]> git.karo-electronics.de Git - karo-tx-linux.git/blob - tools/perf/builtin-trace.c
Merge tag 'perf-core-for-mingo-4.12-20170419' of git://git.kernel.org/pub/scm/linux...
[karo-tx-linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/path.h"
28 #include "util/session.h"
29 #include "util/thread.h"
30 #include <subcmd/parse-options.h>
31 #include "util/strlist.h"
32 #include "util/intlist.h"
33 #include "util/thread_map.h"
34 #include "util/stat.h"
35 #include "trace/beauty/beauty.h"
36 #include "trace-event.h"
37 #include "util/parse-events.h"
38 #include "util/bpf-loader.h"
39 #include "callchain.h"
40 #include "print_binary.h"
41 #include "string2.h"
42 #include "syscalltbl.h"
43 #include "rb_resort.h"
44
45 #include <errno.h>
46 #include <inttypes.h>
47 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
48 #include <stdlib.h>
49 #include <string.h>
50 #include <linux/err.h>
51 #include <linux/filter.h>
52 #include <linux/audit.h>
53 #include <linux/kernel.h>
54 #include <linux/random.h>
55 #include <linux/stringify.h>
56 #include <linux/time64.h>
57
58 #include "sane_ctype.h"
59
60 #ifndef O_CLOEXEC
61 # define O_CLOEXEC              02000000
62 #endif
63
64 struct trace {
65         struct perf_tool        tool;
66         struct syscalltbl       *sctbl;
67         struct {
68                 int             max;
69                 struct syscall  *table;
70                 struct {
71                         struct perf_evsel *sys_enter,
72                                           *sys_exit;
73                 }               events;
74         } syscalls;
75         struct record_opts      opts;
76         struct perf_evlist      *evlist;
77         struct machine          *host;
78         struct thread           *current;
79         u64                     base_time;
80         FILE                    *output;
81         unsigned long           nr_events;
82         struct strlist          *ev_qualifier;
83         struct {
84                 size_t          nr;
85                 int             *entries;
86         }                       ev_qualifier_ids;
87         struct {
88                 size_t          nr;
89                 pid_t           *entries;
90         }                       filter_pids;
91         double                  duration_filter;
92         double                  runtime_ms;
93         struct {
94                 u64             vfs_getname,
95                                 proc_getname;
96         } stats;
97         unsigned int            max_stack;
98         unsigned int            min_stack;
99         bool                    not_ev_qualifier;
100         bool                    live;
101         bool                    full_time;
102         bool                    sched;
103         bool                    multiple_threads;
104         bool                    summary;
105         bool                    summary_only;
106         bool                    show_comm;
107         bool                    show_tool_stats;
108         bool                    trace_syscalls;
109         bool                    kernel_syscallchains;
110         bool                    force;
111         bool                    vfs_getname;
112         int                     trace_pgfaults;
113         int                     open_id;
114 };
115
116 struct tp_field {
117         int offset;
118         union {
119                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
120                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
121         };
122 };
123
124 #define TP_UINT_FIELD(bits) \
125 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
126 { \
127         u##bits value; \
128         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
129         return value;  \
130 }
131
132 TP_UINT_FIELD(8);
133 TP_UINT_FIELD(16);
134 TP_UINT_FIELD(32);
135 TP_UINT_FIELD(64);
136
137 #define TP_UINT_FIELD__SWAPPED(bits) \
138 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
139 { \
140         u##bits value; \
141         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
142         return bswap_##bits(value);\
143 }
144
145 TP_UINT_FIELD__SWAPPED(16);
146 TP_UINT_FIELD__SWAPPED(32);
147 TP_UINT_FIELD__SWAPPED(64);
148
149 static int tp_field__init_uint(struct tp_field *field,
150                                struct format_field *format_field,
151                                bool needs_swap)
152 {
153         field->offset = format_field->offset;
154
155         switch (format_field->size) {
156         case 1:
157                 field->integer = tp_field__u8;
158                 break;
159         case 2:
160                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
161                 break;
162         case 4:
163                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
164                 break;
165         case 8:
166                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
167                 break;
168         default:
169                 return -1;
170         }
171
172         return 0;
173 }
174
175 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
176 {
177         return sample->raw_data + field->offset;
178 }
179
180 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
181 {
182         field->offset = format_field->offset;
183         field->pointer = tp_field__ptr;
184         return 0;
185 }
186
187 struct syscall_tp {
188         struct tp_field id;
189         union {
190                 struct tp_field args, ret;
191         };
192 };
193
194 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
195                                           struct tp_field *field,
196                                           const char *name)
197 {
198         struct format_field *format_field = perf_evsel__field(evsel, name);
199
200         if (format_field == NULL)
201                 return -1;
202
203         return tp_field__init_uint(field, format_field, evsel->needs_swap);
204 }
205
206 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
207         ({ struct syscall_tp *sc = evsel->priv;\
208            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
209
210 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
211                                          struct tp_field *field,
212                                          const char *name)
213 {
214         struct format_field *format_field = perf_evsel__field(evsel, name);
215
216         if (format_field == NULL)
217                 return -1;
218
219         return tp_field__init_ptr(field, format_field);
220 }
221
222 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
223         ({ struct syscall_tp *sc = evsel->priv;\
224            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
225
226 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
227 {
228         zfree(&evsel->priv);
229         perf_evsel__delete(evsel);
230 }
231
232 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
233 {
234         evsel->priv = malloc(sizeof(struct syscall_tp));
235         if (evsel->priv != NULL) {
236                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
237                         goto out_delete;
238
239                 evsel->handler = handler;
240                 return 0;
241         }
242
243         return -ENOMEM;
244
245 out_delete:
246         zfree(&evsel->priv);
247         return -ENOENT;
248 }
249
250 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
251 {
252         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
253
254         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
255         if (IS_ERR(evsel))
256                 evsel = perf_evsel__newtp("syscalls", direction);
257
258         if (IS_ERR(evsel))
259                 return NULL;
260
261         if (perf_evsel__init_syscall_tp(evsel, handler))
262                 goto out_delete;
263
264         return evsel;
265
266 out_delete:
267         perf_evsel__delete_priv(evsel);
268         return NULL;
269 }
270
271 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
272         ({ struct syscall_tp *fields = evsel->priv; \
273            fields->name.integer(&fields->name, sample); })
274
275 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
276         ({ struct syscall_tp *fields = evsel->priv; \
277            fields->name.pointer(&fields->name, sample); })
278
279 struct strarray {
280         int         offset;
281         int         nr_entries;
282         const char **entries;
283 };
284
285 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
286         .nr_entries = ARRAY_SIZE(array), \
287         .entries = array, \
288 }
289
290 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
291         .offset     = off, \
292         .nr_entries = ARRAY_SIZE(array), \
293         .entries = array, \
294 }
295
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297                                                 const char *intfmt,
298                                                 struct syscall_arg *arg)
299 {
300         struct strarray *sa = arg->parm;
301         int idx = arg->val - sa->offset;
302
303         if (idx < 0 || idx >= sa->nr_entries)
304                 return scnprintf(bf, size, intfmt, arg->val);
305
306         return scnprintf(bf, size, "%s", sa->entries[idx]);
307 }
308
309 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
310                                               struct syscall_arg *arg)
311 {
312         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
313 }
314
315 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
316
317 #if defined(__i386__) || defined(__x86_64__)
318 /*
319  * FIXME: Make this available to all arches as soon as the ioctl beautifier
320  *        gets rewritten to support all arches.
321  */
322 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
323                                                  struct syscall_arg *arg)
324 {
325         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
326 }
327
328 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
329 #endif /* defined(__i386__) || defined(__x86_64__) */
330
331 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
332                                         struct syscall_arg *arg);
333
334 #define SCA_FD syscall_arg__scnprintf_fd
335
336 #ifndef AT_FDCWD
337 #define AT_FDCWD        -100
338 #endif
339
340 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
341                                            struct syscall_arg *arg)
342 {
343         int fd = arg->val;
344
345         if (fd == AT_FDCWD)
346                 return scnprintf(bf, size, "CWD");
347
348         return syscall_arg__scnprintf_fd(bf, size, arg);
349 }
350
351 #define SCA_FDAT syscall_arg__scnprintf_fd_at
352
353 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
354                                               struct syscall_arg *arg);
355
356 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
357
358 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
359                                          struct syscall_arg *arg)
360 {
361         return scnprintf(bf, size, "%#lx", arg->val);
362 }
363
364 #define SCA_HEX syscall_arg__scnprintf_hex
365
366 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
367                                          struct syscall_arg *arg)
368 {
369         return scnprintf(bf, size, "%d", arg->val);
370 }
371
372 #define SCA_INT syscall_arg__scnprintf_int
373
374 static const char *bpf_cmd[] = {
375         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
376         "MAP_GET_NEXT_KEY", "PROG_LOAD",
377 };
378 static DEFINE_STRARRAY(bpf_cmd);
379
380 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
381 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
382
383 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
384 static DEFINE_STRARRAY(itimers);
385
386 static const char *keyctl_options[] = {
387         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
388         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
389         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
390         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
391         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
392 };
393 static DEFINE_STRARRAY(keyctl_options);
394
395 static const char *whences[] = { "SET", "CUR", "END",
396 #ifdef SEEK_DATA
397 "DATA",
398 #endif
399 #ifdef SEEK_HOLE
400 "HOLE",
401 #endif
402 };
403 static DEFINE_STRARRAY(whences);
404
405 static const char *fcntl_cmds[] = {
406         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
407         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
408         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
409         "F_GETOWNER_UIDS",
410 };
411 static DEFINE_STRARRAY(fcntl_cmds);
412
413 static const char *rlimit_resources[] = {
414         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
415         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
416         "RTTIME",
417 };
418 static DEFINE_STRARRAY(rlimit_resources);
419
420 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
421 static DEFINE_STRARRAY(sighow);
422
423 static const char *clockid[] = {
424         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
425         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
426         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
427 };
428 static DEFINE_STRARRAY(clockid);
429
430 static const char *socket_families[] = {
431         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
432         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
433         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
434         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
435         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
436         "ALG", "NFC", "VSOCK",
437 };
438 static DEFINE_STRARRAY(socket_families);
439
440 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
441                                                  struct syscall_arg *arg)
442 {
443         size_t printed = 0;
444         int mode = arg->val;
445
446         if (mode == F_OK) /* 0 */
447                 return scnprintf(bf, size, "F");
448 #define P_MODE(n) \
449         if (mode & n##_OK) { \
450                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
451                 mode &= ~n##_OK; \
452         }
453
454         P_MODE(R);
455         P_MODE(W);
456         P_MODE(X);
457 #undef P_MODE
458
459         if (mode)
460                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
461
462         return printed;
463 }
464
465 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
466
467 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
468                                               struct syscall_arg *arg);
469
470 #define SCA_FILENAME syscall_arg__scnprintf_filename
471
472 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
473                                                 struct syscall_arg *arg)
474 {
475         int printed = 0, flags = arg->val;
476
477 #define P_FLAG(n) \
478         if (flags & O_##n) { \
479                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
480                 flags &= ~O_##n; \
481         }
482
483         P_FLAG(CLOEXEC);
484         P_FLAG(NONBLOCK);
485 #undef P_FLAG
486
487         if (flags)
488                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
489
490         return printed;
491 }
492
493 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
494
495 #if defined(__i386__) || defined(__x86_64__)
496 /*
497  * FIXME: Make this available to all arches.
498  */
499 #define TCGETS          0x5401
500
501 static const char *tioctls[] = {
502         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
503         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
504         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
505         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
506         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
507         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
508         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
509         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
510         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
511         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
512         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
513         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
514         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
515         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
516         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
517 };
518
519 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
520 #endif /* defined(__i386__) || defined(__x86_64__) */
521
522 #ifndef GRND_NONBLOCK
523 #define GRND_NONBLOCK   0x0001
524 #endif
525 #ifndef GRND_RANDOM
526 #define GRND_RANDOM     0x0002
527 #endif
528
529 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
530                                                    struct syscall_arg *arg)
531 {
532         int printed = 0, flags = arg->val;
533
534 #define P_FLAG(n) \
535         if (flags & GRND_##n) { \
536                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
537                 flags &= ~GRND_##n; \
538         }
539
540         P_FLAG(RANDOM);
541         P_FLAG(NONBLOCK);
542 #undef P_FLAG
543
544         if (flags)
545                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
546
547         return printed;
548 }
549
550 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
551
552 #define STRARRAY(arg, name, array) \
553           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
554           .arg_parm      = { [arg] = &strarray__##array, }
555
556 #include "trace/beauty/eventfd.c"
557 #include "trace/beauty/flock.c"
558 #include "trace/beauty/futex_op.c"
559 #include "trace/beauty/mmap.c"
560 #include "trace/beauty/mode_t.c"
561 #include "trace/beauty/msg_flags.c"
562 #include "trace/beauty/open_flags.c"
563 #include "trace/beauty/perf_event_open.c"
564 #include "trace/beauty/pid.c"
565 #include "trace/beauty/sched_policy.c"
566 #include "trace/beauty/seccomp.c"
567 #include "trace/beauty/signum.c"
568 #include "trace/beauty/socket_type.c"
569 #include "trace/beauty/waitid_options.c"
570
571 static struct syscall_fmt {
572         const char *name;
573         const char *alias;
574         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
575         void       *arg_parm[6];
576         bool       errmsg;
577         bool       errpid;
578         bool       timeout;
579         bool       hexret;
580 } syscall_fmts[] = {
581         { .name     = "access",     .errmsg = true,
582           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
583         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
584         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
585         { .name     = "brk",        .hexret = true,
586           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
587         { .name     = "chdir",      .errmsg = true, },
588         { .name     = "chmod",      .errmsg = true, },
589         { .name     = "chroot",     .errmsg = true, },
590         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
591         { .name     = "clone",      .errpid = true, },
592         { .name     = "close",      .errmsg = true,
593           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
594         { .name     = "connect",    .errmsg = true, },
595         { .name     = "creat",      .errmsg = true, },
596         { .name     = "dup",        .errmsg = true, },
597         { .name     = "dup2",       .errmsg = true, },
598         { .name     = "dup3",       .errmsg = true, },
599         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
600         { .name     = "eventfd2",   .errmsg = true,
601           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
602         { .name     = "faccessat",  .errmsg = true, },
603         { .name     = "fadvise64",  .errmsg = true, },
604         { .name     = "fallocate",  .errmsg = true, },
605         { .name     = "fchdir",     .errmsg = true, },
606         { .name     = "fchmod",     .errmsg = true, },
607         { .name     = "fchmodat",   .errmsg = true,
608           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
609         { .name     = "fchown",     .errmsg = true, },
610         { .name     = "fchownat",   .errmsg = true,
611           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
612         { .name     = "fcntl",      .errmsg = true,
613           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
614           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
615         { .name     = "fdatasync",  .errmsg = true, },
616         { .name     = "flock",      .errmsg = true,
617           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
618         { .name     = "fsetxattr",  .errmsg = true, },
619         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
620         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
621         { .name     = "fstatfs",    .errmsg = true, },
622         { .name     = "fsync",    .errmsg = true, },
623         { .name     = "ftruncate", .errmsg = true, },
624         { .name     = "futex",      .errmsg = true,
625           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
626         { .name     = "futimesat", .errmsg = true,
627           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
628         { .name     = "getdents",   .errmsg = true, },
629         { .name     = "getdents64", .errmsg = true, },
630         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
631         { .name     = "getpid",     .errpid = true, },
632         { .name     = "getpgid",    .errpid = true, },
633         { .name     = "getppid",    .errpid = true, },
634         { .name     = "getrandom",  .errmsg = true,
635           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
636         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
637         { .name     = "getxattr",   .errmsg = true, },
638         { .name     = "inotify_add_watch",          .errmsg = true, },
639         { .name     = "ioctl",      .errmsg = true,
640           .arg_scnprintf = {
641 #if defined(__i386__) || defined(__x86_64__)
642 /*
643  * FIXME: Make this available to all arches.
644  */
645                              [1] = SCA_STRHEXARRAY, /* cmd */
646                              [2] = SCA_HEX, /* arg */ },
647           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
648 #else
649                              [2] = SCA_HEX, /* arg */ }, },
650 #endif
651         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
652         { .name     = "kill",       .errmsg = true,
653           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
654         { .name     = "lchown",    .errmsg = true, },
655         { .name     = "lgetxattr",  .errmsg = true, },
656         { .name     = "linkat",     .errmsg = true,
657           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
658         { .name     = "listxattr",  .errmsg = true, },
659         { .name     = "llistxattr", .errmsg = true, },
660         { .name     = "lremovexattr",  .errmsg = true, },
661         { .name     = "lseek",      .errmsg = true,
662           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
663           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
664         { .name     = "lsetxattr",  .errmsg = true, },
665         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
666         { .name     = "lsxattr",    .errmsg = true, },
667         { .name     = "madvise",    .errmsg = true,
668           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
669                              [2] = SCA_MADV_BHV, /* behavior */ }, },
670         { .name     = "mkdir",    .errmsg = true, },
671         { .name     = "mkdirat",    .errmsg = true,
672           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
673         { .name     = "mknod",      .errmsg = true, },
674         { .name     = "mknodat",    .errmsg = true,
675           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
676         { .name     = "mlock",      .errmsg = true,
677           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
678         { .name     = "mlockall",   .errmsg = true,
679           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
680         { .name     = "mmap",       .hexret = true,
681           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
682                              [2] = SCA_MMAP_PROT, /* prot */
683                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
684         { .name     = "mprotect",   .errmsg = true,
685           .arg_scnprintf = { [0] = SCA_HEX, /* start */
686                              [2] = SCA_MMAP_PROT, /* prot */ }, },
687         { .name     = "mq_unlink", .errmsg = true,
688           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
689         { .name     = "mremap",     .hexret = true,
690           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
691                              [3] = SCA_MREMAP_FLAGS, /* flags */
692                              [4] = SCA_HEX, /* new_addr */ }, },
693         { .name     = "munlock",    .errmsg = true,
694           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
695         { .name     = "munmap",     .errmsg = true,
696           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
697         { .name     = "name_to_handle_at", .errmsg = true,
698           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
699         { .name     = "newfstatat", .errmsg = true,
700           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
701         { .name     = "open",       .errmsg = true,
702           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
703         { .name     = "open_by_handle_at", .errmsg = true,
704           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
705                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
706         { .name     = "openat",     .errmsg = true,
707           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
708                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
709         { .name     = "perf_event_open", .errmsg = true,
710           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
711                              [3] = SCA_FD,  /* group_fd */
712                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
713         { .name     = "pipe2",      .errmsg = true,
714           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
715         { .name     = "poll",       .errmsg = true, .timeout = true, },
716         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
717         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
718         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
719         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
720         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
721         { .name     = "pwritev",    .errmsg = true, },
722         { .name     = "read",       .errmsg = true, },
723         { .name     = "readlink",   .errmsg = true, },
724         { .name     = "readlinkat", .errmsg = true,
725           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
726         { .name     = "readv",      .errmsg = true, },
727         { .name     = "recvfrom",   .errmsg = true,
728           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
729         { .name     = "recvmmsg",   .errmsg = true,
730           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
731         { .name     = "recvmsg",    .errmsg = true,
732           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
733         { .name     = "removexattr", .errmsg = true, },
734         { .name     = "renameat",   .errmsg = true,
735           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
736         { .name     = "rmdir",    .errmsg = true, },
737         { .name     = "rt_sigaction", .errmsg = true,
738           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
739         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
740         { .name     = "rt_sigqueueinfo", .errmsg = true,
741           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
742         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
743           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
744         { .name     = "sched_getattr",        .errmsg = true, },
745         { .name     = "sched_setattr",        .errmsg = true, },
746         { .name     = "sched_setscheduler",   .errmsg = true,
747           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
748         { .name     = "seccomp", .errmsg = true,
749           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
750                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
751         { .name     = "select",     .errmsg = true, .timeout = true, },
752         { .name     = "sendmmsg",    .errmsg = true,
753           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
754         { .name     = "sendmsg",    .errmsg = true,
755           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
756         { .name     = "sendto",     .errmsg = true,
757           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
758         { .name     = "set_tid_address", .errpid = true, },
759         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
760         { .name     = "setpgid",    .errmsg = true, },
761         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
762         { .name     = "setxattr",   .errmsg = true, },
763         { .name     = "shutdown",   .errmsg = true, },
764         { .name     = "socket",     .errmsg = true,
765           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
766                              [1] = SCA_SK_TYPE, /* type */ },
767           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
768         { .name     = "socketpair", .errmsg = true,
769           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
770                              [1] = SCA_SK_TYPE, /* type */ },
771           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
772         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
773         { .name     = "statfs",     .errmsg = true, },
774         { .name     = "statx",      .errmsg = true,
775           .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
776                              [2] = SCA_STATX_FLAGS, /* flags */
777                              [3] = SCA_STATX_MASK, /* mask */ }, },
778         { .name     = "swapoff",    .errmsg = true,
779           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
780         { .name     = "swapon",     .errmsg = true,
781           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
782         { .name     = "symlinkat",  .errmsg = true,
783           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
784         { .name     = "tgkill",     .errmsg = true,
785           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
786         { .name     = "tkill",      .errmsg = true,
787           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
788         { .name     = "truncate",   .errmsg = true, },
789         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
790         { .name     = "unlinkat",   .errmsg = true,
791           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
792         { .name     = "utime",  .errmsg = true, },
793         { .name     = "utimensat",  .errmsg = true,
794           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
795         { .name     = "utimes",  .errmsg = true, },
796         { .name     = "vmsplice",  .errmsg = true, },
797         { .name     = "wait4",      .errpid = true,
798           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
799         { .name     = "waitid",     .errpid = true,
800           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
801         { .name     = "write",      .errmsg = true, },
802         { .name     = "writev",     .errmsg = true, },
803 };
804
805 static int syscall_fmt__cmp(const void *name, const void *fmtp)
806 {
807         const struct syscall_fmt *fmt = fmtp;
808         return strcmp(name, fmt->name);
809 }
810
811 static struct syscall_fmt *syscall_fmt__find(const char *name)
812 {
813         const int nmemb = ARRAY_SIZE(syscall_fmts);
814         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
815 }
816
817 struct syscall {
818         struct event_format *tp_format;
819         int                 nr_args;
820         struct format_field *args;
821         const char          *name;
822         bool                is_exit;
823         struct syscall_fmt  *fmt;
824         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
825         void                **arg_parm;
826 };
827
828 /*
829  * We need to have this 'calculated' boolean because in some cases we really
830  * don't know what is the duration of a syscall, for instance, when we start
831  * a session and some threads are waiting for a syscall to finish, say 'poll',
832  * in which case all we can do is to print "( ? ) for duration and for the
833  * start timestamp.
834  */
835 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
836 {
837         double duration = (double)t / NSEC_PER_MSEC;
838         size_t printed = fprintf(fp, "(");
839
840         if (!calculated)
841                 printed += fprintf(fp, "     ?   ");
842         else if (duration >= 1.0)
843                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
844         else if (duration >= 0.01)
845                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
846         else
847                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
848         return printed + fprintf(fp, "): ");
849 }
850
851 /**
852  * filename.ptr: The filename char pointer that will be vfs_getname'd
853  * filename.entry_str_pos: Where to insert the string translated from
854  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
855  */
856 struct thread_trace {
857         u64               entry_time;
858         bool              entry_pending;
859         unsigned long     nr_events;
860         unsigned long     pfmaj, pfmin;
861         char              *entry_str;
862         double            runtime_ms;
863         struct {
864                 unsigned long ptr;
865                 short int     entry_str_pos;
866                 bool          pending_open;
867                 unsigned int  namelen;
868                 char          *name;
869         } filename;
870         struct {
871                 int       max;
872                 char      **table;
873         } paths;
874
875         struct intlist *syscall_stats;
876 };
877
878 static struct thread_trace *thread_trace__new(void)
879 {
880         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
881
882         if (ttrace)
883                 ttrace->paths.max = -1;
884
885         ttrace->syscall_stats = intlist__new(NULL);
886
887         return ttrace;
888 }
889
890 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
891 {
892         struct thread_trace *ttrace;
893
894         if (thread == NULL)
895                 goto fail;
896
897         if (thread__priv(thread) == NULL)
898                 thread__set_priv(thread, thread_trace__new());
899
900         if (thread__priv(thread) == NULL)
901                 goto fail;
902
903         ttrace = thread__priv(thread);
904         ++ttrace->nr_events;
905
906         return ttrace;
907 fail:
908         color_fprintf(fp, PERF_COLOR_RED,
909                       "WARNING: not enough memory, dropping samples!\n");
910         return NULL;
911 }
912
913 #define TRACE_PFMAJ             (1 << 0)
914 #define TRACE_PFMIN             (1 << 1)
915
916 static const size_t trace__entry_str_size = 2048;
917
918 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
919 {
920         struct thread_trace *ttrace = thread__priv(thread);
921
922         if (fd > ttrace->paths.max) {
923                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
924
925                 if (npath == NULL)
926                         return -1;
927
928                 if (ttrace->paths.max != -1) {
929                         memset(npath + ttrace->paths.max + 1, 0,
930                                (fd - ttrace->paths.max) * sizeof(char *));
931                 } else {
932                         memset(npath, 0, (fd + 1) * sizeof(char *));
933                 }
934
935                 ttrace->paths.table = npath;
936                 ttrace->paths.max   = fd;
937         }
938
939         ttrace->paths.table[fd] = strdup(pathname);
940
941         return ttrace->paths.table[fd] != NULL ? 0 : -1;
942 }
943
944 static int thread__read_fd_path(struct thread *thread, int fd)
945 {
946         char linkname[PATH_MAX], pathname[PATH_MAX];
947         struct stat st;
948         int ret;
949
950         if (thread->pid_ == thread->tid) {
951                 scnprintf(linkname, sizeof(linkname),
952                           "/proc/%d/fd/%d", thread->pid_, fd);
953         } else {
954                 scnprintf(linkname, sizeof(linkname),
955                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
956         }
957
958         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
959                 return -1;
960
961         ret = readlink(linkname, pathname, sizeof(pathname));
962
963         if (ret < 0 || ret > st.st_size)
964                 return -1;
965
966         pathname[ret] = '\0';
967         return trace__set_fd_pathname(thread, fd, pathname);
968 }
969
970 static const char *thread__fd_path(struct thread *thread, int fd,
971                                    struct trace *trace)
972 {
973         struct thread_trace *ttrace = thread__priv(thread);
974
975         if (ttrace == NULL)
976                 return NULL;
977
978         if (fd < 0)
979                 return NULL;
980
981         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
982                 if (!trace->live)
983                         return NULL;
984                 ++trace->stats.proc_getname;
985                 if (thread__read_fd_path(thread, fd))
986                         return NULL;
987         }
988
989         return ttrace->paths.table[fd];
990 }
991
992 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
993                                         struct syscall_arg *arg)
994 {
995         int fd = arg->val;
996         size_t printed = scnprintf(bf, size, "%d", fd);
997         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
998
999         if (path)
1000                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1001
1002         return printed;
1003 }
1004
1005 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1006                                               struct syscall_arg *arg)
1007 {
1008         int fd = arg->val;
1009         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1010         struct thread_trace *ttrace = thread__priv(arg->thread);
1011
1012         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1013                 zfree(&ttrace->paths.table[fd]);
1014
1015         return printed;
1016 }
1017
1018 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1019                                      unsigned long ptr)
1020 {
1021         struct thread_trace *ttrace = thread__priv(thread);
1022
1023         ttrace->filename.ptr = ptr;
1024         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1025 }
1026
1027 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1028                                               struct syscall_arg *arg)
1029 {
1030         unsigned long ptr = arg->val;
1031
1032         if (!arg->trace->vfs_getname)
1033                 return scnprintf(bf, size, "%#x", ptr);
1034
1035         thread__set_filename_pos(arg->thread, bf, ptr);
1036         return 0;
1037 }
1038
1039 static bool trace__filter_duration(struct trace *trace, double t)
1040 {
1041         return t < (trace->duration_filter * NSEC_PER_MSEC);
1042 }
1043
1044 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1045 {
1046         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1047
1048         return fprintf(fp, "%10.3f ", ts);
1049 }
1050
1051 /*
1052  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1053  * using ttrace->entry_time for a thread that receives a sys_exit without
1054  * first having received a sys_enter ("poll" issued before tracing session
1055  * starts, lost sys_enter exit due to ring buffer overflow).
1056  */
1057 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1058 {
1059         if (tstamp > 0)
1060                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1061
1062         return fprintf(fp, "         ? ");
1063 }
1064
1065 static bool done = false;
1066 static bool interrupted = false;
1067
1068 static void sig_handler(int sig)
1069 {
1070         done = true;
1071         interrupted = sig == SIGINT;
1072 }
1073
1074 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1075                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1076 {
1077         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1078         printed += fprintf_duration(duration, duration_calculated, fp);
1079
1080         if (trace->multiple_threads) {
1081                 if (trace->show_comm)
1082                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1083                 printed += fprintf(fp, "%d ", thread->tid);
1084         }
1085
1086         return printed;
1087 }
1088
1089 static int trace__process_event(struct trace *trace, struct machine *machine,
1090                                 union perf_event *event, struct perf_sample *sample)
1091 {
1092         int ret = 0;
1093
1094         switch (event->header.type) {
1095         case PERF_RECORD_LOST:
1096                 color_fprintf(trace->output, PERF_COLOR_RED,
1097                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1098                 ret = machine__process_lost_event(machine, event, sample);
1099                 break;
1100         default:
1101                 ret = machine__process_event(machine, event, sample);
1102                 break;
1103         }
1104
1105         return ret;
1106 }
1107
1108 static int trace__tool_process(struct perf_tool *tool,
1109                                union perf_event *event,
1110                                struct perf_sample *sample,
1111                                struct machine *machine)
1112 {
1113         struct trace *trace = container_of(tool, struct trace, tool);
1114         return trace__process_event(trace, machine, event, sample);
1115 }
1116
1117 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1118 {
1119         struct machine *machine = vmachine;
1120
1121         if (machine->kptr_restrict_warned)
1122                 return NULL;
1123
1124         if (symbol_conf.kptr_restrict) {
1125                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1126                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1127                            "Kernel samples will not be resolved.\n");
1128                 machine->kptr_restrict_warned = true;
1129                 return NULL;
1130         }
1131
1132         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1133 }
1134
1135 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1136 {
1137         int err = symbol__init(NULL);
1138
1139         if (err)
1140                 return err;
1141
1142         trace->host = machine__new_host();
1143         if (trace->host == NULL)
1144                 return -ENOMEM;
1145
1146         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1147                 return -errno;
1148
1149         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1150                                             evlist->threads, trace__tool_process, false,
1151                                             trace->opts.proc_map_timeout);
1152         if (err)
1153                 symbol__exit();
1154
1155         return err;
1156 }
1157
1158 static int syscall__set_arg_fmts(struct syscall *sc)
1159 {
1160         struct format_field *field;
1161         int idx = 0, len;
1162
1163         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1164         if (sc->arg_scnprintf == NULL)
1165                 return -1;
1166
1167         if (sc->fmt)
1168                 sc->arg_parm = sc->fmt->arg_parm;
1169
1170         for (field = sc->args; field; field = field->next) {
1171                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1172                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1173                 else if (strcmp(field->type, "const char *") == 0 &&
1174                          (strcmp(field->name, "filename") == 0 ||
1175                           strcmp(field->name, "path") == 0 ||
1176                           strcmp(field->name, "pathname") == 0))
1177                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1178                 else if (field->flags & FIELD_IS_POINTER)
1179                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1180                 else if (strcmp(field->type, "pid_t") == 0)
1181                         sc->arg_scnprintf[idx] = SCA_PID;
1182                 else if (strcmp(field->type, "umode_t") == 0)
1183                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1184                 else if ((strcmp(field->type, "int") == 0 ||
1185                           strcmp(field->type, "unsigned int") == 0 ||
1186                           strcmp(field->type, "long") == 0) &&
1187                          (len = strlen(field->name)) >= 2 &&
1188                          strcmp(field->name + len - 2, "fd") == 0) {
1189                         /*
1190                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1191                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1192                          * 65 int
1193                          * 23 unsigned int
1194                          * 7 unsigned long
1195                          */
1196                         sc->arg_scnprintf[idx] = SCA_FD;
1197                 }
1198                 ++idx;
1199         }
1200
1201         return 0;
1202 }
1203
1204 static int trace__read_syscall_info(struct trace *trace, int id)
1205 {
1206         char tp_name[128];
1207         struct syscall *sc;
1208         const char *name = syscalltbl__name(trace->sctbl, id);
1209
1210         if (name == NULL)
1211                 return -1;
1212
1213         if (id > trace->syscalls.max) {
1214                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1215
1216                 if (nsyscalls == NULL)
1217                         return -1;
1218
1219                 if (trace->syscalls.max != -1) {
1220                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1221                                (id - trace->syscalls.max) * sizeof(*sc));
1222                 } else {
1223                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1224                 }
1225
1226                 trace->syscalls.table = nsyscalls;
1227                 trace->syscalls.max   = id;
1228         }
1229
1230         sc = trace->syscalls.table + id;
1231         sc->name = name;
1232
1233         sc->fmt  = syscall_fmt__find(sc->name);
1234
1235         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1236         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1237
1238         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1239                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1240                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1241         }
1242
1243         if (IS_ERR(sc->tp_format))
1244                 return -1;
1245
1246         sc->args = sc->tp_format->format.fields;
1247         sc->nr_args = sc->tp_format->format.nr_fields;
1248         /*
1249          * We need to check and discard the first variable '__syscall_nr'
1250          * or 'nr' that mean the syscall number. It is needless here.
1251          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1252          */
1253         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1254                 sc->args = sc->args->next;
1255                 --sc->nr_args;
1256         }
1257
1258         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1259
1260         return syscall__set_arg_fmts(sc);
1261 }
1262
1263 static int trace__validate_ev_qualifier(struct trace *trace)
1264 {
1265         int err = 0, i;
1266         struct str_node *pos;
1267
1268         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1269         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1270                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1271
1272         if (trace->ev_qualifier_ids.entries == NULL) {
1273                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1274                        trace->output);
1275                 err = -EINVAL;
1276                 goto out;
1277         }
1278
1279         i = 0;
1280
1281         strlist__for_each_entry(pos, trace->ev_qualifier) {
1282                 const char *sc = pos->s;
1283                 int id = syscalltbl__id(trace->sctbl, sc);
1284
1285                 if (id < 0) {
1286                         if (err == 0) {
1287                                 fputs("Error:\tInvalid syscall ", trace->output);
1288                                 err = -EINVAL;
1289                         } else {
1290                                 fputs(", ", trace->output);
1291                         }
1292
1293                         fputs(sc, trace->output);
1294                 }
1295
1296                 trace->ev_qualifier_ids.entries[i++] = id;
1297         }
1298
1299         if (err < 0) {
1300                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1301                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1302                 zfree(&trace->ev_qualifier_ids.entries);
1303                 trace->ev_qualifier_ids.nr = 0;
1304         }
1305 out:
1306         return err;
1307 }
1308
1309 /*
1310  * args is to be interpreted as a series of longs but we need to handle
1311  * 8-byte unaligned accesses. args points to raw_data within the event
1312  * and raw_data is guaranteed to be 8-byte unaligned because it is
1313  * preceded by raw_size which is a u32. So we need to copy args to a temp
1314  * variable to read it. Most notably this avoids extended load instructions
1315  * on unaligned addresses
1316  */
1317
1318 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1319                                       unsigned char *args, struct trace *trace,
1320                                       struct thread *thread)
1321 {
1322         size_t printed = 0;
1323         unsigned char *p;
1324         unsigned long val;
1325
1326         if (sc->args != NULL) {
1327                 struct format_field *field;
1328                 u8 bit = 1;
1329                 struct syscall_arg arg = {
1330                         .idx    = 0,
1331                         .mask   = 0,
1332                         .trace  = trace,
1333                         .thread = thread,
1334                 };
1335
1336                 for (field = sc->args; field;
1337                      field = field->next, ++arg.idx, bit <<= 1) {
1338                         if (arg.mask & bit)
1339                                 continue;
1340
1341                         /* special care for unaligned accesses */
1342                         p = args + sizeof(unsigned long) * arg.idx;
1343                         memcpy(&val, p, sizeof(val));
1344
1345                         /*
1346                          * Suppress this argument if its value is zero and
1347                          * and we don't have a string associated in an
1348                          * strarray for it.
1349                          */
1350                         if (val == 0 &&
1351                             !(sc->arg_scnprintf &&
1352                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1353                               sc->arg_parm[arg.idx]))
1354                                 continue;
1355
1356                         printed += scnprintf(bf + printed, size - printed,
1357                                              "%s%s: ", printed ? ", " : "", field->name);
1358                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1359                                 arg.val = val;
1360                                 if (sc->arg_parm)
1361                                         arg.parm = sc->arg_parm[arg.idx];
1362                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1363                                                                       size - printed, &arg);
1364                         } else {
1365                                 printed += scnprintf(bf + printed, size - printed,
1366                                                      "%ld", val);
1367                         }
1368                 }
1369         } else if (IS_ERR(sc->tp_format)) {
1370                 /*
1371                  * If we managed to read the tracepoint /format file, then we
1372                  * may end up not having any args, like with gettid(), so only
1373                  * print the raw args when we didn't manage to read it.
1374                  */
1375                 int i = 0;
1376
1377                 while (i < 6) {
1378                         /* special care for unaligned accesses */
1379                         p = args + sizeof(unsigned long) * i;
1380                         memcpy(&val, p, sizeof(val));
1381                         printed += scnprintf(bf + printed, size - printed,
1382                                              "%sarg%d: %ld",
1383                                              printed ? ", " : "", i, val);
1384                         ++i;
1385                 }
1386         }
1387
1388         return printed;
1389 }
1390
1391 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1392                                   union perf_event *event,
1393                                   struct perf_sample *sample);
1394
1395 static struct syscall *trace__syscall_info(struct trace *trace,
1396                                            struct perf_evsel *evsel, int id)
1397 {
1398
1399         if (id < 0) {
1400
1401                 /*
1402                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1403                  * before that, leaving at a higher verbosity level till that is
1404                  * explained. Reproduced with plain ftrace with:
1405                  *
1406                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1407                  * grep "NR -1 " /t/trace_pipe
1408                  *
1409                  * After generating some load on the machine.
1410                  */
1411                 if (verbose > 1) {
1412                         static u64 n;
1413                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1414                                 id, perf_evsel__name(evsel), ++n);
1415                 }
1416                 return NULL;
1417         }
1418
1419         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1420             trace__read_syscall_info(trace, id))
1421                 goto out_cant_read;
1422
1423         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1424                 goto out_cant_read;
1425
1426         return &trace->syscalls.table[id];
1427
1428 out_cant_read:
1429         if (verbose > 0) {
1430                 fprintf(trace->output, "Problems reading syscall %d", id);
1431                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1432                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1433                 fputs(" information\n", trace->output);
1434         }
1435         return NULL;
1436 }
1437
1438 static void thread__update_stats(struct thread_trace *ttrace,
1439                                  int id, struct perf_sample *sample)
1440 {
1441         struct int_node *inode;
1442         struct stats *stats;
1443         u64 duration = 0;
1444
1445         inode = intlist__findnew(ttrace->syscall_stats, id);
1446         if (inode == NULL)
1447                 return;
1448
1449         stats = inode->priv;
1450         if (stats == NULL) {
1451                 stats = malloc(sizeof(struct stats));
1452                 if (stats == NULL)
1453                         return;
1454                 init_stats(stats);
1455                 inode->priv = stats;
1456         }
1457
1458         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1459                 duration = sample->time - ttrace->entry_time;
1460
1461         update_stats(stats, duration);
1462 }
1463
1464 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1465 {
1466         struct thread_trace *ttrace;
1467         u64 duration;
1468         size_t printed;
1469
1470         if (trace->current == NULL)
1471                 return 0;
1472
1473         ttrace = thread__priv(trace->current);
1474
1475         if (!ttrace->entry_pending)
1476                 return 0;
1477
1478         duration = sample->time - ttrace->entry_time;
1479
1480         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1481         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1482         ttrace->entry_pending = false;
1483
1484         return printed;
1485 }
1486
1487 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1488                             union perf_event *event __maybe_unused,
1489                             struct perf_sample *sample)
1490 {
1491         char *msg;
1492         void *args;
1493         size_t printed = 0;
1494         struct thread *thread;
1495         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1496         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1497         struct thread_trace *ttrace;
1498
1499         if (sc == NULL)
1500                 return -1;
1501
1502         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1503         ttrace = thread__trace(thread, trace->output);
1504         if (ttrace == NULL)
1505                 goto out_put;
1506
1507         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1508
1509         if (ttrace->entry_str == NULL) {
1510                 ttrace->entry_str = malloc(trace__entry_str_size);
1511                 if (!ttrace->entry_str)
1512                         goto out_put;
1513         }
1514
1515         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1516                 trace__printf_interrupted_entry(trace, sample);
1517
1518         ttrace->entry_time = sample->time;
1519         msg = ttrace->entry_str;
1520         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1521
1522         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1523                                            args, trace, thread);
1524
1525         if (sc->is_exit) {
1526                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1527                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1528                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1529                 }
1530         } else {
1531                 ttrace->entry_pending = true;
1532                 /* See trace__vfs_getname & trace__sys_exit */
1533                 ttrace->filename.pending_open = false;
1534         }
1535
1536         if (trace->current != thread) {
1537                 thread__put(trace->current);
1538                 trace->current = thread__get(thread);
1539         }
1540         err = 0;
1541 out_put:
1542         thread__put(thread);
1543         return err;
1544 }
1545
1546 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1547                                     struct perf_sample *sample,
1548                                     struct callchain_cursor *cursor)
1549 {
1550         struct addr_location al;
1551
1552         if (machine__resolve(trace->host, &al, sample) < 0 ||
1553             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1554                 return -1;
1555
1556         return 0;
1557 }
1558
1559 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1560 {
1561         /* TODO: user-configurable print_opts */
1562         const unsigned int print_opts = EVSEL__PRINT_SYM |
1563                                         EVSEL__PRINT_DSO |
1564                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1565
1566         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1567 }
1568
1569 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1570                            union perf_event *event __maybe_unused,
1571                            struct perf_sample *sample)
1572 {
1573         long ret;
1574         u64 duration = 0;
1575         bool duration_calculated = false;
1576         struct thread *thread;
1577         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1578         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1579         struct thread_trace *ttrace;
1580
1581         if (sc == NULL)
1582                 return -1;
1583
1584         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1585         ttrace = thread__trace(thread, trace->output);
1586         if (ttrace == NULL)
1587                 goto out_put;
1588
1589         if (trace->summary)
1590                 thread__update_stats(ttrace, id, sample);
1591
1592         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1593
1594         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1595                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1596                 ttrace->filename.pending_open = false;
1597                 ++trace->stats.vfs_getname;
1598         }
1599
1600         if (ttrace->entry_time) {
1601                 duration = sample->time - ttrace->entry_time;
1602                 if (trace__filter_duration(trace, duration))
1603                         goto out;
1604                 duration_calculated = true;
1605         } else if (trace->duration_filter)
1606                 goto out;
1607
1608         if (sample->callchain) {
1609                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1610                 if (callchain_ret == 0) {
1611                         if (callchain_cursor.nr < trace->min_stack)
1612                                 goto out;
1613                         callchain_ret = 1;
1614                 }
1615         }
1616
1617         if (trace->summary_only)
1618                 goto out;
1619
1620         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1621
1622         if (ttrace->entry_pending) {
1623                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1624         } else {
1625                 fprintf(trace->output, " ... [");
1626                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1627                 fprintf(trace->output, "]: %s()", sc->name);
1628         }
1629
1630         if (sc->fmt == NULL) {
1631 signed_print:
1632                 fprintf(trace->output, ") = %ld", ret);
1633         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1634                 char bf[STRERR_BUFSIZE];
1635                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1636                            *e = audit_errno_to_name(-ret);
1637
1638                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1639         } else if (ret == 0 && sc->fmt->timeout)
1640                 fprintf(trace->output, ") = 0 Timeout");
1641         else if (sc->fmt->hexret)
1642                 fprintf(trace->output, ") = %#lx", ret);
1643         else if (sc->fmt->errpid) {
1644                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1645
1646                 if (child != NULL) {
1647                         fprintf(trace->output, ") = %ld", ret);
1648                         if (child->comm_set)
1649                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1650                         thread__put(child);
1651                 }
1652         } else
1653                 goto signed_print;
1654
1655         fputc('\n', trace->output);
1656
1657         if (callchain_ret > 0)
1658                 trace__fprintf_callchain(trace, sample);
1659         else if (callchain_ret < 0)
1660                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1661 out:
1662         ttrace->entry_pending = false;
1663         err = 0;
1664 out_put:
1665         thread__put(thread);
1666         return err;
1667 }
1668
1669 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1670                               union perf_event *event __maybe_unused,
1671                               struct perf_sample *sample)
1672 {
1673         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1674         struct thread_trace *ttrace;
1675         size_t filename_len, entry_str_len, to_move;
1676         ssize_t remaining_space;
1677         char *pos;
1678         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1679
1680         if (!thread)
1681                 goto out;
1682
1683         ttrace = thread__priv(thread);
1684         if (!ttrace)
1685                 goto out_put;
1686
1687         filename_len = strlen(filename);
1688         if (filename_len == 0)
1689                 goto out_put;
1690
1691         if (ttrace->filename.namelen < filename_len) {
1692                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1693
1694                 if (f == NULL)
1695                         goto out_put;
1696
1697                 ttrace->filename.namelen = filename_len;
1698                 ttrace->filename.name = f;
1699         }
1700
1701         strcpy(ttrace->filename.name, filename);
1702         ttrace->filename.pending_open = true;
1703
1704         if (!ttrace->filename.ptr)
1705                 goto out_put;
1706
1707         entry_str_len = strlen(ttrace->entry_str);
1708         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1709         if (remaining_space <= 0)
1710                 goto out_put;
1711
1712         if (filename_len > (size_t)remaining_space) {
1713                 filename += filename_len - remaining_space;
1714                 filename_len = remaining_space;
1715         }
1716
1717         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1718         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1719         memmove(pos + filename_len, pos, to_move);
1720         memcpy(pos, filename, filename_len);
1721
1722         ttrace->filename.ptr = 0;
1723         ttrace->filename.entry_str_pos = 0;
1724 out_put:
1725         thread__put(thread);
1726 out:
1727         return 0;
1728 }
1729
1730 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1731                                      union perf_event *event __maybe_unused,
1732                                      struct perf_sample *sample)
1733 {
1734         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1735         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1736         struct thread *thread = machine__findnew_thread(trace->host,
1737                                                         sample->pid,
1738                                                         sample->tid);
1739         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1740
1741         if (ttrace == NULL)
1742                 goto out_dump;
1743
1744         ttrace->runtime_ms += runtime_ms;
1745         trace->runtime_ms += runtime_ms;
1746 out_put:
1747         thread__put(thread);
1748         return 0;
1749
1750 out_dump:
1751         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1752                evsel->name,
1753                perf_evsel__strval(evsel, sample, "comm"),
1754                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1755                runtime,
1756                perf_evsel__intval(evsel, sample, "vruntime"));
1757         goto out_put;
1758 }
1759
1760 static void bpf_output__printer(enum binary_printer_ops op,
1761                                 unsigned int val, void *extra)
1762 {
1763         FILE *output = extra;
1764         unsigned char ch = (unsigned char)val;
1765
1766         switch (op) {
1767         case BINARY_PRINT_CHAR_DATA:
1768                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1769                 break;
1770         case BINARY_PRINT_DATA_BEGIN:
1771         case BINARY_PRINT_LINE_BEGIN:
1772         case BINARY_PRINT_ADDR:
1773         case BINARY_PRINT_NUM_DATA:
1774         case BINARY_PRINT_NUM_PAD:
1775         case BINARY_PRINT_SEP:
1776         case BINARY_PRINT_CHAR_PAD:
1777         case BINARY_PRINT_LINE_END:
1778         case BINARY_PRINT_DATA_END:
1779         default:
1780                 break;
1781         }
1782 }
1783
1784 static void bpf_output__fprintf(struct trace *trace,
1785                                 struct perf_sample *sample)
1786 {
1787         print_binary(sample->raw_data, sample->raw_size, 8,
1788                      bpf_output__printer, trace->output);
1789 }
1790
1791 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1792                                 union perf_event *event __maybe_unused,
1793                                 struct perf_sample *sample)
1794 {
1795         int callchain_ret = 0;
1796
1797         if (sample->callchain) {
1798                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1799                 if (callchain_ret == 0) {
1800                         if (callchain_cursor.nr < trace->min_stack)
1801                                 goto out;
1802                         callchain_ret = 1;
1803                 }
1804         }
1805
1806         trace__printf_interrupted_entry(trace, sample);
1807         trace__fprintf_tstamp(trace, sample->time, trace->output);
1808
1809         if (trace->trace_syscalls)
1810                 fprintf(trace->output, "(         ): ");
1811
1812         fprintf(trace->output, "%s:", evsel->name);
1813
1814         if (perf_evsel__is_bpf_output(evsel)) {
1815                 bpf_output__fprintf(trace, sample);
1816         } else if (evsel->tp_format) {
1817                 event_format__fprintf(evsel->tp_format, sample->cpu,
1818                                       sample->raw_data, sample->raw_size,
1819                                       trace->output);
1820         }
1821
1822         fprintf(trace->output, ")\n");
1823
1824         if (callchain_ret > 0)
1825                 trace__fprintf_callchain(trace, sample);
1826         else if (callchain_ret < 0)
1827                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1828 out:
1829         return 0;
1830 }
1831
1832 static void print_location(FILE *f, struct perf_sample *sample,
1833                            struct addr_location *al,
1834                            bool print_dso, bool print_sym)
1835 {
1836
1837         if ((verbose > 0 || print_dso) && al->map)
1838                 fprintf(f, "%s@", al->map->dso->long_name);
1839
1840         if ((verbose > 0 || print_sym) && al->sym)
1841                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1842                         al->addr - al->sym->start);
1843         else if (al->map)
1844                 fprintf(f, "0x%" PRIx64, al->addr);
1845         else
1846                 fprintf(f, "0x%" PRIx64, sample->addr);
1847 }
1848
1849 static int trace__pgfault(struct trace *trace,
1850                           struct perf_evsel *evsel,
1851                           union perf_event *event __maybe_unused,
1852                           struct perf_sample *sample)
1853 {
1854         struct thread *thread;
1855         struct addr_location al;
1856         char map_type = 'd';
1857         struct thread_trace *ttrace;
1858         int err = -1;
1859         int callchain_ret = 0;
1860
1861         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1862
1863         if (sample->callchain) {
1864                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1865                 if (callchain_ret == 0) {
1866                         if (callchain_cursor.nr < trace->min_stack)
1867                                 goto out_put;
1868                         callchain_ret = 1;
1869                 }
1870         }
1871
1872         ttrace = thread__trace(thread, trace->output);
1873         if (ttrace == NULL)
1874                 goto out_put;
1875
1876         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1877                 ttrace->pfmaj++;
1878         else
1879                 ttrace->pfmin++;
1880
1881         if (trace->summary_only)
1882                 goto out;
1883
1884         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1885                               sample->ip, &al);
1886
1887         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1888
1889         fprintf(trace->output, "%sfault [",
1890                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1891                 "maj" : "min");
1892
1893         print_location(trace->output, sample, &al, false, true);
1894
1895         fprintf(trace->output, "] => ");
1896
1897         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1898                                    sample->addr, &al);
1899
1900         if (!al.map) {
1901                 thread__find_addr_location(thread, sample->cpumode,
1902                                            MAP__FUNCTION, sample->addr, &al);
1903
1904                 if (al.map)
1905                         map_type = 'x';
1906                 else
1907                         map_type = '?';
1908         }
1909
1910         print_location(trace->output, sample, &al, true, false);
1911
1912         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1913
1914         if (callchain_ret > 0)
1915                 trace__fprintf_callchain(trace, sample);
1916         else if (callchain_ret < 0)
1917                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1918 out:
1919         err = 0;
1920 out_put:
1921         thread__put(thread);
1922         return err;
1923 }
1924
1925 static void trace__set_base_time(struct trace *trace,
1926                                  struct perf_evsel *evsel,
1927                                  struct perf_sample *sample)
1928 {
1929         /*
1930          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1931          * and don't use sample->time unconditionally, we may end up having
1932          * some other event in the future without PERF_SAMPLE_TIME for good
1933          * reason, i.e. we may not be interested in its timestamps, just in
1934          * it taking place, picking some piece of information when it
1935          * appears in our event stream (vfs_getname comes to mind).
1936          */
1937         if (trace->base_time == 0 && !trace->full_time &&
1938             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1939                 trace->base_time = sample->time;
1940 }
1941
1942 static int trace__process_sample(struct perf_tool *tool,
1943                                  union perf_event *event,
1944                                  struct perf_sample *sample,
1945                                  struct perf_evsel *evsel,
1946                                  struct machine *machine __maybe_unused)
1947 {
1948         struct trace *trace = container_of(tool, struct trace, tool);
1949         struct thread *thread;
1950         int err = 0;
1951
1952         tracepoint_handler handler = evsel->handler;
1953
1954         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1955         if (thread && thread__is_filtered(thread))
1956                 goto out;
1957
1958         trace__set_base_time(trace, evsel, sample);
1959
1960         if (handler) {
1961                 ++trace->nr_events;
1962                 handler(trace, evsel, event, sample);
1963         }
1964 out:
1965         thread__put(thread);
1966         return err;
1967 }
1968
1969 static int trace__record(struct trace *trace, int argc, const char **argv)
1970 {
1971         unsigned int rec_argc, i, j;
1972         const char **rec_argv;
1973         const char * const record_args[] = {
1974                 "record",
1975                 "-R",
1976                 "-m", "1024",
1977                 "-c", "1",
1978         };
1979
1980         const char * const sc_args[] = { "-e", };
1981         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1982         const char * const majpf_args[] = { "-e", "major-faults" };
1983         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1984         const char * const minpf_args[] = { "-e", "minor-faults" };
1985         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1986
1987         /* +1 is for the event string below */
1988         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1989                 majpf_args_nr + minpf_args_nr + argc;
1990         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1991
1992         if (rec_argv == NULL)
1993                 return -ENOMEM;
1994
1995         j = 0;
1996         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1997                 rec_argv[j++] = record_args[i];
1998
1999         if (trace->trace_syscalls) {
2000                 for (i = 0; i < sc_args_nr; i++)
2001                         rec_argv[j++] = sc_args[i];
2002
2003                 /* event string may be different for older kernels - e.g., RHEL6 */
2004                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2005                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2006                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2007                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2008                 else {
2009                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2010                         return -1;
2011                 }
2012         }
2013
2014         if (trace->trace_pgfaults & TRACE_PFMAJ)
2015                 for (i = 0; i < majpf_args_nr; i++)
2016                         rec_argv[j++] = majpf_args[i];
2017
2018         if (trace->trace_pgfaults & TRACE_PFMIN)
2019                 for (i = 0; i < minpf_args_nr; i++)
2020                         rec_argv[j++] = minpf_args[i];
2021
2022         for (i = 0; i < (unsigned int)argc; i++)
2023                 rec_argv[j++] = argv[i];
2024
2025         return cmd_record(j, rec_argv);
2026 }
2027
2028 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2029
2030 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2031 {
2032         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2033
2034         if (IS_ERR(evsel))
2035                 return false;
2036
2037         if (perf_evsel__field(evsel, "pathname") == NULL) {
2038                 perf_evsel__delete(evsel);
2039                 return false;
2040         }
2041
2042         evsel->handler = trace__vfs_getname;
2043         perf_evlist__add(evlist, evsel);
2044         return true;
2045 }
2046
2047 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2048 {
2049         struct perf_evsel *evsel;
2050         struct perf_event_attr attr = {
2051                 .type = PERF_TYPE_SOFTWARE,
2052                 .mmap_data = 1,
2053         };
2054
2055         attr.config = config;
2056         attr.sample_period = 1;
2057
2058         event_attr_init(&attr);
2059
2060         evsel = perf_evsel__new(&attr);
2061         if (evsel)
2062                 evsel->handler = trace__pgfault;
2063
2064         return evsel;
2065 }
2066
2067 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2068 {
2069         const u32 type = event->header.type;
2070         struct perf_evsel *evsel;
2071
2072         if (type != PERF_RECORD_SAMPLE) {
2073                 trace__process_event(trace, trace->host, event, sample);
2074                 return;
2075         }
2076
2077         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2078         if (evsel == NULL) {
2079                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2080                 return;
2081         }
2082
2083         trace__set_base_time(trace, evsel, sample);
2084
2085         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2086             sample->raw_data == NULL) {
2087                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2088                        perf_evsel__name(evsel), sample->tid,
2089                        sample->cpu, sample->raw_size);
2090         } else {
2091                 tracepoint_handler handler = evsel->handler;
2092                 handler(trace, evsel, event, sample);
2093         }
2094 }
2095
2096 static int trace__add_syscall_newtp(struct trace *trace)
2097 {
2098         int ret = -1;
2099         struct perf_evlist *evlist = trace->evlist;
2100         struct perf_evsel *sys_enter, *sys_exit;
2101
2102         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2103         if (sys_enter == NULL)
2104                 goto out;
2105
2106         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2107                 goto out_delete_sys_enter;
2108
2109         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2110         if (sys_exit == NULL)
2111                 goto out_delete_sys_enter;
2112
2113         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2114                 goto out_delete_sys_exit;
2115
2116         perf_evlist__add(evlist, sys_enter);
2117         perf_evlist__add(evlist, sys_exit);
2118
2119         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2120                 /*
2121                  * We're interested only in the user space callchain
2122                  * leading to the syscall, allow overriding that for
2123                  * debugging reasons using --kernel_syscall_callchains
2124                  */
2125                 sys_exit->attr.exclude_callchain_kernel = 1;
2126         }
2127
2128         trace->syscalls.events.sys_enter = sys_enter;
2129         trace->syscalls.events.sys_exit  = sys_exit;
2130
2131         ret = 0;
2132 out:
2133         return ret;
2134
2135 out_delete_sys_exit:
2136         perf_evsel__delete_priv(sys_exit);
2137 out_delete_sys_enter:
2138         perf_evsel__delete_priv(sys_enter);
2139         goto out;
2140 }
2141
2142 static int trace__set_ev_qualifier_filter(struct trace *trace)
2143 {
2144         int err = -1;
2145         struct perf_evsel *sys_exit;
2146         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2147                                                 trace->ev_qualifier_ids.nr,
2148                                                 trace->ev_qualifier_ids.entries);
2149
2150         if (filter == NULL)
2151                 goto out_enomem;
2152
2153         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2154                                           filter)) {
2155                 sys_exit = trace->syscalls.events.sys_exit;
2156                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2157         }
2158
2159         free(filter);
2160 out:
2161         return err;
2162 out_enomem:
2163         errno = ENOMEM;
2164         goto out;
2165 }
2166
2167 static int trace__run(struct trace *trace, int argc, const char **argv)
2168 {
2169         struct perf_evlist *evlist = trace->evlist;
2170         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2171         int err = -1, i;
2172         unsigned long before;
2173         const bool forks = argc > 0;
2174         bool draining = false;
2175
2176         trace->live = true;
2177
2178         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2179                 goto out_error_raw_syscalls;
2180
2181         if (trace->trace_syscalls)
2182                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2183
2184         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2185                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2186                 if (pgfault_maj == NULL)
2187                         goto out_error_mem;
2188                 perf_evlist__add(evlist, pgfault_maj);
2189         }
2190
2191         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2192                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2193                 if (pgfault_min == NULL)
2194                         goto out_error_mem;
2195                 perf_evlist__add(evlist, pgfault_min);
2196         }
2197
2198         if (trace->sched &&
2199             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2200                                    trace__sched_stat_runtime))
2201                 goto out_error_sched_stat_runtime;
2202
2203         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2204         if (err < 0) {
2205                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2206                 goto out_delete_evlist;
2207         }
2208
2209         err = trace__symbols_init(trace, evlist);
2210         if (err < 0) {
2211                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2212                 goto out_delete_evlist;
2213         }
2214
2215         perf_evlist__config(evlist, &trace->opts, NULL);
2216
2217         if (callchain_param.enabled) {
2218                 bool use_identifier = false;
2219
2220                 if (trace->syscalls.events.sys_exit) {
2221                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2222                                                      &trace->opts, &callchain_param);
2223                         use_identifier = true;
2224                 }
2225
2226                 if (pgfault_maj) {
2227                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2228                         use_identifier = true;
2229                 }
2230
2231                 if (pgfault_min) {
2232                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2233                         use_identifier = true;
2234                 }
2235
2236                 if (use_identifier) {
2237                        /*
2238                         * Now we have evsels with different sample_ids, use
2239                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2240                         * from a fixed position in each ring buffer record.
2241                         *
2242                         * As of this the changeset introducing this comment, this
2243                         * isn't strictly needed, as the fields that can come before
2244                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2245                         * some of those for things like copying the payload of
2246                         * pointer syscall arguments, and for vfs_getname we don't
2247                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2248                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2249                         */
2250                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2251                         perf_evlist__reset_sample_bit(evlist, ID);
2252                 }
2253         }
2254
2255         signal(SIGCHLD, sig_handler);
2256         signal(SIGINT, sig_handler);
2257
2258         if (forks) {
2259                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2260                                                     argv, false, NULL);
2261                 if (err < 0) {
2262                         fprintf(trace->output, "Couldn't run the workload!\n");
2263                         goto out_delete_evlist;
2264                 }
2265         }
2266
2267         err = perf_evlist__open(evlist);
2268         if (err < 0)
2269                 goto out_error_open;
2270
2271         err = bpf__apply_obj_config();
2272         if (err) {
2273                 char errbuf[BUFSIZ];
2274
2275                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2276                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2277                          errbuf);
2278                 goto out_error_open;
2279         }
2280
2281         /*
2282          * Better not use !target__has_task() here because we need to cover the
2283          * case where no threads were specified in the command line, but a
2284          * workload was, and in that case we will fill in the thread_map when
2285          * we fork the workload in perf_evlist__prepare_workload.
2286          */
2287         if (trace->filter_pids.nr > 0)
2288                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2289         else if (thread_map__pid(evlist->threads, 0) == -1)
2290                 err = perf_evlist__set_filter_pid(evlist, getpid());
2291
2292         if (err < 0)
2293                 goto out_error_mem;
2294
2295         if (trace->ev_qualifier_ids.nr > 0) {
2296                 err = trace__set_ev_qualifier_filter(trace);
2297                 if (err < 0)
2298                         goto out_errno;
2299
2300                 pr_debug("event qualifier tracepoint filter: %s\n",
2301                          trace->syscalls.events.sys_exit->filter);
2302         }
2303
2304         err = perf_evlist__apply_filters(evlist, &evsel);
2305         if (err < 0)
2306                 goto out_error_apply_filters;
2307
2308         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2309         if (err < 0)
2310                 goto out_error_mmap;
2311
2312         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2313                 perf_evlist__enable(evlist);
2314
2315         if (forks)
2316                 perf_evlist__start_workload(evlist);
2317
2318         if (trace->opts.initial_delay) {
2319                 usleep(trace->opts.initial_delay * 1000);
2320                 perf_evlist__enable(evlist);
2321         }
2322
2323         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2324                                   evlist->threads->nr > 1 ||
2325                                   perf_evlist__first(evlist)->attr.inherit;
2326 again:
2327         before = trace->nr_events;
2328
2329         for (i = 0; i < evlist->nr_mmaps; i++) {
2330                 union perf_event *event;
2331
2332                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2333                         struct perf_sample sample;
2334
2335                         ++trace->nr_events;
2336
2337                         err = perf_evlist__parse_sample(evlist, event, &sample);
2338                         if (err) {
2339                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2340                                 goto next_event;
2341                         }
2342
2343                         trace__handle_event(trace, event, &sample);
2344 next_event:
2345                         perf_evlist__mmap_consume(evlist, i);
2346
2347                         if (interrupted)
2348                                 goto out_disable;
2349
2350                         if (done && !draining) {
2351                                 perf_evlist__disable(evlist);
2352                                 draining = true;
2353                         }
2354                 }
2355         }
2356
2357         if (trace->nr_events == before) {
2358                 int timeout = done ? 100 : -1;
2359
2360                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2361                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2362                                 draining = true;
2363
2364                         goto again;
2365                 }
2366         } else {
2367                 goto again;
2368         }
2369
2370 out_disable:
2371         thread__zput(trace->current);
2372
2373         perf_evlist__disable(evlist);
2374
2375         if (!err) {
2376                 if (trace->summary)
2377                         trace__fprintf_thread_summary(trace, trace->output);
2378
2379                 if (trace->show_tool_stats) {
2380                         fprintf(trace->output, "Stats:\n "
2381                                                " vfs_getname : %" PRIu64 "\n"
2382                                                " proc_getname: %" PRIu64 "\n",
2383                                 trace->stats.vfs_getname,
2384                                 trace->stats.proc_getname);
2385                 }
2386         }
2387
2388 out_delete_evlist:
2389         perf_evlist__delete(evlist);
2390         trace->evlist = NULL;
2391         trace->live = false;
2392         return err;
2393 {
2394         char errbuf[BUFSIZ];
2395
2396 out_error_sched_stat_runtime:
2397         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2398         goto out_error;
2399
2400 out_error_raw_syscalls:
2401         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2402         goto out_error;
2403
2404 out_error_mmap:
2405         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2406         goto out_error;
2407
2408 out_error_open:
2409         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2410
2411 out_error:
2412         fprintf(trace->output, "%s\n", errbuf);
2413         goto out_delete_evlist;
2414
2415 out_error_apply_filters:
2416         fprintf(trace->output,
2417                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2418                 evsel->filter, perf_evsel__name(evsel), errno,
2419                 str_error_r(errno, errbuf, sizeof(errbuf)));
2420         goto out_delete_evlist;
2421 }
2422 out_error_mem:
2423         fprintf(trace->output, "Not enough memory to run!\n");
2424         goto out_delete_evlist;
2425
2426 out_errno:
2427         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2428         goto out_delete_evlist;
2429 }
2430
2431 static int trace__replay(struct trace *trace)
2432 {
2433         const struct perf_evsel_str_handler handlers[] = {
2434                 { "probe:vfs_getname",       trace__vfs_getname, },
2435         };
2436         struct perf_data_file file = {
2437                 .path  = input_name,
2438                 .mode  = PERF_DATA_MODE_READ,
2439                 .force = trace->force,
2440         };
2441         struct perf_session *session;
2442         struct perf_evsel *evsel;
2443         int err = -1;
2444
2445         trace->tool.sample        = trace__process_sample;
2446         trace->tool.mmap          = perf_event__process_mmap;
2447         trace->tool.mmap2         = perf_event__process_mmap2;
2448         trace->tool.comm          = perf_event__process_comm;
2449         trace->tool.exit          = perf_event__process_exit;
2450         trace->tool.fork          = perf_event__process_fork;
2451         trace->tool.attr          = perf_event__process_attr;
2452         trace->tool.tracing_data  = perf_event__process_tracing_data;
2453         trace->tool.build_id      = perf_event__process_build_id;
2454         trace->tool.namespaces    = perf_event__process_namespaces;
2455
2456         trace->tool.ordered_events = true;
2457         trace->tool.ordering_requires_timestamps = true;
2458
2459         /* add tid to output */
2460         trace->multiple_threads = true;
2461
2462         session = perf_session__new(&file, false, &trace->tool);
2463         if (session == NULL)
2464                 return -1;
2465
2466         if (trace->opts.target.pid)
2467                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2468
2469         if (trace->opts.target.tid)
2470                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2471
2472         if (symbol__init(&session->header.env) < 0)
2473                 goto out;
2474
2475         trace->host = &session->machines.host;
2476
2477         err = perf_session__set_tracepoints_handlers(session, handlers);
2478         if (err)
2479                 goto out;
2480
2481         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2482                                                      "raw_syscalls:sys_enter");
2483         /* older kernels have syscalls tp versus raw_syscalls */
2484         if (evsel == NULL)
2485                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2486                                                              "syscalls:sys_enter");
2487
2488         if (evsel &&
2489             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2490             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2491                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2492                 goto out;
2493         }
2494
2495         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2496                                                      "raw_syscalls:sys_exit");
2497         if (evsel == NULL)
2498                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2499                                                              "syscalls:sys_exit");
2500         if (evsel &&
2501             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2502             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2503                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2504                 goto out;
2505         }
2506
2507         evlist__for_each_entry(session->evlist, evsel) {
2508                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2509                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2510                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2511                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2512                         evsel->handler = trace__pgfault;
2513         }
2514
2515         setup_pager();
2516
2517         err = perf_session__process_events(session);
2518         if (err)
2519                 pr_err("Failed to process events, error %d", err);
2520
2521         else if (trace->summary)
2522                 trace__fprintf_thread_summary(trace, trace->output);
2523
2524 out:
2525         perf_session__delete(session);
2526
2527         return err;
2528 }
2529
2530 static size_t trace__fprintf_threads_header(FILE *fp)
2531 {
2532         size_t printed;
2533
2534         printed  = fprintf(fp, "\n Summary of events:\n\n");
2535
2536         return printed;
2537 }
2538
2539 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2540         struct stats    *stats;
2541         double          msecs;
2542         int             syscall;
2543 )
2544 {
2545         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2546         struct stats *stats = source->priv;
2547
2548         entry->syscall = source->i;
2549         entry->stats   = stats;
2550         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2551 }
2552
2553 static size_t thread__dump_stats(struct thread_trace *ttrace,
2554                                  struct trace *trace, FILE *fp)
2555 {
2556         size_t printed = 0;
2557         struct syscall *sc;
2558         struct rb_node *nd;
2559         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2560
2561         if (syscall_stats == NULL)
2562                 return 0;
2563
2564         printed += fprintf(fp, "\n");
2565
2566         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2567         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2568         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2569
2570         resort_rb__for_each_entry(nd, syscall_stats) {
2571                 struct stats *stats = syscall_stats_entry->stats;
2572                 if (stats) {
2573                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2574                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2575                         double avg = avg_stats(stats);
2576                         double pct;
2577                         u64 n = (u64) stats->n;
2578
2579                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2580                         avg /= NSEC_PER_MSEC;
2581
2582                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2583                         printed += fprintf(fp, "   %-15s", sc->name);
2584                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2585                                            n, syscall_stats_entry->msecs, min, avg);
2586                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2587                 }
2588         }
2589
2590         resort_rb__delete(syscall_stats);
2591         printed += fprintf(fp, "\n\n");
2592
2593         return printed;
2594 }
2595
2596 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2597 {
2598         size_t printed = 0;
2599         struct thread_trace *ttrace = thread__priv(thread);
2600         double ratio;
2601
2602         if (ttrace == NULL)
2603                 return 0;
2604
2605         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2606
2607         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2608         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2609         printed += fprintf(fp, "%.1f%%", ratio);
2610         if (ttrace->pfmaj)
2611                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2612         if (ttrace->pfmin)
2613                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2614         if (trace->sched)
2615                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2616         else if (fputc('\n', fp) != EOF)
2617                 ++printed;
2618
2619         printed += thread__dump_stats(ttrace, trace, fp);
2620
2621         return printed;
2622 }
2623
2624 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2625 {
2626         return ttrace ? ttrace->nr_events : 0;
2627 }
2628
2629 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2630         struct thread *thread;
2631 )
2632 {
2633         entry->thread = rb_entry(nd, struct thread, rb_node);
2634 }
2635
2636 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2637 {
2638         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2639         size_t printed = trace__fprintf_threads_header(fp);
2640         struct rb_node *nd;
2641
2642         if (threads == NULL) {
2643                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2644                 return 0;
2645         }
2646
2647         resort_rb__for_each_entry(nd, threads)
2648                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2649
2650         resort_rb__delete(threads);
2651
2652         return printed;
2653 }
2654
2655 static int trace__set_duration(const struct option *opt, const char *str,
2656                                int unset __maybe_unused)
2657 {
2658         struct trace *trace = opt->value;
2659
2660         trace->duration_filter = atof(str);
2661         return 0;
2662 }
2663
2664 static int trace__set_filter_pids(const struct option *opt, const char *str,
2665                                   int unset __maybe_unused)
2666 {
2667         int ret = -1;
2668         size_t i;
2669         struct trace *trace = opt->value;
2670         /*
2671          * FIXME: introduce a intarray class, plain parse csv and create a
2672          * { int nr, int entries[] } struct...
2673          */
2674         struct intlist *list = intlist__new(str);
2675
2676         if (list == NULL)
2677                 return -1;
2678
2679         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2680         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2681
2682         if (trace->filter_pids.entries == NULL)
2683                 goto out;
2684
2685         trace->filter_pids.entries[0] = getpid();
2686
2687         for (i = 1; i < trace->filter_pids.nr; ++i)
2688                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2689
2690         intlist__delete(list);
2691         ret = 0;
2692 out:
2693         return ret;
2694 }
2695
2696 static int trace__open_output(struct trace *trace, const char *filename)
2697 {
2698         struct stat st;
2699
2700         if (!stat(filename, &st) && st.st_size) {
2701                 char oldname[PATH_MAX];
2702
2703                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2704                 unlink(oldname);
2705                 rename(filename, oldname);
2706         }
2707
2708         trace->output = fopen(filename, "w");
2709
2710         return trace->output == NULL ? -errno : 0;
2711 }
2712
2713 static int parse_pagefaults(const struct option *opt, const char *str,
2714                             int unset __maybe_unused)
2715 {
2716         int *trace_pgfaults = opt->value;
2717
2718         if (strcmp(str, "all") == 0)
2719                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2720         else if (strcmp(str, "maj") == 0)
2721                 *trace_pgfaults |= TRACE_PFMAJ;
2722         else if (strcmp(str, "min") == 0)
2723                 *trace_pgfaults |= TRACE_PFMIN;
2724         else
2725                 return -1;
2726
2727         return 0;
2728 }
2729
2730 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2731 {
2732         struct perf_evsel *evsel;
2733
2734         evlist__for_each_entry(evlist, evsel)
2735                 evsel->handler = handler;
2736 }
2737
2738 /*
2739  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2740  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2741  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2742  *
2743  * It'd be better to introduce a parse_options() variant that would return a
2744  * list with the terms it didn't match to an event...
2745  */
2746 static int trace__parse_events_option(const struct option *opt, const char *str,
2747                                       int unset __maybe_unused)
2748 {
2749         struct trace *trace = (struct trace *)opt->value;
2750         const char *s = str;
2751         char *sep = NULL, *lists[2] = { NULL, NULL, };
2752         int len = strlen(str), err = -1, list;
2753         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2754         char group_name[PATH_MAX];
2755
2756         if (strace_groups_dir == NULL)
2757                 return -1;
2758
2759         if (*s == '!') {
2760                 ++s;
2761                 trace->not_ev_qualifier = true;
2762         }
2763
2764         while (1) {
2765                 if ((sep = strchr(s, ',')) != NULL)
2766                         *sep = '\0';
2767
2768                 list = 0;
2769                 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2770                         list = 1;
2771                 } else {
2772                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2773                         if (access(group_name, R_OK) == 0)
2774                                 list = 1;
2775                 }
2776
2777                 if (lists[list]) {
2778                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2779                 } else {
2780                         lists[list] = malloc(len);
2781                         if (lists[list] == NULL)
2782                                 goto out;
2783                         strcpy(lists[list], s);
2784                 }
2785
2786                 if (!sep)
2787                         break;
2788
2789                 *sep = ',';
2790                 s = sep + 1;
2791         }
2792
2793         if (lists[1] != NULL) {
2794                 struct strlist_config slist_config = {
2795                         .dirname = strace_groups_dir,
2796                 };
2797
2798                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2799                 if (trace->ev_qualifier == NULL) {
2800                         fputs("Not enough memory to parse event qualifier", trace->output);
2801                         goto out;
2802                 }
2803
2804                 if (trace__validate_ev_qualifier(trace))
2805                         goto out;
2806         }
2807
2808         err = 0;
2809
2810         if (lists[0]) {
2811                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2812                                                "event selector. use 'perf list' to list available events",
2813                                                parse_events_option);
2814                 err = parse_events_option(&o, lists[0], 0);
2815         }
2816 out:
2817         if (sep)
2818                 *sep = ',';
2819
2820         return err;
2821 }
2822
2823 int cmd_trace(int argc, const char **argv)
2824 {
2825         const char *trace_usage[] = {
2826                 "perf trace [<options>] [<command>]",
2827                 "perf trace [<options>] -- <command> [<options>]",
2828                 "perf trace record [<options>] [<command>]",
2829                 "perf trace record [<options>] -- <command> [<options>]",
2830                 NULL
2831         };
2832         struct trace trace = {
2833                 .syscalls = {
2834                         . max = -1,
2835                 },
2836                 .opts = {
2837                         .target = {
2838                                 .uid       = UINT_MAX,
2839                                 .uses_mmap = true,
2840                         },
2841                         .user_freq     = UINT_MAX,
2842                         .user_interval = ULLONG_MAX,
2843                         .no_buffering  = true,
2844                         .mmap_pages    = UINT_MAX,
2845                         .proc_map_timeout  = 500,
2846                 },
2847                 .output = stderr,
2848                 .show_comm = true,
2849                 .trace_syscalls = true,
2850                 .kernel_syscallchains = false,
2851                 .max_stack = UINT_MAX,
2852         };
2853         const char *output_name = NULL;
2854         const struct option trace_options[] = {
2855         OPT_CALLBACK('e', "event", &trace, "event",
2856                      "event/syscall selector. use 'perf list' to list available events",
2857                      trace__parse_events_option),
2858         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2859                     "show the thread COMM next to its id"),
2860         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2861         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2862                      trace__parse_events_option),
2863         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2864         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2865         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2866                     "trace events on existing process id"),
2867         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2868                     "trace events on existing thread id"),
2869         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2870                      "pids to filter (by the kernel)", trace__set_filter_pids),
2871         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2872                     "system-wide collection from all CPUs"),
2873         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2874                     "list of cpus to monitor"),
2875         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2876                     "child tasks do not inherit counters"),
2877         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2878                      "number of mmap data pages",
2879                      perf_evlist__parse_mmap_pages),
2880         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2881                    "user to profile"),
2882         OPT_CALLBACK(0, "duration", &trace, "float",
2883                      "show only events with duration > N.M ms",
2884                      trace__set_duration),
2885         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2886         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2887         OPT_BOOLEAN('T', "time", &trace.full_time,
2888                     "Show full timestamp, not time relative to first start"),
2889         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2890                     "Show only syscall summary with statistics"),
2891         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2892                     "Show all syscalls and summary with statistics"),
2893         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2894                      "Trace pagefaults", parse_pagefaults, "maj"),
2895         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2896         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2897         OPT_CALLBACK(0, "call-graph", &trace.opts,
2898                      "record_mode[,record_size]", record_callchain_help,
2899                      &record_parse_callchain_opt),
2900         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2901                     "Show the kernel callchains on the syscall exit path"),
2902         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2903                      "Set the minimum stack depth when parsing the callchain, "
2904                      "anything below the specified depth will be ignored."),
2905         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2906                      "Set the maximum stack depth when parsing the callchain, "
2907                      "anything beyond the specified depth will be ignored. "
2908                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2909         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2910                         "per thread proc mmap processing timeout in ms"),
2911         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2912                      "ms to wait before starting measurement after program "
2913                      "start"),
2914         OPT_END()
2915         };
2916         bool __maybe_unused max_stack_user_set = true;
2917         bool mmap_pages_user_set = true;
2918         const char * const trace_subcommands[] = { "record", NULL };
2919         int err;
2920         char bf[BUFSIZ];
2921
2922         signal(SIGSEGV, sighandler_dump_stack);
2923         signal(SIGFPE, sighandler_dump_stack);
2924
2925         trace.evlist = perf_evlist__new();
2926         trace.sctbl = syscalltbl__new();
2927
2928         if (trace.evlist == NULL || trace.sctbl == NULL) {
2929                 pr_err("Not enough memory to run!\n");
2930                 err = -ENOMEM;
2931                 goto out;
2932         }
2933
2934         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2935                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2936
2937         err = bpf__setup_stdout(trace.evlist);
2938         if (err) {
2939                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2940                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2941                 goto out;
2942         }
2943
2944         err = -1;
2945
2946         if (trace.trace_pgfaults) {
2947                 trace.opts.sample_address = true;
2948                 trace.opts.sample_time = true;
2949         }
2950
2951         if (trace.opts.mmap_pages == UINT_MAX)
2952                 mmap_pages_user_set = false;
2953
2954         if (trace.max_stack == UINT_MAX) {
2955                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2956                 max_stack_user_set = false;
2957         }
2958
2959 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2960         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2961                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2962 #endif
2963
2964         if (callchain_param.enabled) {
2965                 if (!mmap_pages_user_set && geteuid() == 0)
2966                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2967
2968                 symbol_conf.use_callchain = true;
2969         }
2970
2971         if (trace.evlist->nr_entries > 0)
2972                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2973
2974         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2975                 return trace__record(&trace, argc-1, &argv[1]);
2976
2977         /* summary_only implies summary option, but don't overwrite summary if set */
2978         if (trace.summary_only)
2979                 trace.summary = trace.summary_only;
2980
2981         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2982             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2983                 pr_err("Please specify something to trace.\n");
2984                 return -1;
2985         }
2986
2987         if (!trace.trace_syscalls && trace.ev_qualifier) {
2988                 pr_err("The -e option can't be used with --no-syscalls.\n");
2989                 goto out;
2990         }
2991
2992         if (output_name != NULL) {
2993                 err = trace__open_output(&trace, output_name);
2994                 if (err < 0) {
2995                         perror("failed to create output file");
2996                         goto out;
2997                 }
2998         }
2999
3000         trace.open_id = syscalltbl__id(trace.sctbl, "open");
3001
3002         err = target__validate(&trace.opts.target);
3003         if (err) {
3004                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3005                 fprintf(trace.output, "%s", bf);
3006                 goto out_close;
3007         }
3008
3009         err = target__parse_uid(&trace.opts.target);
3010         if (err) {
3011                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3012                 fprintf(trace.output, "%s", bf);
3013                 goto out_close;
3014         }
3015
3016         if (!argc && target__none(&trace.opts.target))
3017                 trace.opts.target.system_wide = true;
3018
3019         if (input_name)
3020                 err = trace__replay(&trace);
3021         else
3022                 err = trace__run(&trace, argc, argv);
3023
3024 out_close:
3025         if (output_name != NULL)
3026                 fclose(trace.output);
3027 out:
3028         return err;
3029 }