]> git.karo-electronics.de Git - karo-tx-linux.git/blob - tools/perf/builtin-trace.c
aba5fac4152923faf50e6be2e1723ad98964b3e9
[karo-tx-linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace/beauty/beauty.h"
35 #include "trace-event.h"
36 #include "util/parse-events.h"
37 #include "util/bpf-loader.h"
38 #include "callchain.h"
39 #include "syscalltbl.h"
40 #include "rb_resort.h"
41
42 #include <inttypes.h>
43 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
44 #include <stdlib.h>
45 #include <string.h>
46 #include <linux/err.h>
47 #include <linux/filter.h>
48 #include <linux/audit.h>
49 #include <linux/kernel.h>
50 #include <linux/random.h>
51 #include <linux/stringify.h>
52 #include <linux/time64.h>
53
54 #ifndef O_CLOEXEC
55 # define O_CLOEXEC              02000000
56 #endif
57
58 struct trace {
59         struct perf_tool        tool;
60         struct syscalltbl       *sctbl;
61         struct {
62                 int             max;
63                 struct syscall  *table;
64                 struct {
65                         struct perf_evsel *sys_enter,
66                                           *sys_exit;
67                 }               events;
68         } syscalls;
69         struct record_opts      opts;
70         struct perf_evlist      *evlist;
71         struct machine          *host;
72         struct thread           *current;
73         u64                     base_time;
74         FILE                    *output;
75         unsigned long           nr_events;
76         struct strlist          *ev_qualifier;
77         struct {
78                 size_t          nr;
79                 int             *entries;
80         }                       ev_qualifier_ids;
81         struct {
82                 size_t          nr;
83                 pid_t           *entries;
84         }                       filter_pids;
85         double                  duration_filter;
86         double                  runtime_ms;
87         struct {
88                 u64             vfs_getname,
89                                 proc_getname;
90         } stats;
91         unsigned int            max_stack;
92         unsigned int            min_stack;
93         bool                    not_ev_qualifier;
94         bool                    live;
95         bool                    full_time;
96         bool                    sched;
97         bool                    multiple_threads;
98         bool                    summary;
99         bool                    summary_only;
100         bool                    show_comm;
101         bool                    show_tool_stats;
102         bool                    trace_syscalls;
103         bool                    kernel_syscallchains;
104         bool                    force;
105         bool                    vfs_getname;
106         int                     trace_pgfaults;
107         int                     open_id;
108 };
109
110 struct tp_field {
111         int offset;
112         union {
113                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
114                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
115         };
116 };
117
118 #define TP_UINT_FIELD(bits) \
119 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
120 { \
121         u##bits value; \
122         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
123         return value;  \
124 }
125
126 TP_UINT_FIELD(8);
127 TP_UINT_FIELD(16);
128 TP_UINT_FIELD(32);
129 TP_UINT_FIELD(64);
130
131 #define TP_UINT_FIELD__SWAPPED(bits) \
132 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134         u##bits value; \
135         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136         return bswap_##bits(value);\
137 }
138
139 TP_UINT_FIELD__SWAPPED(16);
140 TP_UINT_FIELD__SWAPPED(32);
141 TP_UINT_FIELD__SWAPPED(64);
142
143 static int tp_field__init_uint(struct tp_field *field,
144                                struct format_field *format_field,
145                                bool needs_swap)
146 {
147         field->offset = format_field->offset;
148
149         switch (format_field->size) {
150         case 1:
151                 field->integer = tp_field__u8;
152                 break;
153         case 2:
154                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
155                 break;
156         case 4:
157                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
158                 break;
159         case 8:
160                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
161                 break;
162         default:
163                 return -1;
164         }
165
166         return 0;
167 }
168
169 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
170 {
171         return sample->raw_data + field->offset;
172 }
173
174 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
175 {
176         field->offset = format_field->offset;
177         field->pointer = tp_field__ptr;
178         return 0;
179 }
180
181 struct syscall_tp {
182         struct tp_field id;
183         union {
184                 struct tp_field args, ret;
185         };
186 };
187
188 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
189                                           struct tp_field *field,
190                                           const char *name)
191 {
192         struct format_field *format_field = perf_evsel__field(evsel, name);
193
194         if (format_field == NULL)
195                 return -1;
196
197         return tp_field__init_uint(field, format_field, evsel->needs_swap);
198 }
199
200 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
201         ({ struct syscall_tp *sc = evsel->priv;\
202            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
203
204 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
205                                          struct tp_field *field,
206                                          const char *name)
207 {
208         struct format_field *format_field = perf_evsel__field(evsel, name);
209
210         if (format_field == NULL)
211                 return -1;
212
213         return tp_field__init_ptr(field, format_field);
214 }
215
216 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
217         ({ struct syscall_tp *sc = evsel->priv;\
218            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
219
220 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
221 {
222         zfree(&evsel->priv);
223         perf_evsel__delete(evsel);
224 }
225
226 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
227 {
228         evsel->priv = malloc(sizeof(struct syscall_tp));
229         if (evsel->priv != NULL) {
230                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
231                         goto out_delete;
232
233                 evsel->handler = handler;
234                 return 0;
235         }
236
237         return -ENOMEM;
238
239 out_delete:
240         zfree(&evsel->priv);
241         return -ENOENT;
242 }
243
244 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
245 {
246         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
247
248         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
249         if (IS_ERR(evsel))
250                 evsel = perf_evsel__newtp("syscalls", direction);
251
252         if (IS_ERR(evsel))
253                 return NULL;
254
255         if (perf_evsel__init_syscall_tp(evsel, handler))
256                 goto out_delete;
257
258         return evsel;
259
260 out_delete:
261         perf_evsel__delete_priv(evsel);
262         return NULL;
263 }
264
265 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
266         ({ struct syscall_tp *fields = evsel->priv; \
267            fields->name.integer(&fields->name, sample); })
268
269 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
270         ({ struct syscall_tp *fields = evsel->priv; \
271            fields->name.pointer(&fields->name, sample); })
272
273 struct strarray {
274         int         offset;
275         int         nr_entries;
276         const char **entries;
277 };
278
279 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
280         .nr_entries = ARRAY_SIZE(array), \
281         .entries = array, \
282 }
283
284 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
285         .offset     = off, \
286         .nr_entries = ARRAY_SIZE(array), \
287         .entries = array, \
288 }
289
290 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
291                                                 const char *intfmt,
292                                                 struct syscall_arg *arg)
293 {
294         struct strarray *sa = arg->parm;
295         int idx = arg->val - sa->offset;
296
297         if (idx < 0 || idx >= sa->nr_entries)
298                 return scnprintf(bf, size, intfmt, arg->val);
299
300         return scnprintf(bf, size, "%s", sa->entries[idx]);
301 }
302
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304                                               struct syscall_arg *arg)
305 {
306         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310
311 #if defined(__i386__) || defined(__x86_64__)
312 /*
313  * FIXME: Make this available to all arches as soon as the ioctl beautifier
314  *        gets rewritten to support all arches.
315  */
316 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
317                                                  struct syscall_arg *arg)
318 {
319         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
320 }
321
322 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
323 #endif /* defined(__i386__) || defined(__x86_64__) */
324
325 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
326                                         struct syscall_arg *arg);
327
328 #define SCA_FD syscall_arg__scnprintf_fd
329
330 #ifndef AT_FDCWD
331 #define AT_FDCWD        -100
332 #endif
333
334 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
335                                            struct syscall_arg *arg)
336 {
337         int fd = arg->val;
338
339         if (fd == AT_FDCWD)
340                 return scnprintf(bf, size, "CWD");
341
342         return syscall_arg__scnprintf_fd(bf, size, arg);
343 }
344
345 #define SCA_FDAT syscall_arg__scnprintf_fd_at
346
347 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
348                                               struct syscall_arg *arg);
349
350 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
351
352 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
353                                          struct syscall_arg *arg)
354 {
355         return scnprintf(bf, size, "%#lx", arg->val);
356 }
357
358 #define SCA_HEX syscall_arg__scnprintf_hex
359
360 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
361                                          struct syscall_arg *arg)
362 {
363         return scnprintf(bf, size, "%d", arg->val);
364 }
365
366 #define SCA_INT syscall_arg__scnprintf_int
367
368 static const char *bpf_cmd[] = {
369         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
370         "MAP_GET_NEXT_KEY", "PROG_LOAD",
371 };
372 static DEFINE_STRARRAY(bpf_cmd);
373
374 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
375 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
376
377 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
378 static DEFINE_STRARRAY(itimers);
379
380 static const char *keyctl_options[] = {
381         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
382         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
383         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
384         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
385         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
386 };
387 static DEFINE_STRARRAY(keyctl_options);
388
389 static const char *whences[] = { "SET", "CUR", "END",
390 #ifdef SEEK_DATA
391 "DATA",
392 #endif
393 #ifdef SEEK_HOLE
394 "HOLE",
395 #endif
396 };
397 static DEFINE_STRARRAY(whences);
398
399 static const char *fcntl_cmds[] = {
400         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
401         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
402         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
403         "F_GETOWNER_UIDS",
404 };
405 static DEFINE_STRARRAY(fcntl_cmds);
406
407 static const char *rlimit_resources[] = {
408         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
409         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
410         "RTTIME",
411 };
412 static DEFINE_STRARRAY(rlimit_resources);
413
414 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
415 static DEFINE_STRARRAY(sighow);
416
417 static const char *clockid[] = {
418         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
419         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
420         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
421 };
422 static DEFINE_STRARRAY(clockid);
423
424 static const char *socket_families[] = {
425         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
426         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
427         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
428         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
429         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
430         "ALG", "NFC", "VSOCK",
431 };
432 static DEFINE_STRARRAY(socket_families);
433
434 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
435                                                  struct syscall_arg *arg)
436 {
437         size_t printed = 0;
438         int mode = arg->val;
439
440         if (mode == F_OK) /* 0 */
441                 return scnprintf(bf, size, "F");
442 #define P_MODE(n) \
443         if (mode & n##_OK) { \
444                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
445                 mode &= ~n##_OK; \
446         }
447
448         P_MODE(R);
449         P_MODE(W);
450         P_MODE(X);
451 #undef P_MODE
452
453         if (mode)
454                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
455
456         return printed;
457 }
458
459 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
460
461 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
462                                               struct syscall_arg *arg);
463
464 #define SCA_FILENAME syscall_arg__scnprintf_filename
465
466 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
467                                                 struct syscall_arg *arg)
468 {
469         int printed = 0, flags = arg->val;
470
471 #define P_FLAG(n) \
472         if (flags & O_##n) { \
473                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
474                 flags &= ~O_##n; \
475         }
476
477         P_FLAG(CLOEXEC);
478         P_FLAG(NONBLOCK);
479 #undef P_FLAG
480
481         if (flags)
482                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
483
484         return printed;
485 }
486
487 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
488
489 #if defined(__i386__) || defined(__x86_64__)
490 /*
491  * FIXME: Make this available to all arches.
492  */
493 #define TCGETS          0x5401
494
495 static const char *tioctls[] = {
496         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
497         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
498         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
499         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
500         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
501         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
502         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
503         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
504         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
505         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
506         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
507         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
508         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
509         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
510         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
511 };
512
513 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
514 #endif /* defined(__i386__) || defined(__x86_64__) */
515
516 #ifndef GRND_NONBLOCK
517 #define GRND_NONBLOCK   0x0001
518 #endif
519 #ifndef GRND_RANDOM
520 #define GRND_RANDOM     0x0002
521 #endif
522
523 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
524                                                    struct syscall_arg *arg)
525 {
526         int printed = 0, flags = arg->val;
527
528 #define P_FLAG(n) \
529         if (flags & GRND_##n) { \
530                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
531                 flags &= ~GRND_##n; \
532         }
533
534         P_FLAG(RANDOM);
535         P_FLAG(NONBLOCK);
536 #undef P_FLAG
537
538         if (flags)
539                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
540
541         return printed;
542 }
543
544 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
545
546 #define STRARRAY(arg, name, array) \
547           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
548           .arg_parm      = { [arg] = &strarray__##array, }
549
550 #include "trace/beauty/eventfd.c"
551 #include "trace/beauty/flock.c"
552 #include "trace/beauty/futex_op.c"
553 #include "trace/beauty/mmap.c"
554 #include "trace/beauty/mode_t.c"
555 #include "trace/beauty/msg_flags.c"
556 #include "trace/beauty/open_flags.c"
557 #include "trace/beauty/perf_event_open.c"
558 #include "trace/beauty/pid.c"
559 #include "trace/beauty/sched_policy.c"
560 #include "trace/beauty/seccomp.c"
561 #include "trace/beauty/signum.c"
562 #include "trace/beauty/socket_type.c"
563 #include "trace/beauty/waitid_options.c"
564
565 static struct syscall_fmt {
566         const char *name;
567         const char *alias;
568         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
569         void       *arg_parm[6];
570         bool       errmsg;
571         bool       errpid;
572         bool       timeout;
573         bool       hexret;
574 } syscall_fmts[] = {
575         { .name     = "access",     .errmsg = true,
576           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
577         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
578         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
579         { .name     = "brk",        .hexret = true,
580           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
581         { .name     = "chdir",      .errmsg = true, },
582         { .name     = "chmod",      .errmsg = true, },
583         { .name     = "chroot",     .errmsg = true, },
584         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
585         { .name     = "clone",      .errpid = true, },
586         { .name     = "close",      .errmsg = true,
587           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
588         { .name     = "connect",    .errmsg = true, },
589         { .name     = "creat",      .errmsg = true, },
590         { .name     = "dup",        .errmsg = true, },
591         { .name     = "dup2",       .errmsg = true, },
592         { .name     = "dup3",       .errmsg = true, },
593         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
594         { .name     = "eventfd2",   .errmsg = true,
595           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
596         { .name     = "faccessat",  .errmsg = true, },
597         { .name     = "fadvise64",  .errmsg = true, },
598         { .name     = "fallocate",  .errmsg = true, },
599         { .name     = "fchdir",     .errmsg = true, },
600         { .name     = "fchmod",     .errmsg = true, },
601         { .name     = "fchmodat",   .errmsg = true,
602           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
603         { .name     = "fchown",     .errmsg = true, },
604         { .name     = "fchownat",   .errmsg = true,
605           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
606         { .name     = "fcntl",      .errmsg = true,
607           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
608           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
609         { .name     = "fdatasync",  .errmsg = true, },
610         { .name     = "flock",      .errmsg = true,
611           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
612         { .name     = "fsetxattr",  .errmsg = true, },
613         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
614         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
615         { .name     = "fstatfs",    .errmsg = true, },
616         { .name     = "fsync",    .errmsg = true, },
617         { .name     = "ftruncate", .errmsg = true, },
618         { .name     = "futex",      .errmsg = true,
619           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
620         { .name     = "futimesat", .errmsg = true,
621           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
622         { .name     = "getdents",   .errmsg = true, },
623         { .name     = "getdents64", .errmsg = true, },
624         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
625         { .name     = "getpid",     .errpid = true, },
626         { .name     = "getpgid",    .errpid = true, },
627         { .name     = "getppid",    .errpid = true, },
628         { .name     = "getrandom",  .errmsg = true,
629           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
630         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
631         { .name     = "getxattr",   .errmsg = true, },
632         { .name     = "inotify_add_watch",          .errmsg = true, },
633         { .name     = "ioctl",      .errmsg = true,
634           .arg_scnprintf = {
635 #if defined(__i386__) || defined(__x86_64__)
636 /*
637  * FIXME: Make this available to all arches.
638  */
639                              [1] = SCA_STRHEXARRAY, /* cmd */
640                              [2] = SCA_HEX, /* arg */ },
641           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
642 #else
643                              [2] = SCA_HEX, /* arg */ }, },
644 #endif
645         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
646         { .name     = "kill",       .errmsg = true,
647           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
648         { .name     = "lchown",    .errmsg = true, },
649         { .name     = "lgetxattr",  .errmsg = true, },
650         { .name     = "linkat",     .errmsg = true,
651           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
652         { .name     = "listxattr",  .errmsg = true, },
653         { .name     = "llistxattr", .errmsg = true, },
654         { .name     = "lremovexattr",  .errmsg = true, },
655         { .name     = "lseek",      .errmsg = true,
656           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
657           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
658         { .name     = "lsetxattr",  .errmsg = true, },
659         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
660         { .name     = "lsxattr",    .errmsg = true, },
661         { .name     = "madvise",    .errmsg = true,
662           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
663                              [2] = SCA_MADV_BHV, /* behavior */ }, },
664         { .name     = "mkdir",    .errmsg = true, },
665         { .name     = "mkdirat",    .errmsg = true,
666           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
667         { .name     = "mknod",      .errmsg = true, },
668         { .name     = "mknodat",    .errmsg = true,
669           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
670         { .name     = "mlock",      .errmsg = true,
671           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
672         { .name     = "mlockall",   .errmsg = true,
673           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
674         { .name     = "mmap",       .hexret = true,
675           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
676                              [2] = SCA_MMAP_PROT, /* prot */
677                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
678         { .name     = "mprotect",   .errmsg = true,
679           .arg_scnprintf = { [0] = SCA_HEX, /* start */
680                              [2] = SCA_MMAP_PROT, /* prot */ }, },
681         { .name     = "mq_unlink", .errmsg = true,
682           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
683         { .name     = "mremap",     .hexret = true,
684           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
685                              [3] = SCA_MREMAP_FLAGS, /* flags */
686                              [4] = SCA_HEX, /* new_addr */ }, },
687         { .name     = "munlock",    .errmsg = true,
688           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
689         { .name     = "munmap",     .errmsg = true,
690           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
691         { .name     = "name_to_handle_at", .errmsg = true,
692           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
693         { .name     = "newfstatat", .errmsg = true,
694           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
695         { .name     = "open",       .errmsg = true,
696           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
697         { .name     = "open_by_handle_at", .errmsg = true,
698           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
699                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
700         { .name     = "openat",     .errmsg = true,
701           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
702                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
703         { .name     = "perf_event_open", .errmsg = true,
704           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
705                              [3] = SCA_FD,  /* group_fd */
706                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
707         { .name     = "pipe2",      .errmsg = true,
708           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
709         { .name     = "poll",       .errmsg = true, .timeout = true, },
710         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
711         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
712         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
713         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
714         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
715         { .name     = "pwritev",    .errmsg = true, },
716         { .name     = "read",       .errmsg = true, },
717         { .name     = "readlink",   .errmsg = true, },
718         { .name     = "readlinkat", .errmsg = true,
719           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
720         { .name     = "readv",      .errmsg = true, },
721         { .name     = "recvfrom",   .errmsg = true,
722           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
723         { .name     = "recvmmsg",   .errmsg = true,
724           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
725         { .name     = "recvmsg",    .errmsg = true,
726           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
727         { .name     = "removexattr", .errmsg = true, },
728         { .name     = "renameat",   .errmsg = true,
729           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
730         { .name     = "rmdir",    .errmsg = true, },
731         { .name     = "rt_sigaction", .errmsg = true,
732           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
733         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
734         { .name     = "rt_sigqueueinfo", .errmsg = true,
735           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
736         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
737           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
738         { .name     = "sched_getattr",        .errmsg = true, },
739         { .name     = "sched_setattr",        .errmsg = true, },
740         { .name     = "sched_setscheduler",   .errmsg = true,
741           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
742         { .name     = "seccomp", .errmsg = true,
743           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
744                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
745         { .name     = "select",     .errmsg = true, .timeout = true, },
746         { .name     = "sendmmsg",    .errmsg = true,
747           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
748         { .name     = "sendmsg",    .errmsg = true,
749           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
750         { .name     = "sendto",     .errmsg = true,
751           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
752         { .name     = "set_tid_address", .errpid = true, },
753         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
754         { .name     = "setpgid",    .errmsg = true, },
755         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
756         { .name     = "setxattr",   .errmsg = true, },
757         { .name     = "shutdown",   .errmsg = true, },
758         { .name     = "socket",     .errmsg = true,
759           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
760                              [1] = SCA_SK_TYPE, /* type */ },
761           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
762         { .name     = "socketpair", .errmsg = true,
763           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
764                              [1] = SCA_SK_TYPE, /* type */ },
765           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
766         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
767         { .name     = "statfs",     .errmsg = true, },
768         { .name     = "statx",      .errmsg = true,
769           .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
770                              [2] = SCA_STATX_FLAGS, /* flags */
771                              [3] = SCA_STATX_MASK, /* mask */ }, },
772         { .name     = "swapoff",    .errmsg = true,
773           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
774         { .name     = "swapon",     .errmsg = true,
775           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
776         { .name     = "symlinkat",  .errmsg = true,
777           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
778         { .name     = "tgkill",     .errmsg = true,
779           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
780         { .name     = "tkill",      .errmsg = true,
781           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
782         { .name     = "truncate",   .errmsg = true, },
783         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
784         { .name     = "unlinkat",   .errmsg = true,
785           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
786         { .name     = "utime",  .errmsg = true, },
787         { .name     = "utimensat",  .errmsg = true,
788           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
789         { .name     = "utimes",  .errmsg = true, },
790         { .name     = "vmsplice",  .errmsg = true, },
791         { .name     = "wait4",      .errpid = true,
792           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
793         { .name     = "waitid",     .errpid = true,
794           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
795         { .name     = "write",      .errmsg = true, },
796         { .name     = "writev",     .errmsg = true, },
797 };
798
799 static int syscall_fmt__cmp(const void *name, const void *fmtp)
800 {
801         const struct syscall_fmt *fmt = fmtp;
802         return strcmp(name, fmt->name);
803 }
804
805 static struct syscall_fmt *syscall_fmt__find(const char *name)
806 {
807         const int nmemb = ARRAY_SIZE(syscall_fmts);
808         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
809 }
810
811 struct syscall {
812         struct event_format *tp_format;
813         int                 nr_args;
814         struct format_field *args;
815         const char          *name;
816         bool                is_exit;
817         struct syscall_fmt  *fmt;
818         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
819         void                **arg_parm;
820 };
821
822 /*
823  * We need to have this 'calculated' boolean because in some cases we really
824  * don't know what is the duration of a syscall, for instance, when we start
825  * a session and some threads are waiting for a syscall to finish, say 'poll',
826  * in which case all we can do is to print "( ? ) for duration and for the
827  * start timestamp.
828  */
829 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
830 {
831         double duration = (double)t / NSEC_PER_MSEC;
832         size_t printed = fprintf(fp, "(");
833
834         if (!calculated)
835                 printed += fprintf(fp, "     ?   ");
836         else if (duration >= 1.0)
837                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
838         else if (duration >= 0.01)
839                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
840         else
841                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
842         return printed + fprintf(fp, "): ");
843 }
844
845 /**
846  * filename.ptr: The filename char pointer that will be vfs_getname'd
847  * filename.entry_str_pos: Where to insert the string translated from
848  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
849  */
850 struct thread_trace {
851         u64               entry_time;
852         bool              entry_pending;
853         unsigned long     nr_events;
854         unsigned long     pfmaj, pfmin;
855         char              *entry_str;
856         double            runtime_ms;
857         struct {
858                 unsigned long ptr;
859                 short int     entry_str_pos;
860                 bool          pending_open;
861                 unsigned int  namelen;
862                 char          *name;
863         } filename;
864         struct {
865                 int       max;
866                 char      **table;
867         } paths;
868
869         struct intlist *syscall_stats;
870 };
871
872 static struct thread_trace *thread_trace__new(void)
873 {
874         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
875
876         if (ttrace)
877                 ttrace->paths.max = -1;
878
879         ttrace->syscall_stats = intlist__new(NULL);
880
881         return ttrace;
882 }
883
884 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
885 {
886         struct thread_trace *ttrace;
887
888         if (thread == NULL)
889                 goto fail;
890
891         if (thread__priv(thread) == NULL)
892                 thread__set_priv(thread, thread_trace__new());
893
894         if (thread__priv(thread) == NULL)
895                 goto fail;
896
897         ttrace = thread__priv(thread);
898         ++ttrace->nr_events;
899
900         return ttrace;
901 fail:
902         color_fprintf(fp, PERF_COLOR_RED,
903                       "WARNING: not enough memory, dropping samples!\n");
904         return NULL;
905 }
906
907 #define TRACE_PFMAJ             (1 << 0)
908 #define TRACE_PFMIN             (1 << 1)
909
910 static const size_t trace__entry_str_size = 2048;
911
912 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
913 {
914         struct thread_trace *ttrace = thread__priv(thread);
915
916         if (fd > ttrace->paths.max) {
917                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
918
919                 if (npath == NULL)
920                         return -1;
921
922                 if (ttrace->paths.max != -1) {
923                         memset(npath + ttrace->paths.max + 1, 0,
924                                (fd - ttrace->paths.max) * sizeof(char *));
925                 } else {
926                         memset(npath, 0, (fd + 1) * sizeof(char *));
927                 }
928
929                 ttrace->paths.table = npath;
930                 ttrace->paths.max   = fd;
931         }
932
933         ttrace->paths.table[fd] = strdup(pathname);
934
935         return ttrace->paths.table[fd] != NULL ? 0 : -1;
936 }
937
938 static int thread__read_fd_path(struct thread *thread, int fd)
939 {
940         char linkname[PATH_MAX], pathname[PATH_MAX];
941         struct stat st;
942         int ret;
943
944         if (thread->pid_ == thread->tid) {
945                 scnprintf(linkname, sizeof(linkname),
946                           "/proc/%d/fd/%d", thread->pid_, fd);
947         } else {
948                 scnprintf(linkname, sizeof(linkname),
949                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
950         }
951
952         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
953                 return -1;
954
955         ret = readlink(linkname, pathname, sizeof(pathname));
956
957         if (ret < 0 || ret > st.st_size)
958                 return -1;
959
960         pathname[ret] = '\0';
961         return trace__set_fd_pathname(thread, fd, pathname);
962 }
963
964 static const char *thread__fd_path(struct thread *thread, int fd,
965                                    struct trace *trace)
966 {
967         struct thread_trace *ttrace = thread__priv(thread);
968
969         if (ttrace == NULL)
970                 return NULL;
971
972         if (fd < 0)
973                 return NULL;
974
975         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
976                 if (!trace->live)
977                         return NULL;
978                 ++trace->stats.proc_getname;
979                 if (thread__read_fd_path(thread, fd))
980                         return NULL;
981         }
982
983         return ttrace->paths.table[fd];
984 }
985
986 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
987                                         struct syscall_arg *arg)
988 {
989         int fd = arg->val;
990         size_t printed = scnprintf(bf, size, "%d", fd);
991         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
992
993         if (path)
994                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
995
996         return printed;
997 }
998
999 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1000                                               struct syscall_arg *arg)
1001 {
1002         int fd = arg->val;
1003         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1004         struct thread_trace *ttrace = thread__priv(arg->thread);
1005
1006         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1007                 zfree(&ttrace->paths.table[fd]);
1008
1009         return printed;
1010 }
1011
1012 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1013                                      unsigned long ptr)
1014 {
1015         struct thread_trace *ttrace = thread__priv(thread);
1016
1017         ttrace->filename.ptr = ptr;
1018         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1019 }
1020
1021 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1022                                               struct syscall_arg *arg)
1023 {
1024         unsigned long ptr = arg->val;
1025
1026         if (!arg->trace->vfs_getname)
1027                 return scnprintf(bf, size, "%#x", ptr);
1028
1029         thread__set_filename_pos(arg->thread, bf, ptr);
1030         return 0;
1031 }
1032
1033 static bool trace__filter_duration(struct trace *trace, double t)
1034 {
1035         return t < (trace->duration_filter * NSEC_PER_MSEC);
1036 }
1037
1038 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1039 {
1040         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1041
1042         return fprintf(fp, "%10.3f ", ts);
1043 }
1044
1045 /*
1046  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1047  * using ttrace->entry_time for a thread that receives a sys_exit without
1048  * first having received a sys_enter ("poll" issued before tracing session
1049  * starts, lost sys_enter exit due to ring buffer overflow).
1050  */
1051 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1052 {
1053         if (tstamp > 0)
1054                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1055
1056         return fprintf(fp, "         ? ");
1057 }
1058
1059 static bool done = false;
1060 static bool interrupted = false;
1061
1062 static void sig_handler(int sig)
1063 {
1064         done = true;
1065         interrupted = sig == SIGINT;
1066 }
1067
1068 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1069                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1070 {
1071         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1072         printed += fprintf_duration(duration, duration_calculated, fp);
1073
1074         if (trace->multiple_threads) {
1075                 if (trace->show_comm)
1076                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1077                 printed += fprintf(fp, "%d ", thread->tid);
1078         }
1079
1080         return printed;
1081 }
1082
1083 static int trace__process_event(struct trace *trace, struct machine *machine,
1084                                 union perf_event *event, struct perf_sample *sample)
1085 {
1086         int ret = 0;
1087
1088         switch (event->header.type) {
1089         case PERF_RECORD_LOST:
1090                 color_fprintf(trace->output, PERF_COLOR_RED,
1091                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1092                 ret = machine__process_lost_event(machine, event, sample);
1093                 break;
1094         default:
1095                 ret = machine__process_event(machine, event, sample);
1096                 break;
1097         }
1098
1099         return ret;
1100 }
1101
1102 static int trace__tool_process(struct perf_tool *tool,
1103                                union perf_event *event,
1104                                struct perf_sample *sample,
1105                                struct machine *machine)
1106 {
1107         struct trace *trace = container_of(tool, struct trace, tool);
1108         return trace__process_event(trace, machine, event, sample);
1109 }
1110
1111 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1112 {
1113         struct machine *machine = vmachine;
1114
1115         if (machine->kptr_restrict_warned)
1116                 return NULL;
1117
1118         if (symbol_conf.kptr_restrict) {
1119                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1120                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1121                            "Kernel samples will not be resolved.\n");
1122                 machine->kptr_restrict_warned = true;
1123                 return NULL;
1124         }
1125
1126         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1127 }
1128
1129 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1130 {
1131         int err = symbol__init(NULL);
1132
1133         if (err)
1134                 return err;
1135
1136         trace->host = machine__new_host();
1137         if (trace->host == NULL)
1138                 return -ENOMEM;
1139
1140         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1141                 return -errno;
1142
1143         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1144                                             evlist->threads, trace__tool_process, false,
1145                                             trace->opts.proc_map_timeout);
1146         if (err)
1147                 symbol__exit();
1148
1149         return err;
1150 }
1151
1152 static int syscall__set_arg_fmts(struct syscall *sc)
1153 {
1154         struct format_field *field;
1155         int idx = 0, len;
1156
1157         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1158         if (sc->arg_scnprintf == NULL)
1159                 return -1;
1160
1161         if (sc->fmt)
1162                 sc->arg_parm = sc->fmt->arg_parm;
1163
1164         for (field = sc->args; field; field = field->next) {
1165                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1166                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1167                 else if (strcmp(field->type, "const char *") == 0 &&
1168                          (strcmp(field->name, "filename") == 0 ||
1169                           strcmp(field->name, "path") == 0 ||
1170                           strcmp(field->name, "pathname") == 0))
1171                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1172                 else if (field->flags & FIELD_IS_POINTER)
1173                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1174                 else if (strcmp(field->type, "pid_t") == 0)
1175                         sc->arg_scnprintf[idx] = SCA_PID;
1176                 else if (strcmp(field->type, "umode_t") == 0)
1177                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1178                 else if ((strcmp(field->type, "int") == 0 ||
1179                           strcmp(field->type, "unsigned int") == 0 ||
1180                           strcmp(field->type, "long") == 0) &&
1181                          (len = strlen(field->name)) >= 2 &&
1182                          strcmp(field->name + len - 2, "fd") == 0) {
1183                         /*
1184                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1185                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1186                          * 65 int
1187                          * 23 unsigned int
1188                          * 7 unsigned long
1189                          */
1190                         sc->arg_scnprintf[idx] = SCA_FD;
1191                 }
1192                 ++idx;
1193         }
1194
1195         return 0;
1196 }
1197
1198 static int trace__read_syscall_info(struct trace *trace, int id)
1199 {
1200         char tp_name[128];
1201         struct syscall *sc;
1202         const char *name = syscalltbl__name(trace->sctbl, id);
1203
1204         if (name == NULL)
1205                 return -1;
1206
1207         if (id > trace->syscalls.max) {
1208                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1209
1210                 if (nsyscalls == NULL)
1211                         return -1;
1212
1213                 if (trace->syscalls.max != -1) {
1214                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1215                                (id - trace->syscalls.max) * sizeof(*sc));
1216                 } else {
1217                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1218                 }
1219
1220                 trace->syscalls.table = nsyscalls;
1221                 trace->syscalls.max   = id;
1222         }
1223
1224         sc = trace->syscalls.table + id;
1225         sc->name = name;
1226
1227         sc->fmt  = syscall_fmt__find(sc->name);
1228
1229         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1230         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1231
1232         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1233                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1234                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1235         }
1236
1237         if (IS_ERR(sc->tp_format))
1238                 return -1;
1239
1240         sc->args = sc->tp_format->format.fields;
1241         sc->nr_args = sc->tp_format->format.nr_fields;
1242         /*
1243          * We need to check and discard the first variable '__syscall_nr'
1244          * or 'nr' that mean the syscall number. It is needless here.
1245          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1246          */
1247         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1248                 sc->args = sc->args->next;
1249                 --sc->nr_args;
1250         }
1251
1252         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1253
1254         return syscall__set_arg_fmts(sc);
1255 }
1256
1257 static int trace__validate_ev_qualifier(struct trace *trace)
1258 {
1259         int err = 0, i;
1260         struct str_node *pos;
1261
1262         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1263         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1264                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1265
1266         if (trace->ev_qualifier_ids.entries == NULL) {
1267                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1268                        trace->output);
1269                 err = -EINVAL;
1270                 goto out;
1271         }
1272
1273         i = 0;
1274
1275         strlist__for_each_entry(pos, trace->ev_qualifier) {
1276                 const char *sc = pos->s;
1277                 int id = syscalltbl__id(trace->sctbl, sc);
1278
1279                 if (id < 0) {
1280                         if (err == 0) {
1281                                 fputs("Error:\tInvalid syscall ", trace->output);
1282                                 err = -EINVAL;
1283                         } else {
1284                                 fputs(", ", trace->output);
1285                         }
1286
1287                         fputs(sc, trace->output);
1288                 }
1289
1290                 trace->ev_qualifier_ids.entries[i++] = id;
1291         }
1292
1293         if (err < 0) {
1294                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1295                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1296                 zfree(&trace->ev_qualifier_ids.entries);
1297                 trace->ev_qualifier_ids.nr = 0;
1298         }
1299 out:
1300         return err;
1301 }
1302
1303 /*
1304  * args is to be interpreted as a series of longs but we need to handle
1305  * 8-byte unaligned accesses. args points to raw_data within the event
1306  * and raw_data is guaranteed to be 8-byte unaligned because it is
1307  * preceded by raw_size which is a u32. So we need to copy args to a temp
1308  * variable to read it. Most notably this avoids extended load instructions
1309  * on unaligned addresses
1310  */
1311
1312 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1313                                       unsigned char *args, struct trace *trace,
1314                                       struct thread *thread)
1315 {
1316         size_t printed = 0;
1317         unsigned char *p;
1318         unsigned long val;
1319
1320         if (sc->args != NULL) {
1321                 struct format_field *field;
1322                 u8 bit = 1;
1323                 struct syscall_arg arg = {
1324                         .idx    = 0,
1325                         .mask   = 0,
1326                         .trace  = trace,
1327                         .thread = thread,
1328                 };
1329
1330                 for (field = sc->args; field;
1331                      field = field->next, ++arg.idx, bit <<= 1) {
1332                         if (arg.mask & bit)
1333                                 continue;
1334
1335                         /* special care for unaligned accesses */
1336                         p = args + sizeof(unsigned long) * arg.idx;
1337                         memcpy(&val, p, sizeof(val));
1338
1339                         /*
1340                          * Suppress this argument if its value is zero and
1341                          * and we don't have a string associated in an
1342                          * strarray for it.
1343                          */
1344                         if (val == 0 &&
1345                             !(sc->arg_scnprintf &&
1346                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1347                               sc->arg_parm[arg.idx]))
1348                                 continue;
1349
1350                         printed += scnprintf(bf + printed, size - printed,
1351                                              "%s%s: ", printed ? ", " : "", field->name);
1352                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1353                                 arg.val = val;
1354                                 if (sc->arg_parm)
1355                                         arg.parm = sc->arg_parm[arg.idx];
1356                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1357                                                                       size - printed, &arg);
1358                         } else {
1359                                 printed += scnprintf(bf + printed, size - printed,
1360                                                      "%ld", val);
1361                         }
1362                 }
1363         } else if (IS_ERR(sc->tp_format)) {
1364                 /*
1365                  * If we managed to read the tracepoint /format file, then we
1366                  * may end up not having any args, like with gettid(), so only
1367                  * print the raw args when we didn't manage to read it.
1368                  */
1369                 int i = 0;
1370
1371                 while (i < 6) {
1372                         /* special care for unaligned accesses */
1373                         p = args + sizeof(unsigned long) * i;
1374                         memcpy(&val, p, sizeof(val));
1375                         printed += scnprintf(bf + printed, size - printed,
1376                                              "%sarg%d: %ld",
1377                                              printed ? ", " : "", i, val);
1378                         ++i;
1379                 }
1380         }
1381
1382         return printed;
1383 }
1384
1385 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1386                                   union perf_event *event,
1387                                   struct perf_sample *sample);
1388
1389 static struct syscall *trace__syscall_info(struct trace *trace,
1390                                            struct perf_evsel *evsel, int id)
1391 {
1392
1393         if (id < 0) {
1394
1395                 /*
1396                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1397                  * before that, leaving at a higher verbosity level till that is
1398                  * explained. Reproduced with plain ftrace with:
1399                  *
1400                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1401                  * grep "NR -1 " /t/trace_pipe
1402                  *
1403                  * After generating some load on the machine.
1404                  */
1405                 if (verbose > 1) {
1406                         static u64 n;
1407                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1408                                 id, perf_evsel__name(evsel), ++n);
1409                 }
1410                 return NULL;
1411         }
1412
1413         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1414             trace__read_syscall_info(trace, id))
1415                 goto out_cant_read;
1416
1417         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1418                 goto out_cant_read;
1419
1420         return &trace->syscalls.table[id];
1421
1422 out_cant_read:
1423         if (verbose > 0) {
1424                 fprintf(trace->output, "Problems reading syscall %d", id);
1425                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1426                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1427                 fputs(" information\n", trace->output);
1428         }
1429         return NULL;
1430 }
1431
1432 static void thread__update_stats(struct thread_trace *ttrace,
1433                                  int id, struct perf_sample *sample)
1434 {
1435         struct int_node *inode;
1436         struct stats *stats;
1437         u64 duration = 0;
1438
1439         inode = intlist__findnew(ttrace->syscall_stats, id);
1440         if (inode == NULL)
1441                 return;
1442
1443         stats = inode->priv;
1444         if (stats == NULL) {
1445                 stats = malloc(sizeof(struct stats));
1446                 if (stats == NULL)
1447                         return;
1448                 init_stats(stats);
1449                 inode->priv = stats;
1450         }
1451
1452         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1453                 duration = sample->time - ttrace->entry_time;
1454
1455         update_stats(stats, duration);
1456 }
1457
1458 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1459 {
1460         struct thread_trace *ttrace;
1461         u64 duration;
1462         size_t printed;
1463
1464         if (trace->current == NULL)
1465                 return 0;
1466
1467         ttrace = thread__priv(trace->current);
1468
1469         if (!ttrace->entry_pending)
1470                 return 0;
1471
1472         duration = sample->time - ttrace->entry_time;
1473
1474         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1475         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1476         ttrace->entry_pending = false;
1477
1478         return printed;
1479 }
1480
1481 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1482                             union perf_event *event __maybe_unused,
1483                             struct perf_sample *sample)
1484 {
1485         char *msg;
1486         void *args;
1487         size_t printed = 0;
1488         struct thread *thread;
1489         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1490         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1491         struct thread_trace *ttrace;
1492
1493         if (sc == NULL)
1494                 return -1;
1495
1496         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1497         ttrace = thread__trace(thread, trace->output);
1498         if (ttrace == NULL)
1499                 goto out_put;
1500
1501         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1502
1503         if (ttrace->entry_str == NULL) {
1504                 ttrace->entry_str = malloc(trace__entry_str_size);
1505                 if (!ttrace->entry_str)
1506                         goto out_put;
1507         }
1508
1509         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1510                 trace__printf_interrupted_entry(trace, sample);
1511
1512         ttrace->entry_time = sample->time;
1513         msg = ttrace->entry_str;
1514         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1515
1516         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1517                                            args, trace, thread);
1518
1519         if (sc->is_exit) {
1520                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1521                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1522                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1523                 }
1524         } else {
1525                 ttrace->entry_pending = true;
1526                 /* See trace__vfs_getname & trace__sys_exit */
1527                 ttrace->filename.pending_open = false;
1528         }
1529
1530         if (trace->current != thread) {
1531                 thread__put(trace->current);
1532                 trace->current = thread__get(thread);
1533         }
1534         err = 0;
1535 out_put:
1536         thread__put(thread);
1537         return err;
1538 }
1539
1540 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1541                                     struct perf_sample *sample,
1542                                     struct callchain_cursor *cursor)
1543 {
1544         struct addr_location al;
1545
1546         if (machine__resolve(trace->host, &al, sample) < 0 ||
1547             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1548                 return -1;
1549
1550         return 0;
1551 }
1552
1553 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1554 {
1555         /* TODO: user-configurable print_opts */
1556         const unsigned int print_opts = EVSEL__PRINT_SYM |
1557                                         EVSEL__PRINT_DSO |
1558                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1559
1560         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1561 }
1562
1563 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1564                            union perf_event *event __maybe_unused,
1565                            struct perf_sample *sample)
1566 {
1567         long ret;
1568         u64 duration = 0;
1569         bool duration_calculated = false;
1570         struct thread *thread;
1571         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1572         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1573         struct thread_trace *ttrace;
1574
1575         if (sc == NULL)
1576                 return -1;
1577
1578         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1579         ttrace = thread__trace(thread, trace->output);
1580         if (ttrace == NULL)
1581                 goto out_put;
1582
1583         if (trace->summary)
1584                 thread__update_stats(ttrace, id, sample);
1585
1586         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1587
1588         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1589                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1590                 ttrace->filename.pending_open = false;
1591                 ++trace->stats.vfs_getname;
1592         }
1593
1594         if (ttrace->entry_time) {
1595                 duration = sample->time - ttrace->entry_time;
1596                 if (trace__filter_duration(trace, duration))
1597                         goto out;
1598                 duration_calculated = true;
1599         } else if (trace->duration_filter)
1600                 goto out;
1601
1602         if (sample->callchain) {
1603                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1604                 if (callchain_ret == 0) {
1605                         if (callchain_cursor.nr < trace->min_stack)
1606                                 goto out;
1607                         callchain_ret = 1;
1608                 }
1609         }
1610
1611         if (trace->summary_only)
1612                 goto out;
1613
1614         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1615
1616         if (ttrace->entry_pending) {
1617                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1618         } else {
1619                 fprintf(trace->output, " ... [");
1620                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1621                 fprintf(trace->output, "]: %s()", sc->name);
1622         }
1623
1624         if (sc->fmt == NULL) {
1625 signed_print:
1626                 fprintf(trace->output, ") = %ld", ret);
1627         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1628                 char bf[STRERR_BUFSIZE];
1629                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1630                            *e = audit_errno_to_name(-ret);
1631
1632                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1633         } else if (ret == 0 && sc->fmt->timeout)
1634                 fprintf(trace->output, ") = 0 Timeout");
1635         else if (sc->fmt->hexret)
1636                 fprintf(trace->output, ") = %#lx", ret);
1637         else if (sc->fmt->errpid) {
1638                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1639
1640                 if (child != NULL) {
1641                         fprintf(trace->output, ") = %ld", ret);
1642                         if (child->comm_set)
1643                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1644                         thread__put(child);
1645                 }
1646         } else
1647                 goto signed_print;
1648
1649         fputc('\n', trace->output);
1650
1651         if (callchain_ret > 0)
1652                 trace__fprintf_callchain(trace, sample);
1653         else if (callchain_ret < 0)
1654                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1655 out:
1656         ttrace->entry_pending = false;
1657         err = 0;
1658 out_put:
1659         thread__put(thread);
1660         return err;
1661 }
1662
1663 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1664                               union perf_event *event __maybe_unused,
1665                               struct perf_sample *sample)
1666 {
1667         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1668         struct thread_trace *ttrace;
1669         size_t filename_len, entry_str_len, to_move;
1670         ssize_t remaining_space;
1671         char *pos;
1672         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1673
1674         if (!thread)
1675                 goto out;
1676
1677         ttrace = thread__priv(thread);
1678         if (!ttrace)
1679                 goto out_put;
1680
1681         filename_len = strlen(filename);
1682         if (filename_len == 0)
1683                 goto out_put;
1684
1685         if (ttrace->filename.namelen < filename_len) {
1686                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1687
1688                 if (f == NULL)
1689                         goto out_put;
1690
1691                 ttrace->filename.namelen = filename_len;
1692                 ttrace->filename.name = f;
1693         }
1694
1695         strcpy(ttrace->filename.name, filename);
1696         ttrace->filename.pending_open = true;
1697
1698         if (!ttrace->filename.ptr)
1699                 goto out_put;
1700
1701         entry_str_len = strlen(ttrace->entry_str);
1702         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1703         if (remaining_space <= 0)
1704                 goto out_put;
1705
1706         if (filename_len > (size_t)remaining_space) {
1707                 filename += filename_len - remaining_space;
1708                 filename_len = remaining_space;
1709         }
1710
1711         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1712         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1713         memmove(pos + filename_len, pos, to_move);
1714         memcpy(pos, filename, filename_len);
1715
1716         ttrace->filename.ptr = 0;
1717         ttrace->filename.entry_str_pos = 0;
1718 out_put:
1719         thread__put(thread);
1720 out:
1721         return 0;
1722 }
1723
1724 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1725                                      union perf_event *event __maybe_unused,
1726                                      struct perf_sample *sample)
1727 {
1728         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1729         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1730         struct thread *thread = machine__findnew_thread(trace->host,
1731                                                         sample->pid,
1732                                                         sample->tid);
1733         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1734
1735         if (ttrace == NULL)
1736                 goto out_dump;
1737
1738         ttrace->runtime_ms += runtime_ms;
1739         trace->runtime_ms += runtime_ms;
1740 out_put:
1741         thread__put(thread);
1742         return 0;
1743
1744 out_dump:
1745         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1746                evsel->name,
1747                perf_evsel__strval(evsel, sample, "comm"),
1748                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1749                runtime,
1750                perf_evsel__intval(evsel, sample, "vruntime"));
1751         goto out_put;
1752 }
1753
1754 static void bpf_output__printer(enum binary_printer_ops op,
1755                                 unsigned int val, void *extra)
1756 {
1757         FILE *output = extra;
1758         unsigned char ch = (unsigned char)val;
1759
1760         switch (op) {
1761         case BINARY_PRINT_CHAR_DATA:
1762                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1763                 break;
1764         case BINARY_PRINT_DATA_BEGIN:
1765         case BINARY_PRINT_LINE_BEGIN:
1766         case BINARY_PRINT_ADDR:
1767         case BINARY_PRINT_NUM_DATA:
1768         case BINARY_PRINT_NUM_PAD:
1769         case BINARY_PRINT_SEP:
1770         case BINARY_PRINT_CHAR_PAD:
1771         case BINARY_PRINT_LINE_END:
1772         case BINARY_PRINT_DATA_END:
1773         default:
1774                 break;
1775         }
1776 }
1777
1778 static void bpf_output__fprintf(struct trace *trace,
1779                                 struct perf_sample *sample)
1780 {
1781         print_binary(sample->raw_data, sample->raw_size, 8,
1782                      bpf_output__printer, trace->output);
1783 }
1784
1785 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1786                                 union perf_event *event __maybe_unused,
1787                                 struct perf_sample *sample)
1788 {
1789         int callchain_ret = 0;
1790
1791         if (sample->callchain) {
1792                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1793                 if (callchain_ret == 0) {
1794                         if (callchain_cursor.nr < trace->min_stack)
1795                                 goto out;
1796                         callchain_ret = 1;
1797                 }
1798         }
1799
1800         trace__printf_interrupted_entry(trace, sample);
1801         trace__fprintf_tstamp(trace, sample->time, trace->output);
1802
1803         if (trace->trace_syscalls)
1804                 fprintf(trace->output, "(         ): ");
1805
1806         fprintf(trace->output, "%s:", evsel->name);
1807
1808         if (perf_evsel__is_bpf_output(evsel)) {
1809                 bpf_output__fprintf(trace, sample);
1810         } else if (evsel->tp_format) {
1811                 event_format__fprintf(evsel->tp_format, sample->cpu,
1812                                       sample->raw_data, sample->raw_size,
1813                                       trace->output);
1814         }
1815
1816         fprintf(trace->output, ")\n");
1817
1818         if (callchain_ret > 0)
1819                 trace__fprintf_callchain(trace, sample);
1820         else if (callchain_ret < 0)
1821                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1822 out:
1823         return 0;
1824 }
1825
1826 static void print_location(FILE *f, struct perf_sample *sample,
1827                            struct addr_location *al,
1828                            bool print_dso, bool print_sym)
1829 {
1830
1831         if ((verbose > 0 || print_dso) && al->map)
1832                 fprintf(f, "%s@", al->map->dso->long_name);
1833
1834         if ((verbose > 0 || print_sym) && al->sym)
1835                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1836                         al->addr - al->sym->start);
1837         else if (al->map)
1838                 fprintf(f, "0x%" PRIx64, al->addr);
1839         else
1840                 fprintf(f, "0x%" PRIx64, sample->addr);
1841 }
1842
1843 static int trace__pgfault(struct trace *trace,
1844                           struct perf_evsel *evsel,
1845                           union perf_event *event __maybe_unused,
1846                           struct perf_sample *sample)
1847 {
1848         struct thread *thread;
1849         struct addr_location al;
1850         char map_type = 'd';
1851         struct thread_trace *ttrace;
1852         int err = -1;
1853         int callchain_ret = 0;
1854
1855         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1856
1857         if (sample->callchain) {
1858                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1859                 if (callchain_ret == 0) {
1860                         if (callchain_cursor.nr < trace->min_stack)
1861                                 goto out_put;
1862                         callchain_ret = 1;
1863                 }
1864         }
1865
1866         ttrace = thread__trace(thread, trace->output);
1867         if (ttrace == NULL)
1868                 goto out_put;
1869
1870         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1871                 ttrace->pfmaj++;
1872         else
1873                 ttrace->pfmin++;
1874
1875         if (trace->summary_only)
1876                 goto out;
1877
1878         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1879                               sample->ip, &al);
1880
1881         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1882
1883         fprintf(trace->output, "%sfault [",
1884                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1885                 "maj" : "min");
1886
1887         print_location(trace->output, sample, &al, false, true);
1888
1889         fprintf(trace->output, "] => ");
1890
1891         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1892                                    sample->addr, &al);
1893
1894         if (!al.map) {
1895                 thread__find_addr_location(thread, sample->cpumode,
1896                                            MAP__FUNCTION, sample->addr, &al);
1897
1898                 if (al.map)
1899                         map_type = 'x';
1900                 else
1901                         map_type = '?';
1902         }
1903
1904         print_location(trace->output, sample, &al, true, false);
1905
1906         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1907
1908         if (callchain_ret > 0)
1909                 trace__fprintf_callchain(trace, sample);
1910         else if (callchain_ret < 0)
1911                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1912 out:
1913         err = 0;
1914 out_put:
1915         thread__put(thread);
1916         return err;
1917 }
1918
1919 static void trace__set_base_time(struct trace *trace,
1920                                  struct perf_evsel *evsel,
1921                                  struct perf_sample *sample)
1922 {
1923         /*
1924          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1925          * and don't use sample->time unconditionally, we may end up having
1926          * some other event in the future without PERF_SAMPLE_TIME for good
1927          * reason, i.e. we may not be interested in its timestamps, just in
1928          * it taking place, picking some piece of information when it
1929          * appears in our event stream (vfs_getname comes to mind).
1930          */
1931         if (trace->base_time == 0 && !trace->full_time &&
1932             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1933                 trace->base_time = sample->time;
1934 }
1935
1936 static int trace__process_sample(struct perf_tool *tool,
1937                                  union perf_event *event,
1938                                  struct perf_sample *sample,
1939                                  struct perf_evsel *evsel,
1940                                  struct machine *machine __maybe_unused)
1941 {
1942         struct trace *trace = container_of(tool, struct trace, tool);
1943         struct thread *thread;
1944         int err = 0;
1945
1946         tracepoint_handler handler = evsel->handler;
1947
1948         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1949         if (thread && thread__is_filtered(thread))
1950                 goto out;
1951
1952         trace__set_base_time(trace, evsel, sample);
1953
1954         if (handler) {
1955                 ++trace->nr_events;
1956                 handler(trace, evsel, event, sample);
1957         }
1958 out:
1959         thread__put(thread);
1960         return err;
1961 }
1962
1963 static int trace__record(struct trace *trace, int argc, const char **argv)
1964 {
1965         unsigned int rec_argc, i, j;
1966         const char **rec_argv;
1967         const char * const record_args[] = {
1968                 "record",
1969                 "-R",
1970                 "-m", "1024",
1971                 "-c", "1",
1972         };
1973
1974         const char * const sc_args[] = { "-e", };
1975         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1976         const char * const majpf_args[] = { "-e", "major-faults" };
1977         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1978         const char * const minpf_args[] = { "-e", "minor-faults" };
1979         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1980
1981         /* +1 is for the event string below */
1982         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1983                 majpf_args_nr + minpf_args_nr + argc;
1984         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1985
1986         if (rec_argv == NULL)
1987                 return -ENOMEM;
1988
1989         j = 0;
1990         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1991                 rec_argv[j++] = record_args[i];
1992
1993         if (trace->trace_syscalls) {
1994                 for (i = 0; i < sc_args_nr; i++)
1995                         rec_argv[j++] = sc_args[i];
1996
1997                 /* event string may be different for older kernels - e.g., RHEL6 */
1998                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1999                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2000                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2001                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2002                 else {
2003                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2004                         return -1;
2005                 }
2006         }
2007
2008         if (trace->trace_pgfaults & TRACE_PFMAJ)
2009                 for (i = 0; i < majpf_args_nr; i++)
2010                         rec_argv[j++] = majpf_args[i];
2011
2012         if (trace->trace_pgfaults & TRACE_PFMIN)
2013                 for (i = 0; i < minpf_args_nr; i++)
2014                         rec_argv[j++] = minpf_args[i];
2015
2016         for (i = 0; i < (unsigned int)argc; i++)
2017                 rec_argv[j++] = argv[i];
2018
2019         return cmd_record(j, rec_argv);
2020 }
2021
2022 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2023
2024 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2025 {
2026         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2027
2028         if (IS_ERR(evsel))
2029                 return false;
2030
2031         if (perf_evsel__field(evsel, "pathname") == NULL) {
2032                 perf_evsel__delete(evsel);
2033                 return false;
2034         }
2035
2036         evsel->handler = trace__vfs_getname;
2037         perf_evlist__add(evlist, evsel);
2038         return true;
2039 }
2040
2041 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2042 {
2043         struct perf_evsel *evsel;
2044         struct perf_event_attr attr = {
2045                 .type = PERF_TYPE_SOFTWARE,
2046                 .mmap_data = 1,
2047         };
2048
2049         attr.config = config;
2050         attr.sample_period = 1;
2051
2052         event_attr_init(&attr);
2053
2054         evsel = perf_evsel__new(&attr);
2055         if (evsel)
2056                 evsel->handler = trace__pgfault;
2057
2058         return evsel;
2059 }
2060
2061 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2062 {
2063         const u32 type = event->header.type;
2064         struct perf_evsel *evsel;
2065
2066         if (type != PERF_RECORD_SAMPLE) {
2067                 trace__process_event(trace, trace->host, event, sample);
2068                 return;
2069         }
2070
2071         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2072         if (evsel == NULL) {
2073                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2074                 return;
2075         }
2076
2077         trace__set_base_time(trace, evsel, sample);
2078
2079         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2080             sample->raw_data == NULL) {
2081                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2082                        perf_evsel__name(evsel), sample->tid,
2083                        sample->cpu, sample->raw_size);
2084         } else {
2085                 tracepoint_handler handler = evsel->handler;
2086                 handler(trace, evsel, event, sample);
2087         }
2088 }
2089
2090 static int trace__add_syscall_newtp(struct trace *trace)
2091 {
2092         int ret = -1;
2093         struct perf_evlist *evlist = trace->evlist;
2094         struct perf_evsel *sys_enter, *sys_exit;
2095
2096         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2097         if (sys_enter == NULL)
2098                 goto out;
2099
2100         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2101                 goto out_delete_sys_enter;
2102
2103         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2104         if (sys_exit == NULL)
2105                 goto out_delete_sys_enter;
2106
2107         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2108                 goto out_delete_sys_exit;
2109
2110         perf_evlist__add(evlist, sys_enter);
2111         perf_evlist__add(evlist, sys_exit);
2112
2113         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2114                 /*
2115                  * We're interested only in the user space callchain
2116                  * leading to the syscall, allow overriding that for
2117                  * debugging reasons using --kernel_syscall_callchains
2118                  */
2119                 sys_exit->attr.exclude_callchain_kernel = 1;
2120         }
2121
2122         trace->syscalls.events.sys_enter = sys_enter;
2123         trace->syscalls.events.sys_exit  = sys_exit;
2124
2125         ret = 0;
2126 out:
2127         return ret;
2128
2129 out_delete_sys_exit:
2130         perf_evsel__delete_priv(sys_exit);
2131 out_delete_sys_enter:
2132         perf_evsel__delete_priv(sys_enter);
2133         goto out;
2134 }
2135
2136 static int trace__set_ev_qualifier_filter(struct trace *trace)
2137 {
2138         int err = -1;
2139         struct perf_evsel *sys_exit;
2140         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2141                                                 trace->ev_qualifier_ids.nr,
2142                                                 trace->ev_qualifier_ids.entries);
2143
2144         if (filter == NULL)
2145                 goto out_enomem;
2146
2147         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2148                                           filter)) {
2149                 sys_exit = trace->syscalls.events.sys_exit;
2150                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2151         }
2152
2153         free(filter);
2154 out:
2155         return err;
2156 out_enomem:
2157         errno = ENOMEM;
2158         goto out;
2159 }
2160
2161 static int trace__run(struct trace *trace, int argc, const char **argv)
2162 {
2163         struct perf_evlist *evlist = trace->evlist;
2164         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2165         int err = -1, i;
2166         unsigned long before;
2167         const bool forks = argc > 0;
2168         bool draining = false;
2169
2170         trace->live = true;
2171
2172         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2173                 goto out_error_raw_syscalls;
2174
2175         if (trace->trace_syscalls)
2176                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2177
2178         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2179                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2180                 if (pgfault_maj == NULL)
2181                         goto out_error_mem;
2182                 perf_evlist__add(evlist, pgfault_maj);
2183         }
2184
2185         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2186                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2187                 if (pgfault_min == NULL)
2188                         goto out_error_mem;
2189                 perf_evlist__add(evlist, pgfault_min);
2190         }
2191
2192         if (trace->sched &&
2193             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2194                                    trace__sched_stat_runtime))
2195                 goto out_error_sched_stat_runtime;
2196
2197         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2198         if (err < 0) {
2199                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2200                 goto out_delete_evlist;
2201         }
2202
2203         err = trace__symbols_init(trace, evlist);
2204         if (err < 0) {
2205                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2206                 goto out_delete_evlist;
2207         }
2208
2209         perf_evlist__config(evlist, &trace->opts, NULL);
2210
2211         if (callchain_param.enabled) {
2212                 bool use_identifier = false;
2213
2214                 if (trace->syscalls.events.sys_exit) {
2215                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2216                                                      &trace->opts, &callchain_param);
2217                         use_identifier = true;
2218                 }
2219
2220                 if (pgfault_maj) {
2221                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2222                         use_identifier = true;
2223                 }
2224
2225                 if (pgfault_min) {
2226                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2227                         use_identifier = true;
2228                 }
2229
2230                 if (use_identifier) {
2231                        /*
2232                         * Now we have evsels with different sample_ids, use
2233                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2234                         * from a fixed position in each ring buffer record.
2235                         *
2236                         * As of this the changeset introducing this comment, this
2237                         * isn't strictly needed, as the fields that can come before
2238                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2239                         * some of those for things like copying the payload of
2240                         * pointer syscall arguments, and for vfs_getname we don't
2241                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2242                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2243                         */
2244                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2245                         perf_evlist__reset_sample_bit(evlist, ID);
2246                 }
2247         }
2248
2249         signal(SIGCHLD, sig_handler);
2250         signal(SIGINT, sig_handler);
2251
2252         if (forks) {
2253                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2254                                                     argv, false, NULL);
2255                 if (err < 0) {
2256                         fprintf(trace->output, "Couldn't run the workload!\n");
2257                         goto out_delete_evlist;
2258                 }
2259         }
2260
2261         err = perf_evlist__open(evlist);
2262         if (err < 0)
2263                 goto out_error_open;
2264
2265         err = bpf__apply_obj_config();
2266         if (err) {
2267                 char errbuf[BUFSIZ];
2268
2269                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2270                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2271                          errbuf);
2272                 goto out_error_open;
2273         }
2274
2275         /*
2276          * Better not use !target__has_task() here because we need to cover the
2277          * case where no threads were specified in the command line, but a
2278          * workload was, and in that case we will fill in the thread_map when
2279          * we fork the workload in perf_evlist__prepare_workload.
2280          */
2281         if (trace->filter_pids.nr > 0)
2282                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2283         else if (thread_map__pid(evlist->threads, 0) == -1)
2284                 err = perf_evlist__set_filter_pid(evlist, getpid());
2285
2286         if (err < 0)
2287                 goto out_error_mem;
2288
2289         if (trace->ev_qualifier_ids.nr > 0) {
2290                 err = trace__set_ev_qualifier_filter(trace);
2291                 if (err < 0)
2292                         goto out_errno;
2293
2294                 pr_debug("event qualifier tracepoint filter: %s\n",
2295                          trace->syscalls.events.sys_exit->filter);
2296         }
2297
2298         err = perf_evlist__apply_filters(evlist, &evsel);
2299         if (err < 0)
2300                 goto out_error_apply_filters;
2301
2302         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2303         if (err < 0)
2304                 goto out_error_mmap;
2305
2306         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2307                 perf_evlist__enable(evlist);
2308
2309         if (forks)
2310                 perf_evlist__start_workload(evlist);
2311
2312         if (trace->opts.initial_delay) {
2313                 usleep(trace->opts.initial_delay * 1000);
2314                 perf_evlist__enable(evlist);
2315         }
2316
2317         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2318                                   evlist->threads->nr > 1 ||
2319                                   perf_evlist__first(evlist)->attr.inherit;
2320 again:
2321         before = trace->nr_events;
2322
2323         for (i = 0; i < evlist->nr_mmaps; i++) {
2324                 union perf_event *event;
2325
2326                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2327                         struct perf_sample sample;
2328
2329                         ++trace->nr_events;
2330
2331                         err = perf_evlist__parse_sample(evlist, event, &sample);
2332                         if (err) {
2333                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2334                                 goto next_event;
2335                         }
2336
2337                         trace__handle_event(trace, event, &sample);
2338 next_event:
2339                         perf_evlist__mmap_consume(evlist, i);
2340
2341                         if (interrupted)
2342                                 goto out_disable;
2343
2344                         if (done && !draining) {
2345                                 perf_evlist__disable(evlist);
2346                                 draining = true;
2347                         }
2348                 }
2349         }
2350
2351         if (trace->nr_events == before) {
2352                 int timeout = done ? 100 : -1;
2353
2354                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2355                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2356                                 draining = true;
2357
2358                         goto again;
2359                 }
2360         } else {
2361                 goto again;
2362         }
2363
2364 out_disable:
2365         thread__zput(trace->current);
2366
2367         perf_evlist__disable(evlist);
2368
2369         if (!err) {
2370                 if (trace->summary)
2371                         trace__fprintf_thread_summary(trace, trace->output);
2372
2373                 if (trace->show_tool_stats) {
2374                         fprintf(trace->output, "Stats:\n "
2375                                                " vfs_getname : %" PRIu64 "\n"
2376                                                " proc_getname: %" PRIu64 "\n",
2377                                 trace->stats.vfs_getname,
2378                                 trace->stats.proc_getname);
2379                 }
2380         }
2381
2382 out_delete_evlist:
2383         perf_evlist__delete(evlist);
2384         trace->evlist = NULL;
2385         trace->live = false;
2386         return err;
2387 {
2388         char errbuf[BUFSIZ];
2389
2390 out_error_sched_stat_runtime:
2391         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2392         goto out_error;
2393
2394 out_error_raw_syscalls:
2395         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2396         goto out_error;
2397
2398 out_error_mmap:
2399         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2400         goto out_error;
2401
2402 out_error_open:
2403         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2404
2405 out_error:
2406         fprintf(trace->output, "%s\n", errbuf);
2407         goto out_delete_evlist;
2408
2409 out_error_apply_filters:
2410         fprintf(trace->output,
2411                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2412                 evsel->filter, perf_evsel__name(evsel), errno,
2413                 str_error_r(errno, errbuf, sizeof(errbuf)));
2414         goto out_delete_evlist;
2415 }
2416 out_error_mem:
2417         fprintf(trace->output, "Not enough memory to run!\n");
2418         goto out_delete_evlist;
2419
2420 out_errno:
2421         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2422         goto out_delete_evlist;
2423 }
2424
2425 static int trace__replay(struct trace *trace)
2426 {
2427         const struct perf_evsel_str_handler handlers[] = {
2428                 { "probe:vfs_getname",       trace__vfs_getname, },
2429         };
2430         struct perf_data_file file = {
2431                 .path  = input_name,
2432                 .mode  = PERF_DATA_MODE_READ,
2433                 .force = trace->force,
2434         };
2435         struct perf_session *session;
2436         struct perf_evsel *evsel;
2437         int err = -1;
2438
2439         trace->tool.sample        = trace__process_sample;
2440         trace->tool.mmap          = perf_event__process_mmap;
2441         trace->tool.mmap2         = perf_event__process_mmap2;
2442         trace->tool.comm          = perf_event__process_comm;
2443         trace->tool.exit          = perf_event__process_exit;
2444         trace->tool.fork          = perf_event__process_fork;
2445         trace->tool.attr          = perf_event__process_attr;
2446         trace->tool.tracing_data  = perf_event__process_tracing_data;
2447         trace->tool.build_id      = perf_event__process_build_id;
2448         trace->tool.namespaces    = perf_event__process_namespaces;
2449
2450         trace->tool.ordered_events = true;
2451         trace->tool.ordering_requires_timestamps = true;
2452
2453         /* add tid to output */
2454         trace->multiple_threads = true;
2455
2456         session = perf_session__new(&file, false, &trace->tool);
2457         if (session == NULL)
2458                 return -1;
2459
2460         if (trace->opts.target.pid)
2461                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2462
2463         if (trace->opts.target.tid)
2464                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2465
2466         if (symbol__init(&session->header.env) < 0)
2467                 goto out;
2468
2469         trace->host = &session->machines.host;
2470
2471         err = perf_session__set_tracepoints_handlers(session, handlers);
2472         if (err)
2473                 goto out;
2474
2475         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2476                                                      "raw_syscalls:sys_enter");
2477         /* older kernels have syscalls tp versus raw_syscalls */
2478         if (evsel == NULL)
2479                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2480                                                              "syscalls:sys_enter");
2481
2482         if (evsel &&
2483             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2484             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2485                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2486                 goto out;
2487         }
2488
2489         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2490                                                      "raw_syscalls:sys_exit");
2491         if (evsel == NULL)
2492                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2493                                                              "syscalls:sys_exit");
2494         if (evsel &&
2495             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2496             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2497                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2498                 goto out;
2499         }
2500
2501         evlist__for_each_entry(session->evlist, evsel) {
2502                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2503                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2504                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2505                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2506                         evsel->handler = trace__pgfault;
2507         }
2508
2509         setup_pager();
2510
2511         err = perf_session__process_events(session);
2512         if (err)
2513                 pr_err("Failed to process events, error %d", err);
2514
2515         else if (trace->summary)
2516                 trace__fprintf_thread_summary(trace, trace->output);
2517
2518 out:
2519         perf_session__delete(session);
2520
2521         return err;
2522 }
2523
2524 static size_t trace__fprintf_threads_header(FILE *fp)
2525 {
2526         size_t printed;
2527
2528         printed  = fprintf(fp, "\n Summary of events:\n\n");
2529
2530         return printed;
2531 }
2532
2533 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2534         struct stats    *stats;
2535         double          msecs;
2536         int             syscall;
2537 )
2538 {
2539         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2540         struct stats *stats = source->priv;
2541
2542         entry->syscall = source->i;
2543         entry->stats   = stats;
2544         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2545 }
2546
2547 static size_t thread__dump_stats(struct thread_trace *ttrace,
2548                                  struct trace *trace, FILE *fp)
2549 {
2550         size_t printed = 0;
2551         struct syscall *sc;
2552         struct rb_node *nd;
2553         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2554
2555         if (syscall_stats == NULL)
2556                 return 0;
2557
2558         printed += fprintf(fp, "\n");
2559
2560         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2561         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2562         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2563
2564         resort_rb__for_each_entry(nd, syscall_stats) {
2565                 struct stats *stats = syscall_stats_entry->stats;
2566                 if (stats) {
2567                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2568                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2569                         double avg = avg_stats(stats);
2570                         double pct;
2571                         u64 n = (u64) stats->n;
2572
2573                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2574                         avg /= NSEC_PER_MSEC;
2575
2576                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2577                         printed += fprintf(fp, "   %-15s", sc->name);
2578                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2579                                            n, syscall_stats_entry->msecs, min, avg);
2580                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2581                 }
2582         }
2583
2584         resort_rb__delete(syscall_stats);
2585         printed += fprintf(fp, "\n\n");
2586
2587         return printed;
2588 }
2589
2590 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2591 {
2592         size_t printed = 0;
2593         struct thread_trace *ttrace = thread__priv(thread);
2594         double ratio;
2595
2596         if (ttrace == NULL)
2597                 return 0;
2598
2599         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2600
2601         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2602         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2603         printed += fprintf(fp, "%.1f%%", ratio);
2604         if (ttrace->pfmaj)
2605                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2606         if (ttrace->pfmin)
2607                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2608         if (trace->sched)
2609                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2610         else if (fputc('\n', fp) != EOF)
2611                 ++printed;
2612
2613         printed += thread__dump_stats(ttrace, trace, fp);
2614
2615         return printed;
2616 }
2617
2618 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2619 {
2620         return ttrace ? ttrace->nr_events : 0;
2621 }
2622
2623 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2624         struct thread *thread;
2625 )
2626 {
2627         entry->thread = rb_entry(nd, struct thread, rb_node);
2628 }
2629
2630 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2631 {
2632         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2633         size_t printed = trace__fprintf_threads_header(fp);
2634         struct rb_node *nd;
2635
2636         if (threads == NULL) {
2637                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2638                 return 0;
2639         }
2640
2641         resort_rb__for_each_entry(nd, threads)
2642                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2643
2644         resort_rb__delete(threads);
2645
2646         return printed;
2647 }
2648
2649 static int trace__set_duration(const struct option *opt, const char *str,
2650                                int unset __maybe_unused)
2651 {
2652         struct trace *trace = opt->value;
2653
2654         trace->duration_filter = atof(str);
2655         return 0;
2656 }
2657
2658 static int trace__set_filter_pids(const struct option *opt, const char *str,
2659                                   int unset __maybe_unused)
2660 {
2661         int ret = -1;
2662         size_t i;
2663         struct trace *trace = opt->value;
2664         /*
2665          * FIXME: introduce a intarray class, plain parse csv and create a
2666          * { int nr, int entries[] } struct...
2667          */
2668         struct intlist *list = intlist__new(str);
2669
2670         if (list == NULL)
2671                 return -1;
2672
2673         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2674         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2675
2676         if (trace->filter_pids.entries == NULL)
2677                 goto out;
2678
2679         trace->filter_pids.entries[0] = getpid();
2680
2681         for (i = 1; i < trace->filter_pids.nr; ++i)
2682                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2683
2684         intlist__delete(list);
2685         ret = 0;
2686 out:
2687         return ret;
2688 }
2689
2690 static int trace__open_output(struct trace *trace, const char *filename)
2691 {
2692         struct stat st;
2693
2694         if (!stat(filename, &st) && st.st_size) {
2695                 char oldname[PATH_MAX];
2696
2697                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2698                 unlink(oldname);
2699                 rename(filename, oldname);
2700         }
2701
2702         trace->output = fopen(filename, "w");
2703
2704         return trace->output == NULL ? -errno : 0;
2705 }
2706
2707 static int parse_pagefaults(const struct option *opt, const char *str,
2708                             int unset __maybe_unused)
2709 {
2710         int *trace_pgfaults = opt->value;
2711
2712         if (strcmp(str, "all") == 0)
2713                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2714         else if (strcmp(str, "maj") == 0)
2715                 *trace_pgfaults |= TRACE_PFMAJ;
2716         else if (strcmp(str, "min") == 0)
2717                 *trace_pgfaults |= TRACE_PFMIN;
2718         else
2719                 return -1;
2720
2721         return 0;
2722 }
2723
2724 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2725 {
2726         struct perf_evsel *evsel;
2727
2728         evlist__for_each_entry(evlist, evsel)
2729                 evsel->handler = handler;
2730 }
2731
2732 /*
2733  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2734  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2735  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2736  *
2737  * It'd be better to introduce a parse_options() variant that would return a
2738  * list with the terms it didn't match to an event...
2739  */
2740 static int trace__parse_events_option(const struct option *opt, const char *str,
2741                                       int unset __maybe_unused)
2742 {
2743         struct trace *trace = (struct trace *)opt->value;
2744         const char *s = str;
2745         char *sep = NULL, *lists[2] = { NULL, NULL, };
2746         int len = strlen(str), err = -1, list;
2747         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2748         char group_name[PATH_MAX];
2749
2750         if (strace_groups_dir == NULL)
2751                 return -1;
2752
2753         if (*s == '!') {
2754                 ++s;
2755                 trace->not_ev_qualifier = true;
2756         }
2757
2758         while (1) {
2759                 if ((sep = strchr(s, ',')) != NULL)
2760                         *sep = '\0';
2761
2762                 list = 0;
2763                 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2764                         list = 1;
2765                 } else {
2766                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2767                         if (access(group_name, R_OK) == 0)
2768                                 list = 1;
2769                 }
2770
2771                 if (lists[list]) {
2772                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2773                 } else {
2774                         lists[list] = malloc(len);
2775                         if (lists[list] == NULL)
2776                                 goto out;
2777                         strcpy(lists[list], s);
2778                 }
2779
2780                 if (!sep)
2781                         break;
2782
2783                 *sep = ',';
2784                 s = sep + 1;
2785         }
2786
2787         if (lists[1] != NULL) {
2788                 struct strlist_config slist_config = {
2789                         .dirname = strace_groups_dir,
2790                 };
2791
2792                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2793                 if (trace->ev_qualifier == NULL) {
2794                         fputs("Not enough memory to parse event qualifier", trace->output);
2795                         goto out;
2796                 }
2797
2798                 if (trace__validate_ev_qualifier(trace))
2799                         goto out;
2800         }
2801
2802         err = 0;
2803
2804         if (lists[0]) {
2805                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2806                                                "event selector. use 'perf list' to list available events",
2807                                                parse_events_option);
2808                 err = parse_events_option(&o, lists[0], 0);
2809         }
2810 out:
2811         if (sep)
2812                 *sep = ',';
2813
2814         return err;
2815 }
2816
2817 int cmd_trace(int argc, const char **argv)
2818 {
2819         const char *trace_usage[] = {
2820                 "perf trace [<options>] [<command>]",
2821                 "perf trace [<options>] -- <command> [<options>]",
2822                 "perf trace record [<options>] [<command>]",
2823                 "perf trace record [<options>] -- <command> [<options>]",
2824                 NULL
2825         };
2826         struct trace trace = {
2827                 .syscalls = {
2828                         . max = -1,
2829                 },
2830                 .opts = {
2831                         .target = {
2832                                 .uid       = UINT_MAX,
2833                                 .uses_mmap = true,
2834                         },
2835                         .user_freq     = UINT_MAX,
2836                         .user_interval = ULLONG_MAX,
2837                         .no_buffering  = true,
2838                         .mmap_pages    = UINT_MAX,
2839                         .proc_map_timeout  = 500,
2840                 },
2841                 .output = stderr,
2842                 .show_comm = true,
2843                 .trace_syscalls = true,
2844                 .kernel_syscallchains = false,
2845                 .max_stack = UINT_MAX,
2846         };
2847         const char *output_name = NULL;
2848         const struct option trace_options[] = {
2849         OPT_CALLBACK('e', "event", &trace, "event",
2850                      "event/syscall selector. use 'perf list' to list available events",
2851                      trace__parse_events_option),
2852         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2853                     "show the thread COMM next to its id"),
2854         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2855         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2856                      trace__parse_events_option),
2857         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2858         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2859         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2860                     "trace events on existing process id"),
2861         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2862                     "trace events on existing thread id"),
2863         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2864                      "pids to filter (by the kernel)", trace__set_filter_pids),
2865         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2866                     "system-wide collection from all CPUs"),
2867         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2868                     "list of cpus to monitor"),
2869         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2870                     "child tasks do not inherit counters"),
2871         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2872                      "number of mmap data pages",
2873                      perf_evlist__parse_mmap_pages),
2874         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2875                    "user to profile"),
2876         OPT_CALLBACK(0, "duration", &trace, "float",
2877                      "show only events with duration > N.M ms",
2878                      trace__set_duration),
2879         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2880         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2881         OPT_BOOLEAN('T', "time", &trace.full_time,
2882                     "Show full timestamp, not time relative to first start"),
2883         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2884                     "Show only syscall summary with statistics"),
2885         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2886                     "Show all syscalls and summary with statistics"),
2887         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2888                      "Trace pagefaults", parse_pagefaults, "maj"),
2889         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2890         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2891         OPT_CALLBACK(0, "call-graph", &trace.opts,
2892                      "record_mode[,record_size]", record_callchain_help,
2893                      &record_parse_callchain_opt),
2894         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2895                     "Show the kernel callchains on the syscall exit path"),
2896         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2897                      "Set the minimum stack depth when parsing the callchain, "
2898                      "anything below the specified depth will be ignored."),
2899         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2900                      "Set the maximum stack depth when parsing the callchain, "
2901                      "anything beyond the specified depth will be ignored. "
2902                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2903         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2904                         "per thread proc mmap processing timeout in ms"),
2905         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2906                      "ms to wait before starting measurement after program "
2907                      "start"),
2908         OPT_END()
2909         };
2910         bool __maybe_unused max_stack_user_set = true;
2911         bool mmap_pages_user_set = true;
2912         const char * const trace_subcommands[] = { "record", NULL };
2913         int err;
2914         char bf[BUFSIZ];
2915
2916         signal(SIGSEGV, sighandler_dump_stack);
2917         signal(SIGFPE, sighandler_dump_stack);
2918
2919         trace.evlist = perf_evlist__new();
2920         trace.sctbl = syscalltbl__new();
2921
2922         if (trace.evlist == NULL || trace.sctbl == NULL) {
2923                 pr_err("Not enough memory to run!\n");
2924                 err = -ENOMEM;
2925                 goto out;
2926         }
2927
2928         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2929                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2930
2931         err = bpf__setup_stdout(trace.evlist);
2932         if (err) {
2933                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2934                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2935                 goto out;
2936         }
2937
2938         err = -1;
2939
2940         if (trace.trace_pgfaults) {
2941                 trace.opts.sample_address = true;
2942                 trace.opts.sample_time = true;
2943         }
2944
2945         if (trace.opts.mmap_pages == UINT_MAX)
2946                 mmap_pages_user_set = false;
2947
2948         if (trace.max_stack == UINT_MAX) {
2949                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2950                 max_stack_user_set = false;
2951         }
2952
2953 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2954         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2955                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2956 #endif
2957
2958         if (callchain_param.enabled) {
2959                 if (!mmap_pages_user_set && geteuid() == 0)
2960                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2961
2962                 symbol_conf.use_callchain = true;
2963         }
2964
2965         if (trace.evlist->nr_entries > 0)
2966                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2967
2968         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2969                 return trace__record(&trace, argc-1, &argv[1]);
2970
2971         /* summary_only implies summary option, but don't overwrite summary if set */
2972         if (trace.summary_only)
2973                 trace.summary = trace.summary_only;
2974
2975         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2976             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2977                 pr_err("Please specify something to trace.\n");
2978                 return -1;
2979         }
2980
2981         if (!trace.trace_syscalls && trace.ev_qualifier) {
2982                 pr_err("The -e option can't be used with --no-syscalls.\n");
2983                 goto out;
2984         }
2985
2986         if (output_name != NULL) {
2987                 err = trace__open_output(&trace, output_name);
2988                 if (err < 0) {
2989                         perror("failed to create output file");
2990                         goto out;
2991                 }
2992         }
2993
2994         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2995
2996         err = target__validate(&trace.opts.target);
2997         if (err) {
2998                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2999                 fprintf(trace.output, "%s", bf);
3000                 goto out_close;
3001         }
3002
3003         err = target__parse_uid(&trace.opts.target);
3004         if (err) {
3005                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3006                 fprintf(trace.output, "%s", bf);
3007                 goto out_close;
3008         }
3009
3010         if (!argc && target__none(&trace.opts.target))
3011                 trace.opts.target.system_wide = true;
3012
3013         if (input_name)
3014                 err = trace__replay(&trace);
3015         else
3016                 err = trace__run(&trace, argc, argv);
3017
3018 out_close:
3019         if (output_name != NULL)
3020                 fclose(trace.output);
3021 out:
3022         return err;
3023 }