]> git.karo-electronics.de Git - karo-tx-linux.git/blob - tools/perf/builtin-trace.c
0b00d8ac52266b89b8d80fb35c81e9679d9262c7
[karo-tx-linux.git] / tools / perf / builtin-trace.c
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace/beauty/beauty.h"
35 #include "trace-event.h"
36 #include "util/parse-events.h"
37 #include "util/bpf-loader.h"
38 #include "callchain.h"
39 #include "syscalltbl.h"
40 #include "rb_resort.h"
41
42 #include <inttypes.h>
43 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
44 #include <stdlib.h>
45 #include <string.h>
46 #include <linux/err.h>
47 #include <linux/filter.h>
48 #include <linux/audit.h>
49 #include <linux/kernel.h>
50 #include <linux/random.h>
51 #include <linux/stringify.h>
52 #include <linux/time64.h>
53
54 #include "sane_ctype.h"
55
56 #ifndef O_CLOEXEC
57 # define O_CLOEXEC              02000000
58 #endif
59
60 struct trace {
61         struct perf_tool        tool;
62         struct syscalltbl       *sctbl;
63         struct {
64                 int             max;
65                 struct syscall  *table;
66                 struct {
67                         struct perf_evsel *sys_enter,
68                                           *sys_exit;
69                 }               events;
70         } syscalls;
71         struct record_opts      opts;
72         struct perf_evlist      *evlist;
73         struct machine          *host;
74         struct thread           *current;
75         u64                     base_time;
76         FILE                    *output;
77         unsigned long           nr_events;
78         struct strlist          *ev_qualifier;
79         struct {
80                 size_t          nr;
81                 int             *entries;
82         }                       ev_qualifier_ids;
83         struct {
84                 size_t          nr;
85                 pid_t           *entries;
86         }                       filter_pids;
87         double                  duration_filter;
88         double                  runtime_ms;
89         struct {
90                 u64             vfs_getname,
91                                 proc_getname;
92         } stats;
93         unsigned int            max_stack;
94         unsigned int            min_stack;
95         bool                    not_ev_qualifier;
96         bool                    live;
97         bool                    full_time;
98         bool                    sched;
99         bool                    multiple_threads;
100         bool                    summary;
101         bool                    summary_only;
102         bool                    show_comm;
103         bool                    show_tool_stats;
104         bool                    trace_syscalls;
105         bool                    kernel_syscallchains;
106         bool                    force;
107         bool                    vfs_getname;
108         int                     trace_pgfaults;
109         int                     open_id;
110 };
111
112 struct tp_field {
113         int offset;
114         union {
115                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
116                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
117         };
118 };
119
120 #define TP_UINT_FIELD(bits) \
121 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
122 { \
123         u##bits value; \
124         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
125         return value;  \
126 }
127
128 TP_UINT_FIELD(8);
129 TP_UINT_FIELD(16);
130 TP_UINT_FIELD(32);
131 TP_UINT_FIELD(64);
132
133 #define TP_UINT_FIELD__SWAPPED(bits) \
134 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
135 { \
136         u##bits value; \
137         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
138         return bswap_##bits(value);\
139 }
140
141 TP_UINT_FIELD__SWAPPED(16);
142 TP_UINT_FIELD__SWAPPED(32);
143 TP_UINT_FIELD__SWAPPED(64);
144
145 static int tp_field__init_uint(struct tp_field *field,
146                                struct format_field *format_field,
147                                bool needs_swap)
148 {
149         field->offset = format_field->offset;
150
151         switch (format_field->size) {
152         case 1:
153                 field->integer = tp_field__u8;
154                 break;
155         case 2:
156                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
157                 break;
158         case 4:
159                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
160                 break;
161         case 8:
162                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
163                 break;
164         default:
165                 return -1;
166         }
167
168         return 0;
169 }
170
171 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
172 {
173         return sample->raw_data + field->offset;
174 }
175
176 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
177 {
178         field->offset = format_field->offset;
179         field->pointer = tp_field__ptr;
180         return 0;
181 }
182
183 struct syscall_tp {
184         struct tp_field id;
185         union {
186                 struct tp_field args, ret;
187         };
188 };
189
190 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
191                                           struct tp_field *field,
192                                           const char *name)
193 {
194         struct format_field *format_field = perf_evsel__field(evsel, name);
195
196         if (format_field == NULL)
197                 return -1;
198
199         return tp_field__init_uint(field, format_field, evsel->needs_swap);
200 }
201
202 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
203         ({ struct syscall_tp *sc = evsel->priv;\
204            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
205
206 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
207                                          struct tp_field *field,
208                                          const char *name)
209 {
210         struct format_field *format_field = perf_evsel__field(evsel, name);
211
212         if (format_field == NULL)
213                 return -1;
214
215         return tp_field__init_ptr(field, format_field);
216 }
217
218 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
219         ({ struct syscall_tp *sc = evsel->priv;\
220            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
221
222 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
223 {
224         zfree(&evsel->priv);
225         perf_evsel__delete(evsel);
226 }
227
228 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
229 {
230         evsel->priv = malloc(sizeof(struct syscall_tp));
231         if (evsel->priv != NULL) {
232                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
233                         goto out_delete;
234
235                 evsel->handler = handler;
236                 return 0;
237         }
238
239         return -ENOMEM;
240
241 out_delete:
242         zfree(&evsel->priv);
243         return -ENOENT;
244 }
245
246 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
247 {
248         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
249
250         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
251         if (IS_ERR(evsel))
252                 evsel = perf_evsel__newtp("syscalls", direction);
253
254         if (IS_ERR(evsel))
255                 return NULL;
256
257         if (perf_evsel__init_syscall_tp(evsel, handler))
258                 goto out_delete;
259
260         return evsel;
261
262 out_delete:
263         perf_evsel__delete_priv(evsel);
264         return NULL;
265 }
266
267 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
268         ({ struct syscall_tp *fields = evsel->priv; \
269            fields->name.integer(&fields->name, sample); })
270
271 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
272         ({ struct syscall_tp *fields = evsel->priv; \
273            fields->name.pointer(&fields->name, sample); })
274
275 struct strarray {
276         int         offset;
277         int         nr_entries;
278         const char **entries;
279 };
280
281 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
282         .nr_entries = ARRAY_SIZE(array), \
283         .entries = array, \
284 }
285
286 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
287         .offset     = off, \
288         .nr_entries = ARRAY_SIZE(array), \
289         .entries = array, \
290 }
291
292 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
293                                                 const char *intfmt,
294                                                 struct syscall_arg *arg)
295 {
296         struct strarray *sa = arg->parm;
297         int idx = arg->val - sa->offset;
298
299         if (idx < 0 || idx >= sa->nr_entries)
300                 return scnprintf(bf, size, intfmt, arg->val);
301
302         return scnprintf(bf, size, "%s", sa->entries[idx]);
303 }
304
305 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
306                                               struct syscall_arg *arg)
307 {
308         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
309 }
310
311 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
312
313 #if defined(__i386__) || defined(__x86_64__)
314 /*
315  * FIXME: Make this available to all arches as soon as the ioctl beautifier
316  *        gets rewritten to support all arches.
317  */
318 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
319                                                  struct syscall_arg *arg)
320 {
321         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
322 }
323
324 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
325 #endif /* defined(__i386__) || defined(__x86_64__) */
326
327 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
328                                         struct syscall_arg *arg);
329
330 #define SCA_FD syscall_arg__scnprintf_fd
331
332 #ifndef AT_FDCWD
333 #define AT_FDCWD        -100
334 #endif
335
336 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
337                                            struct syscall_arg *arg)
338 {
339         int fd = arg->val;
340
341         if (fd == AT_FDCWD)
342                 return scnprintf(bf, size, "CWD");
343
344         return syscall_arg__scnprintf_fd(bf, size, arg);
345 }
346
347 #define SCA_FDAT syscall_arg__scnprintf_fd_at
348
349 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
350                                               struct syscall_arg *arg);
351
352 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
353
354 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
355                                          struct syscall_arg *arg)
356 {
357         return scnprintf(bf, size, "%#lx", arg->val);
358 }
359
360 #define SCA_HEX syscall_arg__scnprintf_hex
361
362 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
363                                          struct syscall_arg *arg)
364 {
365         return scnprintf(bf, size, "%d", arg->val);
366 }
367
368 #define SCA_INT syscall_arg__scnprintf_int
369
370 static const char *bpf_cmd[] = {
371         "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
372         "MAP_GET_NEXT_KEY", "PROG_LOAD",
373 };
374 static DEFINE_STRARRAY(bpf_cmd);
375
376 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
377 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
378
379 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
380 static DEFINE_STRARRAY(itimers);
381
382 static const char *keyctl_options[] = {
383         "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
384         "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
385         "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
386         "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
387         "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
388 };
389 static DEFINE_STRARRAY(keyctl_options);
390
391 static const char *whences[] = { "SET", "CUR", "END",
392 #ifdef SEEK_DATA
393 "DATA",
394 #endif
395 #ifdef SEEK_HOLE
396 "HOLE",
397 #endif
398 };
399 static DEFINE_STRARRAY(whences);
400
401 static const char *fcntl_cmds[] = {
402         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
403         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
404         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
405         "F_GETOWNER_UIDS",
406 };
407 static DEFINE_STRARRAY(fcntl_cmds);
408
409 static const char *rlimit_resources[] = {
410         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
411         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
412         "RTTIME",
413 };
414 static DEFINE_STRARRAY(rlimit_resources);
415
416 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
417 static DEFINE_STRARRAY(sighow);
418
419 static const char *clockid[] = {
420         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
421         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
422         "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
423 };
424 static DEFINE_STRARRAY(clockid);
425
426 static const char *socket_families[] = {
427         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
428         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
429         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
430         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
431         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
432         "ALG", "NFC", "VSOCK",
433 };
434 static DEFINE_STRARRAY(socket_families);
435
436 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
437                                                  struct syscall_arg *arg)
438 {
439         size_t printed = 0;
440         int mode = arg->val;
441
442         if (mode == F_OK) /* 0 */
443                 return scnprintf(bf, size, "F");
444 #define P_MODE(n) \
445         if (mode & n##_OK) { \
446                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
447                 mode &= ~n##_OK; \
448         }
449
450         P_MODE(R);
451         P_MODE(W);
452         P_MODE(X);
453 #undef P_MODE
454
455         if (mode)
456                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
457
458         return printed;
459 }
460
461 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
462
463 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
464                                               struct syscall_arg *arg);
465
466 #define SCA_FILENAME syscall_arg__scnprintf_filename
467
468 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
469                                                 struct syscall_arg *arg)
470 {
471         int printed = 0, flags = arg->val;
472
473 #define P_FLAG(n) \
474         if (flags & O_##n) { \
475                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
476                 flags &= ~O_##n; \
477         }
478
479         P_FLAG(CLOEXEC);
480         P_FLAG(NONBLOCK);
481 #undef P_FLAG
482
483         if (flags)
484                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
485
486         return printed;
487 }
488
489 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
490
491 #if defined(__i386__) || defined(__x86_64__)
492 /*
493  * FIXME: Make this available to all arches.
494  */
495 #define TCGETS          0x5401
496
497 static const char *tioctls[] = {
498         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
499         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
500         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
501         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
502         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
503         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
504         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
505         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
506         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
507         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
508         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
509         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
510         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
511         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
512         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
513 };
514
515 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
516 #endif /* defined(__i386__) || defined(__x86_64__) */
517
518 #ifndef GRND_NONBLOCK
519 #define GRND_NONBLOCK   0x0001
520 #endif
521 #ifndef GRND_RANDOM
522 #define GRND_RANDOM     0x0002
523 #endif
524
525 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
526                                                    struct syscall_arg *arg)
527 {
528         int printed = 0, flags = arg->val;
529
530 #define P_FLAG(n) \
531         if (flags & GRND_##n) { \
532                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
533                 flags &= ~GRND_##n; \
534         }
535
536         P_FLAG(RANDOM);
537         P_FLAG(NONBLOCK);
538 #undef P_FLAG
539
540         if (flags)
541                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542
543         return printed;
544 }
545
546 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547
548 #define STRARRAY(arg, name, array) \
549           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
550           .arg_parm      = { [arg] = &strarray__##array, }
551
552 #include "trace/beauty/eventfd.c"
553 #include "trace/beauty/flock.c"
554 #include "trace/beauty/futex_op.c"
555 #include "trace/beauty/mmap.c"
556 #include "trace/beauty/mode_t.c"
557 #include "trace/beauty/msg_flags.c"
558 #include "trace/beauty/open_flags.c"
559 #include "trace/beauty/perf_event_open.c"
560 #include "trace/beauty/pid.c"
561 #include "trace/beauty/sched_policy.c"
562 #include "trace/beauty/seccomp.c"
563 #include "trace/beauty/signum.c"
564 #include "trace/beauty/socket_type.c"
565 #include "trace/beauty/waitid_options.c"
566
567 static struct syscall_fmt {
568         const char *name;
569         const char *alias;
570         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
571         void       *arg_parm[6];
572         bool       errmsg;
573         bool       errpid;
574         bool       timeout;
575         bool       hexret;
576 } syscall_fmts[] = {
577         { .name     = "access",     .errmsg = true,
578           .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
579         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
580         { .name     = "bpf",        .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
581         { .name     = "brk",        .hexret = true,
582           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
583         { .name     = "chdir",      .errmsg = true, },
584         { .name     = "chmod",      .errmsg = true, },
585         { .name     = "chroot",     .errmsg = true, },
586         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
587         { .name     = "clone",      .errpid = true, },
588         { .name     = "close",      .errmsg = true,
589           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
590         { .name     = "connect",    .errmsg = true, },
591         { .name     = "creat",      .errmsg = true, },
592         { .name     = "dup",        .errmsg = true, },
593         { .name     = "dup2",       .errmsg = true, },
594         { .name     = "dup3",       .errmsg = true, },
595         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
596         { .name     = "eventfd2",   .errmsg = true,
597           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
598         { .name     = "faccessat",  .errmsg = true, },
599         { .name     = "fadvise64",  .errmsg = true, },
600         { .name     = "fallocate",  .errmsg = true, },
601         { .name     = "fchdir",     .errmsg = true, },
602         { .name     = "fchmod",     .errmsg = true, },
603         { .name     = "fchmodat",   .errmsg = true,
604           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
605         { .name     = "fchown",     .errmsg = true, },
606         { .name     = "fchownat",   .errmsg = true,
607           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
608         { .name     = "fcntl",      .errmsg = true,
609           .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
610           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
611         { .name     = "fdatasync",  .errmsg = true, },
612         { .name     = "flock",      .errmsg = true,
613           .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
614         { .name     = "fsetxattr",  .errmsg = true, },
615         { .name     = "fstat",      .errmsg = true, .alias = "newfstat", },
616         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat", },
617         { .name     = "fstatfs",    .errmsg = true, },
618         { .name     = "fsync",    .errmsg = true, },
619         { .name     = "ftruncate", .errmsg = true, },
620         { .name     = "futex",      .errmsg = true,
621           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
622         { .name     = "futimesat", .errmsg = true,
623           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
624         { .name     = "getdents",   .errmsg = true, },
625         { .name     = "getdents64", .errmsg = true, },
626         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
627         { .name     = "getpid",     .errpid = true, },
628         { .name     = "getpgid",    .errpid = true, },
629         { .name     = "getppid",    .errpid = true, },
630         { .name     = "getrandom",  .errmsg = true,
631           .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
632         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
633         { .name     = "getxattr",   .errmsg = true, },
634         { .name     = "inotify_add_watch",          .errmsg = true, },
635         { .name     = "ioctl",      .errmsg = true,
636           .arg_scnprintf = {
637 #if defined(__i386__) || defined(__x86_64__)
638 /*
639  * FIXME: Make this available to all arches.
640  */
641                              [1] = SCA_STRHEXARRAY, /* cmd */
642                              [2] = SCA_HEX, /* arg */ },
643           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
644 #else
645                              [2] = SCA_HEX, /* arg */ }, },
646 #endif
647         { .name     = "keyctl",     .errmsg = true, STRARRAY(0, option, keyctl_options), },
648         { .name     = "kill",       .errmsg = true,
649           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
650         { .name     = "lchown",    .errmsg = true, },
651         { .name     = "lgetxattr",  .errmsg = true, },
652         { .name     = "linkat",     .errmsg = true,
653           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
654         { .name     = "listxattr",  .errmsg = true, },
655         { .name     = "llistxattr", .errmsg = true, },
656         { .name     = "lremovexattr",  .errmsg = true, },
657         { .name     = "lseek",      .errmsg = true,
658           .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
659           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
660         { .name     = "lsetxattr",  .errmsg = true, },
661         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
662         { .name     = "lsxattr",    .errmsg = true, },
663         { .name     = "madvise",    .errmsg = true,
664           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
665                              [2] = SCA_MADV_BHV, /* behavior */ }, },
666         { .name     = "mkdir",    .errmsg = true, },
667         { .name     = "mkdirat",    .errmsg = true,
668           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
669         { .name     = "mknod",      .errmsg = true, },
670         { .name     = "mknodat",    .errmsg = true,
671           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
672         { .name     = "mlock",      .errmsg = true,
673           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
674         { .name     = "mlockall",   .errmsg = true,
675           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
676         { .name     = "mmap",       .hexret = true,
677           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
678                              [2] = SCA_MMAP_PROT, /* prot */
679                              [3] = SCA_MMAP_FLAGS, /* flags */ }, },
680         { .name     = "mprotect",   .errmsg = true,
681           .arg_scnprintf = { [0] = SCA_HEX, /* start */
682                              [2] = SCA_MMAP_PROT, /* prot */ }, },
683         { .name     = "mq_unlink", .errmsg = true,
684           .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
685         { .name     = "mremap",     .hexret = true,
686           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
687                              [3] = SCA_MREMAP_FLAGS, /* flags */
688                              [4] = SCA_HEX, /* new_addr */ }, },
689         { .name     = "munlock",    .errmsg = true,
690           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
691         { .name     = "munmap",     .errmsg = true,
692           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
693         { .name     = "name_to_handle_at", .errmsg = true,
694           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
695         { .name     = "newfstatat", .errmsg = true,
696           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
697         { .name     = "open",       .errmsg = true,
698           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
699         { .name     = "open_by_handle_at", .errmsg = true,
700           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
701                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
702         { .name     = "openat",     .errmsg = true,
703           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
704                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
705         { .name     = "perf_event_open", .errmsg = true,
706           .arg_scnprintf = { [2] = SCA_INT, /* cpu */
707                              [3] = SCA_FD,  /* group_fd */
708                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
709         { .name     = "pipe2",      .errmsg = true,
710           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
711         { .name     = "poll",       .errmsg = true, .timeout = true, },
712         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
713         { .name     = "pread",      .errmsg = true, .alias = "pread64", },
714         { .name     = "preadv",     .errmsg = true, .alias = "pread", },
715         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
716         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64", },
717         { .name     = "pwritev",    .errmsg = true, },
718         { .name     = "read",       .errmsg = true, },
719         { .name     = "readlink",   .errmsg = true, },
720         { .name     = "readlinkat", .errmsg = true,
721           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
722         { .name     = "readv",      .errmsg = true, },
723         { .name     = "recvfrom",   .errmsg = true,
724           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
725         { .name     = "recvmmsg",   .errmsg = true,
726           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
727         { .name     = "recvmsg",    .errmsg = true,
728           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
729         { .name     = "removexattr", .errmsg = true, },
730         { .name     = "renameat",   .errmsg = true,
731           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
732         { .name     = "rmdir",    .errmsg = true, },
733         { .name     = "rt_sigaction", .errmsg = true,
734           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
735         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
736         { .name     = "rt_sigqueueinfo", .errmsg = true,
737           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
738         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
739           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
740         { .name     = "sched_getattr",        .errmsg = true, },
741         { .name     = "sched_setattr",        .errmsg = true, },
742         { .name     = "sched_setscheduler",   .errmsg = true,
743           .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
744         { .name     = "seccomp", .errmsg = true,
745           .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
746                              [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
747         { .name     = "select",     .errmsg = true, .timeout = true, },
748         { .name     = "sendmmsg",    .errmsg = true,
749           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
750         { .name     = "sendmsg",    .errmsg = true,
751           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
752         { .name     = "sendto",     .errmsg = true,
753           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
754         { .name     = "set_tid_address", .errpid = true, },
755         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
756         { .name     = "setpgid",    .errmsg = true, },
757         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
758         { .name     = "setxattr",   .errmsg = true, },
759         { .name     = "shutdown",   .errmsg = true, },
760         { .name     = "socket",     .errmsg = true,
761           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
762                              [1] = SCA_SK_TYPE, /* type */ },
763           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
764         { .name     = "socketpair", .errmsg = true,
765           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
766                              [1] = SCA_SK_TYPE, /* type */ },
767           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
768         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
769         { .name     = "statfs",     .errmsg = true, },
770         { .name     = "statx",      .errmsg = true,
771           .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
772                              [2] = SCA_STATX_FLAGS, /* flags */
773                              [3] = SCA_STATX_MASK, /* mask */ }, },
774         { .name     = "swapoff",    .errmsg = true,
775           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
776         { .name     = "swapon",     .errmsg = true,
777           .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
778         { .name     = "symlinkat",  .errmsg = true,
779           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
780         { .name     = "tgkill",     .errmsg = true,
781           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
782         { .name     = "tkill",      .errmsg = true,
783           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
784         { .name     = "truncate",   .errmsg = true, },
785         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
786         { .name     = "unlinkat",   .errmsg = true,
787           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
788         { .name     = "utime",  .errmsg = true, },
789         { .name     = "utimensat",  .errmsg = true,
790           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
791         { .name     = "utimes",  .errmsg = true, },
792         { .name     = "vmsplice",  .errmsg = true, },
793         { .name     = "wait4",      .errpid = true,
794           .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
795         { .name     = "waitid",     .errpid = true,
796           .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
797         { .name     = "write",      .errmsg = true, },
798         { .name     = "writev",     .errmsg = true, },
799 };
800
801 static int syscall_fmt__cmp(const void *name, const void *fmtp)
802 {
803         const struct syscall_fmt *fmt = fmtp;
804         return strcmp(name, fmt->name);
805 }
806
807 static struct syscall_fmt *syscall_fmt__find(const char *name)
808 {
809         const int nmemb = ARRAY_SIZE(syscall_fmts);
810         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
811 }
812
813 struct syscall {
814         struct event_format *tp_format;
815         int                 nr_args;
816         struct format_field *args;
817         const char          *name;
818         bool                is_exit;
819         struct syscall_fmt  *fmt;
820         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
821         void                **arg_parm;
822 };
823
824 /*
825  * We need to have this 'calculated' boolean because in some cases we really
826  * don't know what is the duration of a syscall, for instance, when we start
827  * a session and some threads are waiting for a syscall to finish, say 'poll',
828  * in which case all we can do is to print "( ? ) for duration and for the
829  * start timestamp.
830  */
831 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
832 {
833         double duration = (double)t / NSEC_PER_MSEC;
834         size_t printed = fprintf(fp, "(");
835
836         if (!calculated)
837                 printed += fprintf(fp, "     ?   ");
838         else if (duration >= 1.0)
839                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
840         else if (duration >= 0.01)
841                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
842         else
843                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
844         return printed + fprintf(fp, "): ");
845 }
846
847 /**
848  * filename.ptr: The filename char pointer that will be vfs_getname'd
849  * filename.entry_str_pos: Where to insert the string translated from
850  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
851  */
852 struct thread_trace {
853         u64               entry_time;
854         bool              entry_pending;
855         unsigned long     nr_events;
856         unsigned long     pfmaj, pfmin;
857         char              *entry_str;
858         double            runtime_ms;
859         struct {
860                 unsigned long ptr;
861                 short int     entry_str_pos;
862                 bool          pending_open;
863                 unsigned int  namelen;
864                 char          *name;
865         } filename;
866         struct {
867                 int       max;
868                 char      **table;
869         } paths;
870
871         struct intlist *syscall_stats;
872 };
873
874 static struct thread_trace *thread_trace__new(void)
875 {
876         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
877
878         if (ttrace)
879                 ttrace->paths.max = -1;
880
881         ttrace->syscall_stats = intlist__new(NULL);
882
883         return ttrace;
884 }
885
886 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
887 {
888         struct thread_trace *ttrace;
889
890         if (thread == NULL)
891                 goto fail;
892
893         if (thread__priv(thread) == NULL)
894                 thread__set_priv(thread, thread_trace__new());
895
896         if (thread__priv(thread) == NULL)
897                 goto fail;
898
899         ttrace = thread__priv(thread);
900         ++ttrace->nr_events;
901
902         return ttrace;
903 fail:
904         color_fprintf(fp, PERF_COLOR_RED,
905                       "WARNING: not enough memory, dropping samples!\n");
906         return NULL;
907 }
908
909 #define TRACE_PFMAJ             (1 << 0)
910 #define TRACE_PFMIN             (1 << 1)
911
912 static const size_t trace__entry_str_size = 2048;
913
914 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
915 {
916         struct thread_trace *ttrace = thread__priv(thread);
917
918         if (fd > ttrace->paths.max) {
919                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
920
921                 if (npath == NULL)
922                         return -1;
923
924                 if (ttrace->paths.max != -1) {
925                         memset(npath + ttrace->paths.max + 1, 0,
926                                (fd - ttrace->paths.max) * sizeof(char *));
927                 } else {
928                         memset(npath, 0, (fd + 1) * sizeof(char *));
929                 }
930
931                 ttrace->paths.table = npath;
932                 ttrace->paths.max   = fd;
933         }
934
935         ttrace->paths.table[fd] = strdup(pathname);
936
937         return ttrace->paths.table[fd] != NULL ? 0 : -1;
938 }
939
940 static int thread__read_fd_path(struct thread *thread, int fd)
941 {
942         char linkname[PATH_MAX], pathname[PATH_MAX];
943         struct stat st;
944         int ret;
945
946         if (thread->pid_ == thread->tid) {
947                 scnprintf(linkname, sizeof(linkname),
948                           "/proc/%d/fd/%d", thread->pid_, fd);
949         } else {
950                 scnprintf(linkname, sizeof(linkname),
951                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
952         }
953
954         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
955                 return -1;
956
957         ret = readlink(linkname, pathname, sizeof(pathname));
958
959         if (ret < 0 || ret > st.st_size)
960                 return -1;
961
962         pathname[ret] = '\0';
963         return trace__set_fd_pathname(thread, fd, pathname);
964 }
965
966 static const char *thread__fd_path(struct thread *thread, int fd,
967                                    struct trace *trace)
968 {
969         struct thread_trace *ttrace = thread__priv(thread);
970
971         if (ttrace == NULL)
972                 return NULL;
973
974         if (fd < 0)
975                 return NULL;
976
977         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
978                 if (!trace->live)
979                         return NULL;
980                 ++trace->stats.proc_getname;
981                 if (thread__read_fd_path(thread, fd))
982                         return NULL;
983         }
984
985         return ttrace->paths.table[fd];
986 }
987
988 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
989                                         struct syscall_arg *arg)
990 {
991         int fd = arg->val;
992         size_t printed = scnprintf(bf, size, "%d", fd);
993         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
994
995         if (path)
996                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
997
998         return printed;
999 }
1000
1001 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1002                                               struct syscall_arg *arg)
1003 {
1004         int fd = arg->val;
1005         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1006         struct thread_trace *ttrace = thread__priv(arg->thread);
1007
1008         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1009                 zfree(&ttrace->paths.table[fd]);
1010
1011         return printed;
1012 }
1013
1014 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1015                                      unsigned long ptr)
1016 {
1017         struct thread_trace *ttrace = thread__priv(thread);
1018
1019         ttrace->filename.ptr = ptr;
1020         ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1021 }
1022
1023 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1024                                               struct syscall_arg *arg)
1025 {
1026         unsigned long ptr = arg->val;
1027
1028         if (!arg->trace->vfs_getname)
1029                 return scnprintf(bf, size, "%#x", ptr);
1030
1031         thread__set_filename_pos(arg->thread, bf, ptr);
1032         return 0;
1033 }
1034
1035 static bool trace__filter_duration(struct trace *trace, double t)
1036 {
1037         return t < (trace->duration_filter * NSEC_PER_MSEC);
1038 }
1039
1040 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1041 {
1042         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1043
1044         return fprintf(fp, "%10.3f ", ts);
1045 }
1046
1047 /*
1048  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1049  * using ttrace->entry_time for a thread that receives a sys_exit without
1050  * first having received a sys_enter ("poll" issued before tracing session
1051  * starts, lost sys_enter exit due to ring buffer overflow).
1052  */
1053 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1054 {
1055         if (tstamp > 0)
1056                 return __trace__fprintf_tstamp(trace, tstamp, fp);
1057
1058         return fprintf(fp, "         ? ");
1059 }
1060
1061 static bool done = false;
1062 static bool interrupted = false;
1063
1064 static void sig_handler(int sig)
1065 {
1066         done = true;
1067         interrupted = sig == SIGINT;
1068 }
1069
1070 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1071                                         u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1072 {
1073         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1074         printed += fprintf_duration(duration, duration_calculated, fp);
1075
1076         if (trace->multiple_threads) {
1077                 if (trace->show_comm)
1078                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1079                 printed += fprintf(fp, "%d ", thread->tid);
1080         }
1081
1082         return printed;
1083 }
1084
1085 static int trace__process_event(struct trace *trace, struct machine *machine,
1086                                 union perf_event *event, struct perf_sample *sample)
1087 {
1088         int ret = 0;
1089
1090         switch (event->header.type) {
1091         case PERF_RECORD_LOST:
1092                 color_fprintf(trace->output, PERF_COLOR_RED,
1093                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1094                 ret = machine__process_lost_event(machine, event, sample);
1095                 break;
1096         default:
1097                 ret = machine__process_event(machine, event, sample);
1098                 break;
1099         }
1100
1101         return ret;
1102 }
1103
1104 static int trace__tool_process(struct perf_tool *tool,
1105                                union perf_event *event,
1106                                struct perf_sample *sample,
1107                                struct machine *machine)
1108 {
1109         struct trace *trace = container_of(tool, struct trace, tool);
1110         return trace__process_event(trace, machine, event, sample);
1111 }
1112
1113 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1114 {
1115         struct machine *machine = vmachine;
1116
1117         if (machine->kptr_restrict_warned)
1118                 return NULL;
1119
1120         if (symbol_conf.kptr_restrict) {
1121                 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1122                            "Check /proc/sys/kernel/kptr_restrict.\n\n"
1123                            "Kernel samples will not be resolved.\n");
1124                 machine->kptr_restrict_warned = true;
1125                 return NULL;
1126         }
1127
1128         return machine__resolve_kernel_addr(vmachine, addrp, modp);
1129 }
1130
1131 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1132 {
1133         int err = symbol__init(NULL);
1134
1135         if (err)
1136                 return err;
1137
1138         trace->host = machine__new_host();
1139         if (trace->host == NULL)
1140                 return -ENOMEM;
1141
1142         if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1143                 return -errno;
1144
1145         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1146                                             evlist->threads, trace__tool_process, false,
1147                                             trace->opts.proc_map_timeout);
1148         if (err)
1149                 symbol__exit();
1150
1151         return err;
1152 }
1153
1154 static int syscall__set_arg_fmts(struct syscall *sc)
1155 {
1156         struct format_field *field;
1157         int idx = 0, len;
1158
1159         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1160         if (sc->arg_scnprintf == NULL)
1161                 return -1;
1162
1163         if (sc->fmt)
1164                 sc->arg_parm = sc->fmt->arg_parm;
1165
1166         for (field = sc->args; field; field = field->next) {
1167                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1168                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1169                 else if (strcmp(field->type, "const char *") == 0 &&
1170                          (strcmp(field->name, "filename") == 0 ||
1171                           strcmp(field->name, "path") == 0 ||
1172                           strcmp(field->name, "pathname") == 0))
1173                         sc->arg_scnprintf[idx] = SCA_FILENAME;
1174                 else if (field->flags & FIELD_IS_POINTER)
1175                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1176                 else if (strcmp(field->type, "pid_t") == 0)
1177                         sc->arg_scnprintf[idx] = SCA_PID;
1178                 else if (strcmp(field->type, "umode_t") == 0)
1179                         sc->arg_scnprintf[idx] = SCA_MODE_T;
1180                 else if ((strcmp(field->type, "int") == 0 ||
1181                           strcmp(field->type, "unsigned int") == 0 ||
1182                           strcmp(field->type, "long") == 0) &&
1183                          (len = strlen(field->name)) >= 2 &&
1184                          strcmp(field->name + len - 2, "fd") == 0) {
1185                         /*
1186                          * /sys/kernel/tracing/events/syscalls/sys_enter*
1187                          * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1188                          * 65 int
1189                          * 23 unsigned int
1190                          * 7 unsigned long
1191                          */
1192                         sc->arg_scnprintf[idx] = SCA_FD;
1193                 }
1194                 ++idx;
1195         }
1196
1197         return 0;
1198 }
1199
1200 static int trace__read_syscall_info(struct trace *trace, int id)
1201 {
1202         char tp_name[128];
1203         struct syscall *sc;
1204         const char *name = syscalltbl__name(trace->sctbl, id);
1205
1206         if (name == NULL)
1207                 return -1;
1208
1209         if (id > trace->syscalls.max) {
1210                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1211
1212                 if (nsyscalls == NULL)
1213                         return -1;
1214
1215                 if (trace->syscalls.max != -1) {
1216                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1217                                (id - trace->syscalls.max) * sizeof(*sc));
1218                 } else {
1219                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1220                 }
1221
1222                 trace->syscalls.table = nsyscalls;
1223                 trace->syscalls.max   = id;
1224         }
1225
1226         sc = trace->syscalls.table + id;
1227         sc->name = name;
1228
1229         sc->fmt  = syscall_fmt__find(sc->name);
1230
1231         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1232         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1233
1234         if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1235                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1236                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1237         }
1238
1239         if (IS_ERR(sc->tp_format))
1240                 return -1;
1241
1242         sc->args = sc->tp_format->format.fields;
1243         sc->nr_args = sc->tp_format->format.nr_fields;
1244         /*
1245          * We need to check and discard the first variable '__syscall_nr'
1246          * or 'nr' that mean the syscall number. It is needless here.
1247          * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1248          */
1249         if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1250                 sc->args = sc->args->next;
1251                 --sc->nr_args;
1252         }
1253
1254         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1255
1256         return syscall__set_arg_fmts(sc);
1257 }
1258
1259 static int trace__validate_ev_qualifier(struct trace *trace)
1260 {
1261         int err = 0, i;
1262         struct str_node *pos;
1263
1264         trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1265         trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1266                                                  sizeof(trace->ev_qualifier_ids.entries[0]));
1267
1268         if (trace->ev_qualifier_ids.entries == NULL) {
1269                 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1270                        trace->output);
1271                 err = -EINVAL;
1272                 goto out;
1273         }
1274
1275         i = 0;
1276
1277         strlist__for_each_entry(pos, trace->ev_qualifier) {
1278                 const char *sc = pos->s;
1279                 int id = syscalltbl__id(trace->sctbl, sc);
1280
1281                 if (id < 0) {
1282                         if (err == 0) {
1283                                 fputs("Error:\tInvalid syscall ", trace->output);
1284                                 err = -EINVAL;
1285                         } else {
1286                                 fputs(", ", trace->output);
1287                         }
1288
1289                         fputs(sc, trace->output);
1290                 }
1291
1292                 trace->ev_qualifier_ids.entries[i++] = id;
1293         }
1294
1295         if (err < 0) {
1296                 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1297                       "\nHint:\tand: 'man syscalls'\n", trace->output);
1298                 zfree(&trace->ev_qualifier_ids.entries);
1299                 trace->ev_qualifier_ids.nr = 0;
1300         }
1301 out:
1302         return err;
1303 }
1304
1305 /*
1306  * args is to be interpreted as a series of longs but we need to handle
1307  * 8-byte unaligned accesses. args points to raw_data within the event
1308  * and raw_data is guaranteed to be 8-byte unaligned because it is
1309  * preceded by raw_size which is a u32. So we need to copy args to a temp
1310  * variable to read it. Most notably this avoids extended load instructions
1311  * on unaligned addresses
1312  */
1313
1314 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1315                                       unsigned char *args, struct trace *trace,
1316                                       struct thread *thread)
1317 {
1318         size_t printed = 0;
1319         unsigned char *p;
1320         unsigned long val;
1321
1322         if (sc->args != NULL) {
1323                 struct format_field *field;
1324                 u8 bit = 1;
1325                 struct syscall_arg arg = {
1326                         .idx    = 0,
1327                         .mask   = 0,
1328                         .trace  = trace,
1329                         .thread = thread,
1330                 };
1331
1332                 for (field = sc->args; field;
1333                      field = field->next, ++arg.idx, bit <<= 1) {
1334                         if (arg.mask & bit)
1335                                 continue;
1336
1337                         /* special care for unaligned accesses */
1338                         p = args + sizeof(unsigned long) * arg.idx;
1339                         memcpy(&val, p, sizeof(val));
1340
1341                         /*
1342                          * Suppress this argument if its value is zero and
1343                          * and we don't have a string associated in an
1344                          * strarray for it.
1345                          */
1346                         if (val == 0 &&
1347                             !(sc->arg_scnprintf &&
1348                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1349                               sc->arg_parm[arg.idx]))
1350                                 continue;
1351
1352                         printed += scnprintf(bf + printed, size - printed,
1353                                              "%s%s: ", printed ? ", " : "", field->name);
1354                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1355                                 arg.val = val;
1356                                 if (sc->arg_parm)
1357                                         arg.parm = sc->arg_parm[arg.idx];
1358                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1359                                                                       size - printed, &arg);
1360                         } else {
1361                                 printed += scnprintf(bf + printed, size - printed,
1362                                                      "%ld", val);
1363                         }
1364                 }
1365         } else if (IS_ERR(sc->tp_format)) {
1366                 /*
1367                  * If we managed to read the tracepoint /format file, then we
1368                  * may end up not having any args, like with gettid(), so only
1369                  * print the raw args when we didn't manage to read it.
1370                  */
1371                 int i = 0;
1372
1373                 while (i < 6) {
1374                         /* special care for unaligned accesses */
1375                         p = args + sizeof(unsigned long) * i;
1376                         memcpy(&val, p, sizeof(val));
1377                         printed += scnprintf(bf + printed, size - printed,
1378                                              "%sarg%d: %ld",
1379                                              printed ? ", " : "", i, val);
1380                         ++i;
1381                 }
1382         }
1383
1384         return printed;
1385 }
1386
1387 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1388                                   union perf_event *event,
1389                                   struct perf_sample *sample);
1390
1391 static struct syscall *trace__syscall_info(struct trace *trace,
1392                                            struct perf_evsel *evsel, int id)
1393 {
1394
1395         if (id < 0) {
1396
1397                 /*
1398                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1399                  * before that, leaving at a higher verbosity level till that is
1400                  * explained. Reproduced with plain ftrace with:
1401                  *
1402                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1403                  * grep "NR -1 " /t/trace_pipe
1404                  *
1405                  * After generating some load on the machine.
1406                  */
1407                 if (verbose > 1) {
1408                         static u64 n;
1409                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1410                                 id, perf_evsel__name(evsel), ++n);
1411                 }
1412                 return NULL;
1413         }
1414
1415         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1416             trace__read_syscall_info(trace, id))
1417                 goto out_cant_read;
1418
1419         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1420                 goto out_cant_read;
1421
1422         return &trace->syscalls.table[id];
1423
1424 out_cant_read:
1425         if (verbose > 0) {
1426                 fprintf(trace->output, "Problems reading syscall %d", id);
1427                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1428                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1429                 fputs(" information\n", trace->output);
1430         }
1431         return NULL;
1432 }
1433
1434 static void thread__update_stats(struct thread_trace *ttrace,
1435                                  int id, struct perf_sample *sample)
1436 {
1437         struct int_node *inode;
1438         struct stats *stats;
1439         u64 duration = 0;
1440
1441         inode = intlist__findnew(ttrace->syscall_stats, id);
1442         if (inode == NULL)
1443                 return;
1444
1445         stats = inode->priv;
1446         if (stats == NULL) {
1447                 stats = malloc(sizeof(struct stats));
1448                 if (stats == NULL)
1449                         return;
1450                 init_stats(stats);
1451                 inode->priv = stats;
1452         }
1453
1454         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1455                 duration = sample->time - ttrace->entry_time;
1456
1457         update_stats(stats, duration);
1458 }
1459
1460 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1461 {
1462         struct thread_trace *ttrace;
1463         u64 duration;
1464         size_t printed;
1465
1466         if (trace->current == NULL)
1467                 return 0;
1468
1469         ttrace = thread__priv(trace->current);
1470
1471         if (!ttrace->entry_pending)
1472                 return 0;
1473
1474         duration = sample->time - ttrace->entry_time;
1475
1476         printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1477         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1478         ttrace->entry_pending = false;
1479
1480         return printed;
1481 }
1482
1483 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1484                             union perf_event *event __maybe_unused,
1485                             struct perf_sample *sample)
1486 {
1487         char *msg;
1488         void *args;
1489         size_t printed = 0;
1490         struct thread *thread;
1491         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1492         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1493         struct thread_trace *ttrace;
1494
1495         if (sc == NULL)
1496                 return -1;
1497
1498         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1499         ttrace = thread__trace(thread, trace->output);
1500         if (ttrace == NULL)
1501                 goto out_put;
1502
1503         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1504
1505         if (ttrace->entry_str == NULL) {
1506                 ttrace->entry_str = malloc(trace__entry_str_size);
1507                 if (!ttrace->entry_str)
1508                         goto out_put;
1509         }
1510
1511         if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1512                 trace__printf_interrupted_entry(trace, sample);
1513
1514         ttrace->entry_time = sample->time;
1515         msg = ttrace->entry_str;
1516         printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1517
1518         printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1519                                            args, trace, thread);
1520
1521         if (sc->is_exit) {
1522                 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1523                         trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1524                         fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1525                 }
1526         } else {
1527                 ttrace->entry_pending = true;
1528                 /* See trace__vfs_getname & trace__sys_exit */
1529                 ttrace->filename.pending_open = false;
1530         }
1531
1532         if (trace->current != thread) {
1533                 thread__put(trace->current);
1534                 trace->current = thread__get(thread);
1535         }
1536         err = 0;
1537 out_put:
1538         thread__put(thread);
1539         return err;
1540 }
1541
1542 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1543                                     struct perf_sample *sample,
1544                                     struct callchain_cursor *cursor)
1545 {
1546         struct addr_location al;
1547
1548         if (machine__resolve(trace->host, &al, sample) < 0 ||
1549             thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1550                 return -1;
1551
1552         return 0;
1553 }
1554
1555 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1556 {
1557         /* TODO: user-configurable print_opts */
1558         const unsigned int print_opts = EVSEL__PRINT_SYM |
1559                                         EVSEL__PRINT_DSO |
1560                                         EVSEL__PRINT_UNKNOWN_AS_ADDR;
1561
1562         return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1563 }
1564
1565 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1566                            union perf_event *event __maybe_unused,
1567                            struct perf_sample *sample)
1568 {
1569         long ret;
1570         u64 duration = 0;
1571         bool duration_calculated = false;
1572         struct thread *thread;
1573         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1574         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1575         struct thread_trace *ttrace;
1576
1577         if (sc == NULL)
1578                 return -1;
1579
1580         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1581         ttrace = thread__trace(thread, trace->output);
1582         if (ttrace == NULL)
1583                 goto out_put;
1584
1585         if (trace->summary)
1586                 thread__update_stats(ttrace, id, sample);
1587
1588         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1589
1590         if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1591                 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1592                 ttrace->filename.pending_open = false;
1593                 ++trace->stats.vfs_getname;
1594         }
1595
1596         if (ttrace->entry_time) {
1597                 duration = sample->time - ttrace->entry_time;
1598                 if (trace__filter_duration(trace, duration))
1599                         goto out;
1600                 duration_calculated = true;
1601         } else if (trace->duration_filter)
1602                 goto out;
1603
1604         if (sample->callchain) {
1605                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1606                 if (callchain_ret == 0) {
1607                         if (callchain_cursor.nr < trace->min_stack)
1608                                 goto out;
1609                         callchain_ret = 1;
1610                 }
1611         }
1612
1613         if (trace->summary_only)
1614                 goto out;
1615
1616         trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1617
1618         if (ttrace->entry_pending) {
1619                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1620         } else {
1621                 fprintf(trace->output, " ... [");
1622                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1623                 fprintf(trace->output, "]: %s()", sc->name);
1624         }
1625
1626         if (sc->fmt == NULL) {
1627 signed_print:
1628                 fprintf(trace->output, ") = %ld", ret);
1629         } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1630                 char bf[STRERR_BUFSIZE];
1631                 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1632                            *e = audit_errno_to_name(-ret);
1633
1634                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1635         } else if (ret == 0 && sc->fmt->timeout)
1636                 fprintf(trace->output, ") = 0 Timeout");
1637         else if (sc->fmt->hexret)
1638                 fprintf(trace->output, ") = %#lx", ret);
1639         else if (sc->fmt->errpid) {
1640                 struct thread *child = machine__find_thread(trace->host, ret, ret);
1641
1642                 if (child != NULL) {
1643                         fprintf(trace->output, ") = %ld", ret);
1644                         if (child->comm_set)
1645                                 fprintf(trace->output, " (%s)", thread__comm_str(child));
1646                         thread__put(child);
1647                 }
1648         } else
1649                 goto signed_print;
1650
1651         fputc('\n', trace->output);
1652
1653         if (callchain_ret > 0)
1654                 trace__fprintf_callchain(trace, sample);
1655         else if (callchain_ret < 0)
1656                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1657 out:
1658         ttrace->entry_pending = false;
1659         err = 0;
1660 out_put:
1661         thread__put(thread);
1662         return err;
1663 }
1664
1665 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1666                               union perf_event *event __maybe_unused,
1667                               struct perf_sample *sample)
1668 {
1669         struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1670         struct thread_trace *ttrace;
1671         size_t filename_len, entry_str_len, to_move;
1672         ssize_t remaining_space;
1673         char *pos;
1674         const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1675
1676         if (!thread)
1677                 goto out;
1678
1679         ttrace = thread__priv(thread);
1680         if (!ttrace)
1681                 goto out_put;
1682
1683         filename_len = strlen(filename);
1684         if (filename_len == 0)
1685                 goto out_put;
1686
1687         if (ttrace->filename.namelen < filename_len) {
1688                 char *f = realloc(ttrace->filename.name, filename_len + 1);
1689
1690                 if (f == NULL)
1691                         goto out_put;
1692
1693                 ttrace->filename.namelen = filename_len;
1694                 ttrace->filename.name = f;
1695         }
1696
1697         strcpy(ttrace->filename.name, filename);
1698         ttrace->filename.pending_open = true;
1699
1700         if (!ttrace->filename.ptr)
1701                 goto out_put;
1702
1703         entry_str_len = strlen(ttrace->entry_str);
1704         remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1705         if (remaining_space <= 0)
1706                 goto out_put;
1707
1708         if (filename_len > (size_t)remaining_space) {
1709                 filename += filename_len - remaining_space;
1710                 filename_len = remaining_space;
1711         }
1712
1713         to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1714         pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1715         memmove(pos + filename_len, pos, to_move);
1716         memcpy(pos, filename, filename_len);
1717
1718         ttrace->filename.ptr = 0;
1719         ttrace->filename.entry_str_pos = 0;
1720 out_put:
1721         thread__put(thread);
1722 out:
1723         return 0;
1724 }
1725
1726 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1727                                      union perf_event *event __maybe_unused,
1728                                      struct perf_sample *sample)
1729 {
1730         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1731         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1732         struct thread *thread = machine__findnew_thread(trace->host,
1733                                                         sample->pid,
1734                                                         sample->tid);
1735         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1736
1737         if (ttrace == NULL)
1738                 goto out_dump;
1739
1740         ttrace->runtime_ms += runtime_ms;
1741         trace->runtime_ms += runtime_ms;
1742 out_put:
1743         thread__put(thread);
1744         return 0;
1745
1746 out_dump:
1747         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1748                evsel->name,
1749                perf_evsel__strval(evsel, sample, "comm"),
1750                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1751                runtime,
1752                perf_evsel__intval(evsel, sample, "vruntime"));
1753         goto out_put;
1754 }
1755
1756 static void bpf_output__printer(enum binary_printer_ops op,
1757                                 unsigned int val, void *extra)
1758 {
1759         FILE *output = extra;
1760         unsigned char ch = (unsigned char)val;
1761
1762         switch (op) {
1763         case BINARY_PRINT_CHAR_DATA:
1764                 fprintf(output, "%c", isprint(ch) ? ch : '.');
1765                 break;
1766         case BINARY_PRINT_DATA_BEGIN:
1767         case BINARY_PRINT_LINE_BEGIN:
1768         case BINARY_PRINT_ADDR:
1769         case BINARY_PRINT_NUM_DATA:
1770         case BINARY_PRINT_NUM_PAD:
1771         case BINARY_PRINT_SEP:
1772         case BINARY_PRINT_CHAR_PAD:
1773         case BINARY_PRINT_LINE_END:
1774         case BINARY_PRINT_DATA_END:
1775         default:
1776                 break;
1777         }
1778 }
1779
1780 static void bpf_output__fprintf(struct trace *trace,
1781                                 struct perf_sample *sample)
1782 {
1783         print_binary(sample->raw_data, sample->raw_size, 8,
1784                      bpf_output__printer, trace->output);
1785 }
1786
1787 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1788                                 union perf_event *event __maybe_unused,
1789                                 struct perf_sample *sample)
1790 {
1791         int callchain_ret = 0;
1792
1793         if (sample->callchain) {
1794                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1795                 if (callchain_ret == 0) {
1796                         if (callchain_cursor.nr < trace->min_stack)
1797                                 goto out;
1798                         callchain_ret = 1;
1799                 }
1800         }
1801
1802         trace__printf_interrupted_entry(trace, sample);
1803         trace__fprintf_tstamp(trace, sample->time, trace->output);
1804
1805         if (trace->trace_syscalls)
1806                 fprintf(trace->output, "(         ): ");
1807
1808         fprintf(trace->output, "%s:", evsel->name);
1809
1810         if (perf_evsel__is_bpf_output(evsel)) {
1811                 bpf_output__fprintf(trace, sample);
1812         } else if (evsel->tp_format) {
1813                 event_format__fprintf(evsel->tp_format, sample->cpu,
1814                                       sample->raw_data, sample->raw_size,
1815                                       trace->output);
1816         }
1817
1818         fprintf(trace->output, ")\n");
1819
1820         if (callchain_ret > 0)
1821                 trace__fprintf_callchain(trace, sample);
1822         else if (callchain_ret < 0)
1823                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1824 out:
1825         return 0;
1826 }
1827
1828 static void print_location(FILE *f, struct perf_sample *sample,
1829                            struct addr_location *al,
1830                            bool print_dso, bool print_sym)
1831 {
1832
1833         if ((verbose > 0 || print_dso) && al->map)
1834                 fprintf(f, "%s@", al->map->dso->long_name);
1835
1836         if ((verbose > 0 || print_sym) && al->sym)
1837                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1838                         al->addr - al->sym->start);
1839         else if (al->map)
1840                 fprintf(f, "0x%" PRIx64, al->addr);
1841         else
1842                 fprintf(f, "0x%" PRIx64, sample->addr);
1843 }
1844
1845 static int trace__pgfault(struct trace *trace,
1846                           struct perf_evsel *evsel,
1847                           union perf_event *event __maybe_unused,
1848                           struct perf_sample *sample)
1849 {
1850         struct thread *thread;
1851         struct addr_location al;
1852         char map_type = 'd';
1853         struct thread_trace *ttrace;
1854         int err = -1;
1855         int callchain_ret = 0;
1856
1857         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1858
1859         if (sample->callchain) {
1860                 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1861                 if (callchain_ret == 0) {
1862                         if (callchain_cursor.nr < trace->min_stack)
1863                                 goto out_put;
1864                         callchain_ret = 1;
1865                 }
1866         }
1867
1868         ttrace = thread__trace(thread, trace->output);
1869         if (ttrace == NULL)
1870                 goto out_put;
1871
1872         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1873                 ttrace->pfmaj++;
1874         else
1875                 ttrace->pfmin++;
1876
1877         if (trace->summary_only)
1878                 goto out;
1879
1880         thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1881                               sample->ip, &al);
1882
1883         trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1884
1885         fprintf(trace->output, "%sfault [",
1886                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1887                 "maj" : "min");
1888
1889         print_location(trace->output, sample, &al, false, true);
1890
1891         fprintf(trace->output, "] => ");
1892
1893         thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1894                                    sample->addr, &al);
1895
1896         if (!al.map) {
1897                 thread__find_addr_location(thread, sample->cpumode,
1898                                            MAP__FUNCTION, sample->addr, &al);
1899
1900                 if (al.map)
1901                         map_type = 'x';
1902                 else
1903                         map_type = '?';
1904         }
1905
1906         print_location(trace->output, sample, &al, true, false);
1907
1908         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1909
1910         if (callchain_ret > 0)
1911                 trace__fprintf_callchain(trace, sample);
1912         else if (callchain_ret < 0)
1913                 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1914 out:
1915         err = 0;
1916 out_put:
1917         thread__put(thread);
1918         return err;
1919 }
1920
1921 static void trace__set_base_time(struct trace *trace,
1922                                  struct perf_evsel *evsel,
1923                                  struct perf_sample *sample)
1924 {
1925         /*
1926          * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1927          * and don't use sample->time unconditionally, we may end up having
1928          * some other event in the future without PERF_SAMPLE_TIME for good
1929          * reason, i.e. we may not be interested in its timestamps, just in
1930          * it taking place, picking some piece of information when it
1931          * appears in our event stream (vfs_getname comes to mind).
1932          */
1933         if (trace->base_time == 0 && !trace->full_time &&
1934             (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1935                 trace->base_time = sample->time;
1936 }
1937
1938 static int trace__process_sample(struct perf_tool *tool,
1939                                  union perf_event *event,
1940                                  struct perf_sample *sample,
1941                                  struct perf_evsel *evsel,
1942                                  struct machine *machine __maybe_unused)
1943 {
1944         struct trace *trace = container_of(tool, struct trace, tool);
1945         struct thread *thread;
1946         int err = 0;
1947
1948         tracepoint_handler handler = evsel->handler;
1949
1950         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1951         if (thread && thread__is_filtered(thread))
1952                 goto out;
1953
1954         trace__set_base_time(trace, evsel, sample);
1955
1956         if (handler) {
1957                 ++trace->nr_events;
1958                 handler(trace, evsel, event, sample);
1959         }
1960 out:
1961         thread__put(thread);
1962         return err;
1963 }
1964
1965 static int trace__record(struct trace *trace, int argc, const char **argv)
1966 {
1967         unsigned int rec_argc, i, j;
1968         const char **rec_argv;
1969         const char * const record_args[] = {
1970                 "record",
1971                 "-R",
1972                 "-m", "1024",
1973                 "-c", "1",
1974         };
1975
1976         const char * const sc_args[] = { "-e", };
1977         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1978         const char * const majpf_args[] = { "-e", "major-faults" };
1979         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1980         const char * const minpf_args[] = { "-e", "minor-faults" };
1981         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1982
1983         /* +1 is for the event string below */
1984         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1985                 majpf_args_nr + minpf_args_nr + argc;
1986         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1987
1988         if (rec_argv == NULL)
1989                 return -ENOMEM;
1990
1991         j = 0;
1992         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1993                 rec_argv[j++] = record_args[i];
1994
1995         if (trace->trace_syscalls) {
1996                 for (i = 0; i < sc_args_nr; i++)
1997                         rec_argv[j++] = sc_args[i];
1998
1999                 /* event string may be different for older kernels - e.g., RHEL6 */
2000                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2001                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2002                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2003                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2004                 else {
2005                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2006                         return -1;
2007                 }
2008         }
2009
2010         if (trace->trace_pgfaults & TRACE_PFMAJ)
2011                 for (i = 0; i < majpf_args_nr; i++)
2012                         rec_argv[j++] = majpf_args[i];
2013
2014         if (trace->trace_pgfaults & TRACE_PFMIN)
2015                 for (i = 0; i < minpf_args_nr; i++)
2016                         rec_argv[j++] = minpf_args[i];
2017
2018         for (i = 0; i < (unsigned int)argc; i++)
2019                 rec_argv[j++] = argv[i];
2020
2021         return cmd_record(j, rec_argv);
2022 }
2023
2024 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2025
2026 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2027 {
2028         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2029
2030         if (IS_ERR(evsel))
2031                 return false;
2032
2033         if (perf_evsel__field(evsel, "pathname") == NULL) {
2034                 perf_evsel__delete(evsel);
2035                 return false;
2036         }
2037
2038         evsel->handler = trace__vfs_getname;
2039         perf_evlist__add(evlist, evsel);
2040         return true;
2041 }
2042
2043 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2044 {
2045         struct perf_evsel *evsel;
2046         struct perf_event_attr attr = {
2047                 .type = PERF_TYPE_SOFTWARE,
2048                 .mmap_data = 1,
2049         };
2050
2051         attr.config = config;
2052         attr.sample_period = 1;
2053
2054         event_attr_init(&attr);
2055
2056         evsel = perf_evsel__new(&attr);
2057         if (evsel)
2058                 evsel->handler = trace__pgfault;
2059
2060         return evsel;
2061 }
2062
2063 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2064 {
2065         const u32 type = event->header.type;
2066         struct perf_evsel *evsel;
2067
2068         if (type != PERF_RECORD_SAMPLE) {
2069                 trace__process_event(trace, trace->host, event, sample);
2070                 return;
2071         }
2072
2073         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2074         if (evsel == NULL) {
2075                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2076                 return;
2077         }
2078
2079         trace__set_base_time(trace, evsel, sample);
2080
2081         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2082             sample->raw_data == NULL) {
2083                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2084                        perf_evsel__name(evsel), sample->tid,
2085                        sample->cpu, sample->raw_size);
2086         } else {
2087                 tracepoint_handler handler = evsel->handler;
2088                 handler(trace, evsel, event, sample);
2089         }
2090 }
2091
2092 static int trace__add_syscall_newtp(struct trace *trace)
2093 {
2094         int ret = -1;
2095         struct perf_evlist *evlist = trace->evlist;
2096         struct perf_evsel *sys_enter, *sys_exit;
2097
2098         sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2099         if (sys_enter == NULL)
2100                 goto out;
2101
2102         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2103                 goto out_delete_sys_enter;
2104
2105         sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2106         if (sys_exit == NULL)
2107                 goto out_delete_sys_enter;
2108
2109         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2110                 goto out_delete_sys_exit;
2111
2112         perf_evlist__add(evlist, sys_enter);
2113         perf_evlist__add(evlist, sys_exit);
2114
2115         if (callchain_param.enabled && !trace->kernel_syscallchains) {
2116                 /*
2117                  * We're interested only in the user space callchain
2118                  * leading to the syscall, allow overriding that for
2119                  * debugging reasons using --kernel_syscall_callchains
2120                  */
2121                 sys_exit->attr.exclude_callchain_kernel = 1;
2122         }
2123
2124         trace->syscalls.events.sys_enter = sys_enter;
2125         trace->syscalls.events.sys_exit  = sys_exit;
2126
2127         ret = 0;
2128 out:
2129         return ret;
2130
2131 out_delete_sys_exit:
2132         perf_evsel__delete_priv(sys_exit);
2133 out_delete_sys_enter:
2134         perf_evsel__delete_priv(sys_enter);
2135         goto out;
2136 }
2137
2138 static int trace__set_ev_qualifier_filter(struct trace *trace)
2139 {
2140         int err = -1;
2141         struct perf_evsel *sys_exit;
2142         char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2143                                                 trace->ev_qualifier_ids.nr,
2144                                                 trace->ev_qualifier_ids.entries);
2145
2146         if (filter == NULL)
2147                 goto out_enomem;
2148
2149         if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2150                                           filter)) {
2151                 sys_exit = trace->syscalls.events.sys_exit;
2152                 err = perf_evsel__append_tp_filter(sys_exit, filter);
2153         }
2154
2155         free(filter);
2156 out:
2157         return err;
2158 out_enomem:
2159         errno = ENOMEM;
2160         goto out;
2161 }
2162
2163 static int trace__run(struct trace *trace, int argc, const char **argv)
2164 {
2165         struct perf_evlist *evlist = trace->evlist;
2166         struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2167         int err = -1, i;
2168         unsigned long before;
2169         const bool forks = argc > 0;
2170         bool draining = false;
2171
2172         trace->live = true;
2173
2174         if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2175                 goto out_error_raw_syscalls;
2176
2177         if (trace->trace_syscalls)
2178                 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2179
2180         if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2181                 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2182                 if (pgfault_maj == NULL)
2183                         goto out_error_mem;
2184                 perf_evlist__add(evlist, pgfault_maj);
2185         }
2186
2187         if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2188                 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2189                 if (pgfault_min == NULL)
2190                         goto out_error_mem;
2191                 perf_evlist__add(evlist, pgfault_min);
2192         }
2193
2194         if (trace->sched &&
2195             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2196                                    trace__sched_stat_runtime))
2197                 goto out_error_sched_stat_runtime;
2198
2199         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2200         if (err < 0) {
2201                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2202                 goto out_delete_evlist;
2203         }
2204
2205         err = trace__symbols_init(trace, evlist);
2206         if (err < 0) {
2207                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2208                 goto out_delete_evlist;
2209         }
2210
2211         perf_evlist__config(evlist, &trace->opts, NULL);
2212
2213         if (callchain_param.enabled) {
2214                 bool use_identifier = false;
2215
2216                 if (trace->syscalls.events.sys_exit) {
2217                         perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2218                                                      &trace->opts, &callchain_param);
2219                         use_identifier = true;
2220                 }
2221
2222                 if (pgfault_maj) {
2223                         perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2224                         use_identifier = true;
2225                 }
2226
2227                 if (pgfault_min) {
2228                         perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2229                         use_identifier = true;
2230                 }
2231
2232                 if (use_identifier) {
2233                        /*
2234                         * Now we have evsels with different sample_ids, use
2235                         * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2236                         * from a fixed position in each ring buffer record.
2237                         *
2238                         * As of this the changeset introducing this comment, this
2239                         * isn't strictly needed, as the fields that can come before
2240                         * PERF_SAMPLE_ID are all used, but we'll probably disable
2241                         * some of those for things like copying the payload of
2242                         * pointer syscall arguments, and for vfs_getname we don't
2243                         * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2244                         * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2245                         */
2246                         perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2247                         perf_evlist__reset_sample_bit(evlist, ID);
2248                 }
2249         }
2250
2251         signal(SIGCHLD, sig_handler);
2252         signal(SIGINT, sig_handler);
2253
2254         if (forks) {
2255                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2256                                                     argv, false, NULL);
2257                 if (err < 0) {
2258                         fprintf(trace->output, "Couldn't run the workload!\n");
2259                         goto out_delete_evlist;
2260                 }
2261         }
2262
2263         err = perf_evlist__open(evlist);
2264         if (err < 0)
2265                 goto out_error_open;
2266
2267         err = bpf__apply_obj_config();
2268         if (err) {
2269                 char errbuf[BUFSIZ];
2270
2271                 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2272                 pr_err("ERROR: Apply config to BPF failed: %s\n",
2273                          errbuf);
2274                 goto out_error_open;
2275         }
2276
2277         /*
2278          * Better not use !target__has_task() here because we need to cover the
2279          * case where no threads were specified in the command line, but a
2280          * workload was, and in that case we will fill in the thread_map when
2281          * we fork the workload in perf_evlist__prepare_workload.
2282          */
2283         if (trace->filter_pids.nr > 0)
2284                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2285         else if (thread_map__pid(evlist->threads, 0) == -1)
2286                 err = perf_evlist__set_filter_pid(evlist, getpid());
2287
2288         if (err < 0)
2289                 goto out_error_mem;
2290
2291         if (trace->ev_qualifier_ids.nr > 0) {
2292                 err = trace__set_ev_qualifier_filter(trace);
2293                 if (err < 0)
2294                         goto out_errno;
2295
2296                 pr_debug("event qualifier tracepoint filter: %s\n",
2297                          trace->syscalls.events.sys_exit->filter);
2298         }
2299
2300         err = perf_evlist__apply_filters(evlist, &evsel);
2301         if (err < 0)
2302                 goto out_error_apply_filters;
2303
2304         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2305         if (err < 0)
2306                 goto out_error_mmap;
2307
2308         if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2309                 perf_evlist__enable(evlist);
2310
2311         if (forks)
2312                 perf_evlist__start_workload(evlist);
2313
2314         if (trace->opts.initial_delay) {
2315                 usleep(trace->opts.initial_delay * 1000);
2316                 perf_evlist__enable(evlist);
2317         }
2318
2319         trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2320                                   evlist->threads->nr > 1 ||
2321                                   perf_evlist__first(evlist)->attr.inherit;
2322 again:
2323         before = trace->nr_events;
2324
2325         for (i = 0; i < evlist->nr_mmaps; i++) {
2326                 union perf_event *event;
2327
2328                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2329                         struct perf_sample sample;
2330
2331                         ++trace->nr_events;
2332
2333                         err = perf_evlist__parse_sample(evlist, event, &sample);
2334                         if (err) {
2335                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2336                                 goto next_event;
2337                         }
2338
2339                         trace__handle_event(trace, event, &sample);
2340 next_event:
2341                         perf_evlist__mmap_consume(evlist, i);
2342
2343                         if (interrupted)
2344                                 goto out_disable;
2345
2346                         if (done && !draining) {
2347                                 perf_evlist__disable(evlist);
2348                                 draining = true;
2349                         }
2350                 }
2351         }
2352
2353         if (trace->nr_events == before) {
2354                 int timeout = done ? 100 : -1;
2355
2356                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2357                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2358                                 draining = true;
2359
2360                         goto again;
2361                 }
2362         } else {
2363                 goto again;
2364         }
2365
2366 out_disable:
2367         thread__zput(trace->current);
2368
2369         perf_evlist__disable(evlist);
2370
2371         if (!err) {
2372                 if (trace->summary)
2373                         trace__fprintf_thread_summary(trace, trace->output);
2374
2375                 if (trace->show_tool_stats) {
2376                         fprintf(trace->output, "Stats:\n "
2377                                                " vfs_getname : %" PRIu64 "\n"
2378                                                " proc_getname: %" PRIu64 "\n",
2379                                 trace->stats.vfs_getname,
2380                                 trace->stats.proc_getname);
2381                 }
2382         }
2383
2384 out_delete_evlist:
2385         perf_evlist__delete(evlist);
2386         trace->evlist = NULL;
2387         trace->live = false;
2388         return err;
2389 {
2390         char errbuf[BUFSIZ];
2391
2392 out_error_sched_stat_runtime:
2393         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2394         goto out_error;
2395
2396 out_error_raw_syscalls:
2397         tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2398         goto out_error;
2399
2400 out_error_mmap:
2401         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2402         goto out_error;
2403
2404 out_error_open:
2405         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2406
2407 out_error:
2408         fprintf(trace->output, "%s\n", errbuf);
2409         goto out_delete_evlist;
2410
2411 out_error_apply_filters:
2412         fprintf(trace->output,
2413                 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2414                 evsel->filter, perf_evsel__name(evsel), errno,
2415                 str_error_r(errno, errbuf, sizeof(errbuf)));
2416         goto out_delete_evlist;
2417 }
2418 out_error_mem:
2419         fprintf(trace->output, "Not enough memory to run!\n");
2420         goto out_delete_evlist;
2421
2422 out_errno:
2423         fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2424         goto out_delete_evlist;
2425 }
2426
2427 static int trace__replay(struct trace *trace)
2428 {
2429         const struct perf_evsel_str_handler handlers[] = {
2430                 { "probe:vfs_getname",       trace__vfs_getname, },
2431         };
2432         struct perf_data_file file = {
2433                 .path  = input_name,
2434                 .mode  = PERF_DATA_MODE_READ,
2435                 .force = trace->force,
2436         };
2437         struct perf_session *session;
2438         struct perf_evsel *evsel;
2439         int err = -1;
2440
2441         trace->tool.sample        = trace__process_sample;
2442         trace->tool.mmap          = perf_event__process_mmap;
2443         trace->tool.mmap2         = perf_event__process_mmap2;
2444         trace->tool.comm          = perf_event__process_comm;
2445         trace->tool.exit          = perf_event__process_exit;
2446         trace->tool.fork          = perf_event__process_fork;
2447         trace->tool.attr          = perf_event__process_attr;
2448         trace->tool.tracing_data  = perf_event__process_tracing_data;
2449         trace->tool.build_id      = perf_event__process_build_id;
2450         trace->tool.namespaces    = perf_event__process_namespaces;
2451
2452         trace->tool.ordered_events = true;
2453         trace->tool.ordering_requires_timestamps = true;
2454
2455         /* add tid to output */
2456         trace->multiple_threads = true;
2457
2458         session = perf_session__new(&file, false, &trace->tool);
2459         if (session == NULL)
2460                 return -1;
2461
2462         if (trace->opts.target.pid)
2463                 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2464
2465         if (trace->opts.target.tid)
2466                 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2467
2468         if (symbol__init(&session->header.env) < 0)
2469                 goto out;
2470
2471         trace->host = &session->machines.host;
2472
2473         err = perf_session__set_tracepoints_handlers(session, handlers);
2474         if (err)
2475                 goto out;
2476
2477         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2478                                                      "raw_syscalls:sys_enter");
2479         /* older kernels have syscalls tp versus raw_syscalls */
2480         if (evsel == NULL)
2481                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2482                                                              "syscalls:sys_enter");
2483
2484         if (evsel &&
2485             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2486             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2487                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2488                 goto out;
2489         }
2490
2491         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2492                                                      "raw_syscalls:sys_exit");
2493         if (evsel == NULL)
2494                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2495                                                              "syscalls:sys_exit");
2496         if (evsel &&
2497             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2498             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2499                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2500                 goto out;
2501         }
2502
2503         evlist__for_each_entry(session->evlist, evsel) {
2504                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2505                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2506                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2507                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2508                         evsel->handler = trace__pgfault;
2509         }
2510
2511         setup_pager();
2512
2513         err = perf_session__process_events(session);
2514         if (err)
2515                 pr_err("Failed to process events, error %d", err);
2516
2517         else if (trace->summary)
2518                 trace__fprintf_thread_summary(trace, trace->output);
2519
2520 out:
2521         perf_session__delete(session);
2522
2523         return err;
2524 }
2525
2526 static size_t trace__fprintf_threads_header(FILE *fp)
2527 {
2528         size_t printed;
2529
2530         printed  = fprintf(fp, "\n Summary of events:\n\n");
2531
2532         return printed;
2533 }
2534
2535 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2536         struct stats    *stats;
2537         double          msecs;
2538         int             syscall;
2539 )
2540 {
2541         struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2542         struct stats *stats = source->priv;
2543
2544         entry->syscall = source->i;
2545         entry->stats   = stats;
2546         entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2547 }
2548
2549 static size_t thread__dump_stats(struct thread_trace *ttrace,
2550                                  struct trace *trace, FILE *fp)
2551 {
2552         size_t printed = 0;
2553         struct syscall *sc;
2554         struct rb_node *nd;
2555         DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2556
2557         if (syscall_stats == NULL)
2558                 return 0;
2559
2560         printed += fprintf(fp, "\n");
2561
2562         printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2563         printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2564         printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2565
2566         resort_rb__for_each_entry(nd, syscall_stats) {
2567                 struct stats *stats = syscall_stats_entry->stats;
2568                 if (stats) {
2569                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2570                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2571                         double avg = avg_stats(stats);
2572                         double pct;
2573                         u64 n = (u64) stats->n;
2574
2575                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2576                         avg /= NSEC_PER_MSEC;
2577
2578                         sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2579                         printed += fprintf(fp, "   %-15s", sc->name);
2580                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2581                                            n, syscall_stats_entry->msecs, min, avg);
2582                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2583                 }
2584         }
2585
2586         resort_rb__delete(syscall_stats);
2587         printed += fprintf(fp, "\n\n");
2588
2589         return printed;
2590 }
2591
2592 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2593 {
2594         size_t printed = 0;
2595         struct thread_trace *ttrace = thread__priv(thread);
2596         double ratio;
2597
2598         if (ttrace == NULL)
2599                 return 0;
2600
2601         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2602
2603         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2604         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2605         printed += fprintf(fp, "%.1f%%", ratio);
2606         if (ttrace->pfmaj)
2607                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2608         if (ttrace->pfmin)
2609                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2610         if (trace->sched)
2611                 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2612         else if (fputc('\n', fp) != EOF)
2613                 ++printed;
2614
2615         printed += thread__dump_stats(ttrace, trace, fp);
2616
2617         return printed;
2618 }
2619
2620 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2621 {
2622         return ttrace ? ttrace->nr_events : 0;
2623 }
2624
2625 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2626         struct thread *thread;
2627 )
2628 {
2629         entry->thread = rb_entry(nd, struct thread, rb_node);
2630 }
2631
2632 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2633 {
2634         DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2635         size_t printed = trace__fprintf_threads_header(fp);
2636         struct rb_node *nd;
2637
2638         if (threads == NULL) {
2639                 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2640                 return 0;
2641         }
2642
2643         resort_rb__for_each_entry(nd, threads)
2644                 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2645
2646         resort_rb__delete(threads);
2647
2648         return printed;
2649 }
2650
2651 static int trace__set_duration(const struct option *opt, const char *str,
2652                                int unset __maybe_unused)
2653 {
2654         struct trace *trace = opt->value;
2655
2656         trace->duration_filter = atof(str);
2657         return 0;
2658 }
2659
2660 static int trace__set_filter_pids(const struct option *opt, const char *str,
2661                                   int unset __maybe_unused)
2662 {
2663         int ret = -1;
2664         size_t i;
2665         struct trace *trace = opt->value;
2666         /*
2667          * FIXME: introduce a intarray class, plain parse csv and create a
2668          * { int nr, int entries[] } struct...
2669          */
2670         struct intlist *list = intlist__new(str);
2671
2672         if (list == NULL)
2673                 return -1;
2674
2675         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2676         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2677
2678         if (trace->filter_pids.entries == NULL)
2679                 goto out;
2680
2681         trace->filter_pids.entries[0] = getpid();
2682
2683         for (i = 1; i < trace->filter_pids.nr; ++i)
2684                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2685
2686         intlist__delete(list);
2687         ret = 0;
2688 out:
2689         return ret;
2690 }
2691
2692 static int trace__open_output(struct trace *trace, const char *filename)
2693 {
2694         struct stat st;
2695
2696         if (!stat(filename, &st) && st.st_size) {
2697                 char oldname[PATH_MAX];
2698
2699                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2700                 unlink(oldname);
2701                 rename(filename, oldname);
2702         }
2703
2704         trace->output = fopen(filename, "w");
2705
2706         return trace->output == NULL ? -errno : 0;
2707 }
2708
2709 static int parse_pagefaults(const struct option *opt, const char *str,
2710                             int unset __maybe_unused)
2711 {
2712         int *trace_pgfaults = opt->value;
2713
2714         if (strcmp(str, "all") == 0)
2715                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2716         else if (strcmp(str, "maj") == 0)
2717                 *trace_pgfaults |= TRACE_PFMAJ;
2718         else if (strcmp(str, "min") == 0)
2719                 *trace_pgfaults |= TRACE_PFMIN;
2720         else
2721                 return -1;
2722
2723         return 0;
2724 }
2725
2726 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2727 {
2728         struct perf_evsel *evsel;
2729
2730         evlist__for_each_entry(evlist, evsel)
2731                 evsel->handler = handler;
2732 }
2733
2734 /*
2735  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2736  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2737  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2738  *
2739  * It'd be better to introduce a parse_options() variant that would return a
2740  * list with the terms it didn't match to an event...
2741  */
2742 static int trace__parse_events_option(const struct option *opt, const char *str,
2743                                       int unset __maybe_unused)
2744 {
2745         struct trace *trace = (struct trace *)opt->value;
2746         const char *s = str;
2747         char *sep = NULL, *lists[2] = { NULL, NULL, };
2748         int len = strlen(str), err = -1, list;
2749         char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2750         char group_name[PATH_MAX];
2751
2752         if (strace_groups_dir == NULL)
2753                 return -1;
2754
2755         if (*s == '!') {
2756                 ++s;
2757                 trace->not_ev_qualifier = true;
2758         }
2759
2760         while (1) {
2761                 if ((sep = strchr(s, ',')) != NULL)
2762                         *sep = '\0';
2763
2764                 list = 0;
2765                 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2766                         list = 1;
2767                 } else {
2768                         path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2769                         if (access(group_name, R_OK) == 0)
2770                                 list = 1;
2771                 }
2772
2773                 if (lists[list]) {
2774                         sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2775                 } else {
2776                         lists[list] = malloc(len);
2777                         if (lists[list] == NULL)
2778                                 goto out;
2779                         strcpy(lists[list], s);
2780                 }
2781
2782                 if (!sep)
2783                         break;
2784
2785                 *sep = ',';
2786                 s = sep + 1;
2787         }
2788
2789         if (lists[1] != NULL) {
2790                 struct strlist_config slist_config = {
2791                         .dirname = strace_groups_dir,
2792                 };
2793
2794                 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2795                 if (trace->ev_qualifier == NULL) {
2796                         fputs("Not enough memory to parse event qualifier", trace->output);
2797                         goto out;
2798                 }
2799
2800                 if (trace__validate_ev_qualifier(trace))
2801                         goto out;
2802         }
2803
2804         err = 0;
2805
2806         if (lists[0]) {
2807                 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2808                                                "event selector. use 'perf list' to list available events",
2809                                                parse_events_option);
2810                 err = parse_events_option(&o, lists[0], 0);
2811         }
2812 out:
2813         if (sep)
2814                 *sep = ',';
2815
2816         return err;
2817 }
2818
2819 int cmd_trace(int argc, const char **argv)
2820 {
2821         const char *trace_usage[] = {
2822                 "perf trace [<options>] [<command>]",
2823                 "perf trace [<options>] -- <command> [<options>]",
2824                 "perf trace record [<options>] [<command>]",
2825                 "perf trace record [<options>] -- <command> [<options>]",
2826                 NULL
2827         };
2828         struct trace trace = {
2829                 .syscalls = {
2830                         . max = -1,
2831                 },
2832                 .opts = {
2833                         .target = {
2834                                 .uid       = UINT_MAX,
2835                                 .uses_mmap = true,
2836                         },
2837                         .user_freq     = UINT_MAX,
2838                         .user_interval = ULLONG_MAX,
2839                         .no_buffering  = true,
2840                         .mmap_pages    = UINT_MAX,
2841                         .proc_map_timeout  = 500,
2842                 },
2843                 .output = stderr,
2844                 .show_comm = true,
2845                 .trace_syscalls = true,
2846                 .kernel_syscallchains = false,
2847                 .max_stack = UINT_MAX,
2848         };
2849         const char *output_name = NULL;
2850         const struct option trace_options[] = {
2851         OPT_CALLBACK('e', "event", &trace, "event",
2852                      "event/syscall selector. use 'perf list' to list available events",
2853                      trace__parse_events_option),
2854         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2855                     "show the thread COMM next to its id"),
2856         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2857         OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2858                      trace__parse_events_option),
2859         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2860         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2861         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2862                     "trace events on existing process id"),
2863         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2864                     "trace events on existing thread id"),
2865         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2866                      "pids to filter (by the kernel)", trace__set_filter_pids),
2867         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2868                     "system-wide collection from all CPUs"),
2869         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2870                     "list of cpus to monitor"),
2871         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2872                     "child tasks do not inherit counters"),
2873         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2874                      "number of mmap data pages",
2875                      perf_evlist__parse_mmap_pages),
2876         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2877                    "user to profile"),
2878         OPT_CALLBACK(0, "duration", &trace, "float",
2879                      "show only events with duration > N.M ms",
2880                      trace__set_duration),
2881         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2882         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2883         OPT_BOOLEAN('T', "time", &trace.full_time,
2884                     "Show full timestamp, not time relative to first start"),
2885         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2886                     "Show only syscall summary with statistics"),
2887         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2888                     "Show all syscalls and summary with statistics"),
2889         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2890                      "Trace pagefaults", parse_pagefaults, "maj"),
2891         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2892         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2893         OPT_CALLBACK(0, "call-graph", &trace.opts,
2894                      "record_mode[,record_size]", record_callchain_help,
2895                      &record_parse_callchain_opt),
2896         OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2897                     "Show the kernel callchains on the syscall exit path"),
2898         OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2899                      "Set the minimum stack depth when parsing the callchain, "
2900                      "anything below the specified depth will be ignored."),
2901         OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2902                      "Set the maximum stack depth when parsing the callchain, "
2903                      "anything beyond the specified depth will be ignored. "
2904                      "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2905         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2906                         "per thread proc mmap processing timeout in ms"),
2907         OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2908                      "ms to wait before starting measurement after program "
2909                      "start"),
2910         OPT_END()
2911         };
2912         bool __maybe_unused max_stack_user_set = true;
2913         bool mmap_pages_user_set = true;
2914         const char * const trace_subcommands[] = { "record", NULL };
2915         int err;
2916         char bf[BUFSIZ];
2917
2918         signal(SIGSEGV, sighandler_dump_stack);
2919         signal(SIGFPE, sighandler_dump_stack);
2920
2921         trace.evlist = perf_evlist__new();
2922         trace.sctbl = syscalltbl__new();
2923
2924         if (trace.evlist == NULL || trace.sctbl == NULL) {
2925                 pr_err("Not enough memory to run!\n");
2926                 err = -ENOMEM;
2927                 goto out;
2928         }
2929
2930         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2931                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2932
2933         err = bpf__setup_stdout(trace.evlist);
2934         if (err) {
2935                 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2936                 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2937                 goto out;
2938         }
2939
2940         err = -1;
2941
2942         if (trace.trace_pgfaults) {
2943                 trace.opts.sample_address = true;
2944                 trace.opts.sample_time = true;
2945         }
2946
2947         if (trace.opts.mmap_pages == UINT_MAX)
2948                 mmap_pages_user_set = false;
2949
2950         if (trace.max_stack == UINT_MAX) {
2951                 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2952                 max_stack_user_set = false;
2953         }
2954
2955 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2956         if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2957                 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2958 #endif
2959
2960         if (callchain_param.enabled) {
2961                 if (!mmap_pages_user_set && geteuid() == 0)
2962                         trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2963
2964                 symbol_conf.use_callchain = true;
2965         }
2966
2967         if (trace.evlist->nr_entries > 0)
2968                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2969
2970         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2971                 return trace__record(&trace, argc-1, &argv[1]);
2972
2973         /* summary_only implies summary option, but don't overwrite summary if set */
2974         if (trace.summary_only)
2975                 trace.summary = trace.summary_only;
2976
2977         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2978             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2979                 pr_err("Please specify something to trace.\n");
2980                 return -1;
2981         }
2982
2983         if (!trace.trace_syscalls && trace.ev_qualifier) {
2984                 pr_err("The -e option can't be used with --no-syscalls.\n");
2985                 goto out;
2986         }
2987
2988         if (output_name != NULL) {
2989                 err = trace__open_output(&trace, output_name);
2990                 if (err < 0) {
2991                         perror("failed to create output file");
2992                         goto out;
2993                 }
2994         }
2995
2996         trace.open_id = syscalltbl__id(trace.sctbl, "open");
2997
2998         err = target__validate(&trace.opts.target);
2999         if (err) {
3000                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3001                 fprintf(trace.output, "%s", bf);
3002                 goto out_close;
3003         }
3004
3005         err = target__parse_uid(&trace.opts.target);
3006         if (err) {
3007                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3008                 fprintf(trace.output, "%s", bf);
3009                 goto out_close;
3010         }
3011
3012         if (!argc && target__none(&trace.opts.target))
3013                 trace.opts.target.system_wide = true;
3014
3015         if (input_name)
3016                 err = trace__replay(&trace);
3017         else
3018                 err = trace__run(&trace, argc, argv);
3019
3020 out_close:
3021         if (output_name != NULL)
3022                 fclose(trace.output);
3023 out:
3024         return err;
3025 }