4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include "util/parse-options.h"
15 #include "util/parse-events.h"
17 #include "util/header.h"
18 #include "util/event.h"
19 #include "util/evlist.h"
20 #include "util/evsel.h"
21 #include "util/debug.h"
22 #include "util/session.h"
23 #include "util/tool.h"
24 #include "util/symbol.h"
25 #include "util/cpumap.h"
26 #include "util/thread_map.h"
27 #include "util/data.h"
35 struct perf_tool tool;
36 struct record_opts opts;
38 struct perf_data_file file;
39 struct perf_evlist *evlist;
40 struct perf_session *session;
44 bool no_buildid_cache;
48 static int record__write(struct record *rec, void *bf, size_t size)
50 if (perf_data_file__write(rec->session->file, bf, size) < 0) {
51 pr_err("failed to write perf data, error: %m\n");
55 rec->bytes_written += size;
59 static int process_synthesized_event(struct perf_tool *tool,
60 union perf_event *event,
61 struct perf_sample *sample __maybe_unused,
62 struct machine *machine __maybe_unused)
64 struct record *rec = container_of(tool, struct record, tool);
65 return record__write(rec, event, event->header.size);
68 static int record__mmap_read(struct record *rec, int idx)
70 struct perf_mmap *md = &rec->evlist->mmap[idx];
71 unsigned int head = perf_mmap__read_head(md);
72 unsigned int old = md->prev;
73 unsigned char *data = md->base + page_size;
85 if ((old & md->mask) + size != (head & md->mask)) {
86 buf = &data[old & md->mask];
87 size = md->mask + 1 - (old & md->mask);
90 if (record__write(rec, buf, size) < 0) {
96 buf = &data[old & md->mask];
100 if (record__write(rec, buf, size) < 0) {
106 perf_evlist__mmap_consume(rec->evlist, idx);
111 static volatile int done = 0;
112 static volatile int signr = -1;
113 static volatile int child_finished = 0;
115 static void sig_handler(int sig)
125 static void record__sig_exit(void)
130 signal(signr, SIG_DFL);
134 static int record__open(struct record *rec)
137 struct perf_evsel *pos;
138 struct perf_evlist *evlist = rec->evlist;
139 struct perf_session *session = rec->session;
140 struct record_opts *opts = &rec->opts;
143 perf_evlist__config(evlist, opts);
145 evlist__for_each(evlist, pos) {
147 if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
148 if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
150 ui__warning("%s\n", msg);
155 perf_evsel__open_strerror(pos, &opts->target,
156 errno, msg, sizeof(msg));
157 ui__error("%s\n", msg);
162 if (perf_evlist__apply_filters(evlist)) {
163 error("failed to set filter with %d (%s)\n", errno,
164 strerror_r(errno, msg, sizeof(msg)));
169 if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
170 if (errno == EPERM) {
171 pr_err("Permission error mapping pages.\n"
172 "Consider increasing "
173 "/proc/sys/kernel/perf_event_mlock_kb,\n"
174 "or try again with a smaller value of -m/--mmap_pages.\n"
175 "(current value: %u)\n", opts->mmap_pages);
178 pr_err("failed to mmap with %d (%s)\n", errno,
179 strerror_r(errno, msg, sizeof(msg)));
185 session->evlist = evlist;
186 perf_session__set_id_hdr_size(session);
191 static int process_buildids(struct record *rec)
193 struct perf_data_file *file = &rec->file;
194 struct perf_session *session = rec->session;
195 u64 start = session->header.data_offset;
197 u64 size = lseek(file->fd, 0, SEEK_CUR);
201 return __perf_session__process_events(session, start,
203 size, &build_id__mark_dso_hit_ops);
206 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
209 struct perf_tool *tool = data;
211 *As for guest kernel when processing subcommand record&report,
212 *we arrange module mmap prior to guest kernel mmap and trigger
213 *a preload dso because default guest module symbols are loaded
214 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
215 *method is used to avoid symbol missing when the first addr is
216 *in module instead of in guest kernel.
218 err = perf_event__synthesize_modules(tool, process_synthesized_event,
221 pr_err("Couldn't record guest kernel [%d]'s reference"
222 " relocation symbol.\n", machine->pid);
225 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
226 * have no _text sometimes.
228 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
231 pr_err("Couldn't record guest kernel [%d]'s reference"
232 " relocation symbol.\n", machine->pid);
235 static struct perf_event_header finished_round_event = {
236 .size = sizeof(struct perf_event_header),
237 .type = PERF_RECORD_FINISHED_ROUND,
240 static int record__mmap_read_all(struct record *rec)
242 u64 bytes_written = rec->bytes_written;
246 for (i = 0; i < rec->evlist->nr_mmaps; i++) {
247 if (rec->evlist->mmap[i].base) {
248 if (record__mmap_read(rec, i) != 0) {
256 * Mark the round finished in case we wrote
257 * at least one event.
259 if (bytes_written != rec->bytes_written)
260 rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
266 static void record__init_features(struct record *rec)
268 struct perf_session *session = rec->session;
271 for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
272 perf_header__set_feat(&session->header, feat);
275 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
277 if (!have_tracepoints(&rec->evlist->entries))
278 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
280 if (!rec->opts.branch_stack)
281 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
284 static volatile int workload_exec_errno;
287 * perf_evlist__prepare_workload will send a SIGUSR1
288 * if the fork fails, since we asked by setting its
289 * want_signal to true.
291 static void workload_exec_failed_signal(int signo __maybe_unused,
293 void *ucontext __maybe_unused)
295 workload_exec_errno = info->si_value.sival_int;
300 static int __cmd_record(struct record *rec, int argc, const char **argv)
304 unsigned long waking = 0;
305 const bool forks = argc > 0;
306 struct machine *machine;
307 struct perf_tool *tool = &rec->tool;
308 struct record_opts *opts = &rec->opts;
309 struct perf_data_file *file = &rec->file;
310 struct perf_session *session;
311 bool disabled = false, draining = false;
313 rec->progname = argv[0];
315 atexit(record__sig_exit);
316 signal(SIGCHLD, sig_handler);
317 signal(SIGINT, sig_handler);
318 signal(SIGTERM, sig_handler);
320 session = perf_session__new(file, false, NULL);
321 if (session == NULL) {
322 pr_err("Perf session creation failed.\n");
326 rec->session = session;
328 record__init_features(rec);
331 err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
333 workload_exec_failed_signal);
335 pr_err("Couldn't run the workload!\n");
337 goto out_delete_session;
341 if (record__open(rec) != 0) {
346 if (!rec->evlist->nr_groups)
347 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
350 err = perf_header__write_pipe(file->fd);
354 err = perf_session__write_header(session, rec->evlist,
361 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
362 pr_err("Couldn't generate buildids. "
363 "Use --no-buildid to profile anyway.\n");
368 machine = &session->machines.host;
371 err = perf_event__synthesize_attrs(tool, session,
372 process_synthesized_event);
374 pr_err("Couldn't synthesize attrs.\n");
378 if (have_tracepoints(&rec->evlist->entries)) {
380 * FIXME err <= 0 here actually means that
381 * there were no tracepoints so its not really
382 * an error, just that we don't need to
383 * synthesize anything. We really have to
384 * return this more properly and also
385 * propagate errors that now are calling die()
387 err = perf_event__synthesize_tracing_data(tool, file->fd, rec->evlist,
388 process_synthesized_event);
390 pr_err("Couldn't record tracing data.\n");
393 rec->bytes_written += err;
397 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
400 pr_err("Couldn't record kernel reference relocation symbol\n"
401 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
402 "Check /proc/kallsyms permission or run as root.\n");
404 err = perf_event__synthesize_modules(tool, process_synthesized_event,
407 pr_err("Couldn't record kernel module information.\n"
408 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
409 "Check /proc/modules permission or run as root.\n");
412 machines__process_guests(&session->machines,
413 perf_event__synthesize_guest_os, tool);
416 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
417 process_synthesized_event, opts->sample_address);
421 if (rec->realtime_prio) {
422 struct sched_param param;
424 param.sched_priority = rec->realtime_prio;
425 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
426 pr_err("Could not set realtime priority.\n");
433 * When perf is starting the traced process, all the events
434 * (apart from group members) have enable_on_exec=1 set,
435 * so don't spoil it by prematurely enabling them.
437 if (!target__none(&opts->target) && !opts->initial_delay)
438 perf_evlist__enable(rec->evlist);
444 perf_evlist__start_workload(rec->evlist);
446 if (opts->initial_delay) {
447 usleep(opts->initial_delay * 1000);
448 perf_evlist__enable(rec->evlist);
452 int hits = rec->samples;
454 if (record__mmap_read_all(rec) < 0) {
459 if (hits == rec->samples) {
460 if (done || draining)
462 err = perf_evlist__poll(rec->evlist, -1);
464 * Propagate error, only if there's any. Ignore positive
465 * number of returned events and interrupt error.
467 if (err > 0 || (err < 0 && errno == EINTR))
471 if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
476 * When perf is starting the traced process, at the end events
477 * die with the process and we wait for that. Thus no need to
478 * disable events in this case.
480 if (done && !disabled && !target__none(&opts->target)) {
481 perf_evlist__disable(rec->evlist);
486 if (forks && workload_exec_errno) {
487 char msg[STRERR_BUFSIZE];
488 const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
489 pr_err("Workload failed: %s\n", emsg);
495 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
498 * Approximate RIP event size: 24 bytes.
501 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
502 (double)rec->bytes_written / 1024.0 / 1024.0,
504 rec->bytes_written / 24);
512 kill(rec->evlist->workload.pid, SIGTERM);
518 else if (WIFEXITED(exit_status))
519 status = WEXITSTATUS(exit_status);
520 else if (WIFSIGNALED(exit_status))
521 signr = WTERMSIG(exit_status);
525 if (!err && !file->is_pipe) {
526 rec->session->header.data_size += rec->bytes_written;
528 if (!rec->no_buildid)
529 process_buildids(rec);
530 perf_session__write_header(rec->session, rec->evlist,
535 perf_session__delete(session);
539 #define BRANCH_OPT(n, m) \
540 { .name = n, .mode = (m) }
542 #define BRANCH_END { .name = NULL }
549 static const struct branch_mode branch_modes[] = {
550 BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
551 BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
552 BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
553 BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
554 BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
555 BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
556 BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
557 BRANCH_OPT("abort_tx", PERF_SAMPLE_BRANCH_ABORT_TX),
558 BRANCH_OPT("in_tx", PERF_SAMPLE_BRANCH_IN_TX),
559 BRANCH_OPT("no_tx", PERF_SAMPLE_BRANCH_NO_TX),
560 BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
565 parse_branch_stack(const struct option *opt, const char *str, int unset)
568 (PERF_SAMPLE_BRANCH_USER |\
569 PERF_SAMPLE_BRANCH_KERNEL |\
570 PERF_SAMPLE_BRANCH_HV)
572 uint64_t *mode = (uint64_t *)opt->value;
573 const struct branch_mode *br;
574 char *s, *os = NULL, *p;
581 * cannot set it twice, -b + --branch-filter for instance
586 /* str may be NULL in case no arg is passed to -b */
588 /* because str is read-only */
589 s = os = strdup(str);
598 for (br = branch_modes; br->name; br++) {
599 if (!strcasecmp(s, br->name))
603 ui__warning("unknown branch filter %s,"
604 " check man page\n", s);
618 /* default to any branch */
619 if ((*mode & ~ONLY_PLM) == 0) {
620 *mode = PERF_SAMPLE_BRANCH_ANY;
627 static void callchain_debug(void)
629 static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF" };
631 pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
633 if (callchain_param.record_mode == CALLCHAIN_DWARF)
634 pr_debug("callchain: stack dump size %d\n",
635 callchain_param.dump_size);
638 int record_parse_callchain_opt(const struct option *opt __maybe_unused,
644 callchain_param.enabled = !unset;
646 /* --no-call-graph */
648 callchain_param.record_mode = CALLCHAIN_NONE;
649 pr_debug("callchain: disabled\n");
653 ret = parse_callchain_record_opt(arg);
660 int record_callchain_opt(const struct option *opt __maybe_unused,
661 const char *arg __maybe_unused,
662 int unset __maybe_unused)
664 callchain_param.enabled = true;
666 if (callchain_param.record_mode == CALLCHAIN_NONE)
667 callchain_param.record_mode = CALLCHAIN_FP;
673 static int perf_record_config(const char *var, const char *value, void *cb)
675 if (!strcmp(var, "record.call-graph"))
676 var = "call-graph.record-mode"; /* fall-through */
678 return perf_default_config(var, value, cb);
681 static const char * const record_usage[] = {
682 "perf record [<options>] [<command>]",
683 "perf record [<options>] -- <command> [<options>]",
688 * XXX Ideally would be local to cmd_record() and passed to a record__new
689 * because we need to have access to it in record__exit, that is called
690 * after cmd_record() exits, but since record_options need to be accessible to
691 * builtin-script, leave it here.
693 * At least we don't ouch it in all the other functions here directly.
695 * Just say no to tons of global variables, sigh.
697 static struct record record = {
700 .mmap_pages = UINT_MAX,
701 .user_freq = UINT_MAX,
702 .user_interval = ULLONG_MAX,
706 .default_per_cpu = true,
711 #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
713 #ifdef HAVE_DWARF_UNWIND_SUPPORT
714 const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf";
716 const char record_callchain_help[] = CALLCHAIN_HELP "fp";
720 * XXX Will stay a global variable till we fix builtin-script.c to stop messing
721 * with it and switch to use the library functions in perf_evlist that came
722 * from builtin-record.c, i.e. use record_opts,
723 * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
726 const struct option record_options[] = {
727 OPT_CALLBACK('e', "event", &record.evlist, "event",
728 "event selector. use 'perf list' to list available events",
729 parse_events_option),
730 OPT_CALLBACK(0, "filter", &record.evlist, "filter",
731 "event filter", parse_filter),
732 OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
733 "record events on existing process id"),
734 OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
735 "record events on existing thread id"),
736 OPT_INTEGER('r', "realtime", &record.realtime_prio,
737 "collect data with this RT SCHED_FIFO priority"),
738 OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
739 "collect data without buffering"),
740 OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
741 "collect raw sample records from all opened counters"),
742 OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
743 "system-wide collection from all CPUs"),
744 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
745 "list of cpus to monitor"),
746 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
747 OPT_STRING('o', "output", &record.file.path, "file",
749 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
750 &record.opts.no_inherit_set,
751 "child tasks do not inherit counters"),
752 OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
753 OPT_CALLBACK('m', "mmap-pages", &record.opts.mmap_pages, "pages",
754 "number of mmap data pages",
755 perf_evlist__parse_mmap_pages),
756 OPT_BOOLEAN(0, "group", &record.opts.group,
757 "put the counters into a counter group"),
758 OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
759 NULL, "enables call-graph recording" ,
760 &record_callchain_opt),
761 OPT_CALLBACK(0, "call-graph", &record.opts,
762 "mode[,dump_size]", record_callchain_help,
763 &record_parse_callchain_opt),
764 OPT_INCR('v', "verbose", &verbose,
765 "be more verbose (show counter open errors, etc)"),
766 OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
767 OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
768 "per thread counts"),
769 OPT_BOOLEAN('d', "data", &record.opts.sample_address,
771 OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
772 OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
773 OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
775 OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
776 "do not update the buildid cache"),
777 OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
778 "do not collect buildids in perf.data"),
779 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
780 "monitor event in cgroup name only",
782 OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
783 "ms to wait before starting measurement after program start"),
784 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
787 OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
788 "branch any", "sample any taken branches",
791 OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
792 "branch filter mask", "branch stack filter modes",
794 OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
795 "sample by weight (on special events only)"),
796 OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
797 "sample transaction flags (special events only)"),
798 OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
799 "use per-thread mmaps"),
803 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
806 struct record *rec = &record;
809 rec->evlist = perf_evlist__new();
810 if (rec->evlist == NULL)
813 perf_config(perf_record_config, rec);
815 argc = parse_options(argc, argv, record_options, record_usage,
816 PARSE_OPT_STOP_AT_NON_OPTION);
817 if (!argc && target__none(&rec->opts.target))
818 usage_with_options(record_usage, record_options);
820 if (nr_cgroups && !rec->opts.target.system_wide) {
821 ui__error("cgroup monitoring only available in"
822 " system-wide mode\n");
823 usage_with_options(record_usage, record_options);
828 if (symbol_conf.kptr_restrict)
830 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
831 "check /proc/sys/kernel/kptr_restrict.\n\n"
832 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
833 "file is not found in the buildid cache or in the vmlinux path.\n\n"
834 "Samples in kernel modules won't be resolved at all.\n\n"
835 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
836 "even with a suitable vmlinux or kallsyms file.\n\n");
838 if (rec->no_buildid_cache || rec->no_buildid)
839 disable_buildid_cache();
841 if (rec->evlist->nr_entries == 0 &&
842 perf_evlist__add_default(rec->evlist) < 0) {
843 pr_err("Not enough memory for event selector list\n");
844 goto out_symbol_exit;
847 if (rec->opts.target.tid && !rec->opts.no_inherit_set)
848 rec->opts.no_inherit = true;
850 err = target__validate(&rec->opts.target);
852 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
853 ui__warning("%s", errbuf);
856 err = target__parse_uid(&rec->opts.target);
858 int saved_errno = errno;
860 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
861 ui__error("%s", errbuf);
864 goto out_symbol_exit;
868 if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
869 usage_with_options(record_usage, record_options);
871 if (record_opts__config(&rec->opts)) {
873 goto out_symbol_exit;
876 err = __cmd_record(&record, argc, argv);
878 perf_evlist__delete(rec->evlist);