On 20/07/22 04:06, Ian Rogers wrote: > On Mon, Jul 11, 2022 at 2:33 AM Adrian Hunter <adrian.hunter@xxxxxxxxx> wrote: >> >> Inject events from a perf.data file recorded in a virtual machine into >> a perf.data file recorded on the host at the same time. >> >> Only side band events (e.g. mmap, comm, fork, exit etc) and build IDs are >> injected. Additionally, the guest kcore_dir is copied as kcore_dir__ >> appended to the machine PID. >> >> This is non-trivial because: >> o It is not possible to process 2 sessions simultaneously so instead >> events are first written to a temporary file. >> o To avoid conflict, guest sample IDs are replaced with new unused sample >> IDs. >> o Guest event's CPU is changed to be the host CPU because it is more >> useful for reporting and analysis. >> o Sample ID is mapped to machine PID which is recorded with VCPU in the >> id index. This is important to allow guest events to be related to the >> guest machine and VCPU. >> o Timestamps must be converted. >> o Events are inserted to obey finished-round ordering. >> >> The anticipated use-case is: >> - start recording sideband events in a guest machine >> - start recording an AUX area trace on the host which can trace also the >> guest (e.g. Intel PT) >> - run test case on the guest >> - stop recording on the host >> - stop recording on the guest >> - copy the guest perf.data file to the host >> - inject the guest perf.data file sideband events into the host perf.data >> file using perf inject >> - the resulting perf.data file can now be used >> >> Subsequent patches provide Intel PT support for this. >> >> Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx> >> --- >> tools/perf/Documentation/perf-inject.txt | 17 + >> tools/perf/builtin-inject.c | 1043 +++++++++++++++++++++- >> 2 files changed, 1059 insertions(+), 1 deletion(-) >> >> diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt >> index 0570a1ccd344..646aa31586ed 100644 >> --- a/tools/perf/Documentation/perf-inject.txt >> +++ b/tools/perf/Documentation/perf-inject.txt >> @@ -85,6 +85,23 @@ include::itrace.txt[] >> without updating it. Currently this option is supported only by >> Intel PT, refer linkperf:perf-intel-pt[1] >> >> +--guest-data=<path>,<pid>[,<time offset>[,<time scale>]]:: >> + Insert events from a perf.data file recorded in a virtual machine at >> + the same time as the input perf.data file was recorded on the host. >> + The Process ID (PID) of the QEMU hypervisor process must be provided, >> + and the time offset and time scale (multiplier) will likely be needed >> + to convert guest time stamps into host time stamps. For example, for >> + x86 the TSC Offset and Multiplier could be provided for a virtual machine >> + using Linux command line option no-kvmclock. >> + Currently only mmap, mmap2, comm, task, context_switch, ksymbol, >> + and text_poke events are inserted, as well as build ID information. >> + The QEMU option -name debug-threads=on is needed so that thread names >> + can be used to determine which thread is running which VCPU. Note >> + libvirt seems to use this by default. >> + When using perf record in the guest, option --sample-identifier >> + should be used, and also --buildid-all and --switch-events may be >> + useful. >> + > > Would other hypervisors based on kvm like gVisor work if they > implemented name-debug-threads? AFAICT gVisor is not a machine level hypervisor so the issue does not arise. > >> SEE ALSO >> -------- >> linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1], >> diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c >> index c800911f68e7..fd4547bb75f7 100644 >> --- a/tools/perf/builtin-inject.c >> +++ b/tools/perf/builtin-inject.c >> @@ -26,6 +26,7 @@ >> #include "util/thread.h" >> #include "util/namespaces.h" >> #include "util/util.h" >> +#include "util/tsc.h" >> >> #include <internal/lib.h> >> >> @@ -35,8 +36,70 @@ >> >> #include <linux/list.h> >> #include <linux/string.h> >> +#include <linux/zalloc.h> >> +#include <linux/hash.h> >> #include <errno.h> >> #include <signal.h> >> +#include <inttypes.h> >> + >> +struct guest_event { >> + struct perf_sample sample; >> + union perf_event *event; >> + char event_buf[PERF_SAMPLE_MAX_SIZE]; >> +}; >> + >> +struct guest_id { >> + /* hlist_node must be first, see free_hlist() */ >> + struct hlist_node node; >> + u64 id; >> + u64 host_id; >> + u32 vcpu; >> +}; >> + >> +struct guest_tid { >> + /* hlist_node must be first, see free_hlist() */ >> + struct hlist_node node; >> + /* Thread ID of QEMU thread */ >> + u32 tid; >> + u32 vcpu; >> +}; >> + >> +struct guest_vcpu { >> + /* Current host CPU */ >> + u32 cpu; >> + /* Thread ID of QEMU thread */ >> + u32 tid; >> +}; >> + >> +struct guest_session { >> + char *perf_data_file; >> + u32 machine_pid; >> + u64 time_offset; >> + double time_scale; >> + struct perf_tool tool; >> + struct perf_data data; >> + struct perf_session *session; >> + char *tmp_file_name; >> + int tmp_fd; >> + struct perf_tsc_conversion host_tc; >> + struct perf_tsc_conversion guest_tc; >> + bool copy_kcore_dir; >> + bool have_tc; >> + bool fetched; >> + bool ready; >> + u16 dflt_id_hdr_size; >> + u64 dflt_id; >> + u64 highest_id; >> + /* Array of guest_vcpu */ >> + struct guest_vcpu *vcpu; >> + size_t vcpu_cnt; >> + /* Hash table for guest_id */ >> + struct hlist_head heads[PERF_EVLIST__HLIST_SIZE]; >> + /* Hash table for guest_tid */ >> + struct hlist_head tids[PERF_EVLIST__HLIST_SIZE]; >> + /* Place to stash next guest event */ >> + struct guest_event ev; >> +}; >> >> struct perf_inject { >> struct perf_tool tool; >> @@ -59,6 +122,7 @@ struct perf_inject { >> struct itrace_synth_opts itrace_synth_opts; >> char event_copy[PERF_SAMPLE_MAX_SIZE]; >> struct perf_file_section secs[HEADER_FEAT_BITS]; >> + struct guest_session guest_session; >> }; >> >> struct event_entry { >> @@ -698,6 +762,841 @@ static int perf_inject__sched_stat(struct perf_tool *tool, >> return perf_event__repipe(tool, event_sw, &sample_sw, machine); >> } >> >> +static struct guest_vcpu *guest_session__vcpu(struct guest_session *gs, u32 vcpu) >> +{ >> + if (realloc_array_as_needed(gs->vcpu, gs->vcpu_cnt, vcpu, NULL)) >> + return NULL; >> + return &gs->vcpu[vcpu]; >> +} >> + >> +static int guest_session__output_bytes(struct guest_session *gs, void *buf, size_t sz) >> +{ >> + ssize_t ret = writen(gs->tmp_fd, buf, sz); >> + >> + return ret < 0 ? ret : 0; >> +} >> + >> +static int guest_session__repipe(struct perf_tool *tool, >> + union perf_event *event, >> + struct perf_sample *sample __maybe_unused, >> + struct machine *machine __maybe_unused) >> +{ >> + struct guest_session *gs = container_of(tool, struct guest_session, tool); >> + >> + return guest_session__output_bytes(gs, event, event->header.size); >> +} >> + >> +static int guest_session__map_tid(struct guest_session *gs, u32 tid, u32 vcpu) >> +{ >> + struct guest_tid *guest_tid = zalloc(sizeof(*guest_tid)); >> + int hash; >> + >> + if (!guest_tid) >> + return -ENOMEM; >> + >> + guest_tid->tid = tid; >> + guest_tid->vcpu = vcpu; >> + hash = hash_32(guest_tid->tid, PERF_EVLIST__HLIST_BITS); >> + hlist_add_head(&guest_tid->node, &gs->tids[hash]); >> + >> + return 0; >> +} >> + >> +static int host_peek_vm_comms_cb(struct perf_session *session __maybe_unused, >> + union perf_event *event, >> + u64 offset __maybe_unused, void *data) >> +{ >> + struct guest_session *gs = data; >> + unsigned int vcpu; >> + struct guest_vcpu *guest_vcpu; >> + int ret; >> + >> + if (event->header.type != PERF_RECORD_COMM || >> + event->comm.pid != gs->machine_pid) >> + return 0; >> + >> + /* >> + * QEMU option -name debug-threads=on, causes thread names formatted as >> + * below, although it is not an ABI. Also libvirt seems to use this by >> + * default. Here we rely on it to tell us which thread is which VCPU. >> + */ >> + ret = sscanf(event->comm.comm, "CPU %u/KVM", &vcpu); >> + if (ret <= 0) >> + return ret; >> + pr_debug("Found VCPU: tid %u comm %s vcpu %u\n", >> + event->comm.tid, event->comm.comm, vcpu); >> + if (vcpu > INT_MAX) { >> + pr_err("Invalid VCPU %u\n", vcpu); >> + return -EINVAL; >> + } >> + guest_vcpu = guest_session__vcpu(gs, vcpu); >> + if (!guest_vcpu) >> + return -ENOMEM; >> + if (guest_vcpu->tid && guest_vcpu->tid != event->comm.tid) { >> + pr_err("Fatal error: Two threads found with the same VCPU\n"); >> + return -EINVAL; >> + } >> + guest_vcpu->tid = event->comm.tid; >> + >> + return guest_session__map_tid(gs, event->comm.tid, vcpu); >> +} >> + >> +static int host_peek_vm_comms(struct perf_session *session, struct guest_session *gs) >> +{ >> + return perf_session__peek_events(session, session->header.data_offset, >> + session->header.data_size, >> + host_peek_vm_comms_cb, gs); >> +} >> + >> +static bool evlist__is_id_used(struct evlist *evlist, u64 id) >> +{ >> + return evlist__id2sid(evlist, id); >> +} >> + >> +static u64 guest_session__allocate_new_id(struct guest_session *gs, struct evlist *host_evlist) >> +{ >> + do { >> + gs->highest_id += 1; >> + } while (!gs->highest_id || evlist__is_id_used(host_evlist, gs->highest_id)); >> + >> + return gs->highest_id; >> +} >> + >> +static int guest_session__map_id(struct guest_session *gs, u64 id, u64 host_id, u32 vcpu) >> +{ >> + struct guest_id *guest_id = zalloc(sizeof(*guest_id)); >> + int hash; >> + >> + if (!guest_id) >> + return -ENOMEM; >> + >> + guest_id->id = id; >> + guest_id->host_id = host_id; >> + guest_id->vcpu = vcpu; >> + hash = hash_64(guest_id->id, PERF_EVLIST__HLIST_BITS); >> + hlist_add_head(&guest_id->node, &gs->heads[hash]); >> + >> + return 0; >> +} >> + >> +static u64 evlist__find_highest_id(struct evlist *evlist) >> +{ >> + struct evsel *evsel; >> + u64 highest_id = 1; >> + >> + evlist__for_each_entry(evlist, evsel) { >> + u32 j; >> + >> + for (j = 0; j < evsel->core.ids; j++) { >> + u64 id = evsel->core.id[j]; >> + >> + if (id > highest_id) >> + highest_id = id; >> + } >> + } >> + >> + return highest_id; >> +} >> + >> +static int guest_session__map_ids(struct guest_session *gs, struct evlist *host_evlist) >> +{ >> + struct evlist *evlist = gs->session->evlist; >> + struct evsel *evsel; >> + int ret; >> + >> + evlist__for_each_entry(evlist, evsel) { >> + u32 j; >> + >> + for (j = 0; j < evsel->core.ids; j++) { >> + struct perf_sample_id *sid; >> + u64 host_id; >> + u64 id; >> + >> + id = evsel->core.id[j]; >> + sid = evlist__id2sid(evlist, id); >> + if (!sid || sid->cpu.cpu == -1) >> + continue; >> + host_id = guest_session__allocate_new_id(gs, host_evlist); >> + ret = guest_session__map_id(gs, id, host_id, sid->cpu.cpu); >> + if (ret) >> + return ret; >> + } >> + } >> + >> + return 0; >> +} >> + >> +static struct guest_id *guest_session__lookup_id(struct guest_session *gs, u64 id) >> +{ >> + struct hlist_head *head; >> + struct guest_id *guest_id; >> + int hash; >> + >> + hash = hash_64(id, PERF_EVLIST__HLIST_BITS); >> + head = &gs->heads[hash]; >> + >> + hlist_for_each_entry(guest_id, head, node) >> + if (guest_id->id == id) >> + return guest_id; >> + >> + return NULL; >> +} >> + >> +static int process_attr(struct perf_tool *tool, union perf_event *event, >> + struct perf_sample *sample __maybe_unused, >> + struct machine *machine __maybe_unused) >> +{ >> + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); >> + >> + return perf_event__process_attr(tool, event, &inject->session->evlist); >> +} >> + >> +static int guest_session__add_attr(struct guest_session *gs, struct evsel *evsel) >> +{ >> + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); >> + struct perf_event_attr attr = evsel->core.attr; >> + u64 *id_array; >> + u32 *vcpu_array; >> + int ret = -ENOMEM; >> + u32 i; >> + >> + id_array = calloc(evsel->core.ids, sizeof(*id_array)); >> + if (!id_array) >> + return -ENOMEM; >> + >> + vcpu_array = calloc(evsel->core.ids, sizeof(*vcpu_array)); >> + if (!vcpu_array) >> + goto out; >> + >> + for (i = 0; i < evsel->core.ids; i++) { >> + u64 id = evsel->core.id[i]; >> + struct guest_id *guest_id = guest_session__lookup_id(gs, id); >> + >> + if (!guest_id) { >> + pr_err("Failed to find guest id %"PRIu64"\n", id); >> + ret = -EINVAL; >> + goto out; >> + } >> + id_array[i] = guest_id->host_id; >> + vcpu_array[i] = guest_id->vcpu; >> + } >> + >> + attr.sample_type |= PERF_SAMPLE_IDENTIFIER; >> + attr.exclude_host = 1; >> + attr.exclude_guest = 0; >> + >> + ret = perf_event__synthesize_attr(&inject->tool, &attr, evsel->core.ids, >> + id_array, process_attr); >> + if (ret) >> + pr_err("Failed to add guest attr.\n"); >> + >> + for (i = 0; i < evsel->core.ids; i++) { >> + struct perf_sample_id *sid; >> + u32 vcpu = vcpu_array[i]; >> + >> + sid = evlist__id2sid(inject->session->evlist, id_array[i]); >> + /* Guest event is per-thread from the host point of view */ >> + sid->cpu.cpu = -1; >> + sid->tid = gs->vcpu[vcpu].tid; >> + sid->machine_pid = gs->machine_pid; >> + sid->vcpu.cpu = vcpu; >> + } >> +out: >> + free(vcpu_array); >> + free(id_array); >> + return ret; >> +} >> + >> +static int guest_session__add_attrs(struct guest_session *gs) >> +{ >> + struct evlist *evlist = gs->session->evlist; >> + struct evsel *evsel; >> + int ret; >> + >> + evlist__for_each_entry(evlist, evsel) { >> + ret = guest_session__add_attr(gs, evsel); >> + if (ret) >> + return ret; >> + } >> + >> + return 0; >> +} >> + >> +static int synthesize_id_index(struct perf_inject *inject, size_t new_cnt) >> +{ >> + struct perf_session *session = inject->session; >> + struct evlist *evlist = session->evlist; >> + struct machine *machine = &session->machines.host; >> + size_t from = evlist->core.nr_entries - new_cnt; >> + >> + return __perf_event__synthesize_id_index(&inject->tool, perf_event__repipe, >> + evlist, machine, from); >> +} >> + >> +static struct guest_tid *guest_session__lookup_tid(struct guest_session *gs, u32 tid) >> +{ >> + struct hlist_head *head; >> + struct guest_tid *guest_tid; >> + int hash; >> + >> + hash = hash_32(tid, PERF_EVLIST__HLIST_BITS); >> + head = &gs->tids[hash]; >> + >> + hlist_for_each_entry(guest_tid, head, node) >> + if (guest_tid->tid == tid) >> + return guest_tid; >> + >> + return NULL; >> +} >> + >> +static bool dso__is_in_kernel_space(struct dso *dso) >> +{ >> + if (dso__is_vdso(dso)) >> + return false; >> + >> + return dso__is_kcore(dso) || >> + dso->kernel || >> + is_kernel_module(dso->long_name, PERF_RECORD_MISC_CPUMODE_UNKNOWN); >> +} >> + >> +static u64 evlist__first_id(struct evlist *evlist) >> +{ >> + struct evsel *evsel; >> + >> + evlist__for_each_entry(evlist, evsel) { >> + if (evsel->core.ids) >> + return evsel->core.id[0]; >> + } >> + return 0; >> +} >> + >> +static int process_build_id(struct perf_tool *tool, >> + union perf_event *event, >> + struct perf_sample *sample __maybe_unused, >> + struct machine *machine __maybe_unused) >> +{ >> + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); >> + >> + return perf_event__process_build_id(inject->session, event); >> +} >> + >> +static int synthesize_build_id(struct perf_inject *inject, struct dso *dso, pid_t machine_pid) >> +{ >> + struct machine *machine = perf_session__findnew_machine(inject->session, machine_pid); >> + u8 cpumode = dso__is_in_kernel_space(dso) ? >> + PERF_RECORD_MISC_GUEST_KERNEL : >> + PERF_RECORD_MISC_GUEST_USER; >> + >> + if (!machine) >> + return -ENOMEM; >> + >> + dso->hit = 1; >> + >> + return perf_event__synthesize_build_id(&inject->tool, dso, cpumode, >> + process_build_id, machine); >> +} >> + >> +static int guest_session__add_build_ids(struct guest_session *gs) >> +{ >> + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); >> + struct machine *machine = &gs->session->machines.host; >> + struct dso *dso; >> + int ret; >> + >> + /* Build IDs will be put in the Build ID feature section */ >> + perf_header__set_feat(&inject->session->header, HEADER_BUILD_ID); >> + >> + dsos__for_each_with_build_id(dso, &machine->dsos.head) { >> + ret = synthesize_build_id(inject, dso, gs->machine_pid); >> + if (ret) >> + return ret; >> + } >> + >> + return 0; >> +} >> + >> +static int guest_session__ksymbol_event(struct perf_tool *tool, >> + union perf_event *event, >> + struct perf_sample *sample __maybe_unused, >> + struct machine *machine __maybe_unused) >> +{ >> + struct guest_session *gs = container_of(tool, struct guest_session, tool); >> + >> + /* Only support out-of-line i.e. no BPF support */ >> + if (event->ksymbol.ksym_type != PERF_RECORD_KSYMBOL_TYPE_OOL) >> + return 0; >> + >> + return guest_session__output_bytes(gs, event, event->header.size); >> +} >> + >> +static int guest_session__start(struct guest_session *gs, const char *name, bool force) >> +{ >> + char tmp_file_name[] = "/tmp/perf-inject-guest_session-XXXXXX"; >> + struct perf_session *session; >> + int ret; >> + >> + /* Only these events will be injected */ >> + gs->tool.mmap = guest_session__repipe; >> + gs->tool.mmap2 = guest_session__repipe; >> + gs->tool.comm = guest_session__repipe; >> + gs->tool.fork = guest_session__repipe; >> + gs->tool.exit = guest_session__repipe; >> + gs->tool.lost = guest_session__repipe; >> + gs->tool.context_switch = guest_session__repipe; >> + gs->tool.ksymbol = guest_session__ksymbol_event; >> + gs->tool.text_poke = guest_session__repipe; >> + /* >> + * Processing a build ID creates a struct dso with that build ID. Later, >> + * all guest dsos are iterated and the build IDs processed into the host >> + * session where they will be output to the Build ID feature section >> + * when the perf.data file header is written. >> + */ >> + gs->tool.build_id = perf_event__process_build_id; >> + /* Process the id index to know what VCPU an ID belongs to */ >> + gs->tool.id_index = perf_event__process_id_index; >> + >> + gs->tool.ordered_events = true; >> + gs->tool.ordering_requires_timestamps = true; >> + >> + gs->data.path = name; >> + gs->data.force = force; >> + gs->data.mode = PERF_DATA_MODE_READ; >> + >> + session = perf_session__new(&gs->data, &gs->tool); >> + if (IS_ERR(session)) >> + return PTR_ERR(session); >> + gs->session = session; >> + >> + /* >> + * Initial events have zero'd ID samples. Get default ID sample size >> + * used for removing them. >> + */ >> + gs->dflt_id_hdr_size = session->machines.host.id_hdr_size; >> + /* And default ID for adding back a host-compatible ID sample */ >> + gs->dflt_id = evlist__first_id(session->evlist); >> + if (!gs->dflt_id) { >> + pr_err("Guest data has no sample IDs"); >> + return -EINVAL; >> + } >> + >> + /* Temporary file for guest events */ >> + gs->tmp_file_name = strdup(tmp_file_name); >> + if (!gs->tmp_file_name) >> + return -ENOMEM; >> + gs->tmp_fd = mkstemp(gs->tmp_file_name); >> + if (gs->tmp_fd < 0) >> + return -errno; >> + >> + if (zstd_init(&gs->session->zstd_data, 0) < 0) >> + pr_warning("Guest session decompression initialization failed.\n"); >> + >> + /* >> + * perf does not support processing 2 sessions simultaneously, so output >> + * guest events to a temporary file. >> + */ >> + ret = perf_session__process_events(gs->session); >> + if (ret) >> + return ret; >> + >> + if (lseek(gs->tmp_fd, 0, SEEK_SET)) >> + return -errno; >> + >> + return 0; >> +} >> + >> +/* Free hlist nodes assuming hlist_node is the first member of hlist entries */ >> +static void free_hlist(struct hlist_head *heads, size_t hlist_sz) >> +{ >> + struct hlist_node *pos, *n; >> + size_t i; >> + >> + for (i = 0; i < hlist_sz; ++i) { >> + hlist_for_each_safe(pos, n, &heads[i]) { >> + hlist_del(pos); >> + free(pos); >> + } >> + } >> +} >> + >> +static void guest_session__exit(struct guest_session *gs) >> +{ >> + if (gs->session) { >> + perf_session__delete(gs->session); >> + free_hlist(gs->heads, PERF_EVLIST__HLIST_SIZE); >> + free_hlist(gs->tids, PERF_EVLIST__HLIST_SIZE); >> + } >> + if (gs->tmp_file_name) { >> + if (gs->tmp_fd >= 0) >> + close(gs->tmp_fd); >> + unlink(gs->tmp_file_name); >> + free(gs->tmp_file_name); >> + } >> + free(gs->vcpu); >> + free(gs->perf_data_file); >> +} >> + >> +static void get_tsc_conv(struct perf_tsc_conversion *tc, struct perf_record_time_conv *time_conv) >> +{ >> + tc->time_shift = time_conv->time_shift; >> + tc->time_mult = time_conv->time_mult; >> + tc->time_zero = time_conv->time_zero; >> + tc->time_cycles = time_conv->time_cycles; >> + tc->time_mask = time_conv->time_mask; >> + tc->cap_user_time_zero = time_conv->cap_user_time_zero; >> + tc->cap_user_time_short = time_conv->cap_user_time_short; >> +} >> + >> +static void guest_session__get_tc(struct guest_session *gs) >> +{ >> + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); >> + >> + get_tsc_conv(&gs->host_tc, &inject->session->time_conv); >> + get_tsc_conv(&gs->guest_tc, &gs->session->time_conv); >> +} >> + >> +static void guest_session__convert_time(struct guest_session *gs, u64 guest_time, u64 *host_time) >> +{ >> + u64 tsc; >> + >> + if (!guest_time) { >> + *host_time = 0; >> + return; >> + } >> + >> + if (gs->guest_tc.cap_user_time_zero) >> + tsc = perf_time_to_tsc(guest_time, &gs->guest_tc); >> + else >> + tsc = guest_time; >> + >> + /* >> + * This is the correct order of operations for x86 if the TSC Offset and >> + * Multiplier values are used. >> + */ >> + tsc -= gs->time_offset; >> + tsc /= gs->time_scale; >> + >> + if (gs->host_tc.cap_user_time_zero) >> + *host_time = tsc_to_perf_time(tsc, &gs->host_tc); >> + else >> + *host_time = tsc; >> +} >> + >> +static int guest_session__fetch(struct guest_session *gs) >> +{ >> + void *buf = gs->ev.event_buf; >> + struct perf_event_header *hdr = buf; >> + size_t hdr_sz = sizeof(*hdr); >> + ssize_t ret; >> + >> + ret = readn(gs->tmp_fd, buf, hdr_sz); >> + if (ret < 0) >> + return ret; >> + >> + if (!ret) { >> + /* Zero size means EOF */ >> + hdr->size = 0; >> + return 0; >> + } >> + >> + buf += hdr_sz; >> + >> + ret = readn(gs->tmp_fd, buf, hdr->size - hdr_sz); >> + if (ret < 0) >> + return ret; >> + >> + gs->ev.event = (union perf_event *)gs->ev.event_buf; >> + gs->ev.sample.time = 0; >> + >> + if (hdr->type >= PERF_RECORD_USER_TYPE_START) { >> + pr_err("Unexpected type fetching guest event"); >> + return 0; >> + } >> + >> + ret = evlist__parse_sample(gs->session->evlist, gs->ev.event, &gs->ev.sample); >> + if (ret) { >> + pr_err("Parse failed fetching guest event"); >> + return ret; >> + } >> + >> + if (!gs->have_tc) { >> + guest_session__get_tc(gs); >> + gs->have_tc = true; >> + } >> + >> + guest_session__convert_time(gs, gs->ev.sample.time, &gs->ev.sample.time); >> + >> + return 0; >> +} >> + >> +static int evlist__append_id_sample(struct evlist *evlist, union perf_event *ev, >> + const struct perf_sample *sample) >> +{ >> + struct evsel *evsel; >> + void *array; >> + int ret; >> + >> + evsel = evlist__id2evsel(evlist, sample->id); >> + array = ev; >> + >> + if (!evsel) { >> + pr_err("No evsel for id %"PRIu64"\n", sample->id); >> + return -EINVAL; >> + } >> + >> + array += ev->header.size; >> + ret = perf_event__synthesize_id_sample(array, evsel->core.attr.sample_type, sample); >> + if (ret < 0) >> + return ret; >> + >> + if (ret & 7) { >> + pr_err("Bad id sample size %d\n", ret); >> + return -EINVAL; >> + } >> + >> + ev->header.size += ret; >> + >> + return 0; >> +} >> + >> +static int guest_session__inject_events(struct guest_session *gs, u64 timestamp) >> +{ >> + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); >> + int ret; >> + >> + if (!gs->ready) >> + return 0; >> + >> + while (1) { >> + struct perf_sample *sample; >> + struct guest_id *guest_id; >> + union perf_event *ev; >> + u16 id_hdr_size; >> + u8 cpumode; >> + u64 id; >> + >> + if (!gs->fetched) { >> + ret = guest_session__fetch(gs); >> + if (ret) >> + return ret; >> + gs->fetched = true; >> + } >> + >> + ev = gs->ev.event; >> + sample = &gs->ev.sample; >> + >> + if (!ev->header.size) >> + return 0; /* EOF */ >> + >> + if (sample->time > timestamp) >> + return 0; >> + >> + /* Change cpumode to guest */ >> + cpumode = ev->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; >> + if (cpumode & PERF_RECORD_MISC_USER) >> + cpumode = PERF_RECORD_MISC_GUEST_USER; >> + else >> + cpumode = PERF_RECORD_MISC_GUEST_KERNEL; >> + ev->header.misc &= ~PERF_RECORD_MISC_CPUMODE_MASK; >> + ev->header.misc |= cpumode; >> + >> + id = sample->id; >> + if (!id) { >> + id = gs->dflt_id; >> + id_hdr_size = gs->dflt_id_hdr_size; >> + } else { >> + struct evsel *evsel = evlist__id2evsel(gs->session->evlist, id); >> + >> + id_hdr_size = evsel__id_hdr_size(evsel); >> + } >> + >> + if (id_hdr_size & 7) { >> + pr_err("Bad id_hdr_size %u\n", id_hdr_size); >> + return -EINVAL; >> + } >> + >> + if (ev->header.size & 7) { >> + pr_err("Bad event size %u\n", ev->header.size); >> + return -EINVAL; >> + } >> + >> + /* Remove guest id sample */ >> + ev->header.size -= id_hdr_size; >> + >> + if (ev->header.size & 7) { >> + pr_err("Bad raw event size %u\n", ev->header.size); >> + return -EINVAL; >> + } >> + >> + guest_id = guest_session__lookup_id(gs, id); >> + if (!guest_id) { >> + pr_err("Guest event with unknown id %llu\n", >> + (unsigned long long)id); >> + return -EINVAL; >> + } >> + >> + /* Change to host ID to avoid conflicting ID values */ >> + sample->id = guest_id->host_id; >> + sample->stream_id = guest_id->host_id; >> + >> + if (sample->cpu != (u32)-1) { >> + if (sample->cpu >= gs->vcpu_cnt) { >> + pr_err("Guest event with unknown VCPU %u\n", >> + sample->cpu); >> + return -EINVAL; >> + } >> + /* Change to host CPU instead of guest VCPU */ >> + sample->cpu = gs->vcpu[sample->cpu].cpu; >> + } >> + >> + /* New id sample with new ID and CPU */ >> + ret = evlist__append_id_sample(inject->session->evlist, ev, sample); >> + if (ret) >> + return ret; >> + >> + if (ev->header.size & 7) { >> + pr_err("Bad new event size %u\n", ev->header.size); >> + return -EINVAL; >> + } >> + >> + gs->fetched = false; >> + >> + ret = output_bytes(inject, ev, ev->header.size); >> + if (ret) >> + return ret; >> + } >> +} >> + >> +static int guest_session__flush_events(struct guest_session *gs) >> +{ >> + return guest_session__inject_events(gs, -1); >> +} >> + >> +static int host__repipe(struct perf_tool *tool, >> + union perf_event *event, >> + struct perf_sample *sample, >> + struct machine *machine) >> +{ >> + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); >> + int ret; >> + >> + ret = guest_session__inject_events(&inject->guest_session, sample->time); >> + if (ret) >> + return ret; >> + >> + return perf_event__repipe(tool, event, sample, machine); >> +} >> + >> +static int host__finished_init(struct perf_session *session, union perf_event *event) >> +{ >> + struct perf_inject *inject = container_of(session->tool, struct perf_inject, tool); >> + struct guest_session *gs = &inject->guest_session; >> + int ret; >> + >> + /* >> + * Peek through host COMM events to find QEMU threads and the VCPU they >> + * are running. >> + */ >> + ret = host_peek_vm_comms(session, gs); >> + if (ret) >> + return ret; >> + >> + if (!gs->vcpu_cnt) { >> + pr_err("No VCPU theads found for pid %u\n", gs->machine_pid); >> + return -EINVAL; >> + } >> + >> + /* >> + * Allocate new (unused) host sample IDs and map them to the guest IDs. >> + */ >> + gs->highest_id = evlist__find_highest_id(session->evlist); >> + ret = guest_session__map_ids(gs, session->evlist); >> + if (ret) >> + return ret; >> + >> + ret = guest_session__add_attrs(gs); >> + if (ret) >> + return ret; >> + >> + ret = synthesize_id_index(inject, gs->session->evlist->core.nr_entries); >> + if (ret) { >> + pr_err("Failed to synthesize id_index\n"); >> + return ret; >> + } >> + >> + ret = guest_session__add_build_ids(gs); >> + if (ret) { >> + pr_err("Failed to add guest build IDs\n"); >> + return ret; >> + } >> + >> + gs->ready = true; >> + >> + ret = guest_session__inject_events(gs, 0); >> + if (ret) >> + return ret; >> + >> + return perf_event__repipe_op2_synth(session, event); >> +} >> + >> +/* >> + * Obey finished-round ordering. The FINISHED_ROUND event is first processed >> + * which flushes host events to file up until the last flush time. Then inject >> + * guest events up to the same time. Finally write out the FINISHED_ROUND event >> + * itself. >> + */ >> +static int host__finished_round(struct perf_tool *tool, >> + union perf_event *event, >> + struct ordered_events *oe) >> +{ >> + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); >> + int ret = perf_event__process_finished_round(tool, event, oe); >> + u64 timestamp = ordered_events__last_flush_time(oe); >> + >> + if (ret) >> + return ret; >> + >> + ret = guest_session__inject_events(&inject->guest_session, timestamp); >> + if (ret) >> + return ret; >> + >> + return perf_event__repipe_oe_synth(tool, event, oe); >> +} >> + >> +static int host__context_switch(struct perf_tool *tool, >> + union perf_event *event, >> + struct perf_sample *sample, >> + struct machine *machine) >> +{ >> + struct perf_inject *inject = container_of(tool, struct perf_inject, tool); >> + bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT; >> + struct guest_session *gs = &inject->guest_session; >> + u32 pid = event->context_switch.next_prev_pid; >> + u32 tid = event->context_switch.next_prev_tid; >> + struct guest_tid *guest_tid; >> + u32 vcpu; >> + >> + if (out || pid != gs->machine_pid) >> + goto out; >> + >> + guest_tid = guest_session__lookup_tid(gs, tid); >> + if (!guest_tid) >> + goto out; >> + >> + if (sample->cpu == (u32)-1) { >> + pr_err("Switch event does not have CPU\n"); >> + return -EINVAL; >> + } >> + >> + vcpu = guest_tid->vcpu; >> + if (vcpu >= gs->vcpu_cnt) >> + return -EINVAL; >> + >> + /* Guest is switching in, record which CPU the VCPU is now running on */ >> + gs->vcpu[vcpu].cpu = sample->cpu; >> +out: >> + return host__repipe(tool, event, sample, machine); >> +} >> + >> static void sig_handler(int sig __maybe_unused) >> { >> session_done = 1; >> @@ -767,6 +1666,61 @@ static int parse_vm_time_correlation(const struct option *opt, const char *str, >> return inject->itrace_synth_opts.vm_tm_corr_args ? 0 : -ENOMEM; >> } >> >> +static int parse_guest_data(const struct option *opt, const char *str, int unset) >> +{ >> + struct perf_inject *inject = opt->value; >> + struct guest_session *gs = &inject->guest_session; >> + char *tok; >> + char *s; >> + >> + if (unset) >> + return 0; >> + >> + if (!str) >> + goto bad_args; >> + >> + s = strdup(str); >> + if (!s) >> + return -ENOMEM; >> + >> + gs->perf_data_file = strsep(&s, ","); >> + if (!gs->perf_data_file) >> + goto bad_args; >> + >> + gs->copy_kcore_dir = has_kcore_dir(gs->perf_data_file); >> + if (gs->copy_kcore_dir) >> + inject->output.is_dir = true; >> + >> + tok = strsep(&s, ","); >> + if (!tok) >> + goto bad_args; >> + gs->machine_pid = strtoul(tok, NULL, 0); >> + if (!inject->guest_session.machine_pid) >> + goto bad_args; >> + >> + gs->time_scale = 1; >> + >> + tok = strsep(&s, ","); >> + if (!tok) >> + goto out; >> + gs->time_offset = strtoull(tok, NULL, 0); >> + >> + tok = strsep(&s, ","); >> + if (!tok) >> + goto out; >> + gs->time_scale = strtod(tok, NULL); >> + if (!gs->time_scale) >> + goto bad_args; >> +out: >> + return 0; >> + >> +bad_args: >> + pr_err("--guest-data option requires guest perf.data file name, " >> + "guest machine PID, and optionally guest timestamp offset, " >> + "and guest timestamp scale factor, separated by commas.\n"); >> + return -1; >> +} >> + >> static int save_section_info_cb(struct perf_file_section *section, >> struct perf_header *ph __maybe_unused, >> int feat, int fd __maybe_unused, void *data) >> @@ -896,6 +1850,22 @@ static int copy_kcore_dir(struct perf_inject *inject) >> return ret; >> } >> >> +static int guest_session__copy_kcore_dir(struct guest_session *gs) >> +{ >> + struct perf_inject *inject = container_of(gs, struct perf_inject, guest_session); >> + char *cmd; >> + int ret; >> + >> + ret = asprintf(&cmd, "cp -r -n %s/kcore_dir %s/kcore_dir__%u >/dev/null 2>&1", >> + gs->perf_data_file, inject->output.path, gs->machine_pid); >> + if (ret < 0) >> + return ret; >> + pr_debug("%s\n", cmd); >> + ret = system(cmd); >> + free(cmd); >> + return ret; >> +} >> + >> static int output_fd(struct perf_inject *inject) >> { >> return inject->in_place_update ? -1 : perf_data__fd(&inject->output); >> @@ -904,6 +1874,7 @@ static int output_fd(struct perf_inject *inject) >> static int __cmd_inject(struct perf_inject *inject) >> { >> int ret = -EINVAL; >> + struct guest_session *gs = &inject->guest_session; >> struct perf_session *session = inject->session; >> int fd = output_fd(inject); >> u64 output_data_offset; >> @@ -968,6 +1939,47 @@ static int __cmd_inject(struct perf_inject *inject) >> output_data_offset = roundup(8192 + session->header.data_offset, 4096); >> if (inject->strip) >> strip_init(inject); >> + } else if (gs->perf_data_file) { >> + char *name = gs->perf_data_file; >> + >> + /* >> + * Not strictly necessary, but keep these events in order wrt >> + * guest events. >> + */ >> + inject->tool.mmap = host__repipe; >> + inject->tool.mmap2 = host__repipe; >> + inject->tool.comm = host__repipe; >> + inject->tool.fork = host__repipe; >> + inject->tool.exit = host__repipe; >> + inject->tool.lost = host__repipe; >> + inject->tool.context_switch = host__repipe; >> + inject->tool.ksymbol = host__repipe; >> + inject->tool.text_poke = host__repipe; >> + /* >> + * Once the host session has initialized, set up sample ID >> + * mapping and feed in guest attrs, build IDs and initial >> + * events. >> + */ >> + inject->tool.finished_init = host__finished_init; >> + /* Obey finished round ordering */ >> + inject->tool.finished_round = host__finished_round, >> + /* Keep track of which CPU a VCPU is runnng on */ >> + inject->tool.context_switch = host__context_switch; >> + /* >> + * Must order events to be able to obey finished round >> + * ordering. >> + */ >> + inject->tool.ordered_events = true; >> + inject->tool.ordering_requires_timestamps = true; >> + /* Set up a separate session to process guest perf.data file */ >> + ret = guest_session__start(gs, name, session->data->force); >> + if (ret) { >> + pr_err("Failed to process %s, error %d\n", name, ret); >> + return ret; >> + } >> + /* Allow space in the header for guest attributes */ >> + output_data_offset += gs->session->header.data_offset; >> + output_data_offset = roundup(output_data_offset, 4096); >> } >> >> if (!inject->itrace_synth_opts.set) >> @@ -980,6 +1992,18 @@ static int __cmd_inject(struct perf_inject *inject) >> if (ret) >> return ret; >> >> + if (gs->session) { >> + /* >> + * Remaining guest events have later timestamps. Flush them >> + * out to file. >> + */ >> + ret = guest_session__flush_events(gs); >> + if (ret) { >> + pr_err("Failed to flush guest events\n"); >> + return ret; >> + } >> + } >> + >> if (!inject->is_pipe && !inject->in_place_update) { >> struct inject_fc inj_fc = { >> .fc.copy = feat_copy_cb, >> @@ -1014,8 +2038,17 @@ static int __cmd_inject(struct perf_inject *inject) >> >> if (inject->copy_kcore_dir) { >> ret = copy_kcore_dir(inject); >> - if (ret) >> + if (ret) { >> + pr_err("Failed to copy kcore\n"); >> return ret; >> + } >> + } >> + if (gs->copy_kcore_dir) { >> + ret = guest_session__copy_kcore_dir(gs); >> + if (ret) { >> + pr_err("Failed to copy guest kcore\n"); >> + return ret; >> + } >> } >> } >> >> @@ -1113,6 +2146,12 @@ int cmd_inject(int argc, const char **argv) >> OPT_CALLBACK_OPTARG(0, "vm-time-correlation", &inject, NULL, "opts", >> "correlate time between VM guests and the host", >> parse_vm_time_correlation), >> + OPT_CALLBACK_OPTARG(0, "guest-data", &inject, NULL, "opts", >> + "inject events from a guest perf.data file", >> + parse_guest_data), >> + OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory", >> + "guest mount directory under which every guest os" >> + " instance has a subdir"), > > Should guestmount also be in the man page? Also should it have a > hyphen like guest-data? Sent a patch to update the man page. It is "guestmount" in other tools so we should stick with that. > > Thanks, > Ian > >> OPT_END() >> }; >> const char * const inject_usage[] = { >> @@ -1243,6 +2282,8 @@ int cmd_inject(int argc, const char **argv) >> >> ret = __cmd_inject(&inject); >> >> + guest_session__exit(&inject.guest_session); >> + >> out_delete: >> zstd_fini(&(inject.session->zstd_data)); >> perf_session__delete(inject.session); >> -- >> 2.25.1 >>