The formula, used in the Linux kernel for guest TSC clock is: guest_tsc = tsc-offset + (host_tsc * tsc-scaling-ratio) >> tsc-scaling-ratio-frac-bits The KVM hypervisor exports these parameters in the debugfs, if clock scaling is supported: /sys/kernel/debug/kvm: tsc-offset tsc-scaling-ratio tsc-scaling-ratio-frac-bits Current trace-cmd implementation does not take into account fraction bits, when calculating guest clock offset in case of a clock scaling. The TRACECMD_OPTION_TIME_SHIFT option is extended with per-cpu array with fraction bits. KVM and PTP algorithms are extended to get fraction: - in PTP, there are fixed scaling=1 and fraction=0; - KVM gets these from KVM debugfs, if available. The formula from the kernel is implemented in trace-cmd library, when calculating timestamps of the guest events. This change is backward compatible with trace-cmd 2.9, where the TRACECMD_OPTION_TIME_SHIFT option is introduced. Signed-off-by: Tzvetomir Stoyanov (VMware) <tz.stoyanov@xxxxxxxxx> --- .../include/private/trace-cmd-private.h | 4 +- lib/trace-cmd/include/trace-tsync-local.h | 7 ++- lib/trace-cmd/trace-timesync-kvm.c | 56 ++++++++++++++----- lib/trace-cmd/trace-timesync-ptp.c | 4 +- lib/trace-cmd/trace-timesync.c | 55 ++++++++++++++---- 5 files changed, 96 insertions(+), 30 deletions(-) diff --git a/lib/trace-cmd/include/private/trace-cmd-private.h b/lib/trace-cmd/include/private/trace-cmd-private.h index c58b09e9..f40babef 100644 --- a/lib/trace-cmd/include/private/trace-cmd-private.h +++ b/lib/trace-cmd/include/private/trace-cmd-private.h @@ -454,8 +454,8 @@ tracecmd_tsync_with_guest(unsigned long long trace_id, int loop_interval, int guest_cpus, const char *proto_name, const char *clock); int tracecmd_tsync_with_guest_stop(struct tracecmd_time_sync *tsync); int tracecmd_tsync_get_offsets(struct tracecmd_time_sync *tsync, int cpu, - int *count, long long **ts, - long long **offsets, long long **scalings); + int *count, long long **ts, + long long **offsets, long long **scalings, long long **frac); int tracecmd_tsync_get_session_params(struct tracecmd_time_sync *tsync, char **selected_proto, unsigned int *tsync_port); diff --git a/lib/trace-cmd/include/trace-tsync-local.h b/lib/trace-cmd/include/trace-tsync-local.h index 7e56d187..b357662c 100644 --- a/lib/trace-cmd/include/trace-tsync-local.h +++ b/lib/trace-cmd/include/trace-tsync-local.h @@ -27,14 +27,15 @@ struct tracecmd_time_sync { struct clock_sync_offsets { /* Arrays with calculated time offsets at given time */ int sync_size; /* Allocated size of sync_ts, - * sync_offsets and sync_scalings + * sync_offsets, sync_scalings and sync_frac */ int sync_count; /* Number of elements in sync_ts, - * sync_offsets and sync_scalings + * sync_offsets, sync_scalings and sync_frac */ long long *sync_ts; long long *sync_offsets; long long *sync_scalings; + long long *sync_frac; }; struct clock_sync_context { @@ -60,7 +61,7 @@ int tracecmd_tsync_proto_register(const char *proto_name, int accuracy, int role int (*init)(struct tracecmd_time_sync *), int (*free)(struct tracecmd_time_sync *), int (*calc)(struct tracecmd_time_sync *, - long long *, long long *, + long long *, long long *, long long*, long long *, unsigned int)); int tracecmd_tsync_proto_unregister(char *proto_name); diff --git a/lib/trace-cmd/trace-timesync-kvm.c b/lib/trace-cmd/trace-timesync-kvm.c index 6f76fe1f..8d531922 100644 --- a/lib/trace-cmd/trace-timesync-kvm.c +++ b/lib/trace-cmd/trace-timesync-kvm.c @@ -19,6 +19,7 @@ #define KVM_DEBUG_FS "/sys/kernel/debug/kvm" #define KVM_DEBUG_OFFSET_FILE "tsc-offset" #define KVM_DEBUG_SCALING_FILE "tsc-scaling-ratio" +#define KVM_DEBUG_FRACTION_FILE "tsc-scaling-ratio-frac-bits" #define KVM_DEBUG_VCPU_DIR "vcpu" /* default KVM scaling values, taken from the Linux kernel */ @@ -37,6 +38,7 @@ struct kvm_clock_sync { int vcpu_count; char **vcpu_offsets; char **vcpu_scalings; + char **vcpu_frac; int marker_fd; struct tep_handle *tep; int raw_id; @@ -47,6 +49,7 @@ struct kvm_clock_offset_msg { s64 ts; s64 offset; s64 scaling; + s64 frac; }; static int read_ll_from_file(char *file, long long *res) @@ -72,21 +75,27 @@ static int read_ll_from_file(char *file, long long *res) static bool kvm_scaling_check_vm_cpu(char *vname, char *cpu) { - long long scaling; + long long scaling, frac; + bool has_scaling = false; + bool has_frac = false; char *path; int ret; if (asprintf(&path, "%s/%s/%s", vname, cpu, KVM_DEBUG_SCALING_FILE) < 0) - return true; + return false; ret = read_ll_from_file(path, &scaling); free(path); + if (!ret) + has_scaling = true; + + if (asprintf(&path, "%s/%s/%s", vname, cpu, KVM_DEBUG_FRACTION_FILE) < 0) + return false; + ret = read_ll_from_file(path, &frac); + free(path); + if (!ret) + has_frac = true; - /* - * If there is a scaling, different from the default - - * return false, not supported. - */ - if (!ret && - scaling != KVM_SCALING_AMD_DEFAULT && scaling != KVM_SCALING_INTEL_DEFAULT) + if (has_scaling != has_frac) return false; return true; @@ -173,6 +182,11 @@ static int kvm_open_vcpu_dir(struct kvm_clock_sync *kvm, int cpu, char *dir_str) dir_str, entry->d_name); kvm->vcpu_scalings[cpu] = strdup(path); } + if (!strcmp(entry->d_name, KVM_DEBUG_FRACTION_FILE)) { + snprintf(path, sizeof(path), "%s/%s", + dir_str, entry->d_name); + kvm->vcpu_frac[cpu] = strdup(path); + } } } if (!kvm->vcpu_offsets[cpu]) @@ -187,6 +201,8 @@ error: kvm->vcpu_offsets[cpu] = NULL; free(kvm->vcpu_scalings[cpu]); kvm->vcpu_scalings[cpu] = NULL; + free(kvm->vcpu_frac[cpu]); + kvm->vcpu_frac[cpu] = NULL; return -1; } @@ -252,7 +268,8 @@ static int kvm_clock_sync_init_host(struct tracecmd_time_sync *tsync, kvm->vcpu_count = tsync->vcpu_count; kvm->vcpu_offsets = calloc(kvm->vcpu_count, sizeof(char *)); kvm->vcpu_scalings = calloc(kvm->vcpu_count, sizeof(char *)); - if (!kvm->vcpu_offsets || !kvm->vcpu_scalings) + kvm->vcpu_frac = calloc(kvm->vcpu_count, sizeof(char *)); + if (!kvm->vcpu_offsets || !kvm->vcpu_scalings || !kvm->vcpu_frac) goto error; if (kvm_open_debug_files(kvm, tsync->guest_pid) < 0) goto error; @@ -261,6 +278,7 @@ static int kvm_clock_sync_init_host(struct tracecmd_time_sync *tsync, error: free(kvm->vcpu_offsets); free(kvm->vcpu_scalings); + free(kvm->vcpu_frac); return -1; } @@ -351,6 +369,8 @@ static int kvm_clock_sync_free(struct tracecmd_time_sync *tsync) kvm->vcpu_offsets[i] = NULL; free(kvm->vcpu_scalings[i]); kvm->vcpu_scalings[i] = NULL; + free(kvm->vcpu_frac[i]); + kvm->vcpu_frac[i] = NULL; } if (kvm->tep) tep_free(kvm->tep); @@ -362,7 +382,7 @@ static int kvm_clock_sync_free(struct tracecmd_time_sync *tsync) } static int kvm_clock_host(struct tracecmd_time_sync *tsync, - long long *offset, long long *scaling, + long long *offset, long long *scaling, long long *frac, long long *timestamp, unsigned int cpu) { char sync_proto[TRACECMD_TSYNC_PNAME_LENGTH]; @@ -372,6 +392,7 @@ static int kvm_clock_host(struct tracecmd_time_sync *tsync, long long kvm_scaling = 1; unsigned int sync_msg; long long kvm_offset; + long long kvm_frac = 0; unsigned int size; char *msg; int ret; @@ -386,12 +407,16 @@ static int kvm_clock_host(struct tracecmd_time_sync *tsync, ret = read_ll_from_file(kvm->vcpu_offsets[cpu], &kvm_offset); if (ret < 0) return -1; + if (kvm->vcpu_scalings && kvm->vcpu_scalings[cpu]) { read_ll_from_file(kvm->vcpu_scalings[cpu], &kvm_scaling); if (kvm_scaling == KVM_SCALING_AMD_DEFAULT || kvm_scaling == KVM_SCALING_INTEL_DEFAULT) kvm_scaling = 1; } + + if (kvm->vcpu_frac && kvm->vcpu_frac[cpu]) + ret = read_ll_from_file(kvm->vcpu_frac[cpu], &kvm_frac); msg = (char *)&packet; size = sizeof(packet); ret = tracecmd_msg_recv_time_sync(tsync->msg_handle, @@ -403,6 +428,7 @@ static int kvm_clock_host(struct tracecmd_time_sync *tsync, packet.offset = -kvm_offset; packet.scaling = kvm_scaling; + packet.frac = kvm_frac; ret = tracecmd_msg_send_time_sync(tsync->msg_handle, KVM_NAME, KVM_SYNC_PKT_RESPONSE, sizeof(packet), (char *)&packet); @@ -411,6 +437,7 @@ static int kvm_clock_host(struct tracecmd_time_sync *tsync, *scaling = packet.scaling; *offset = packet.offset; + *frac = kvm_frac; *timestamp = packet.ts; return 0; @@ -442,10 +469,10 @@ static int kvm_marker_find(struct tep_event *event, struct tep_record *record, return 0; } - static int kvm_clock_guest(struct tracecmd_time_sync *tsync, long long *offset, long long *scaling, + long long *frac, long long *timestamp) { char sync_proto[TRACECMD_TSYNC_PNAME_LENGTH]; @@ -486,12 +513,13 @@ static int kvm_clock_guest(struct tracecmd_time_sync *tsync, *scaling = packet.scaling; *offset = packet.offset; + *frac = packet.frac; *timestamp = packet.ts; return 0; } static int kvm_clock_sync_calc(struct tracecmd_time_sync *tsync, - long long *offset, long long *scaling, + long long *offset, long long *scaling, long long *frac, long long *timestamp, unsigned int cpu) { struct clock_sync_context *clock_context; @@ -503,9 +531,9 @@ static int kvm_clock_sync_calc(struct tracecmd_time_sync *tsync, clock_context = (struct clock_sync_context *)tsync->context; if (clock_context->is_guest) - ret = kvm_clock_guest(tsync, offset, scaling, timestamp); + ret = kvm_clock_guest(tsync, offset, scaling, frac, timestamp); else - ret = kvm_clock_host(tsync, offset, scaling, timestamp, cpu); + ret = kvm_clock_host(tsync, offset, scaling, frac, timestamp, cpu); return ret; } diff --git a/lib/trace-cmd/trace-timesync-ptp.c b/lib/trace-cmd/trace-timesync-ptp.c index b05f1cd0..70242ee3 100644 --- a/lib/trace-cmd/trace-timesync-ptp.c +++ b/lib/trace-cmd/trace-timesync-ptp.c @@ -663,7 +663,7 @@ static int ptp_clock_server(struct tracecmd_time_sync *tsync, } static int ptp_clock_sync_calc(struct tracecmd_time_sync *tsync, - long long *offset, long long *scaling, + long long *offset, long long *scaling, long long *frac, long long *timestamp, unsigned int cpu) { struct clock_sync_context *clock_context; @@ -689,6 +689,8 @@ static int ptp_clock_sync_calc(struct tracecmd_time_sync *tsync, if (scaling) *scaling = 1; + if (frac) + *frac = 0; if (clock_context->is_server) ret = ptp_clock_server(tsync, offset, timestamp); else diff --git a/lib/trace-cmd/trace-timesync.c b/lib/trace-cmd/trace-timesync.c index 19ca19d7..f8ec15a3 100644 --- a/lib/trace-cmd/trace-timesync.c +++ b/lib/trace-cmd/trace-timesync.c @@ -36,7 +36,7 @@ struct tsync_proto { int (*clock_sync_init)(struct tracecmd_time_sync *clock_context); int (*clock_sync_free)(struct tracecmd_time_sync *clock_context); int (*clock_sync_calc)(struct tracecmd_time_sync *clock_context, - long long *offset, long long *scaling, + long long *offset, long long *scaling, long long *frac, long long *timestamp, unsigned int cpu); }; @@ -76,7 +76,7 @@ int tracecmd_tsync_proto_register(const char *proto_name, int accuracy, int role int (*init)(struct tracecmd_time_sync *), int (*free)(struct tracecmd_time_sync *), int (*calc)(struct tracecmd_time_sync *, - long long *, long long *, + long long *, long long *, long long *, long long *, unsigned int)) { struct tsync_proto *proto = NULL; @@ -137,12 +137,13 @@ bool __hidden tsync_proto_is_supported(const char *proto_name) * @ts: Array of size @count containing timestamps of callculated offsets * @offsets: array of size @count, containing offsets for each timestamp * @scalings: array of size @count, containing scaling ratios for each timestamp + * @frac: array of size @count, containing fraction bits for each timestamp * * Retuns -1 in case of an error, or 0 otherwise */ int tracecmd_tsync_get_offsets(struct tracecmd_time_sync *tsync, int cpu, int *count, long long **ts, - long long **offsets, long long **scalings) + long long **offsets, long long **scalings, long long **frac) { struct clock_sync_context *tsync_context; @@ -159,6 +160,8 @@ int tracecmd_tsync_get_offsets(struct tracecmd_time_sync *tsync, int cpu, *offsets = tsync_context->offsets[cpu].sync_offsets; if (scalings) *scalings = tsync_context->offsets[cpu].sync_scalings; + if (frac) + *frac = tsync_context->offsets[cpu].sync_frac; return 0; } @@ -564,9 +567,11 @@ void tracecmd_tsync_free(struct tracecmd_time_sync *tsync) free(tsync_context->offsets[i].sync_ts); free(tsync_context->offsets[i].sync_offsets); free(tsync_context->offsets[i].sync_scalings); + free(tsync_context->offsets[i].sync_frac); tsync_context->offsets[i].sync_ts = NULL; tsync_context->offsets[i].sync_offsets = NULL; tsync_context->offsets[i].sync_scalings = NULL; + tsync_context->offsets[i].sync_frac = NULL; tsync_context->offsets[i].sync_count = 0; tsync_context->offsets[i].sync_size = 0; } @@ -643,10 +648,11 @@ static int tsync_send(struct tracecmd_time_sync *tsync, long long timestamp = 0; long long scaling = 0; long long offset = 0; + long long frac = 0; int ret; old_set = pin_to_cpu(cpu); - ret = proto->clock_sync_calc(tsync, &offset, &scaling, ×tamp, cpu); + ret = proto->clock_sync_calc(tsync, &offset, &scaling, &frac, ×tamp, cpu); if (old_set) restore_pin_to_cpu(old_set); @@ -685,10 +691,11 @@ static void tsync_with_host(struct tracecmd_time_sync *tsync) } static int record_sync_sample(struct clock_sync_offsets *offsets, int array_step, - long long offset, long long scaling, long long ts) + long long offset, long long scaling, long long frac, long long ts) { long long *sync_scalings = NULL; long long *sync_offsets = NULL; + long long *sync_frac = NULL; long long *sync_ts = NULL; if (offsets->sync_count >= offsets->sync_size) { @@ -698,22 +705,27 @@ static int record_sync_sample(struct clock_sync_offsets *offsets, int array_step (offsets->sync_size + array_step) * sizeof(long long)); sync_scalings = realloc(offsets->sync_scalings, (offsets->sync_size + array_step) * sizeof(long long)); + sync_frac = realloc(offsets->sync_frac, + (offsets->sync_size + array_step) * sizeof(long long)); - if (!sync_ts || !sync_offsets || !sync_scalings) { + if (!sync_ts || !sync_offsets || !sync_scalings || !sync_frac) { free(sync_ts); free(sync_offsets); free(sync_scalings); + free(sync_frac); return -1; } offsets->sync_size += array_step; offsets->sync_ts = sync_ts; offsets->sync_offsets = sync_offsets; offsets->sync_scalings = sync_scalings; + offsets->sync_frac = sync_frac; } offsets->sync_ts[offsets->sync_count] = ts; offsets->sync_offsets[offsets->sync_count] = offset; offsets->sync_scalings[offsets->sync_count] = scaling; + offsets->sync_frac[offsets->sync_count] = frac; offsets->sync_count++; return 0; @@ -726,9 +738,10 @@ static int tsync_get_sample(struct tracecmd_time_sync *tsync, unsigned int cpu, long long timestamp = 0; long long scaling = 0; long long offset = 0; + long long frac = 0; int ret; - ret = proto->clock_sync_calc(tsync, &offset, &scaling, ×tamp, cpu); + ret = proto->clock_sync_calc(tsync, &offset, &scaling, &frac, ×tamp, cpu); if (ret) { tracecmd_warning("Failed to synchronize timestamps with guest"); return -1; @@ -739,7 +752,7 @@ static int tsync_get_sample(struct tracecmd_time_sync *tsync, unsigned int cpu, if (!clock || cpu >= clock->cpu_count || !clock->offsets) return -1; return record_sync_sample(&clock->offsets[cpu], array_step, - offset, scaling, timestamp); + offset, scaling, frac, timestamp); } #define TIMER_SEC_NANO 1000000000LL @@ -928,6 +941,7 @@ int tracecmd_write_guest_time_shift(struct tracecmd_output *handle, unsigned int flags; long long *scalings = NULL; long long *offsets = NULL; + long long *frac = NULL; long long *ts = NULL; int vcount; int count; @@ -936,7 +950,7 @@ int tracecmd_write_guest_time_shift(struct tracecmd_output *handle, if (!tsync->vcpu_count) return -1; - vcount = 3 + (4 * tsync->vcpu_count); + vcount = 3 + (5 * tsync->vcpu_count); vector = calloc(vcount, sizeof(struct iovec)); if (!vector) return -1; @@ -955,7 +969,7 @@ int tracecmd_write_guest_time_shift(struct tracecmd_output *handle, if (j >= vcount) break; ret = tracecmd_tsync_get_offsets(tsync, i, &count, - &ts, &offsets, &scalings); + &ts, &offsets, &scalings, NULL); if (ret < 0 || !count || !ts || !offsets || !scalings) break; vector[j].iov_len = 4; @@ -971,6 +985,27 @@ int tracecmd_write_guest_time_shift(struct tracecmd_output *handle, ret = -1; goto out; } + /* + * Writing fraction bits into the option is implemented in a separate loop for + * backward compatibility. In the trace-cmd 2.9 release, this option has only offset + * and scaling. That legacy code must work with the new extended option. + * + */ + for (i = 0; i < tsync->vcpu_count; i++) { + if (j >= vcount) + break; + ret = tracecmd_tsync_get_offsets(tsync, i, NULL, + NULL, NULL, NULL, &frac); + if (ret < 0) + break; + vector[j].iov_len = 8 * count; + vector[j++].iov_base = frac; + } + if (i < tsync->vcpu_count) { + ret = -1; + goto out; + } + tracecmd_add_option_v(handle, TRACECMD_OPTION_TIME_SHIFT, vector, vcount); #ifdef TSYNC_DEBUG if (count > 1) -- 2.31.1