On Tue, 2021-05-11 at 11:16 +0000, Stamatis, Ilias wrote: > On Mon, 2021-05-10 at 16:59 +0300, Maxim Levitsky wrote: > > On Thu, 2021-05-06 at 10:32 +0000, ilstam@xxxxxxxxxxx wrote: > > > From: Ilias Stamatis <ilstam@xxxxxxxxxx> > > > > > > Test that nested TSC scaling works as expected with both L1 and L2 > > > scaled. > > > > > > Signed-off-by: Ilias Stamatis <ilstam@xxxxxxxxxx> > > > --- > > > tools/testing/selftests/kvm/.gitignore | 1 + > > > tools/testing/selftests/kvm/Makefile | 1 + > > > .../kvm/x86_64/vmx_nested_tsc_scaling_test.c | 209 ++++++++++++++++++ > > > 3 files changed, 211 insertions(+) > > > create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c > > > > > > diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore > > > index bd83158e0e0b..cc02022f9951 100644 > > > --- a/tools/testing/selftests/kvm/.gitignore > > > +++ b/tools/testing/selftests/kvm/.gitignore > > > @@ -29,6 +29,7 @@ > > > /x86_64/vmx_preemption_timer_test > > > /x86_64/vmx_set_nested_state_test > > > /x86_64/vmx_tsc_adjust_test > > > +/x86_64/vmx_nested_tsc_scaling_test > > > /x86_64/xapic_ipi_test > > > /x86_64/xen_shinfo_test > > > /x86_64/xen_vmcall_test > > > diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile > > > index e439d027939d..1078240b1313 100644 > > > --- a/tools/testing/selftests/kvm/Makefile > > > +++ b/tools/testing/selftests/kvm/Makefile > > > @@ -60,6 +60,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test > > > TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test > > > TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test > > > TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test > > > +TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test > > > TEST_GEN_PROGS_x86_64 += x86_64/xapic_ipi_test > > > TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test > > > TEST_GEN_PROGS_x86_64 += x86_64/debug_regs > > > diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c > > > new file mode 100644 > > > index 000000000000..b05f5151ecbe > > > --- /dev/null > > > +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c > > > @@ -0,0 +1,209 @@ > > > +// SPDX-License-Identifier: GPL-2.0-only > > > +/* > > > + * vmx_nested_tsc_scaling_test > > > + * > > > + * Copyright (C) 2021 Amazon.com, Inc. or its affiliates. > > > + * > > > + * This test case verifies that nested TSC scaling behaves as expected when > > > + * both L1 and L2 are scaled using different ratios. For this test we scale > > > + * L1 down and scale L2 up. > > > + */ > > > + > > > + > > > +#include "kvm_util.h" > > > +#include "vmx.h" > > > +#include "kselftest.h" > > > + > > > + > > > +#define VCPU_ID 0 > > > + > > > +/* L1 is scaled down by this factor */ > > > +#define L1_SCALE_FACTOR 2ULL > > > +/* L2 is scaled up (from L1's perspective) by this factor */ > > > +#define L2_SCALE_FACTOR 4ULL > > > > For fun, I might have randomized these factors as well. > > So L2_SCALE_FACTOR (or rather TSC_MULTIPLIER_L2 that depends on it) is > referenced from within l1_guest_code(). If we change this to a static variable > we won't be able to access it from there. How could this be done? I also had this thought after I wrote the reply. I don't have much experience yet with KVM selftests so this might indeed be not possible. > > For the L1 factor it's easy as we only use it in main(). > > > > + > > > +#define TSC_OFFSET_L2 (1UL << 32) > > > +#define TSC_MULTIPLIER_L2 (L2_SCALE_FACTOR << 48) > > > > It would be fun to use a negative offset here (also randomally). > > Do you mean a random offset that is always negative or a random offset that > sometimes is positive and sometimes is negative? Yep, to test the special case for negative numbers. > > > > + > > > +#define L2_GUEST_STACK_SIZE 64 > > > + > > > +enum { USLEEP, UCHECK_L1, UCHECK_L2 }; > > > +#define GUEST_SLEEP(sec) ucall(UCALL_SYNC, 2, USLEEP, sec) > > > +#define GUEST_CHECK(level, freq) ucall(UCALL_SYNC, 2, level, freq) > > > + > > > + > > > +/* > > > + * This function checks whether the "actual" TSC frequency of a guest matches > > > + * its expected frequency. In order to account for delays in taking the TSC > > > + * measurements, a difference of 1% between the actual and the expected value > > > + * is tolerated. > > > + */ > > > +static void compare_tsc_freq(uint64_t actual, uint64_t expected) > > > +{ > > > + uint64_t tolerance, thresh_low, thresh_high; > > > + > > > + tolerance = expected / 100; > > > + thresh_low = expected - tolerance; > > > + thresh_high = expected + tolerance; > > > + > > > + TEST_ASSERT(thresh_low < actual, > > > + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 > > > + " but it actually is %"PRIu64, > > > + thresh_low, thresh_high, actual); > > > + TEST_ASSERT(thresh_high > actual, > > > + "TSC freq is expected to be between %"PRIu64" and %"PRIu64 > > > + " but it actually is %"PRIu64, > > > + thresh_low, thresh_high, actual); > > > +} > > > + > > > +static void check_tsc_freq(int level) > > > +{ > > > + uint64_t tsc_start, tsc_end, tsc_freq; > > > + > > > + /* > > > + * Reading the TSC twice with about a second's difference should give > > > + * us an approximation of the TSC frequency from the guest's > > > + * perspective. Now, this won't be completely accurate, but it should > > > + * be good enough for the purposes of this test. > > > + */ > > > > It would be nice to know if the host has stable TSC (you can obtain this via > > KVM_GET_CLOCK, the KVM_CLOCK_TSC_STABLE flag). > > > > And if not stable skip the test, to avoid false positives. > > (Yes I have a laptop I just bought that has an unstable TSC....) > > > > Hmm, this is a vm ioctl but I noticed that one of its vcpus needs to have been > run at least once otherwise it won't return KVM_CLOCK_TSC_STABLE in the flags. > > So... Yes, now I remember that this thing relies on the TSC sync logic, master clock thing, etc... Oh well... To be honest we really need the kernel to export the information it knows about the TSC because it is useful to many users and not limited to virtualization. Currently other than KVM's KVM_GET_TSC_KHZ there is no clean way to know even the TSC frequency, let alone if kernel considers the TSC to be stable AFAIK. Other more or less reliable (but hacky) way to know if TSC is stable is to see if the kernel is using tsc via (/sys/devices/system/clocksource/clocksource0/current_clocksource = tsc) Oh well... Best regards, Maxim Levitsky > > > > + tsc_start = rdmsr(MSR_IA32_TSC); > > > + GUEST_SLEEP(1); > > > + tsc_end = rdmsr(MSR_IA32_TSC); > > > + > > > + tsc_freq = tsc_end - tsc_start; > > > + > > > + GUEST_CHECK(level, tsc_freq); > > > +} > > > + > > > +static void l2_guest_code(void) > > > +{ > > > + check_tsc_freq(UCHECK_L2); > > > + > > > + /* exit to L1 */ > > > + __asm__ __volatile__("vmcall"); > > > +} > > > + > > > +static void l1_guest_code(struct vmx_pages *vmx_pages) > > > +{ > > > + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; > > > + uint32_t control; > > > + > > > + /* check that L1's frequency looks alright before launching L2 */ > > > + check_tsc_freq(UCHECK_L1); > > > + > > > + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); > > > + GUEST_ASSERT(load_vmcs(vmx_pages)); > > > + > > > + /* prepare the VMCS for L2 execution */ > > > + prepare_vmcs(vmx_pages, l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); > > > + > > > + /* enable TSC offsetting and TSC scaling for L2 */ > > > + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); > > > + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; > > > + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); > > > + > > > + control = vmreadz(SECONDARY_VM_EXEC_CONTROL); > > > + control |= SECONDARY_EXEC_TSC_SCALING; > > > + vmwrite(SECONDARY_VM_EXEC_CONTROL, control); > > > + > > > + vmwrite(TSC_OFFSET, TSC_OFFSET_L2); > > > + vmwrite(TSC_MULTIPLIER, TSC_MULTIPLIER_L2); > > > + vmwrite(TSC_MULTIPLIER_HIGH, TSC_MULTIPLIER_L2 >> 32); > > > + > > > + /* launch L2 */ > > > + GUEST_ASSERT(!vmlaunch()); > > > + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); > > > + > > > + /* check that L1's frequency still looks good */ > > > + check_tsc_freq(UCHECK_L1); > > > + > > > + GUEST_DONE(); > > > +} > > > + > > > +static void tsc_scaling_check_supported(void) > > > +{ > > > + if (!kvm_check_cap(KVM_CAP_TSC_CONTROL)) { > > > + print_skip("TSC scaling not supported by the HW"); > > > + exit(KSFT_SKIP); > > > + } > > > +} > > > + > > > +int main(int argc, char *argv[]) > > > +{ > > > + struct kvm_vm *vm; > > > + vm_vaddr_t vmx_pages_gva; > > > + > > > + uint64_t tsc_start, tsc_end; > > > + uint64_t tsc_khz; > > > + uint64_t l0_tsc_freq = 0; > > > + uint64_t l1_tsc_freq = 0; > > > + uint64_t l2_tsc_freq = 0; > > > + > > > + nested_vmx_check_supported(); > > > + tsc_scaling_check_supported(); > > I can't add the check here > > > > + > > > + tsc_start = rdtsc(); > > > + sleep(1); > > > + tsc_end = rdtsc(); > > > + > > > + l0_tsc_freq = tsc_end - tsc_start; > > > + printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); > > > + > > > + vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code); > > > + vcpu_alloc_vmx(vm, &vmx_pages_gva); > > > + vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva); > > nor here > > > > + > > > + tsc_khz = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_TSC_KHZ, NULL); > > > + TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); > > > + > > > + /* scale down L1's TSC frequency */ > > > + vcpu_ioctl(vm, VCPU_ID, KVM_SET_TSC_KHZ, > > > + (void *) (tsc_khz / L1_SCALE_FACTOR)); > > > + > > > + for (;;) { > > > + volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID); > > > + struct ucall uc; > > > + > > > + vcpu_run(vm, VCPU_ID); > > > + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, > > > + "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", > > > + run->exit_reason, > > > + exit_reason_str(run->exit_reason)); > > should I add it here? > > > > + > > > + switch (get_ucall(vm, VCPU_ID, &uc)) { > > > + case UCALL_ABORT: > > > + TEST_FAIL("%s", (const char *) uc.args[0]); > > > + case UCALL_SYNC: > > > + switch (uc.args[0]) { > > > + case USLEEP: > > > + sleep(uc.args[1]); > > > + break; > > > + case UCHECK_L1: > > > + l1_tsc_freq = uc.args[1]; > > > + printf("L1's TSC frequency is around: %"PRIu64 > > > + "\n", l1_tsc_freq); > > > + > > > + compare_tsc_freq(l1_tsc_freq, > > > + l0_tsc_freq / L1_SCALE_FACTOR); > > > + break; > > > + case UCHECK_L2: > > > + l2_tsc_freq = uc.args[1]; > > > + printf("L2's TSC frequency is around: %"PRIu64 > > > + "\n", l2_tsc_freq); > > > + > > > + compare_tsc_freq(l2_tsc_freq, > > > + l1_tsc_freq * L2_SCALE_FACTOR); > > > + break; > > > + } > > > + break; > > > + case UCALL_DONE: > > > + goto done; > > > + default: > > > + TEST_FAIL("Unknown ucall %lu", uc.cmd); > > > + } > > > + } > > > + > > > +done: > > > + kvm_vm_free(vm); > > > + return 0; > > > +} > > > > Overall looks OK to me. > > > > I can't test it, since the most recent Intel laptop I have (i7-7600U) > > still lacks TSC scaling (or did Intel cripple this feature on clients like what > > they did with APICv ?) > > > > Best regards, > > Maxim Levitsky > > > > > >