From: Jim Mattson <jmattson@xxxxxxxxxx> Test if posted interrupt delivery race with migration. Add a selftest to demonstrate a race condition between migration and posted interrupt for a nested VM. The consequence of this race condition causes the loss of a posted interrupt for a nested vCPU after migration and triggers a warning for unpatched kernel. The selftest demonstrates that if a L2 vCPU is in halted state before migration, then after migration, it is not able to receive a posted interrupt from another vCPU within the same VM. The fundamental problem is deeply buried in the kernel logic where vcpu_block() will directly check vmcs12 related mappings before having a valid vmcs12 ready. Because of that, it fails to process the posted interrupt and triggers the warning in vmx_guest_apic_has_interrupt() static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { ... if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || !nested_cpu_has_vid(get_vmcs12(vcpu)) || WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) <= HERE return false; ... } Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx> Co-developed-by: Mingwei Zhang <mizhang@xxxxxxxxxx> Signed-off-by: Mingwei Zhang <mizhang@xxxxxxxxxx> --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../kvm/x86_64/vmx_migrate_pi_pending.c | 291 ++++++++++++++++++ 3 files changed, 293 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_migrate_pi_pending.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index d625a3f83780..749b2be5b23c 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -51,6 +51,7 @@ /x86_64/vmx_exception_with_invalid_guest_state /x86_64/vmx_invalid_nested_guest_state /x86_64/vmx_msrs_test +/x86_64/vmx_migrate_pi_pending /x86_64/vmx_preemption_timer_test /x86_64/vmx_set_nested_state_test /x86_64/vmx_tsc_adjust_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 4c122f1b1737..5e830c65c068 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -110,6 +110,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state +TEST_GEN_PROGS_x86_64 += x86_64/vmx_migrate_pi_pending TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test diff --git a/tools/testing/selftests/kvm/x86_64/vmx_migrate_pi_pending.c b/tools/testing/selftests/kvm/x86_64/vmx_migrate_pi_pending.c new file mode 100644 index 000000000000..6f1a9f284754 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_migrate_pi_pending.c @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vmx_migrate_pi_pending + * + * Copyright (C) 2022, Google, LLC. + * + * Deliver a nested posted interrupt between migration and the first + * KVM_RUN on the target. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#include <string.h> +#include <sys/ioctl.h> +#include <linux/bitmap.h> +#include <pthread.h> +#include <signal.h> + +#include "kselftest.h" + +#define VCPU_ID0 0 +#define VCPU_ID1 1 +#define PI_ON_BIT 256 +#define PI_NV 0x42 +#define L2_INTR 0x71 + +enum { + PORT_L0_EXIT = 0x2000, +}; + +/* The virtual machine object. */ +struct vmx_pages *vmx; + +static struct kvm_vm *vm; +static struct kvm_vcpu *vcpu0; +static struct kvm_vcpu *vcpu1; +bool vcpu0_can_run = true; +bool vcpu0_running; +bool pi_executed; +pthread_t pthread_cpu0; +pthread_t pthread_cpu1; + +static void vcpu0_ipi_handler(struct ex_regs *regs) +{ + asm volatile("inb %%dx, %%al" + : : [port] "d" (PORT_L0_EXIT) : "rax"); + asm volatile("vmcall"); +} + +static void l2_vcpu0_guest_code(void) +{ + asm volatile("cli"); + asm volatile("sti; nop; hlt"); +} + +static void l1_vcpu0_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_vcpu0_guest_stack[L2_GUEST_STACK_SIZE]; + uint32_t control; + + x2apic_enable(); + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + /* Prepare the VMCS for L2 execution. */ + prepare_vmcs(vmx_pages, l2_vcpu0_guest_code, + &l2_vcpu0_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(PIN_BASED_VM_EXEC_CONTROL); + control |= (PIN_BASED_EXT_INTR_MASK | + PIN_BASED_POSTED_INTR); + vmwrite(PIN_BASED_VM_EXEC_CONTROL, control); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= (CPU_BASED_TPR_SHADOW | + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS); + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + control = vmreadz(SECONDARY_VM_EXEC_CONTROL); + control |= (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + vmwrite(SECONDARY_VM_EXEC_CONTROL, control); + control = vmreadz(VM_EXIT_CONTROLS); + control |= VM_EXIT_ACK_INTR_ON_EXIT; + vmwrite(VM_EXIT_CONTROLS, control); + vmwrite(VIRTUAL_APIC_PAGE_ADDR, vmx_pages->virtual_apic_gpa); + vmwrite(POSTED_INTR_DESC_ADDR, vmx_pages->posted_intr_desc_gpa); + vmwrite(POSTED_INTR_NV, PI_NV); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + GUEST_ASSERT(!test_bit(PI_ON_BIT, (void *)vmx_pages->posted_intr_desc)); + GUEST_DONE(); +} + +static void post_intr(u8 vector, void *pi_desc) +{ + set_bit(vector, pi_desc); + set_bit(PI_ON_BIT, pi_desc); +} + +static void l1_vcpu1_guest_code(void *vcpu0_pi_desc) +{ + post_intr(L2_INTR, vcpu0_pi_desc); + x2apic_enable(); + x2apic_write_reg(APIC_ICR, ((u64)VCPU_ID0 << 32) | + APIC_DEST_PHYSICAL | APIC_DM_FIXED | PI_NV); + GUEST_DONE(); +} + +static void save_restore_vm(struct kvm_vm *vm) +{ + struct kvm_regs regs1 = {}, regs2 = {}; + struct kvm_x86_state *state; + + state = vcpu_save_state(vcpu0); + vcpu_regs_get(vcpu0, ®s1); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu0 = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu0, state); + kvm_x86_state_cleanup(state); + + vcpu_regs_get(vcpu0, ®s2); + TEST_ASSERT(!memcmp(®s1, ®s2, sizeof(regs2)), + "vcpu0: Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx", + (ulong) regs2.rdi, (ulong) regs2.rsi); +} + +void *create_and_run_vcpu1(void *arg) +{ + struct ucall uc; + struct kvm_run *run; + struct kvm_mp_state vcpu0_mp_state; + + pthread_cpu1 = pthread_self(); + + /* Keep trying to kick out vcpu0 until it is in halted state. */ + for (;;) { + WRITE_ONCE(vcpu0_can_run, true); + sleep(1); + WRITE_ONCE(vcpu0_can_run, false); + pthread_kill(pthread_cpu0, SIGUSR1); + printf("vcpu1: Sent SIGUSR1 to vcpu0\n"); + + while (READ_ONCE(vcpu0_running)) + ; + + vcpu_mp_state_get(vcpu0, &vcpu0_mp_state); + if (vcpu0_mp_state.mp_state == KVM_MP_STATE_HALTED) + break; + } + + printf("vcpu1: Kicked out vcpu0 and ensure vcpu0 is halted\n"); + + /* Use save_restore_vm() to simulate a VM migration. */ + save_restore_vm(vm); + + printf("vcpu1: Finished save and restore vm.\n"); + vcpu1 = vm_vcpu_add(vm, VCPU_ID1, l1_vcpu1_guest_code); + vcpu_args_set(vcpu1, 1, vmx->posted_intr_desc); + + /* Start an L1 in vcpu1 and send a posted interrupt to halted L2 in vcpu0. */ + for (;;) { + run = vcpu1->run; + vcpu_run(vcpu1); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "vcpu1: Got exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu1, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + /* NOT REACHED */ + case UCALL_DONE: + printf("vcpu1: Successfully send a posted interrupt to vcpu0\n"); + goto done; + default: + TEST_FAIL("vcpu1: Unknown ucall %lu", uc.cmd); + } + } + +done: + /* + * Allow vcpu0 resume execution from L0 userspace and check if the + * posted interrupt get executed. + */ + WRITE_ONCE(vcpu0_can_run, true); + sleep(1); + TEST_ASSERT(READ_ONCE(pi_executed), + "vcpu0 did not execute the posted interrupt.\n"); + + return NULL; +} + +void sig_handler(int signum, siginfo_t *info, void *context) +{ + TEST_ASSERT(pthread_self() == pthread_cpu0, + "Incorrect receiver of the signal, expect pthread_cpu0: " + "%lu, but get: %lu\n", pthread_cpu0, pthread_self()); + printf("vcpu0: Execute sighandler for signal: %d\n", signum); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva; + struct sigaction sig_action; + struct sigaction old_action; + + memset(&sig_action, 0, sizeof(sig_action)); + sig_action.sa_sigaction = sig_handler; + sig_action.sa_flags = SA_RESTART | SA_SIGINFO; + sigemptyset(&sig_action.sa_mask); + sigaction(SIGUSR1, &sig_action, &old_action); + + pthread_cpu0 = pthread_self(); + printf("vcpu0: Finish setup signal handler for SIGUSR1\n"); + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + + vm = vm_create_with_one_vcpu(&vcpu0, (void *)l1_vcpu0_guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpu0); + vm_install_exception_handler(vm, L2_INTR, vcpu0_ipi_handler); + + /* Allocate VMX pages and shared descriptors (vmx_pages). */ + vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva); + prepare_virtual_apic(vmx, vm); + prepare_posted_intr_desc(vmx, vm); + vcpu_args_set(vcpu0, 1, vmx_pages_gva); + + pthread_create(&pthread_cpu1, NULL, create_and_run_vcpu1, NULL); + + for (;;) { + struct kvm_run *run = vcpu0->run; + struct ucall uc; + int rc; + + while (!READ_ONCE(vcpu0_can_run)) + ; + + WRITE_ONCE(vcpu0_running, true); + + rc = __vcpu_run(vcpu0); + + vcpu0->run->immediate_exit = 0; + + /* + * When vCPU is kicked out by a signal, ensure a consistent vCPU + * state to prepare for migration before setting the + * vcpu_running flag to false. + */ + if (rc == -1 && run->exit_reason == KVM_EXIT_INTR) { + vcpu_run_complete_io(vcpu0); + + WRITE_ONCE(vcpu0_running, false); + + continue; + } + + WRITE_ONCE(vcpu0_running, false); + + if (run->io.port == PORT_L0_EXIT) { + printf("vcpu0: Executed the posted interrupt\n"); + WRITE_ONCE(pi_executed, true); + continue; + } + + switch (get_ucall(vcpu0, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s", (const char *)uc.args[0]); + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("vcpu0: Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} -- 2.37.2.672.g94769d06f0-goog