Thanks, David. On Fri, May 20, 2022 at 2:08 PM David Matlack <dmatlack@xxxxxxxxxx> wrote: > > On Fri, May 20, 2022 at 10:37 AM Jue Wang <juew@xxxxxxxxxx> wrote: > > > > This patch add a self test that verifies user space can inject > > UnCorrectable No Action required (UCNA) memory errors to the guest. > > > > Signed-off-by: Jue Wang <juew@xxxxxxxxxx> > > --- > > tools/testing/selftests/kvm/.gitignore | 1 + > > tools/testing/selftests/kvm/Makefile | 1 + > > .../selftests/kvm/include/x86_64/apic.h | 1 + > > .../selftests/kvm/include/x86_64/mce.h | 25 ++ > > .../selftests/kvm/include/x86_64/processor.h | 1 + > > .../selftests/kvm/lib/x86_64/processor.c | 2 +- > > .../kvm/x86_64/ucna_injection_test.c | 287 ++++++++++++++++++ > > 7 files changed, 317 insertions(+), 1 deletion(-) > > create mode 100644 tools/testing/selftests/kvm/include/x86_64/mce.h > > create mode 100644 tools/testing/selftests/kvm/x86_64/ucna_injection_test.c > > > > diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore > > index 0b0e4402bba6..a8eb9d3f082c 100644 > > --- a/tools/testing/selftests/kvm/.gitignore > > +++ b/tools/testing/selftests/kvm/.gitignore > > @@ -37,6 +37,7 @@ > > /x86_64/tsc_scaling_sync > > /x86_64/sync_regs_test > > /x86_64/tsc_msrs_test > > +/x86_64/ucna_injection_test > > /x86_64/userspace_io_test > > /x86_64/userspace_msr_exit_test > > /x86_64/vmx_apic_access_test > > diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile > > index 681b173aa87c..8d9858b54b98 100644 > > --- a/tools/testing/selftests/kvm/Makefile > > +++ b/tools/testing/selftests/kvm/Makefile > > @@ -66,6 +66,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test > > TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test > > TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test > > TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test > > +TEST_GEN_PROGS_x86_64 += x86_64/ucna_injection_test > > TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test > > TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test > > TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test > > diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h > > index ac88557dcc9a..bed316fdecd5 100644 > > --- a/tools/testing/selftests/kvm/include/x86_64/apic.h > > +++ b/tools/testing/selftests/kvm/include/x86_64/apic.h > > @@ -35,6 +35,7 @@ > > #define APIC_SPIV_APIC_ENABLED (1 << 8) > > #define APIC_IRR 0x200 > > #define APIC_ICR 0x300 > > +#define APIC_LVTCMCI 0x2f0 > > #define APIC_DEST_SELF 0x40000 > > #define APIC_DEST_ALLINC 0x80000 > > #define APIC_DEST_ALLBUT 0xC0000 > > diff --git a/tools/testing/selftests/kvm/include/x86_64/mce.h b/tools/testing/selftests/kvm/include/x86_64/mce.h > > new file mode 100644 > > index 000000000000..6119321f3f5d > > --- /dev/null > > +++ b/tools/testing/selftests/kvm/include/x86_64/mce.h > > @@ -0,0 +1,25 @@ > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > +/* > > + * tools/testing/selftests/kvm/include/x86_64/mce.h > > + * > > + * Copyright (C) 2022, Google LLC. > > + */ > > + > > +#ifndef SELFTEST_KVM_MCE_H > > +#define SELFTEST_KVM_MCE_H > > + > > +#define MCG_CTL_P BIT_ULL(8) /* MCG_CTL register available */ > > +#define MCG_SER_P BIT_ULL(24) /* MCA recovery/new status bits */ > > +#define MCG_LMCE_P BIT_ULL(27) /* Local machine check supported */ > > +#define MCG_CMCI_P BIT_ULL(10) /* CMCI supported */ > > +#define KVM_MAX_MCE_BANKS 32 > > +#define MCG_CAP_BANKS_MASK 0xff /* Bit 0-7 of the MCG_CAP register are #banks */ > > +#define MCI_STATUS_VAL (1ULL << 63) /* valid error */ > > +#define MCI_STATUS_UC (1ULL << 61) /* uncorrected error */ > > +#define MCI_STATUS_EN (1ULL << 60) /* error enabled */ > > +#define MCI_STATUS_MISCV (1ULL << 59) /* misc error reg. valid */ > > +#define MCI_STATUS_ADDRV (1ULL << 58) /* addr reg. valid */ > > +#define MCM_ADDR_PHYS 2 /* physical address */ > > +#define MCI_CTL2_CMCI_EN BIT_ULL(30) > > + > > +#endif /* SELFTEST_KVM_MCE_H */ > > diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h > > index d0d51adec76e..fa316c3b7725 100644 > > --- a/tools/testing/selftests/kvm/include/x86_64/processor.h > > +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h > > @@ -481,6 +481,7 @@ struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void); > > void vcpu_set_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); > > struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpuid); > > void vm_xsave_req_perm(int bit); > > +void vcpu_setup(struct kvm_vm *vm, int vcpuid); > > > > enum x86_page_size { > > X86_PAGE_SIZE_4K = 0, > > diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c > > index 33ea5e9955d9..bb1ef665cd10 100644 > > --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c > > +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c > > @@ -580,7 +580,7 @@ static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp, > > kvm_seg_fill_gdt_64bit(vm, segp); > > } > > > > -static void vcpu_setup(struct kvm_vm *vm, int vcpuid) > > +void vcpu_setup(struct kvm_vm *vm, int vcpuid) > > { > > struct kvm_sregs sregs; > > > > diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c > > new file mode 100644 > > index 000000000000..104a9f116b23 > > --- /dev/null > > +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c > > @@ -0,0 +1,287 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* > > + * ucna_injection_test > > + * > > + * Copyright (C) 2022, Google LLC. > > + * > > + * This work is licensed under the terms of the GNU GPL, version 2. > > + * > > + * Test that user space can inject UnCorrectable No Action required (UCNA) > > + * memory errors to the guest. > > + * > > + * The test starts one vCPU with the MCG_CMCI_P enabled. It verifies that > > + * proper UCNA errors can be injected to a vCPU with MCG_CMCI_P and > > + * corresponding per-bank control register (MCI_CTL2) bit enabled. > > + * The test also checks that the UCNA errors get recorded in the > > + * Machine Check bank registers no matter the error signal interrupts get > > + * delivered into the guest or not. > > + * > > + */ > > + > > +#define _GNU_SOURCE /* for program_invocation_short_name */ > > +#include <pthread.h> > > +#include <inttypes.h> > > +#include <string.h> > > +#include <time.h> > > + > > +#include "kvm_util_base.h" > > +#include "kvm_util.h" > > +#include "mce.h" > > +#include "processor.h" > > +#include "test_util.h" > > +#include "apic.h" > > + > > +#define UCNA_VCPU_ID 0 > > +#define SYNC_FIRST_UCNA 9 > > +#define SYNC_SECOND_UCNA 10 > > +#define FIRST_UCNA_ADDR 0xdeadbeef > > +#define SECOND_UCNA_ADDR 0xcafeb0ba > > + > > +/* > > + * Vector for the CMCI interrupt. > > + * Value is arbitrary. Any value in 0x20-0xFF should work: > > + * https://wiki.osdev.org/Interrupt_Vector_Table > > + */ > > +#define CMCI_VECTOR 0xa9 > > + > > +#define UCNA_BANK 0x7 // IMC0 bank > > + > > +/* > > + * Record states about the injected UCNA. > > + * The variables started with the 'i_' prefixes are recorded in interrupt > > + * handler. Variables without the 'i_' prefixes are recorded in guest main > > + * execution thread. > > + */ > > +static volatile uint64_t i_ucna_rcvd; > > +static volatile uint64_t i_ucna_addr; > > +static volatile uint64_t ucna_addr; > > +static volatile uint64_t ucna_addr2; > > + > > +struct thread_params { > > + struct kvm_vm *vm; > > + uint32_t vcpu_id; > > +}; > > + > > +static void verify_apic_base_addr(void) > > +{ > > + uint64_t msr = rdmsr(MSR_IA32_APICBASE); > > + uint64_t base = GET_APIC_BASE(msr); > > + > > + GUEST_ASSERT(base == APIC_DEFAULT_GPA); > > +} > > + > > +static void guest_code(void) > > +{ > > + uint64_t ctl2; > > + verify_apic_base_addr(); > > + xapic_enable(); > > + > > + /* Sets up the interrupt vector and enables per-bank CMCI sigaling. */ > > + xapic_write_reg(APIC_LVTCMCI, CMCI_VECTOR | APIC_DM_FIXED); > > + ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); > > + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 | MCI_CTL2_CMCI_EN); > > + > > + /* Enables interrupt in guest. */ > > + asm volatile("sti"); > > + > > + /* Let user space inject the first UCNA */ > > + GUEST_SYNC(SYNC_FIRST_UCNA); > > + > > + ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK)); > > + > > + /* Disables the per-bank CMCI signaling. */ > > + ctl2 = rdmsr(MSR_IA32_MCx_CTL2(UCNA_BANK)); > > + wrmsr(MSR_IA32_MCx_CTL2(UCNA_BANK), ctl2 & ~MCI_CTL2_CMCI_EN); > > + > > + /* Let the user space inject the second UCNA */ > > + GUEST_SYNC(SYNC_SECOND_UCNA); > > + > > + ucna_addr2 = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK)); > > + GUEST_DONE(); > > +} > > + > > +static void guest_cmci_handler(struct ex_regs *regs) > > +{ > > + i_ucna_rcvd++; > > + i_ucna_addr = rdmsr(MSR_IA32_MCx_ADDR(UCNA_BANK)); > > + xapic_write_reg(APIC_EOI, 0); > > +} > > + > > +static void inject_ucna(struct kvm_vm *vm, uint32_t vcpu_id, uint64_t addr) { > > + /* > > + * A UCNA error is indicated with VAL=1, UC=1, PCC=0, S=0 and AR=0 in > > + * the IA32_MCi_STATUS register. > > + * MSCOD=1 (BIT[16] - MscodDataRdErr). > > + * MCACOD=0x0090 (Memory controller error format, channel 0) > > + */ > > + uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | > > + MCI_STATUS_MISCV | MCI_STATUS_ADDRV | 0x10090; > > + struct kvm_x86_mce mce = {}; > > + mce.status = status; > > + mce.mcg_status = 0; > > + /* > > + * MCM_ADDR_PHYS indicates the reported address is a physical address. > > + * Lowest 6 bits is the recoverable address LSB, i.e., the injected MCE > > + * is at 4KB granularity. > > + */ > > + mce.misc = (MCM_ADDR_PHYS << 6) | 0xc; > > + mce.addr = addr; > > + mce.bank = UCNA_BANK; > > + > > + TEST_ASSERT(_vcpu_ioctl(vm, vcpu_id, KVM_X86_SET_MCE, &mce) != -1, > > + "Inject UCNA"); > > +} > > + > > +static void *vcpu_thread(void *arg) > > +{ > > + struct thread_params *params = (struct thread_params *)arg; > > + struct ucall uc; > > + int old; > > + int r; > > + unsigned int exit_reason; > > + > > + r = pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, &old); > > + TEST_ASSERT(r == 0, > > + "pthread_setcanceltype failed on vcpu_id=%u with errno=%d", > > + params->vcpu_id, r); > > + > > + fprintf(stderr, "vCPU thread running vCPU %u\n", params->vcpu_id); > > + vcpu_run(params->vm, params->vcpu_id); > > + > > + exit_reason = vcpu_state(params->vm, params->vcpu_id)->exit_reason; > > + TEST_ASSERT(exit_reason == KVM_EXIT_IO, > > + "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO", > > + params->vcpu_id, exit_reason, exit_reason_str(exit_reason)); > > + TEST_ASSERT(get_ucall(params->vm, params->vcpu_id, &uc) == UCALL_SYNC, > > + "Expect UCALL_SYNC\n"); > > + TEST_ASSERT(uc.args[1] == SYNC_FIRST_UCNA, "Injecting first UCNA."); > > + > > + fprintf(stderr, "Injecting first UCNA at %#x.\n", FIRST_UCNA_ADDR); > > + > > + inject_ucna(params->vm, params->vcpu_id, FIRST_UCNA_ADDR); > > + vcpu_run(params->vm, params->vcpu_id); > > + > > + exit_reason = vcpu_state(params->vm, params->vcpu_id)->exit_reason; > > + TEST_ASSERT(exit_reason == KVM_EXIT_IO, > > + "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO", > > + params->vcpu_id, exit_reason, exit_reason_str(exit_reason)); > > + TEST_ASSERT(get_ucall(params->vm, params->vcpu_id, &uc) == UCALL_SYNC, > > + "Expect UCALL_SYNC\n"); > > + TEST_ASSERT(uc.args[1] == SYNC_SECOND_UCNA, "Injecting second UCNA."); > > + > > + fprintf(stderr, "Injecting second UCNA at %#x.\n", SECOND_UCNA_ADDR); > > + > > + inject_ucna(params->vm, params->vcpu_id, SECOND_UCNA_ADDR); > > + vcpu_run(params->vm, params->vcpu_id); > > + > > + exit_reason = vcpu_state(params->vm, params->vcpu_id)->exit_reason; > > + TEST_ASSERT(exit_reason == KVM_EXIT_IO, > > + "vCPU %u exited with unexpected exit reason %u-%s, expected KVM_EXIT_IO", > > + params->vcpu_id, exit_reason, exit_reason_str(exit_reason)); > > + if (get_ucall(params->vm, params->vcpu_id, &uc) == UCALL_ABORT) { > > + TEST_ASSERT(false, > > + "vCPU %u exited with error: %s.\n", > > + params->vcpu_id, (const char *)uc.args[0]); > > + } > > + > > + return NULL; > > +} > > + > > +static void setup_mce_cap(struct kvm_vm *vm, uint32_t vcpuid) > > +{ > > + uint64_t supported_mcg_caps = 0; > > + uint64_t mcg_caps = MCG_CMCI_P | MCG_CTL_P | MCG_SER_P | MCG_LMCE_P > > + | KVM_MAX_MCE_BANKS; > > + > > + TEST_ASSERT(_kvm_ioctl(vm, KVM_X86_GET_MCE_CAP_SUPPORTED, > > + &supported_mcg_caps) != -1, > > + "KVM_GET_MCE_CAP_SUPPORTED"); > > + fprintf(stderr, "KVM supported MCG_CAP: %#lx\n", supported_mcg_caps); > > + > > + TEST_ASSERT(supported_mcg_caps & MCG_CMCI_P, "MCG_CMCI_P is not supported"); > > Instead of failing the test on older kernels, mark it as skipped. > > if (!(supported_mcg_caps & MCG_CMCI_P)) { > print_skip(" MCG_CMCI_P is not supported"); > exit(KSFT_SKIP); > } > Thanks, updated. > > + > > + mcg_caps &= supported_mcg_caps | MCG_CAP_BANKS_MASK; > > + TEST_ASSERT(_vcpu_ioctl(vm, vcpuid, KVM_X86_SETUP_MCE, &mcg_caps) != -1, > > + "KVM_X86_SETUP_MCE"); > > +} > > + > > +static void create_vcpu_with_mce_cap(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) > > +{ > > + struct kvm_mp_state mp_state; > > + struct kvm_regs regs; > > + vm_vaddr_t stack_vaddr; > > + stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(), > > + DEFAULT_GUEST_STACK_VADDR_MIN); > > + > > + vm_vcpu_add(vm, vcpuid); > > + setup_mce_cap(vm, vcpuid); > > Is some ordering requirement between KVM_X86_SETUP_MCE and other vCPU > ioctls? I'm wondering why this function has to duplicate so much of > the vCPU setup code. Yes, KVM_X86_SETUP_MCE sets up the per vCPU mcg_cap and other MCE related registers. This has to be done before kvm_lapic_reset or kvm_vcpu_after_set_cpuid because otherwise the kvm_apic_set_version call will effectively set a value in APIC_LVR register that the number of supported APIC_LVT* vectors is 5 instead of 6 (including CMCI). In practice, this is how we would configure the per-vCPU KVM_X86_SETUP_MCE in some other user space VMM impl (and the way it has been done). > > + > > + vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); > > + vcpu_setup(vm, vcpuid); > > + > > + /* Setup guest general purpose registers */ > > + vcpu_regs_get(vm, vcpuid, ®s); > > + regs.rflags = regs.rflags | 0x2; > > + regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()); > > + regs.rip = (unsigned long) guest_code; > > + vcpu_regs_set(vm, vcpuid, ®s); > > + > > + /* Setup the MP state */ > > + mp_state.mp_state = 0; > > + vcpu_set_mp_state(vm, vcpuid, &mp_state); > > +} > > + > > +int main(int argc, char *argv[]) > > +{ > > + int r; > > + void *retval; > > + int run_secs = 3; > > + pthread_t threads[2]; > > + struct thread_params params[2]; > > + struct kvm_vm *vm; > > + uint64_t *p_i_ucna_rcvd, *p_i_ucna_addr; > > + uint64_t *p_ucna_addr, *p_ucna_addr2; > > + > > + kvm_check_cap(KVM_CAP_MCE); > > + > > + vm = vm_create_without_vcpus(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES); > > + > > + create_vcpu_with_mce_cap(vm, UCNA_VCPU_ID, guest_code); > > + > > + params[0].vm = vm; > > + > > + vm_init_descriptor_tables(vm); > > + vcpu_init_descriptor_tables(vm, UCNA_VCPU_ID); > > + vm_install_exception_handler(vm, CMCI_VECTOR, guest_cmci_handler); > > + > > + virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA); > > + > > + p_i_ucna_rcvd = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_rcvd); > > + p_i_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&i_ucna_addr); > > + p_ucna_addr = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr); > > + p_ucna_addr2 = (uint64_t *)addr_gva2hva(vm, (uint64_t)&ucna_addr2); > > + > > + /* Start vCPU thread and wait for it to execute first HLT. */ > > Stale comment from xapic_ipi_test.c > > > + params[0].vcpu_id = UCNA_VCPU_ID; > > + r = pthread_create(&threads[0], NULL, vcpu_thread, ¶ms[0]); > > + TEST_ASSERT(r == 0, > > + "pthread_create vcpu thread failed errno=%d", errno); > > + fprintf(stderr, "vCPU thread started\n"); > > + > > + sleep(run_secs); > > + > > + r = pthread_join(threads[0], &retval); > > + TEST_ASSERT(r == 0, > > + "pthread_join on vcpu_id=%d failed with errno=%d", > > + UCNA_VCPU_ID, r); > > Creating a here is unnecessary, e.g. you could just replace all this > code with a call to vcpu_thread(). Good point, updated. > > > + > > + fprintf(stderr, > > + "Test successful after running for %d seconds.\n" > > + "UCNA CMCI interrupts received: %ld\n" > > + "Last UCNA address received via CMCI: %lx\n" > > + "First UCNA address in vCPU thread: %lx\n" > > + "Second UCNA address in vCPU thread: %lx\n", > > + run_secs, *p_i_ucna_rcvd, *p_i_ucna_addr, *p_ucna_addr, *p_ucna_addr2); > > Instead of printing out all these fields, can you make assertions > about them? e.g. Assert that the expected number of interrupts were > received, etc. Updated. > > > + > > + kvm_vm_free(vm); > > +} > > -- > > 2.36.1.124.g0e6072fb45-goog > >