pKVM need to handle vmexit from host OS after it deprivilege to a VM. Some instructions cause vmexit unconditionally, like CPUID, XSETBV, for such vmexit handlers, pKVM just do what host VM want. Although now msr_bitmap is cleared, there is still possibility host Linux accessing unsupported MSR (e.g. MSR from AMD platform) which will cause vmexit, pKVM just simply ignore such msr writing and return 0 on such msr reading. For MOV to CR vmexit, pKVM only take care CR4.VMXE, it allows host VM change VMXE bit by directly write host value to CR4_READ_SHADOW. Define a pkvm_main function which do the loop handling of vmexit_handlers based on above different vmexit reason, and __pkvm_vmx_vcpu_run at the tail of loop to trigger VMLAUNCH/VMRESUME. New pKVM on Intel platform files to support hypervisor runtime are placed under arch/x86/kvm/vmx/pkvm/hyp. Signed-off-by: Chuanxiao Dong <chuanxiao.dong@xxxxxxxxx> Signed-off-by: Jason Chen CJ <jason.cj.chen@xxxxxxxxx> --- arch/x86/kvm/vmx/pkvm/Makefile | 1 + arch/x86/kvm/vmx/pkvm/hyp/Makefile | 8 ++ arch/x86/kvm/vmx/pkvm/hyp/vmexit.c | 154 ++++++++++++++++++++++ arch/x86/kvm/vmx/pkvm/hyp/vmexit.h | 11 ++ arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S | 186 +++++++++++++++++++++++++++ arch/x86/kvm/vmx/pkvm/include/pkvm.h | 2 + arch/x86/kvm/vmx/pkvm/pkvm_host.c | 3 +- 7 files changed, 364 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/pkvm/Makefile b/arch/x86/kvm/vmx/pkvm/Makefile index 1795d5f9b4b0..ed0629baf449 100644 --- a/arch/x86/kvm/vmx/pkvm/Makefile +++ b/arch/x86/kvm/vmx/pkvm/Makefile @@ -6,3 +6,4 @@ ccflags-y += -I $(srctree)/arch/x86/kvm/vmx/pkvm/include pkvm-obj := pkvm_host.o obj-$(CONFIG_PKVM_INTEL) += $(pkvm-obj) +obj-$(CONFIG_PKVM_INTEL) += hyp/ diff --git a/arch/x86/kvm/vmx/pkvm/hyp/Makefile b/arch/x86/kvm/vmx/pkvm/hyp/Makefile new file mode 100644 index 000000000000..ea810f09e381 --- /dev/null +++ b/arch/x86/kvm/vmx/pkvm/hyp/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y += -I $(srctree)/arch/x86/kvm +ccflags-y += -I $(srctree)/arch/x86/kvm/vmx/pkvm/include + +pkvm-hyp-y := vmx_asm.o vmexit.o + +obj-$(CONFIG_PKVM_INTEL) += $(pkvm-hyp-y) diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c new file mode 100644 index 000000000000..19be7ce201df --- /dev/null +++ b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Intel Corporation + */ + +#include <pkvm.h> +#include "vmexit.h" +#include "debug.h" + +#define CR4 4 + +#define MOV_TO_CR 0 + +static void skip_emulated_instruction(void) +{ + unsigned long rip; + + rip = vmcs_readl(GUEST_RIP); + rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_writel(GUEST_RIP, rip); +} + +static void handle_cpuid(struct kvm_vcpu *vcpu) +{ + u32 eax, ebx, ecx, edx; + + eax = vcpu->arch.regs[VCPU_REGS_RAX]; + ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + native_cpuid(&eax, &ebx, &ecx, &edx); + vcpu->arch.regs[VCPU_REGS_RAX] = eax; + vcpu->arch.regs[VCPU_REGS_RBX] = ebx; + vcpu->arch.regs[VCPU_REGS_RCX] = ecx; + vcpu->arch.regs[VCPU_REGS_RDX] = edx; +} + +static void handle_cr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long exit_qual, val; + int cr; + int type; + int reg; + + exit_qual = vmx->exit_qualification; + cr = exit_qual & 15; + type = (exit_qual >> 4) & 3; + reg = (exit_qual >> 8) & 15; + + switch (type) { + case MOV_TO_CR: + switch (cr) { + case CR4: + /* + * VMXE bit is owned by host, others are owned by guest + * So only when guest is trying to modify VMXE bit it + * can cause vmexit and get here. + */ + val = vcpu->arch.regs[reg]; + vmcs_writel(CR4_READ_SHADOW, val); + break; + default: + break; + } + break; + default: + break; + } +} + +static void handle_read_msr(struct kvm_vcpu *vcpu) +{ + /* simply return 0 for non-supported MSRs */ + vcpu->arch.regs[VCPU_REGS_RAX] = 0; + vcpu->arch.regs[VCPU_REGS_RDX] = 0; +} + +static void handle_write_msr(struct kvm_vcpu *vcpu) +{ + /*No emulation for msr write now*/ +} + +static void handle_xsetbv(struct kvm_vcpu *vcpu) +{ + u32 eax = (u32)(vcpu->arch.regs[VCPU_REGS_RAX] & -1u); + u32 edx = (u32)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u); + u32 ecx = (u32)(vcpu->arch.regs[VCPU_REGS_RCX] & -1u); + + asm volatile(".byte 0x0f,0x01,0xd1" + : : "a" (eax), "d" (edx), "c" (ecx)); +} + +/* we take use of kvm_vcpu structure, but not used all the fields */ +int pkvm_main(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int launch = 1; + + do { + bool skip_instruction = false; + + if (__pkvm_vmx_vcpu_run(vcpu->arch.regs, launch)) { + pkvm_err("%s: CPU%d run_vcpu failed with error 0x%x\n", + __func__, vcpu->cpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + return -EINVAL; + } + + vcpu->arch.cr2 = native_read_cr2(); + + vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); + vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + switch (vmx->exit_reason.full) { + case EXIT_REASON_CPUID: + handle_cpuid(vcpu); + skip_instruction = true; + break; + case EXIT_REASON_CR_ACCESS: + pkvm_dbg("CPU%d vmexit_reason: CR_ACCESS.\n", vcpu->cpu); + handle_cr(vcpu); + skip_instruction = true; + break; + case EXIT_REASON_MSR_READ: + pkvm_dbg("CPU%d vmexit_reason: MSR_READ 0x%lx\n", + vcpu->cpu, vcpu->arch.regs[VCPU_REGS_RCX]); + handle_read_msr(vcpu); + skip_instruction = true; + break; + case EXIT_REASON_MSR_WRITE: + pkvm_dbg("CPU%d vmexit_reason: MSR_WRITE 0x%lx\n", + vcpu->cpu, vcpu->arch.regs[VCPU_REGS_RCX]); + handle_write_msr(vcpu); + skip_instruction = true; + break; + case EXIT_REASON_XSETBV: + handle_xsetbv(vcpu); + skip_instruction = true; + break; + default: + pkvm_dbg("CPU%d: Unsupported vmexit reason 0x%x.\n", vcpu->cpu, vmx->exit_reason.full); + skip_instruction = true; + break; + } + + /* now only need vmresume */ + launch = 0; + + if (skip_instruction) + skip_emulated_instruction(); + + native_write_cr2(vcpu->arch.cr2); + } while (1); + + return 0; +} diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.h b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.h new file mode 100644 index 000000000000..5089b87b51b5 --- /dev/null +++ b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Intel Corporation + */ + +#ifndef _PKVM_VMEXIT_H_ +#define _PKVM_VMEXIT_H_ + +int __pkvm_vmx_vcpu_run(unsigned long *regs, int launch); + +#endif diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S b/arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S new file mode 100644 index 000000000000..3a0c9fcd8d9c --- /dev/null +++ b/arch/x86/kvm/vmx/pkvm/hyp/vmx_asm.S @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Intel Corporation + */ +#include <linux/linkage.h> +#include <asm/kvm_vcpu_regs.h> +#include <asm/frame.h> +#include <asm/asm.h> +#include <asm/bitsperlong.h> +#include <asm/unwind_hints.h> +#include <asm/nospec-branch.h> + +#define WORD_SIZE (BITS_PER_LONG / 8) + +#define VCPU_RAX (__VCPU_REGS_RAX * WORD_SIZE) +#define VCPU_RCX (__VCPU_REGS_RCX * WORD_SIZE) +#define VCPU_RDX (__VCPU_REGS_RDX * WORD_SIZE) +#define VCPU_RBX (__VCPU_REGS_RBX * WORD_SIZE) +#define VCPU_RBP (__VCPU_REGS_RBP * WORD_SIZE) +#define VCPU_RSI (__VCPU_REGS_RSI * WORD_SIZE) +#define VCPU_RDI (__VCPU_REGS_RDI * WORD_SIZE) + +#define VCPU_R8 (__VCPU_REGS_R8 * WORD_SIZE) +#define VCPU_R9 (__VCPU_REGS_R9 * WORD_SIZE) +#define VCPU_R10 (__VCPU_REGS_R10 * WORD_SIZE) +#define VCPU_R11 (__VCPU_REGS_R11 * WORD_SIZE) +#define VCPU_R12 (__VCPU_REGS_R12 * WORD_SIZE) +#define VCPU_R13 (__VCPU_REGS_R13 * WORD_SIZE) +#define VCPU_R14 (__VCPU_REGS_R14 * WORD_SIZE) +#define VCPU_R15 (__VCPU_REGS_R15 * WORD_SIZE) + +#define HOST_RSP 0x6C14 + +/** + * __vmenter - VM-Enter the current loaded VMCS + * + * Returns: + * %RFLAGS.CF is set on VM-Fail Invalid + * %RFLAGS.ZF is set on VM-Fail Valid + * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit + * + * Note that VMRESUME/VMLAUNCH fall-through and return directly if + * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump + * to vmx_vmexit. + */ +SYM_FUNC_START_LOCAL(__vmenter) + /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */ + je 2f + +1: vmresume + ANNOTATE_UNRET_SAFE + ret + +2: vmlaunch + ANNOTATE_UNRET_SAFE + ret +SYM_FUNC_END(__vmenter) + +/** + * __pkvm_vmx_vmexit - Handle a VMX VM-Exit + * + * Returns: + * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit + * + * This is __vmenter's partner in crime. On a VM-Exit, control will jump + * here after hardware loads the host's state, i.e. this is the destination + * referred to by VMCS.HOST_RIP. + */ +SYM_FUNC_START(__pkvm_vmx_vmexit) + ANNOTATE_UNRET_SAFE + ret +SYM_FUNC_END(__pkvm_vmx_vmexit) + +/** + * __pkvm_vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode + * @regs: unsigned long * (to guest registers) + * @launched: %true if the VMCS has been launched + * + * Returns: + * 0 on VM-Exit, 1 on VM-Fail + */ +SYM_FUNC_START(__pkvm_vmx_vcpu_run) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + push %r15 + push %r14 + push %r13 + push %r12 + + push %_ASM_BX + + push %_ASM_ARG1 + + /* record host RSP (0x6C14) */ + mov $HOST_RSP, %_ASM_BX + lea -WORD_SIZE(%_ASM_SP), %_ASM_CX + vmwrite %_ASM_CX, %_ASM_BX + + mov %_ASM_ARG1, %_ASM_CX + cmp $1, %_ASM_ARG2 + + mov VCPU_RAX(%_ASM_CX), %_ASM_AX + mov VCPU_RBX(%_ASM_CX), %_ASM_BX + mov VCPU_RDX(%_ASM_CX), %_ASM_DX + mov VCPU_RSI(%_ASM_CX), %_ASM_SI + mov VCPU_RDI(%_ASM_CX), %_ASM_DI + mov VCPU_RBP(%_ASM_CX), %_ASM_BP + mov VCPU_R8(%_ASM_CX), %r8 + mov VCPU_R9(%_ASM_CX), %r9 + mov VCPU_R10(%_ASM_CX), %r10 + mov VCPU_R11(%_ASM_CX), %r11 + mov VCPU_R12(%_ASM_CX), %r12 + mov VCPU_R13(%_ASM_CX), %r13 + mov VCPU_R14(%_ASM_CX), %r14 + mov VCPU_R15(%_ASM_CX), %r15 + + mov VCPU_RCX(%_ASM_CX), %_ASM_CX + + call __vmenter + + /* Jump on VM-Fail. */ + jbe 2f + + push %_ASM_CX + mov WORD_SIZE(%_ASM_SP), %_ASM_CX + + mov %_ASM_AX, VCPU_RAX(%_ASM_CX) + mov %_ASM_BX, VCPU_RBX(%_ASM_CX) + mov %_ASM_DX, VCPU_RDX(%_ASM_CX) + mov %_ASM_SI, VCPU_RSI(%_ASM_CX) + mov %_ASM_DI, VCPU_RDI(%_ASM_CX) + mov %_ASM_BP, VCPU_RBP(%_ASM_CX) + mov %r8 , VCPU_R8(%_ASM_CX) + mov %r9 , VCPU_R9(%_ASM_CX) + mov %r10, VCPU_R10(%_ASM_CX) + mov %r11, VCPU_R11(%_ASM_CX) + mov %r12, VCPU_R12(%_ASM_CX) + mov %r13, VCPU_R13(%_ASM_CX) + mov %r14, VCPU_R14(%_ASM_CX) + mov %r15, VCPU_R15(%_ASM_CX) + + pop VCPU_RCX(%_ASM_CX) + + /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */ + xor %eax, %eax + + /* + * Clear all general purpose registers except RSP and RAX to prevent + * speculative use of the guest's values, even those that are reloaded + * via the stack. In theory, an L1 cache miss when restoring registers + * could lead to speculative execution with the guest's values. + * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially + * free. RSP and RAX are exempt as RSP is restored by hardware during + * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. + */ +1: xor %ebx, %ebx + xor %ecx, %ecx + xor %edx, %edx + xor %esi, %esi + xor %edi, %edi + xor %ebp, %ebp + xor %r8d, %r8d + xor %r9d, %r9d + xor %r10d, %r10d + xor %r11d, %r11d + xor %r12d, %r12d + xor %r13d, %r13d + xor %r14d, %r14d + xor %r15d, %r15d + + /* "POP" @regs. */ + add $WORD_SIZE, %_ASM_SP + pop %_ASM_BX + + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + pop %_ASM_BP + ANNOTATE_UNRET_SAFE + ret + /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */ +2: mov $1, %eax + jmp 1b +SYM_FUNC_END(__pkvm_vmx_vcpu_run) diff --git a/arch/x86/kvm/vmx/pkvm/include/pkvm.h b/arch/x86/kvm/vmx/pkvm/include/pkvm.h index 486e631f4254..65583c01574e 100644 --- a/arch/x86/kvm/vmx/pkvm/include/pkvm.h +++ b/arch/x86/kvm/vmx/pkvm/include/pkvm.h @@ -47,4 +47,6 @@ struct pkvm_hyp { #define PKVM_PCPU_PAGES (ALIGN(sizeof(struct pkvm_pcpu), PAGE_SIZE) >> PAGE_SHIFT) #define PKVM_HOST_VCPU_PAGES (ALIGN(sizeof(struct pkvm_host_vcpu), PAGE_SIZE) >> PAGE_SHIFT) +void __pkvm_vmx_vmexit(void); + #endif diff --git a/arch/x86/kvm/vmx/pkvm/pkvm_host.c b/arch/x86/kvm/vmx/pkvm/pkvm_host.c index 810e7421f644..d147d6ec7795 100644 --- a/arch/x86/kvm/vmx/pkvm/pkvm_host.c +++ b/arch/x86/kvm/vmx/pkvm/pkvm_host.c @@ -277,7 +277,8 @@ static __init void init_host_state_area(struct pkvm_host_vcpu *vcpu) _init_host_state_area(pcpu); - /*TODO: add HOST_RIP */ + /*host RIP*/ + vmcs_writel(HOST_RIP, (unsigned long)__pkvm_vmx_vmexit); } static __init void init_execution_control(struct vcpu_vmx *vmx, -- 2.25.1