Host VM keep the capability to launch its guests based on VMX, pKVM need to provide VMX emulation for it. This includes emulations for different VMX instructions - VMXON/VMXOFF, VMPTRLD/VMCLEAR, VMWRITE/VMREAD, and VMRESUME/VMLAUNCH. This patch introduces nested.c, and provide emulation for VMXON and VMXOFF vmx instructions for host VM. The emulation simply does state check and revision id validation for vmxarea passed from VMXON/VMXOFF instructions, the physical VMX is kept as enabled after the pKVM initialization. More permission check still leaves as TODO. Signed-off-by: Jason Chen CJ <jason.cj.chen@xxxxxxxxx> --- arch/x86/kvm/vmx/pkvm/hyp/Makefile | 2 +- arch/x86/kvm/vmx/pkvm/hyp/nested.c | 195 +++++++++++++++++++++++++++++ arch/x86/kvm/vmx/pkvm/hyp/nested.h | 11 ++ arch/x86/kvm/vmx/pkvm/hyp/vmexit.c | 12 ++ 4 files changed, 219 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/pkvm/hyp/Makefile b/arch/x86/kvm/vmx/pkvm/hyp/Makefile index 7c6f71f18676..660fd611395f 100644 --- a/arch/x86/kvm/vmx/pkvm/hyp/Makefile +++ b/arch/x86/kvm/vmx/pkvm/hyp/Makefile @@ -12,7 +12,7 @@ ccflags-y += -D__PKVM_HYP__ virt-dir := ../../../../../../$(KVM_PKVM) pkvm-hyp-y := vmx_asm.o vmexit.o memory.o early_alloc.o pgtable.o mmu.o pkvm.o \ - init_finalise.o ept.o idt.o irq.o + init_finalise.o ept.o idt.o irq.o nested.o ifndef CONFIG_PKVM_INTEL_DEBUG lib-dir := lib diff --git a/arch/x86/kvm/vmx/pkvm/hyp/nested.c b/arch/x86/kvm/vmx/pkvm/hyp/nested.c new file mode 100644 index 000000000000..f5e2eb8f51c8 --- /dev/null +++ b/arch/x86/kvm/vmx/pkvm/hyp/nested.c @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Intel Corporation + */ + +#include <pkvm.h> + +#include "pkvm_hyp.h" +#include "debug.h" + +enum VMXResult { + VMsucceed, + VMfailValid, + VMfailInvalid, +}; + +static void nested_vmx_result(enum VMXResult result, int error_number) +{ + u64 rflags = vmcs_readl(GUEST_RFLAGS); + + rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | + X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF); + + if (result == VMfailValid) { + rflags |= X86_EFLAGS_ZF; + vmcs_write32(VM_INSTRUCTION_ERROR, error_number); + } else if (result == VMfailInvalid) { + rflags |= X86_EFLAGS_CF; + } else { + /* VMsucceed, do nothing */ + } + + if (result != VMsucceed) + pkvm_err("VMX failed: %d/%d", result, error_number); + + vmcs_writel(GUEST_RFLAGS, rflags); +} + +static int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, + u32 vmx_instruction_info, gva_t *ret) +{ + gva_t off; + struct kvm_segment s; + + /* + * According to Vol. 3B, "Information for VM Exits Due to Instruction + * Execution", on an exit, vmx_instruction_info holds most of the + * addressing components of the operand. Only the displacement part + * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). + * For how an actual address is calculated from all these components, + * refer to Vol. 1, "Operand Addressing". + */ + int scaling = vmx_instruction_info & 3; + int addr_size = (vmx_instruction_info >> 7) & 7; + bool is_reg = vmx_instruction_info & (1u << 10); + int seg_reg = (vmx_instruction_info >> 15) & 7; + int index_reg = (vmx_instruction_info >> 18) & 0xf; + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); + int base_reg = (vmx_instruction_info >> 23) & 0xf; + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); + + if (is_reg) { + /* TODO: inject #UD */ + return 1; + } + + /* Addr = segment_base + offset */ + /* offset = base + [index * scale] + displacement */ + off = exit_qualification; /* holds the displacement */ + if (addr_size == 1) + off = (gva_t)sign_extend64(off, 31); + else if (addr_size == 0) + off = (gva_t)sign_extend64(off, 15); + if (base_is_valid) + off += vcpu->arch.regs[base_reg]; + if (index_is_valid) + off += vcpu->arch.regs[index_reg] << scaling; + + if (seg_reg == VCPU_SREG_FS) + s.base = vmcs_readl(GUEST_FS_BASE); + if (seg_reg == VCPU_SREG_GS) + s.base = vmcs_readl(GUEST_GS_BASE); + + /* TODO: support more cpu mode beside long mode */ + /* + * The effective address, i.e. @off, of a memory operand is truncated + * based on the address size of the instruction. Note that this is + * the *effective address*, i.e. the address prior to accounting for + * the segment's base. + */ + if (addr_size == 1) /* 32 bit */ + off &= 0xffffffff; + else if (addr_size == 0) /* 16 bit */ + off &= 0xffff; + + /* + * The virtual/linear address is never truncated in 64-bit + * mode, e.g. a 32-bit address size can yield a 64-bit virtual + * address when using FS/GS with a non-zero base. + */ + if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) + *ret = s.base + off; + else + *ret = off; + + /* TODO: check addr is canonical, otherwise inject #GP/#SS */ + + return 0; +} + +static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, + int *ret) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t gva; + struct x86_exception e; + int r; + + if (get_vmx_mem_address(vcpu, vmx->exit_qualification, + vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) { + *ret = 1; + return -EINVAL; + } + + r = read_gva(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); + if (r < 0) { + /*TODO: handle memory failure exception */ + *ret = 1; + return -EINVAL; + } + + return 0; +} + +static int validate_vmcs_revision_id(struct kvm_vcpu *vcpu, gpa_t vmpointer) +{ + struct vmcs_config *vmcs_config = &pkvm_hyp->vmcs_config; + u32 rev_id; + + read_gpa(vcpu, vmpointer, &rev_id, sizeof(rev_id)); + + return (rev_id == vmcs_config->revision_id); +} + +static bool check_vmx_permission(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool permit = true; + + /*TODO: check more env (cr, cpl) and inject #UD/#GP */ + if (!vmx->nested.vmxon) + permit = false; + + return permit; +} + +int handle_vmxon(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t vmptr; + int r; + + /*TODO: check env error(cr, efer, rflags, cpl) */ + if (vmx->nested.vmxon) { + nested_vmx_result(VMfailValid, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + } else { + if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) { + nested_vmx_result(VMfailInvalid, 0); + return r; + } else if (!validate_vmcs_revision_id(vcpu, vmptr)) { + nested_vmx_result(VMfailInvalid, 0); + } else { + vmx->nested.vmxon_ptr = vmptr; + vmx->nested.vmxon = true; + + nested_vmx_result(VMsucceed, 0); + } + } + + return 0; +} + +int handle_vmxoff(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (check_vmx_permission(vcpu)) { + vmx->nested.vmxon = false; + vmx->nested.vmxon_ptr = INVALID_GPA; + + nested_vmx_result(VMsucceed, 0); + } + + return 0; +} diff --git a/arch/x86/kvm/vmx/pkvm/hyp/nested.h b/arch/x86/kvm/vmx/pkvm/hyp/nested.h new file mode 100644 index 000000000000..2d21edaddb25 --- /dev/null +++ b/arch/x86/kvm/vmx/pkvm/hyp/nested.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022 Intel Corporation + */ +#ifndef __PKVM_NESTED_H +#define __PKVM_NESTED_H + +int handle_vmxon(struct kvm_vcpu *vcpu); +int handle_vmxoff(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c index 6b82b6be612c..fa67cab803a8 100644 --- a/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c +++ b/arch/x86/kvm/vmx/pkvm/hyp/vmexit.c @@ -9,6 +9,7 @@ #include "vmexit.h" #include "ept.h" #include "pkvm_hyp.h" +#include "nested.h" #include "debug.h" #define CR4 4 @@ -168,6 +169,7 @@ int pkvm_main(struct kvm_vcpu *vcpu) vcpu->arch.cr2 = native_read_cr2(); vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -194,6 +196,16 @@ int pkvm_main(struct kvm_vcpu *vcpu) handle_write_msr(vcpu); skip_instruction = true; break; + case EXIT_REASON_VMON: + pkvm_dbg("CPU%d vmexit reason: VMXON.\n", vcpu->cpu); + handle_vmxon(vcpu); + skip_instruction = true; + break; + case EXIT_REASON_VMOFF: + pkvm_dbg("CPU%d vmexit reason: VMXOFF.\n", vcpu->cpu); + handle_vmxoff(vcpu); + skip_instruction = true; + break; case EXIT_REASON_XSETBV: handle_xsetbv(vcpu); skip_instruction = true; -- 2.25.1