On Tue, 2021-01-26 at 03:18 -0500, Wei Huang wrote: > From: Bandan Das <bsd@xxxxxxxxxx> > > While running SVM related instructions (VMRUN/VMSAVE/VMLOAD), some AMD > CPUs check EAX against reserved memory regions (e.g. SMM memory on host) > before checking VMCB's instruction intercept. If EAX falls into such > memory areas, #GP is triggered before VMEXIT. This causes problem under > nested virtualization. To solve this problem, KVM needs to trap #GP and > check the instructions triggering #GP. For VM execution instructions, > KVM emulates these instructions. > > Co-developed-by: Wei Huang <wei.huang2@xxxxxxx> > Signed-off-by: Wei Huang <wei.huang2@xxxxxxx> > Signed-off-by: Bandan Das <bsd@xxxxxxxxxx> > --- > arch/x86/kvm/svm/svm.c | 109 ++++++++++++++++++++++++++++++++++------- > 1 file changed, 91 insertions(+), 18 deletions(-) > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index 7ef171790d02..e5ca01e25e89 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -200,6 +200,8 @@ module_param(sev_es, int, 0444); > bool __read_mostly dump_invalid_vmcb; > module_param(dump_invalid_vmcb, bool, 0644); > > +bool svm_gp_erratum_intercept = true; I'll expect this to be a module parm, so that user could override it, just like enable_vmware_backdoor > + > static u8 rsm_ins_bytes[] = "\x0f\xaa"; > > static void svm_complete_interrupts(struct vcpu_svm *svm); > @@ -288,6 +290,9 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) > if (!(efer & EFER_SVME)) { > svm_leave_nested(svm); > svm_set_gif(svm, true); > + /* #GP intercept is still needed in vmware_backdoor */ > + if (!enable_vmware_backdoor) I would use if (svm_gp_erratum_intercept && !enable_vmware_backdoor) to document this. > + clr_exception_intercept(svm, GP_VECTOR); > > /* > * Free the nested guest state, unless we are in SMM. > @@ -309,6 +314,10 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) > > svm->vmcb->save.efer = efer | EFER_SVME; > vmcb_mark_dirty(svm->vmcb, VMCB_CR); > + /* Enable #GP interception for SVM instructions */ > + if (svm_gp_erratum_intercept) > + set_exception_intercept(svm, GP_VECTOR); > + > return 0; > } > > @@ -1957,24 +1966,6 @@ static int ac_interception(struct vcpu_svm *svm) > return 1; > } > > -static int gp_interception(struct vcpu_svm *svm) > -{ > - struct kvm_vcpu *vcpu = &svm->vcpu; > - u32 error_code = svm->vmcb->control.exit_info_1; > - > - WARN_ON_ONCE(!enable_vmware_backdoor); > - > - /* > - * VMware backdoor emulation on #GP interception only handles IN{S}, > - * OUT{S}, and RDPMC, none of which generate a non-zero error code. > - */ > - if (error_code) { > - kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); > - return 1; > - } > - return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); > -} > - > static bool is_erratum_383(void) > { > int err, i; > @@ -2173,6 +2164,88 @@ static int vmrun_interception(struct vcpu_svm *svm) > return nested_svm_vmrun(svm); > } > > +enum { > + NONE_SVM_INSTR, > + SVM_INSTR_VMRUN, > + SVM_INSTR_VMLOAD, > + SVM_INSTR_VMSAVE, > +}; > + > +/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ > +static int svm_instr_opcode(struct kvm_vcpu *vcpu) > +{ > + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; > + > + if (ctxt->b != 0x1 || ctxt->opcode_len != 2) > + return NONE_SVM_INSTR; > + > + switch (ctxt->modrm) { > + case 0xd8: /* VMRUN */ > + return SVM_INSTR_VMRUN; > + case 0xda: /* VMLOAD */ > + return SVM_INSTR_VMLOAD; > + case 0xdb: /* VMSAVE */ > + return SVM_INSTR_VMSAVE; > + default: > + break; > + } > + > + return NONE_SVM_INSTR; > +} > + > +static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) > +{ > + int (*const svm_instr_handlers[])(struct vcpu_svm *svm) = { > + [SVM_INSTR_VMRUN] = vmrun_interception, > + [SVM_INSTR_VMLOAD] = vmload_interception, > + [SVM_INSTR_VMSAVE] = vmsave_interception, > + }; > + struct vcpu_svm *svm = to_svm(vcpu); > + > + return svm_instr_handlers[opcode](svm); > +} > + > +/* > + * #GP handling code. Note that #GP can be triggered under the following two > + * cases: > + * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on > + * some AMD CPUs when EAX of these instructions are in the reserved memory > + * regions (e.g. SMM memory on host). > + * 2) VMware backdoor > + */ > +static int gp_interception(struct vcpu_svm *svm) > +{ > + struct kvm_vcpu *vcpu = &svm->vcpu; > + u32 error_code = svm->vmcb->control.exit_info_1; > + int opcode; > + > + /* Both #GP cases have zero error_code */ > + if (error_code) > + goto reinject; > + > + /* Decode the instruction for usage later */ > + if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) > + goto reinject; > + > + opcode = svm_instr_opcode(vcpu); > + > + if (opcode == NONE_SVM_INSTR) { > + WARN_ON_ONCE(!enable_vmware_backdoor); > + > + /* > + * VMware backdoor emulation on #GP interception only handles > + * IN{S}, OUT{S}, and RDPMC. > + */ > + return kvm_emulate_instruction(vcpu, > + EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); > + } else I would check svm_gp_erratum_intercept here, not do any emulation if not set, and print a warning. > + return emulate_svm_instr(vcpu, opcode); > + > +reinject: > + kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); > + return 1; > +} > + > void svm_set_gif(struct vcpu_svm *svm, bool value) > { > if (value) { Best regards, Maxim Levitsky