On 12/14/20 9:33 AM, Paolo Bonzini wrote: > On 10/12/20 18:09, Tom Lendacky wrote: >> @@ -2797,7 +2838,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, >> struct msr_data *msr) >> static int wrmsr_interception(struct vcpu_svm *svm) >> { >> - return kvm_emulate_wrmsr(&svm->vcpu); >> + u32 ecx; >> + u64 data; >> + >> + if (!sev_es_guest(svm->vcpu.kvm)) >> + return kvm_emulate_wrmsr(&svm->vcpu); >> + >> + ecx = kvm_rcx_read(&svm->vcpu); >> + data = kvm_read_edx_eax(&svm->vcpu); >> + if (kvm_set_msr(&svm->vcpu, ecx, data)) { >> + trace_kvm_msr_write_ex(ecx, data); >> + ghcb_set_sw_exit_info_1(svm->ghcb, 1); >> + ghcb_set_sw_exit_info_2(svm->ghcb, >> + X86_TRAP_GP | >> + SVM_EVTINJ_TYPE_EXEPT | >> + SVM_EVTINJ_VALID); >> + return 1; >> + } >> + >> + trace_kvm_msr_write(ecx, data); >> + >> + return kvm_skip_emulated_instruction(&svm->vcpu); >> } >> static int msr_interception(struct vcpu_svm *svm) > > This code duplication is ugly, and does not work with userspace MSR > filters too. Agree and I missed that the userspace MSR support went in. > > But we can instead trap the completion of the MSR read/write to use > ghcb_set_sw_exit_info_1 instead of kvm_inject_gp, with a callback like > > static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) > { > if (!sev_es_guest(svm->vcpu.kvm) || !err) > return kvm_complete_insn_gp(&svm->vcpu, err); > > ghcb_set_sw_exit_info_1(svm->ghcb, 1); > ghcb_set_sw_exit_info_2(svm->ghcb, > X86_TRAP_GP | > SVM_EVTINJ_TYPE_EXEPT | > SVM_EVTINJ_VALID); > return 1; > } If we use the kvm_complete_insn_gp() we lose the tracing and it needs to be able to deal with read completion setting the registers. It also needs to deal with both kvm_emulate_rdmsr/wrmsr() when not bouncing to userspace. Let me take a shot at covering all the cases and see what I can come up with. I noticed that the userspace completion path doesn't have tracing invocations, trace_kvm_msr_read/write_ex() or trace_kvm_msr_read/write(), is that by design? > > > ... > .complete_emulated_msr = svm_complete_emulated_msr, > >> @@ -2827,7 +2888,14 @@ static int interrupt_window_interception(struct >> vcpu_svm *svm) >> static int pause_interception(struct vcpu_svm *svm) >> { >> struct kvm_vcpu *vcpu = &svm->vcpu; >> - bool in_kernel = (svm_get_cpl(vcpu) == 0); >> + bool in_kernel; >> + >> + /* >> + * CPL is not made available for an SEV-ES guest, so just set >> in_kernel >> + * to true. >> + */ >> + in_kernel = (sev_es_guest(svm->vcpu.kvm)) ? true >> + : (svm_get_cpl(vcpu) == 0); >> if (!kvm_pause_in_guest(vcpu->kvm)) >> grow_ple_window(vcpu); > > See below. > >> @@ -3273,6 +3351,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) >> struct vcpu_svm *svm = to_svm(vcpu); >> struct vmcb *vmcb = svm->vmcb; >> + /* >> + * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask >> + * bit to determine the state of the IF flag. >> + */ >> + if (sev_es_guest(svm->vcpu.kvm)) >> + return !(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK); > > This seems wrong, you have to take into account SVM_INTERRUPT_SHADOW_MASK > as well. Also, even though GIF is not really used by SEV-ES guests, I > think it's nicer to put this check afterwards. > > That is: > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index 4372e45c8f06..2dd9c9698480 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -3247,7 +3247,14 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) > if (!gif_set(svm)) > return true; > > - if (is_guest_mode(vcpu)) { > + if (sev_es_guest(svm->vcpu.kvm)) { > + /* > + * SEV-ES guests to not expose RFLAGS. Use the VMCB interrupt mask > + * bit to determine the state of the IF flag. > + */ > + if (!(vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK)) > + return true; > + } else if (is_guest_mode(vcpu)) { > /* As long as interrupts are being delivered... */ > if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) > ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF) > > Yup, I'll make that change. > >> if (!gif_set(svm)) >> return true; >> @@ -3458,6 +3543,12 @@ static void svm_complete_interrupts(struct >> vcpu_svm *svm) >> svm->vcpu.arch.nmi_injected = true; >> break; >> case SVM_EXITINTINFO_TYPE_EXEPT: >> + /* >> + * Never re-inject a #VC exception. >> + */ >> + if (vector == X86_TRAP_VC) >> + break; >> + >> /* >> * In case of software exceptions, do not reinject the vector, >> * but re-execute the instruction instead. Rewind RIP first >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index a3fdc16cfd6f..b6809a2851d2 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -4018,7 +4018,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) >> { >> int idx; >> - if (vcpu->preempted) >> + if (vcpu->preempted && !vcpu->arch.guest_state_protected) >> vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); > > This has to be true, otherwise no directed yield will be done at all: > > if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && > !kvm_arch_vcpu_in_kernel(vcpu)) > continue; > > Or more easily, just use in_kernel == false in pause_interception, like > > + /* > + * CPL is not made available for an SEV-ES guest, therefore > + * vcpu->arch.preempted_in_kernel can never be true. Just > + * set in_kernel to false as well. > + */ > + in_kernel = !sev_es_guest(svm->vcpu.kvm) && svm_get_cpl(vcpu) == 0; Sounds good, I'll make that change. > >> /* >> @@ -8161,7 +8161,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) >> { >> struct kvm_run *kvm_run = vcpu->run; >> - kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; >> + kvm_run->if_flag = (vcpu->arch.guest_state_protected) >> + ? kvm_arch_interrupt_allowed(vcpu) >> + : (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; > > Here indeed you only want the interrupt allowed bit, not the interrupt > window. But we can just be bold and always set it to true. > > - for userspace irqchip, kvm_run->ready_for_interrupt_injection is set > just below and it will always be false if kvm_arch_interrupt_allowed is false > > - for in-kernel APIC, if_flag is documented to be invalid (though it > actually is valid). For split irqchip, they can just use > kvm_run->ready_for_interrupt_injection; for entirely in-kernel interrupt > handling, userspace does not need if_flag at all. Ok, I'll make that change. Thanks, Tom > > Paolo > >> kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; >> kvm_run->cr8 = kvm_get_cr8(vcpu); >> kvm_run->apic_base = kvm_get_apic_base(vcpu); >> > >