On Thu, 2024-10-31 at 07:54 -0700, Sean Christopherson wrote: > On Thu, Oct 31, 2024, Kai Huang wrote: > > > @@ -10111,12 +10111,21 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) > > > cpl = kvm_x86_call(get_cpl)(vcpu); > > > > > > ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl); > > > - if (nr == KVM_HC_MAP_GPA_RANGE && !ret) > > > - /* MAP_GPA tosses the request to the user space. */ > > > + if (!ret) > > > return 0; > > > > > > - if (!op_64_bit) > > > + /* > > > + * KVM's ABI with the guest is that '0' is success, and any other value > > > + * is an error code. Internally, '0' == exit to userspace (see above) > > > + * and '1' == success, as KVM's de facto standard return codes are that > > > + * plus -errno == failure. Explicitly check for '1' as some PV error > > > + * codes are positive values. > > > + */ > > > + if (ret == 1) > > > + ret = 0; > > > + else if (!op_64_bit) > > > ret = (u32)ret; > > > + > > > kvm_rax_write(vcpu, ret); > > > > > > return kvm_skip_emulated_instruction(vcpu); > > > > > > > It appears below two cases are not covered correctly? > > > > #ifdef CONFIG_X86_64 > > case KVM_HC_CLOCK_PAIRING: > > ret = kvm_pv_clock_pairing(vcpu, a0, a1); > > break; > > #endif > > case KVM_HC_SEND_IPI: > > if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) > > break; > > > > ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); > > break; > > > > Looking at the code, seems at least for KVM_HC_CLOCK_PAIRING, > > kvm_pv_clock_pairing() returns 0 on success, and the upstream behaviour is not > > routing to userspace but writing 0 to vcpu's RAX and returning to guest. With > > the change above, it immediately returns to userspace w/o writing 0 to RAX. > > > > For KVM_HC_SEND_IPI, seems the upstream behaviour is the return value of > > kvm_pv_send_ipi() is always written to vcpu's RAX reg, and we always just go > > back to guest. With your change, the behaviour will be changed to: > > > > 1) when ret == 0, exit to userspace w/o writing 0 to vcpu's RAX, > > 2) when ret == 1, it is converted to 0 and then written to RAX. > > > > This doesn't look like safe. > > Drat, I managed to space on the cases that didn't explicit set '0'. Hrm. > > My other idea was have an out-param to separate the return code intended for KVM > from the return code intended for the guest. I generally dislike out-params, but > trying to juggle a return value that multiplexes guest and host values seems like > an even worse idea. Yeah this looks fine to me, one comment below ... [...] > - ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl); > - if (nr == KVM_HC_MAP_GPA_RANGE && !ret) > - /* MAP_GPA tosses the request to the user space. */ > - return 0; > + r = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, &ret); > + if (r <= r) > + return r; ... should be: if (r <= 0) return r; ? Another option might be we move "set hypercall return value" code inside __kvm_emulate_hypercall(). So IIUC the reason to split __kvm_emulate_hypercall() out is for TDX, and while non-TDX uses RAX to carry the hypercall return value, TDX uses R10. We can additionally pass a "kvm_hypercall_set_ret_func" function pointer to __kvm_emulate_hypercall(), and invoke it inside. Then we can change __kvm_emulate_hypercall() to return: < 0 error, ==0 return to userspace, > 0 go back to guest. I made some attempt below, build test only: diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6d9f763a7bb9..c48feb62a5d7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2179,10 +2179,14 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, kvm_set_or_clear_apicv_inhibit(kvm, reason, false); } -unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl); +typedef void (*kvm_hypercall_set_ret_func_t)(struct kvm_vcpu *vcpu, + unsigned long val); + +int __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, + unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, + int op_64_bit, int cpl, + kvm_hypercall_set_ret_func_t set_ret_func); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 83fe0a78146f..01bdf01cfc79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9998,10 +9998,11 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, - unsigned long a0, unsigned long a1, - unsigned long a2, unsigned long a3, - int op_64_bit, int cpl) +int __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, + unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3, + int op_64_bit, int cpl, + kvm_hypercall_set_ret_func_t set_ret_func) { unsigned long ret; @@ -10076,6 +10077,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); vcpu->arch.complete_userspace_io = complete_hypercall_exit; + /* MAP_GPA tosses the request to the user space. */ /* stat is incremented on completion. */ return 0; } @@ -10085,16 +10087,26 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr, } out: + if (!op_64_bit) + ret = (u32)ret; + (*set_ret_func)(vcpu, ret); + ++vcpu->stat.hypercalls; - return ret; + return 1; } EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall); +static void kvm_hypercall_set_ret_default(struct kvm_vcpu *vcpu, + unsigned long val) +{ + kvm_rax_write(vcpu, val); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { - unsigned long nr, a0, a1, a2, a3, ret; + unsigned long nr, a0, a1, a2, a3; int op_64_bit; - int cpl; + int cpl, ret; if (kvm_xen_hypercall_enabled(vcpu->kvm)) return kvm_xen_hypercall(vcpu); @@ -10110,14 +10122,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) op_64_bit = is_64_bit_hypercall(vcpu); cpl = kvm_x86_call(get_cpl)(vcpu); - ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl); - if (nr == KVM_HC_MAP_GPA_RANGE && !ret) - /* MAP_GPA tosses the request to the user space. */ - return 0; - - if (!op_64_bit) - ret = (u32)ret; - kvm_rax_write(vcpu, ret); + ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, + kvm_hypercall_set_ret_default); + if (ret <= 0) + return ret; return kvm_skip_emulated_instruction(vcpu); }