Re: [PATCH v3 1/2] KVM: x86: Check hypercall's exit to userspace generically

Sean Christopherson <seanjc@xxxxxxxxxx> · Thu, 31 Oct 2024 07:54:45 -0700

On Thu, Oct 31, 2024, Kai Huang wrote:
> > @@ -10111,12 +10111,21 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >  	cpl = kvm_x86_call(get_cpl)(vcpu);
> >  
> >  	ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
> > -	if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
> > -		/* MAP_GPA tosses the request to the user space. */
> > +	if (!ret)
> >  		return 0;
> >  
> > -	if (!op_64_bit)
> > +	/*
> > +	 * KVM's ABI with the guest is that '0' is success, and any other value
> > +	 * is an error code.  Internally, '0' == exit to userspace (see above)
> > +	 * and '1' == success, as KVM's de facto standard return codes are that
> > +	 * plus -errno == failure.  Explicitly check for '1' as some PV error
> > +	 * codes are positive values.
> > +	 */
> > +	if (ret == 1)
> > +		ret = 0;
> > +	else if (!op_64_bit)
> >  		ret = (u32)ret;
> > +
> >  	kvm_rax_write(vcpu, ret);
> >  
> >  	return kvm_skip_emulated_instruction(vcpu);
> > 
> 
> It appears below two cases are not covered correctly?
> 
> #ifdef CONFIG_X86_64    
>         case KVM_HC_CLOCK_PAIRING:
>                 ret = kvm_pv_clock_pairing(vcpu, a0, a1);
>                 break;
> #endif
>         case KVM_HC_SEND_IPI:
>                 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
>                         break;
> 
>                 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
>                 break; 
> 
> Looking at the code, seems at least for KVM_HC_CLOCK_PAIRING,
> kvm_pv_clock_pairing() returns 0 on success, and the upstream behaviour is not
> routing to userspace but writing 0 to vcpu's RAX and returning to guest.  With
> the change above, it immediately returns to userspace w/o writing 0 to RAX.
> 
> For KVM_HC_SEND_IPI, seems the upstream behaviour is the return value of
> kvm_pv_send_ipi() is always written to vcpu's RAX reg, and we always just go
> back to guest.  With your change, the behaviour will be changed to:
> 
>   1) when ret == 0, exit to userspace w/o writing  0 to vcpu's RAX,
>   2) when ret == 1, it is converted to 0 and then written to RAX.
> 
> This doesn't look like safe.

Drat, I managed to space on the cases that didn't explicit set '0'.  Hrm.

My other idea was have an out-param to separate the return code intended for KVM
from the return code intended for the guest.  I generally dislike out-params, but
trying to juggle a return value that multiplexes guest and host values seems like
an even worse idea.

Also completely untested...

---
 arch/x86/include/asm/kvm_host.h |  8 +++----
 arch/x86/kvm/x86.c              | 41 +++++++++++++++------------------
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6d9f763a7bb9..226df5c56811 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2179,10 +2179,10 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
 	kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
 }
 
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
-				      unsigned long a0, unsigned long a1,
-				      unsigned long a2, unsigned long a3,
-				      int op_64_bit, int cpl);
+int __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+			    unsigned long a0, unsigned long a1,
+			    unsigned long a2, unsigned long a3,
+			    int op_64_bit, int cpl, unsigned long *ret);
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e09daa3b157c..e9ae09f1b45b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9998,13 +9998,11 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
 	return kvm_skip_emulated_instruction(vcpu);
 }
 
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
-				      unsigned long a0, unsigned long a1,
-				      unsigned long a2, unsigned long a3,
-				      int op_64_bit, int cpl)
+int __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+			    unsigned long a0, unsigned long a1,
+			    unsigned long a2, unsigned long a3,
+			    int op_64_bit, int cpl, unsigned long *ret)
 {
-	unsigned long ret;
-
 	trace_kvm_hypercall(nr, a0, a1, a2, a3);
 
 	if (!op_64_bit) {
@@ -10016,15 +10014,15 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
 	}
 
 	if (cpl) {
-		ret = -KVM_EPERM;
+		*ret = -KVM_EPERM;
 		goto out;
 	}
 
-	ret = -KVM_ENOSYS;
+	*ret = -KVM_ENOSYS;
 
 	switch (nr) {
 	case KVM_HC_VAPIC_POLL_IRQ:
-		ret = 0;
+		*ret = 0;
 		break;
 	case KVM_HC_KICK_CPU:
 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
@@ -10032,36 +10030,36 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
 
 		kvm_pv_kick_cpu_op(vcpu->kvm, a1);
 		kvm_sched_yield(vcpu, a1);
-		ret = 0;
+		*ret = 0;
 		break;
 #ifdef CONFIG_X86_64
 	case KVM_HC_CLOCK_PAIRING:
-		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+		*ret = kvm_pv_clock_pairing(vcpu, a0, a1);
 		break;
 #endif
 	case KVM_HC_SEND_IPI:
 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
 			break;
 
-		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
+		*ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
 		break;
 	case KVM_HC_SCHED_YIELD:
 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
 			break;
 
 		kvm_sched_yield(vcpu, a0);
-		ret = 0;
+		*ret = 0;
 		break;
 	case KVM_HC_MAP_GPA_RANGE: {
 		u64 gpa = a0, npages = a1, attrs = a2;
 
-		ret = -KVM_ENOSYS;
+		*ret = -KVM_ENOSYS;
 		if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE))
 			break;
 
 		if (!PAGE_ALIGNED(gpa) || !npages ||
 		    gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
-			ret = -KVM_EINVAL;
+			*ret = -KVM_EINVAL;
 			break;
 		}
 
@@ -10080,13 +10078,13 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
 		return 0;
 	}
 	default:
-		ret = -KVM_ENOSYS;
+		*ret = -KVM_ENOSYS;
 		break;
 	}
 
 out:
 	++vcpu->stat.hypercalls;
-	return ret;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
 
@@ -10094,7 +10092,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
 	int op_64_bit;
-	int cpl;
+	int cpl, r;
 
 	if (kvm_xen_hypercall_enabled(vcpu->kvm))
 		return kvm_xen_hypercall(vcpu);
@@ -10110,10 +10108,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	op_64_bit = is_64_bit_hypercall(vcpu);
 	cpl = kvm_x86_call(get_cpl)(vcpu);
 
-	ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
-	if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
-		/* MAP_GPA tosses the request to the user space. */
-		return 0;
+	r = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, &ret);
+	if (r <= r)
+		return r;
 
 	if (!op_64_bit)
 		ret = (u32)ret;

base-commit: 675248928970d33f7fc8ca9851a170c98f4f1c4f
--