Re: [PATCH v3 1/2] KVM: x86: Check hypercall's exit to userspace generically

"Huang, Kai" <kai.huang@xxxxxxxxx> · Fri, 1 Nov 2024 11:05:45 +0000

On Thu, 2024-10-31 at 07:54 -0700, Sean Christopherson wrote:
> On Thu, Oct 31, 2024, Kai Huang wrote:
> > > @@ -10111,12 +10111,21 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> > >  	cpl = kvm_x86_call(get_cpl)(vcpu);
> > >  
> > >  	ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
> > > -	if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
> > > -		/* MAP_GPA tosses the request to the user space. */
> > > +	if (!ret)
> > >  		return 0;
> > >  
> > > -	if (!op_64_bit)
> > > +	/*
> > > +	 * KVM's ABI with the guest is that '0' is success, and any other value
> > > +	 * is an error code.  Internally, '0' == exit to userspace (see above)
> > > +	 * and '1' == success, as KVM's de facto standard return codes are that
> > > +	 * plus -errno == failure.  Explicitly check for '1' as some PV error
> > > +	 * codes are positive values.
> > > +	 */
> > > +	if (ret == 1)
> > > +		ret = 0;
> > > +	else if (!op_64_bit)
> > >  		ret = (u32)ret;
> > > +
> > >  	kvm_rax_write(vcpu, ret);
> > >  
> > >  	return kvm_skip_emulated_instruction(vcpu);
> > > 
> > 
> > It appears below two cases are not covered correctly?
> > 
> > #ifdef CONFIG_X86_64    
> >         case KVM_HC_CLOCK_PAIRING:
> >                 ret = kvm_pv_clock_pairing(vcpu, a0, a1);
> >                 break;
> > #endif
> >         case KVM_HC_SEND_IPI:
> >                 if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
> >                         break;
> > 
> >                 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
> >                 break; 
> > 
> > Looking at the code, seems at least for KVM_HC_CLOCK_PAIRING,
> > kvm_pv_clock_pairing() returns 0 on success, and the upstream behaviour is not
> > routing to userspace but writing 0 to vcpu's RAX and returning to guest.  With
> > the change above, it immediately returns to userspace w/o writing 0 to RAX.
> > 
> > For KVM_HC_SEND_IPI, seems the upstream behaviour is the return value of
> > kvm_pv_send_ipi() is always written to vcpu's RAX reg, and we always just go
> > back to guest.  With your change, the behaviour will be changed to:
> > 
> >   1) when ret == 0, exit to userspace w/o writing  0 to vcpu's RAX,
> >   2) when ret == 1, it is converted to 0 and then written to RAX.
> > 
> > This doesn't look like safe.
> 
> Drat, I managed to space on the cases that didn't explicit set '0'.  Hrm.
> 
> My other idea was have an out-param to separate the return code intended for KVM
> from the return code intended for the guest.  I generally dislike out-params, but
> trying to juggle a return value that multiplexes guest and host values seems like
> an even worse idea.

Yeah this looks fine to me, one comment below ...


[...]

> -	ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
> -	if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
> -		/* MAP_GPA tosses the request to the user space. */
> -		return 0;
> +	r = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, &ret);
> +	if (r <= r)
> +		return r;

... should be:

	if (r <= 0)
		return r;

?

Another option might be we move "set hypercall return value" code inside
__kvm_emulate_hypercall().  So IIUC the reason to split
__kvm_emulate_hypercall() out is for TDX, and while non-TDX uses RAX to carry
the hypercall return value, TDX uses R10.

We can additionally pass a "kvm_hypercall_set_ret_func" function pointer to
__kvm_emulate_hypercall(), and invoke it inside.  Then we can change
__kvm_emulate_hypercall() to return: 
    < 0 error, 
    ==0 return to userspace, 
    > 0 go back to guest.

I made some attempt below, build test only:

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6d9f763a7bb9..c48feb62a5d7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2179,10 +2179,14 @@ static inline void kvm_clear_apicv_inhibit(struct kvm
*kvm,
        kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
 }
 
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
-                                     unsigned long a0, unsigned long a1,
-                                     unsigned long a2, unsigned long a3,
-                                     int op_64_bit, int cpl);
+typedef void (*kvm_hypercall_set_ret_func_t)(struct kvm_vcpu *vcpu,
+               unsigned long val);
+
+int __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+                           unsigned long a0, unsigned long a1,
+                           unsigned long a2, unsigned long a3,
+                           int op_64_bit, int cpl,
+                           kvm_hypercall_set_ret_func_t set_ret_func);
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83fe0a78146f..01bdf01cfc79 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9998,10 +9998,11 @@ static int complete_hypercall_exit(struct kvm_vcpu
*vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 }
 
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
-                                     unsigned long a0, unsigned long a1,
-                                     unsigned long a2, unsigned long a3,
-                                     int op_64_bit, int cpl)
+int __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+                           unsigned long a0, unsigned long a1,
+                           unsigned long a2, unsigned long a3,
+                           int op_64_bit, int cpl,
+                           kvm_hypercall_set_ret_func_t set_ret_func)
 {
        unsigned long ret;
 
@@ -10076,6 +10077,7 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu
*vcpu, unsigned long nr,
 
                WARN_ON_ONCE(vcpu->run->hypercall.flags &
KVM_EXIT_HYPERCALL_MBZ);
                vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+               /* MAP_GPA tosses the request to the user space. */
                /* stat is incremented on completion. */
                return 0;
        }
@@ -10085,16 +10087,26 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu
*vcpu, unsigned long nr,
        }
 
 out:
+       if (!op_64_bit)
+               ret = (u32)ret;
+       (*set_ret_func)(vcpu, ret);
+
        ++vcpu->stat.hypercalls;
-       return ret;
+       return 1;
 }
 EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
 
+static void kvm_hypercall_set_ret_default(struct kvm_vcpu *vcpu,
+                                         unsigned long val)
+{
+       kvm_rax_write(vcpu, val);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
-       unsigned long nr, a0, a1, a2, a3, ret;
+       unsigned long nr, a0, a1, a2, a3;
        int op_64_bit;
-       int cpl;
+       int cpl, ret;
 
        if (kvm_xen_hypercall_enabled(vcpu->kvm))
                return kvm_xen_hypercall(vcpu);
@@ -10110,14 +10122,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        op_64_bit = is_64_bit_hypercall(vcpu);
        cpl = kvm_x86_call(get_cpl)(vcpu);
 
-       ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
-       if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
-               /* MAP_GPA tosses the request to the user space. */
-               return 0;
-
-       if (!op_64_bit)
-               ret = (u32)ret;
-       kvm_rax_write(vcpu, ret);
+       ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl,
+                       kvm_hypercall_set_ret_default);
+       if (ret <= 0)
+               return ret;
 
        return kvm_skip_emulated_instruction(vcpu);
 }