Re: [PATCH v3 1/2] KVM: x86: Check hypercall's exit to userspace generically

"Huang, Kai" <kai.huang@xxxxxxxxx> · Mon, 4 Nov 2024 09:55:11 +0000

On Mon, 2024-11-04 at 16:49 +0800, Binbin Wu wrote:
> 
> 
> On 11/2/2024 12:39 AM, Sean Christopherson wrote:
> > On Fri, Nov 01, 2024, Kai Huang wrote:
> > > On Thu, 2024-10-31 at 07:54 -0700, Sean Christopherson wrote:
> > > > On Thu, Oct 31, 2024, Kai Huang wrote:
> > > > -	ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
> > > > -	if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
> > > > -		/* MAP_GPA tosses the request to the user space. */
> > > > -		return 0;
> > > > +	r = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, &ret);
> > > > +	if (r <= r)
> > > > +		return r;
> > > ... should be:
> > > 
> > > 	if (r <= 0)
> > > 		return r;
> > > 
> > > ?
> > > 
> > > Another option might be we move "set hypercall return value" code inside
> > > __kvm_emulate_hypercall().  So IIUC the reason to split
> > > __kvm_emulate_hypercall() out is for TDX, and while non-TDX uses RAX to carry
> > > the hypercall return value, TDX uses R10.
> > > 
> > > We can additionally pass a "kvm_hypercall_set_ret_func" function pointer to
> > > __kvm_emulate_hypercall(), and invoke it inside.  Then we can change
> > > __kvm_emulate_hypercall() to return:
> > >      < 0 error,
> > >      ==0 return to userspace,
> > >      > 0 go back to guest.
> > Hmm, and the caller can still handle kvm_skip_emulated_instruction(), because the
> > return value is KVM's normal pattern.
> > 
> > I like it!
> > 
> > But, there's no need to pass a function pointer, KVM can write (and read) arbitrary
> > GPRs, it's just avoided in most cases so that the sanity checks and available/dirty
> > updates are elided.  For this code though, it's easy enough to keep kvm_rxx_read()
> > for getting values, and eating the overhead of a single GPR write is a perfectly
> > fine tradeoff for eliminating the return multiplexing.
> > 
> > Lightly tested.  Assuming this works for TDX and passes testing, I'll post a
> > mini-series next week.
> > 
> > --
> > From: Sean Christopherson <seanjc@xxxxxxxxxx>
> > Date: Fri, 1 Nov 2024 09:04:00 -0700
> > Subject: [PATCH] KVM: x86: Refactor __kvm_emulate_hypercall() to accept reg
> >   names, not values
> > 
> > Rework __kvm_emulate_hypercall() to take the names of input and output
> > (guest return value) registers, as opposed to taking the input values and
> > returning the output value.  As part of the refactor, change the actual
> > return value from __kvm_emulate_hypercall() to be KVM's de facto standard
> > of '0' == exit to userspace, '1' == resume guest, and -errno == failure.
> > 
> > Using the return value for KVM's control flow eliminates the multiplexed
> > return value, where '0' for KVM_HC_MAP_GPA_RANGE (and only that hypercall)
> > means "exit to userspace".
> > 
> > Use the direct GPR accessors to read values to avoid the pointless marking
> > of the registers as available, but use kvm_register_write_raw() for the
> > guest return value so that the innermost helper doesn't need to multiplex
> > its return value.  Using the generic kvm_register_write_raw() adds very
> > minimal overhead, so as a one-off in a relatively slow path it's well
> > worth the code simplification.
> > 
> > Suggested-by: Kai Huang <kai.huang@xxxxxxxxx>
> > Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> > ---
> >   arch/x86/include/asm/kvm_host.h | 15 +++++++++----
> >   arch/x86/kvm/x86.c              | 40 +++++++++++++--------------------
> >   2 files changed, 27 insertions(+), 28 deletions(-)
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 6d9f763a7bb9..9e66fde1c4e4 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -2179,10 +2179,17 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
> >   	kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
> >   }
> >   
> > -unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
> > -				      unsigned long a0, unsigned long a1,
> > -				      unsigned long a2, unsigned long a3,
> > -				      int op_64_bit, int cpl);
> > +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
> > +			      unsigned long a0, unsigned long a1,
> > +			      unsigned long a2, unsigned long a3,
> > +			      int op_64_bit, int cpl, int ret_reg);
> > +
> > +#define __kvm_emulate_hypercall(_vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, ret)	\
> > +	____kvm_emulate_hypercall(vcpu,						\
> > +				  kvm_##nr##_read(vcpu), kvm_##a0##_read(vcpu),	\
> > +				  kvm_##a1##_read(vcpu), kvm_##a2##_read(vcpu),	\
> > +				  kvm_##a3##_read(vcpu), op_64_bit, cpl, VCPU_REGS_##ret)
> > +
> >   int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
> >   
> >   int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index e09daa3b157c..425a301911a6 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -9998,10 +9998,10 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
> >   	return kvm_skip_emulated_instruction(vcpu);
> >   }
> >   
> > -unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
> > -				      unsigned long a0, unsigned long a1,
> > -				      unsigned long a2, unsigned long a3,
> > -				      int op_64_bit, int cpl)
> > +int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
> > +			      unsigned long a0, unsigned long a1,
> > +			      unsigned long a2, unsigned long a3,
> > +			      int op_64_bit, int cpl, int ret_reg)
> >   {
> >   	unsigned long ret;
> >   
> > @@ -10086,15 +10086,18 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
> >   
> >   out:
> >   	++vcpu->stat.hypercalls;
> > -	return ret;
> > +
> > +	if (!op_64_bit)
> > +		ret = (u32)ret;
> > +
> > +	kvm_register_write_raw(vcpu, ret_reg, ret);
> > +	return 1;
> >   }
> > -EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
> > +EXPORT_SYMBOL_GPL(____kvm_emulate_hypercall);
> >   
> >   int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >   {
> > -	unsigned long nr, a0, a1, a2, a3, ret;
> > -	int op_64_bit;
> > -	int cpl;
> > +	int r;
> >   
> >   	if (kvm_xen_hypercall_enabled(vcpu->kvm))
> >   		return kvm_xen_hypercall(vcpu);
> > @@ -10102,23 +10105,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
> >   	if (kvm_hv_hypercall_enabled(vcpu))
> >   		return kvm_hv_hypercall(vcpu);
> >   
> > -	nr = kvm_rax_read(vcpu);
> > -	a0 = kvm_rbx_read(vcpu);
> > -	a1 = kvm_rcx_read(vcpu);
> > -	a2 = kvm_rdx_read(vcpu);
> > -	a3 = kvm_rsi_read(vcpu);
> > -	op_64_bit = is_64_bit_hypercall(vcpu);
> > -	cpl = kvm_x86_call(get_cpl)(vcpu);
> > -
> > -	ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
> > -	if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
> > -		/* MAP_GPA tosses the request to the user space. */
> > +	r = __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi,
> > +				    is_64_bit_hypercall(vcpu),
> > +				    kvm_x86_call(get_cpl)(vcpu), RAX);
> Now, the register for return code of the hypercall can be specified.
> But in  ____kvm_emulate_hypercall(), the complete_userspace_io callback
> is hardcoded to complete_hypercall_exit(), which always set return code
> to RAX.
> 
> We can allow the caller to pass in the cui callback, or assign different
> version according to the input 'ret_reg'.  So that different callers can use
> different cui callbacks.  E.g., TDX needs to set return code to R10 in cui
> callback.
> 
> How about:
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index dba78f22ab27..0fba98685f42 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -2226,13 +2226,15 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
>   int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
>                                unsigned long a0, unsigned long a1,
>                                unsigned long a2, unsigned long a3,
> -                             int op_64_bit, int cpl, int ret_reg);
> +                             int op_64_bit, int cpl, int ret_reg,
> +                             int (*cui)(struct kvm_vcpu *vcpu));
> 

Does below (incremental diff based on Sean's) work?

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 734dac079453..5131af97968d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10075,7 +10075,6 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu,
unsigned long nr,
                        vcpu->run->hypercall.flags |=
KVM_EXIT_HYPERCALL_LONG_MODE;
 
                WARN_ON_ONCE(vcpu->run->hypercall.flags &
KVM_EXIT_HYPERCALL_MBZ);
-               vcpu->arch.complete_userspace_io = complete_hypercall_exit;
                /* stat is incremented on completion. */
                return 0;
        }
@@ -10108,8 +10107,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
        r = __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi,
                                    is_64_bit_hypercall(vcpu),
                                    kvm_x86_call(get_cpl)(vcpu), RAX);
-       if (r <= 0)
+       if (r <= 0) {
+               if (!r)
+                       vcpu->arch.complete_userspace_io =
complete_hypercall_exit;
                return 0;
+       }
 
        return kvm_skip_emulated_instruction(vcpu);
 }