Re: [PATCH v3 1/2] KVM: x86: Check hypercall's exit to userspace generically

Binbin Wu <binbin.wu@xxxxxxxxxxxxxxx> · Mon, 4 Nov 2024 16:49:52 +0800

On 11/2/2024 12:39 AM, Sean Christopherson wrote:
On Fri, Nov 01, 2024, Kai Huang wrote:
On Thu, 2024-10-31 at 07:54 -0700, Sean Christopherson wrote:
On Thu, Oct 31, 2024, Kai Huang wrote:
-	ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
-	if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
-		/* MAP_GPA tosses the request to the user space. */
-		return 0;
+	r = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, &ret);
+	if (r <= r)
+		return r;
... should be:

	if (r <= 0)
		return r;

?

Another option might be we move "set hypercall return value" code inside
__kvm_emulate_hypercall().  So IIUC the reason to split
__kvm_emulate_hypercall() out is for TDX, and while non-TDX uses RAX to carry
the hypercall return value, TDX uses R10.

We can additionally pass a "kvm_hypercall_set_ret_func" function pointer to
__kvm_emulate_hypercall(), and invoke it inside.  Then we can change
__kvm_emulate_hypercall() to return:
     < 0 error,
     ==0 return to userspace,
     > 0 go back to guest.
Hmm, and the caller can still handle kvm_skip_emulated_instruction(), because the
return value is KVM's normal pattern.

I like it!

But, there's no need to pass a function pointer, KVM can write (and read) arbitrary
GPRs, it's just avoided in most cases so that the sanity checks and available/dirty
updates are elided.  For this code though, it's easy enough to keep kvm_rxx_read()
for getting values, and eating the overhead of a single GPR write is a perfectly
fine tradeoff for eliminating the return multiplexing.

Lightly tested.  Assuming this works for TDX and passes testing, I'll post a
mini-series next week.

--
From: Sean Christopherson <seanjc@xxxxxxxxxx>
Date: Fri, 1 Nov 2024 09:04:00 -0700
Subject: [PATCH] KVM: x86: Refactor __kvm_emulate_hypercall() to accept reg
  names, not values

Rework __kvm_emulate_hypercall() to take the names of input and output
(guest return value) registers, as opposed to taking the input values and
returning the output value.  As part of the refactor, change the actual
return value from __kvm_emulate_hypercall() to be KVM's de facto standard
of '0' == exit to userspace, '1' == resume guest, and -errno == failure.

Using the return value for KVM's control flow eliminates the multiplexed
return value, where '0' for KVM_HC_MAP_GPA_RANGE (and only that hypercall)
means "exit to userspace".

Use the direct GPR accessors to read values to avoid the pointless marking
of the registers as available, but use kvm_register_write_raw() for the
guest return value so that the innermost helper doesn't need to multiplex
its return value.  Using the generic kvm_register_write_raw() adds very
minimal overhead, so as a one-off in a relatively slow path it's well
worth the code simplification.

Suggested-by: Kai Huang <kai.huang@xxxxxxxxx>
Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
---
  arch/x86/include/asm/kvm_host.h | 15 +++++++++----
  arch/x86/kvm/x86.c              | 40 +++++++++++++--------------------
  2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6d9f763a7bb9..9e66fde1c4e4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2179,10 +2179,17 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
  	kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
  }
  
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
-				      unsigned long a0, unsigned long a1,
-				      unsigned long a2, unsigned long a3,
-				      int op_64_bit, int cpl);
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+			      unsigned long a0, unsigned long a1,
+			      unsigned long a2, unsigned long a3,
+			      int op_64_bit, int cpl, int ret_reg);
+
+#define __kvm_emulate_hypercall(_vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, ret)	\
+	____kvm_emulate_hypercall(vcpu,						\
+				  kvm_##nr##_read(vcpu), kvm_##a0##_read(vcpu),	\
+				  kvm_##a1##_read(vcpu), kvm_##a2##_read(vcpu),	\
+				  kvm_##a3##_read(vcpu), op_64_bit, cpl, VCPU_REGS_##ret)
+
  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
  
  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e09daa3b157c..425a301911a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9998,10 +9998,10 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
  	return kvm_skip_emulated_instruction(vcpu);
  }
  
-unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
-				      unsigned long a0, unsigned long a1,
-				      unsigned long a2, unsigned long a3,
-				      int op_64_bit, int cpl)
+int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
+			      unsigned long a0, unsigned long a1,
+			      unsigned long a2, unsigned long a3,
+			      int op_64_bit, int cpl, int ret_reg)
  {
  	unsigned long ret;
  
@@ -10086,15 +10086,18 @@ unsigned long __kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
  
  out:
  	++vcpu->stat.hypercalls;
-	return ret;
+
+	if (!op_64_bit)
+		ret = (u32)ret;
+
+	kvm_register_write_raw(vcpu, ret_reg, ret);
+	return 1;
  }
-EXPORT_SYMBOL_GPL(__kvm_emulate_hypercall);
+EXPORT_SYMBOL_GPL(____kvm_emulate_hypercall);
  
  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
  {
-	unsigned long nr, a0, a1, a2, a3, ret;
-	int op_64_bit;
-	int cpl;
+	int r;
  
  	if (kvm_xen_hypercall_enabled(vcpu->kvm))
  		return kvm_xen_hypercall(vcpu);
@@ -10102,23 +10105,12 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
  	if (kvm_hv_hypercall_enabled(vcpu))
  		return kvm_hv_hypercall(vcpu);
  
-	nr = kvm_rax_read(vcpu);
-	a0 = kvm_rbx_read(vcpu);
-	a1 = kvm_rcx_read(vcpu);
-	a2 = kvm_rdx_read(vcpu);
-	a3 = kvm_rsi_read(vcpu);
-	op_64_bit = is_64_bit_hypercall(vcpu);
-	cpl = kvm_x86_call(get_cpl)(vcpu);
-
-	ret = __kvm_emulate_hypercall(vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl);
-	if (nr == KVM_HC_MAP_GPA_RANGE && !ret)
-		/* MAP_GPA tosses the request to the user space. */
+	r = __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi,
+				    is_64_bit_hypercall(vcpu),
+				    kvm_x86_call(get_cpl)(vcpu), RAX);
Now, the register for return code of the hypercall can be specified.
But in  ____kvm_emulate_hypercall(), the complete_userspace_io callback
is hardcoded to complete_hypercall_exit(), which always set return code
to RAX.

We can allow the caller to pass in the cui callback, or assign different
version according to the input 'ret_reg'.  So that different callers can use
different cui callbacks.  E.g., TDX needs to set return code to R10 in cui
callback.

How about:

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dba78f22ab27..0fba98685f42 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2226,13 +2226,15 @@ static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
 int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
                              unsigned long a0, unsigned long a1,
                              unsigned long a2, unsigned long a3,
-                             int op_64_bit, int cpl, int ret_reg);
+                             int op_64_bit, int cpl, int ret_reg,
+                             int (*cui)(struct kvm_vcpu *vcpu));

-#define __kvm_emulate_hypercall(_vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, ret)        \
+#define __kvm_emulate_hypercall(_vcpu, nr, a0, a1, a2, a3, op_64_bit, cpl, ret, cui)   \
____kvm_emulate_hypercall(vcpu, \
                                  kvm_##nr##_read(vcpu), kvm_##a0##_read(vcpu), \
                                  kvm_##a1##_read(vcpu), kvm_##a2##_read(vcpu), \
-                                 kvm_##a3##_read(vcpu), op_64_bit, cpl, VCPU_REGS_##ret)
+                                 kvm_##a3##_read(vcpu), op_64_bit, cpl, VCPU_REGS_##ret, \
+                                 cui)

 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6e0a518aec4a..b68690c4a4c0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10019,7 +10019,8 @@ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
 int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
                              unsigned long a0, unsigned long a1,
                              unsigned long a2, unsigned long a3,
-                             int op_64_bit, int cpl, int ret_reg)
+                             int op_64_bit, int cpl, int ret_reg,
+                             int (*cui)(struct kvm_vcpu *vcpu))
 {
        unsigned long ret;

@@ -10093,7 +10094,7 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, unsigned long nr,
                        vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE;

                WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ);
-               vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+               vcpu->arch.complete_userspace_io = cui;
                /* stat is incremented on completion. */
                return 0;
        }
@@ -10125,7 +10126,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)

        r = __kvm_emulate_hypercall(vcpu, rax, rbx, rcx, rdx, rsi,
                                    is_64_bit_hypercall(vcpu),
-                                   kvm_x86_call(get_cpl)(vcpu), RAX);
+                                   kvm_x86_call(get_cpl)(vcpu), RAX, complete_hypercall_exit);
        if (r <= 0)
                return 0;


+	if (r <= 0)
  		return 0;
  
-	if (!op_64_bit)
-		ret = (u32)ret;
-	kvm_rax_write(vcpu, ret);
-
  	return kvm_skip_emulated_instruction(vcpu);
  }
  EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);

base-commit: 911785b796e325dec83b32050f294e278a306211