Re: [RFC] Para-virtualized TLB flush for PV-waiting vCPUs

Sean Christopherson <seanjc@xxxxxxxxxx> · Fri, 17 Jan 2025 13:34:17 -0800

On Tue, Jan 07, 2025, Kenta Ishiguro wrote:
> Signed-off-by: Kenta Ishiguro <kentaishiguro@xxxxxxxxxxxxxxxxxxxx>
> ---
>  arch/x86/include/uapi/asm/kvm_para.h |  1 +
>  arch/x86/kernel/kvm.c                | 13 +++++++++++--
>  2 files changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
> index a1efa7907a0b..db26e167a707 100644
> --- a/arch/x86/include/uapi/asm/kvm_para.h
> +++ b/arch/x86/include/uapi/asm/kvm_para.h
> @@ -70,6 +70,7 @@ struct kvm_steal_time {
>  
>  #define KVM_VCPU_PREEMPTED          (1 << 0)
>  #define KVM_VCPU_FLUSH_TLB          (1 << 1)
> +#define KVM_VCPU_IN_PVWAIT          (1 << 2)
>  
>  #define KVM_CLOCK_PAIRING_WALLCLOCK 0
>  struct kvm_clock_pairing {
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 21e9e4845354..f17057b7d263 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -668,7 +668,8 @@ static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
>  		 */
>  		src = &per_cpu(steal_time, cpu);
>  		state = READ_ONCE(src->preempted);
> -		if ((state & KVM_VCPU_PREEMPTED)) {
> +		if ((state & KVM_VCPU_PREEMPTED) ||
> +		    (state & KVM_VCPU_IN_PVWAIT)) {
>  			if (try_cmpxchg(&src->preempted, &state,
>  					state | KVM_VCPU_FLUSH_TLB))

This is unsafe.  KVM_VCPU_PREEMPTED relies on it being set and cleared by the host
to ensure either the host observes KVM_VCPU_FLUSH_TLB before the vCPU enters the
guest, i.e. before the vCPU can possibly consume stale TLB entries.

If the host is already in the process of re-entering the guest on the waiting
vCPU, e.g. because it received a kick, then there will be no KVM_REQ_STEAL_UPDATE
in the host and so the host won't process KVM_VCPU_FLUSH_TLB.

I also see no reason to limit this to kvm_wait(); the logic you are relying on is
really just "let KVM do the flushes if the vCPU is in the host".  There's a balance
to be had, e.g. toggling a flag on every entry+exit would get expensive, but
toggling at kvm_arch_vcpu_put() and then letting kvm_arch_vcpu_load() request
a steal_time update seems pretty straightforward.

E.g. this will also elide IPIs if the vCPU happens to be in host userspace doing
emulation of some kind, or if the vCPU is blocking.

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index a1efa7907a0b..e3a6e6ecf70b 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -70,6 +70,7 @@ struct kvm_steal_time {
 
 #define KVM_VCPU_PREEMPTED          (1 << 0)
 #define KVM_VCPU_FLUSH_TLB          (1 << 1)
+#define KVM_VCPU_IN_HOST           (1 << 2)
 
 #define KVM_CLOCK_PAIRING_WALLCLOCK 0
 struct kvm_clock_pairing {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index acdd72e89bb0..5e3dc209e86c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5018,8 +5018,11 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
        struct kvm_steal_time __user *st;
        struct kvm_memslots *slots;
-       static const u8 preempted = KVM_VCPU_PREEMPTED;
        gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+       u8 preempted = KVM_VCPU_IN_HOST;
+
+       if (vcpu->preempted)
+               preempted |= KVM_VCPU_PREEMPTED;
 
        /*
         * The vCPU can be marked preempted if and only if the VM-Exit was on
@@ -5037,7 +5040,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                return;
 
-       if (vcpu->arch.st.preempted)
+       if (vcpu->arch.st.preempted == preempted)
                return;
 
        /* This happens on process exit */
@@ -5055,7 +5058,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
 
        if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
-               vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+               vcpu->arch.st.preempted = preempted;
 
        mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
 }
@@ -5064,7 +5067,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
        int idx;
 
-       if (vcpu->preempted) {
+       if (vcpu->preempted || !kvm_xen_msr_enabled(vcpu->kvm)) {
                /*
                 * Assume protected guests are in-kernel.  Inefficient yielding
                 * due to false positives is preferable to never yielding due

>  				__cpumask_clear_cpu(cpu, flushmask);
> @@ -1045,6 +1046,9 @@ static void kvm_kick_cpu(int cpu)
>  
>  static void kvm_wait(u8 *ptr, u8 val)
>  {
> +	u8 state;
> +	struct kvm_steal_time *src;
> +
>  	if (in_nmi())
>  		return;
>  
> @@ -1054,8 +1058,13 @@ static void kvm_wait(u8 *ptr, u8 val)
>  	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
>  	 */
>  	if (irqs_disabled()) {
> -		if (READ_ONCE(*ptr) == val)
> +		if (READ_ONCE(*ptr) == val) {
> +			src = this_cpu_ptr(&steal_time);
> +			state = READ_ONCE(src->preempted);
> +			try_cmpxchg(&src->preempted, &state,
> +				    state | KVM_VCPU_IN_PVWAIT);
>  			halt();
> +		}
>  	} else {
>  		local_irq_disable();
>  
> -- 
> 2.25.1
>