Re: [RFC] Para-virtualized TLB flush for PV-waiting vCPUs

Kenta Ishiguro <kentaishiguro@xxxxxxxxxxxxxxxxxxxx> · Mon, 20 Jan 2025 01:27:49 +0900

Thank you for your comments.
I understood the scenario of why my previous change was unsafe.

Also, I like the concept of KVM_VCPU_IN_HOST for tracking whether a vCPU is
scheduled out because it can be helpful to understand the vCPU's situation
from guests.

I have tested the attached changes, but I found that the performance
improvement was somewhat limited. The boundary-checking code prevents
KVM from setting KVM_VCPU_IN_HOST and KVM_VCPU_PREEMPTED, which may be
contributing to this limitation.  I think this is a conservative
approach to avoid using stale TLB entries.
I referred to this patch:
https://lore.kernel.org/lkml/20220614021116.1101331-1-sashal@xxxxxxxxxx/
Since PV waiting causes the most significant overhead, is it possible to
allow guests to perform PV flush if vCPUs are PV waiting and scheduled out?

> diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
> index a1efa7907a0b..e3a6e6ecf70b 100644
> --- a/arch/x86/include/uapi/asm/kvm_para.h
> +++ b/arch/x86/include/uapi/asm/kvm_para.h
> @@ -70,6 +70,7 @@ struct kvm_steal_time {
>  
>  #define KVM_VCPU_PREEMPTED          (1 << 0)
>  #define KVM_VCPU_FLUSH_TLB          (1 << 1)
> +#define KVM_VCPU_IN_HOST           (1 << 2)
>  
>  #define KVM_CLOCK_PAIRING_WALLCLOCK 0
>  struct kvm_clock_pairing {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index acdd72e89bb0..5e3dc209e86c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5018,8 +5018,11 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
>         struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
>         struct kvm_steal_time __user *st;
>         struct kvm_memslots *slots;
> -       static const u8 preempted = KVM_VCPU_PREEMPTED;
>         gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
> +       u8 preempted = KVM_VCPU_IN_HOST;
> +
> +       if (vcpu->preempted)
> +               preempted |= KVM_VCPU_PREEMPTED;
>  
>         /*
>          * The vCPU can be marked preempted if and only if the VM-Exit was on
> @@ -5037,7 +5040,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
>         if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
>                 return;
>  
> -       if (vcpu->arch.st.preempted)
> +       if (vcpu->arch.st.preempted == preempted)
This code may clear KVM_VCPU_FLUSH_TLB.
https://lore.kernel.org/lkml/20200803121849.869521189@xxxxxxxxxxxxxxxxxxx/
>                 return;
>  
>         /* This happens on process exit */
> @@ -5055,7 +5058,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
>         BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
>  
>         if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
> -               vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
> +               vcpu->arch.st.preempted = preempted;
>  
>         mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
>  }
> @@ -5064,7 +5067,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  {
>         int idx;
>  
> -       if (vcpu->preempted) {
> +       if (vcpu->preempted || !kvm_xen_msr_enabled(vcpu->kvm)) {
>                 /*
>                  * Assume protected guests are in-kernel.  Inefficient yielding
>                  * due to false positives is preferable to never yielding due



---
 arch/x86/include/uapi/asm/kvm_para.h |  1 +
 arch/x86/kernel/kvm.c                |  2 +-
 arch/x86/kvm/x86.c                   | 11 +++++++----
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index a1efa7907a0b..6ef06c3f234b 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -70,6 +70,7 @@ struct kvm_steal_time {
 
 #define KVM_VCPU_PREEMPTED          (1 << 0)
 #define KVM_VCPU_FLUSH_TLB          (1 << 1)
+#define KVM_VCPU_IN_HOST           (1 << 2)
 
 #define KVM_CLOCK_PAIRING_WALLCLOCK 0
 struct kvm_clock_pairing {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 21e9e4845354..17ea1111a158 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -668,7 +668,7 @@ static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
 		 */
 		src = &per_cpu(steal_time, cpu);
 		state = READ_ONCE(src->preempted);
-		if ((state & KVM_VCPU_PREEMPTED)) {
+		if ((state & KVM_VCPU_PREEMPTED) || (state & KVM_VCPU_IN_HOST)) {
 			if (try_cmpxchg(&src->preempted, &state,
 					state | KVM_VCPU_FLUSH_TLB))
 				__cpumask_clear_cpu(cpu, flushmask);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c79a8cc57ba4..27b558ae1ad2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5019,8 +5019,11 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
 	struct kvm_steal_time __user *st;
 	struct kvm_memslots *slots;
-	static const u8 preempted = KVM_VCPU_PREEMPTED;
 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+	u8 preempted = KVM_VCPU_IN_HOST;
+
+	if (vcpu->preempted)
+		preempted |= KVM_VCPU_PREEMPTED;
 
 	/*
 	 * The vCPU can be marked preempted if and only if the VM-Exit was on
@@ -5038,7 +5041,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
-	if (vcpu->arch.st.preempted)
+	if (vcpu->arch.st.preempted == preempted)
 		return;
 
 	/* This happens on process exit */
@@ -5056,7 +5059,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
 	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
 
 	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
-		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+		vcpu->arch.st.preempted = preempted;
 
 	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
 }
@@ -5065,7 +5068,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	int idx;
 
-	if (vcpu->preempted) {
+	if (vcpu->preempted || !kvm_xen_msr_enabled(vcpu->kvm)) {
 		/*
 		 * Assume protected guests are in-kernel.  Inefficient yielding
 		 * due to false positives is preferable to never yielding due
-- 
2.25.1