Re: [PATCH v3 09/11] KVM: x86/mmu: serialize vCPUs to zap gfn when guest MTRRs are honored

Yuan Yao <yuan.yao@xxxxxxxxxxxxxxx> · Fri, 16 Jun 2023 15:45:50 +0800

On Fri, Jun 16, 2023 at 10:39:45AM +0800, Yan Zhao wrote:
> Serialize concurrent and repeated calls of kvm_zap_gfn_range() from every
> vCPU for CR0.CD toggles and MTRR updates when guest MTRRs are honored.
>
> During guest boot-up, if guest MTRRs are honored by TDP, TDP zaps are
> triggered several times by each vCPU for CR0.CD toggles and MTRRs updates.
> This will take unexpected longer CPU cycles because of the contention of
> kvm->mmu_lock.
>
> Therefore, introduce a mtrr_zap_list to remove duplicated zap and an atomic
> mtrr_zapping to allow only one vCPU to do the real zap work at one time.
>
> Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> Co-developed-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |   4 +
>  arch/x86/kvm/mtrr.c             | 141 +++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c              |   2 +-
>  arch/x86/kvm/x86.h              |   1 +
>  4 files changed, 146 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 28bd38303d70..8da1517a1513 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1444,6 +1444,10 @@ struct kvm_arch {
>  	 */
>  #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
>  	struct kvm_mmu_memory_cache split_desc_cache;
> +
> +	struct list_head mtrr_zap_list;
> +	spinlock_t mtrr_zap_list_lock;
> +	atomic_t mtrr_zapping;
>  };
>
>  struct kvm_vm_stat {
> diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
> index b35dd0bc9cad..688748e3a4d2 100644
> --- a/arch/x86/kvm/mtrr.c
> +++ b/arch/x86/kvm/mtrr.c
> @@ -25,6 +25,8 @@
>  #define IA32_MTRR_DEF_TYPE_FE		(1ULL << 10)
>  #define IA32_MTRR_DEF_TYPE_TYPE_MASK	(0xff)
>
> +static void kvm_mtrr_zap_gfn_range(struct kvm_vcpu *vcpu,
> +				   gfn_t gfn_start, gfn_t gfn_end);
>  static bool is_mtrr_base_msr(unsigned int msr)
>  {
>  	/* MTRR base MSRs use even numbers, masks use odd numbers. */
> @@ -341,7 +343,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
>  		var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end);
>  	}
>
> -	kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
> +	kvm_mtrr_zap_gfn_range(vcpu, gpa_to_gfn(start), gpa_to_gfn(end));
>  }
>
>  static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
> @@ -437,6 +439,11 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
>  void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
>  {
>  	INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
> +
> +	if (vcpu->vcpu_id == 0) {
> +		spin_lock_init(&vcpu->kvm->arch.mtrr_zap_list_lock);
> +		INIT_LIST_HEAD(&vcpu->kvm->arch.mtrr_zap_list);
> +	}
>  }
>
>  struct mtrr_iter {
> @@ -740,3 +747,135 @@ void kvm_mtrr_get_cd_memory_type(struct kvm_vcpu *vcpu, u8 *type, bool *ipat)
>  	}
>  }
>  EXPORT_SYMBOL_GPL(kvm_mtrr_get_cd_memory_type);
> +
> +struct mtrr_zap_range {
> +	gfn_t start;
> +	/* end is exclusive */
> +	gfn_t end;
> +	struct list_head node;
> +};
> +
> +static void kvm_clear_mtrr_zap_list(struct kvm *kvm)
> +{
> +	struct list_head *head = &kvm->arch.mtrr_zap_list;
> +	struct mtrr_zap_range *tmp, *n;
> +
> +	spin_lock(&kvm->arch.mtrr_zap_list_lock);
> +	list_for_each_entry_safe(tmp, n, head, node) {
> +		list_del(&tmp->node);
> +		kfree(tmp);
> +	}
> +	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> +}
> +
> +/*
> + * Add @range into kvm->arch.mtrr_zap_list and sort the list in
> + * "length" ascending + "start" descending order, so that
> + * ranges consuming more zap cycles can be dequeued later and their
> + * chances of being found duplicated are increased.
> + */
> +static void kvm_add_mtrr_zap_list(struct kvm *kvm, struct mtrr_zap_range *range)
> +{
> +	struct list_head *head = &kvm->arch.mtrr_zap_list;
> +	u64 len = range->end - range->start;
> +	struct mtrr_zap_range *cur, *n;
> +	bool added = false;
> +
> +	spin_lock(&kvm->arch.mtrr_zap_list_lock);
> +
> +	if (list_empty(head)) {
> +		list_add(&range->node, head);
> +		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> +		return;
> +	}
> +
> +	list_for_each_entry_safe(cur, n, head, node) {
> +		u64 cur_len = cur->end - cur->start;
> +
> +		if (len < cur_len)
> +			break;
> +
> +		if (len > cur_len)
> +			continue;
> +
> +		if (range->start > cur->start)
> +			break;
> +
> +		if (range->start < cur->start)
> +			continue;
> +
> +		/* equal len & start, no need to add */
> +		added = true;

Possible/worth to ignore the range already covered
by queued range ?

> +		kfree(range);
> +		break;
> +	}
> +
> +	if (!added)
> +		list_add_tail(&range->node, &cur->node);
> +
> +	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> +}
> +
> +static void kvm_zap_mtrr_zap_list(struct kvm *kvm)
> +{
> +	struct list_head *head = &kvm->arch.mtrr_zap_list;
> +	struct mtrr_zap_range *cur = NULL;
> +
> +	spin_lock(&kvm->arch.mtrr_zap_list_lock);
> +
> +	while (!list_empty(head)) {
> +		u64 start, end;
> +
> +		cur = list_first_entry(head, typeof(*cur), node);
> +		start = cur->start;
> +		end = cur->end;
> +		list_del(&cur->node);
> +		kfree(cur);
> +		spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> +
> +		kvm_zap_gfn_range(kvm, start, end);
> +
> +		spin_lock(&kvm->arch.mtrr_zap_list_lock);
> +	}
> +
> +	spin_unlock(&kvm->arch.mtrr_zap_list_lock);
> +}
> +
> +static void kvm_zap_or_wait_mtrr_zap_list(struct kvm *kvm)
> +{
> +	if (atomic_cmpxchg_acquire(&kvm->arch.mtrr_zapping, 0, 1) == 0) {
> +		kvm_zap_mtrr_zap_list(kvm);
> +		atomic_set_release(&kvm->arch.mtrr_zapping, 0);
> +		return;
> +	}
> +
> +	while (atomic_read(&kvm->arch.mtrr_zapping))
> +		cpu_relax();
> +}
> +
> +static void kvm_mtrr_zap_gfn_range(struct kvm_vcpu *vcpu,
> +				   gfn_t gfn_start, gfn_t gfn_end)
> +{
> +	struct mtrr_zap_range *range;
> +
> +	range = kmalloc(sizeof(*range), GFP_KERNEL_ACCOUNT);
> +	if (!range)
> +		goto fail;
> +
> +	range->start = gfn_start;
> +	range->end = gfn_end;
> +
> +	kvm_add_mtrr_zap_list(vcpu->kvm, range);
> +
> +	kvm_zap_or_wait_mtrr_zap_list(vcpu->kvm);
> +	return;
> +
> +fail:
> +	kvm_clear_mtrr_zap_list(vcpu->kvm);

A very small chance race condition that incorrectly
clear the queued ranges which have not been zapped by another thread ?
Like below:

Thread A                         |  Thread B
kvm_add_mtrr_zap_list()          |
                                 |  kvm_clear_mtrr_zap_list()
kvm_zap_or_wait_mtrr_zap_list()  |

Call kvm_clear_mtrr_zap_list() here looks unnecessary, other
threads(B here) who put thing in the queue will take care them well.

> +	kvm_zap_gfn_range(vcpu->kvm, gfn_start, gfn_end);
> +}
> +
> +void kvm_zap_gfn_range_on_cd_toggle(struct kvm_vcpu *vcpu)
> +{
> +	return kvm_mtrr_zap_gfn_range(vcpu, gpa_to_gfn(0), gpa_to_gfn(~0ULL));
> +}
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 32cc8bfaa5f1..74aac14a3c0b 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -943,7 +943,7 @@ void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned lon
>
>  	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
>  	    kvm_mmu_honors_guest_mtrrs(vcpu->kvm))
> -		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);

>  }
>  EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
>
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 9781b4b32d68..be946aba2bf0 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -314,6 +314,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
>  bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
>  					  int page_num);
>  void kvm_mtrr_get_cd_memory_type(struct kvm_vcpu *vcpu, u8 *type, bool *ipat);
> +void kvm_zap_gfn_range_on_cd_toggle(struct kvm_vcpu *vcpu);
>  bool kvm_vector_hashing_enabled(void);
>  void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code);
>  int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
> --
> 2.17.1
>