Re: [PATCH v3 2/2] KVM: x86/mmu: Recover TDP MMU NX huge pages using MMU read lock

Sean Christopherson <seanjc@xxxxxxxxxx> · Wed, 30 Oct 2024 07:28:28 -0700

On Fri, Sep 06, 2024, Vipin Sharma wrote:
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 455caaaa04f5..fc597f66aa11 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -7317,8 +7317,8 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel
>  	return err;
>  }
>  
> -void kvm_recover_nx_huge_pages(struct kvm *kvm, struct list_head *pages,
> -			       unsigned long nr_pages)
> +void kvm_recover_nx_huge_pages(struct kvm *kvm, bool shared,
> +			       struct list_head *pages, unsigned long nr_pages)
>  {
>  	struct kvm_memory_slot *slot;
>  	int rcu_idx;
> @@ -7329,7 +7329,10 @@ void kvm_recover_nx_huge_pages(struct kvm *kvm, struct list_head *pages,
>  	ulong to_zap;
>  
>  	rcu_idx = srcu_read_lock(&kvm->srcu);
> -	write_lock(&kvm->mmu_lock);
> +	if (shared)

Hmm, what if we do this?

enum kvm_mmu_types {
	KVM_SHADOW_MMU,
#ifdef CONFIG_X86_64
	KVM_TDP_MMU,
#endif
	KVM_NR_MMU_TYPES,
};

#ifndef CONFIG_X86_64
#define KVM_TDP_MMU -1
#endif

And then this becomes:

	if (mmu_type == KVM_TDP_MMU)

> +		read_lock(&kvm->mmu_lock);
> +	else
> +		write_lock(&kvm->mmu_lock);
>  
>  	/*
>  	 * Zapping TDP MMU shadow pages, including the remote TLB flush, must
> @@ -7341,8 +7344,13 @@ void kvm_recover_nx_huge_pages(struct kvm *kvm, struct list_head *pages,
>  	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
>  	to_zap = ratio ? DIV_ROUND_UP(nr_pages, ratio) : 0;
>  	for ( ; to_zap; --to_zap) {
> -		if (list_empty(pages))
> +		if (tdp_mmu_enabled)

Shouldn't this be?

		if (shared)

Or if we do the above

		if (mmu_type == KVM_TDP_MMU)

Actually, better idea (sans comments)

	if (mmu_type == KVM_TDP_MMU) {
		read_lock(&kvm->mmu_lock);
		kvm_tdp_mmu_pages_lock(kvm);
	} else {
		write_lock(&kvm->mmu_lock);
	}

	rcu_read_lock();

	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
	to_zap = ratio ? DIV_ROUND_UP(possible_nx->nr_pages, ratio) : 0;
	for ( ; to_zap; --to_zap) {
		if (list_empty(possible_nx->pages))
			break;

                ...

		/* Blah blah blah. */
		if (mmu_type == KVM_TDP_MMU)
			kvm_tdp_mmu_pages_unlock(kvm);

                ...

		/* Blah blah blah. */
		if (mmu_type == KVM_TDP_MMU)
			kvm_tdp_mmu_pages_lock(kvm);
	}
	kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);

	rcu_read_unlock();

	if (mmu_type == KVM_TDP_MMU) {
		kvm_tdp_mmu_pages_unlock(kvm);
		read_unlock(&kvm->mmu_lock);
	} else {
		write_unlock(&kvm->mmu_lock);
	}
	srcu_read_unlock(&kvm->srcu, rcu_idx);

> @@ -825,23 +835,51 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
>  	rcu_read_unlock();
>  }
>  
> -bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
> +bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,

This rename, and any refactoring that is associated with said rename, e.g. comments,
belongs in a separate patch.

> +					   struct kvm_mmu_page *sp)
>  {
> -	u64 old_spte;
> +	struct tdp_iter iter = {
> +		.old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
> +		.sptep = sp->ptep,
> +		.level = sp->role.level + 1,
> +		.gfn = sp->gfn,
> +		.as_id = kvm_mmu_page_as_id(sp),
> +	};
> +
> +	lockdep_assert_held_read(&kvm->mmu_lock);

Newline here, to isolate the lockdep assertion from the functional code.

> +	if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
> +		return false;
>  
>  	/*
> -	 * This helper intentionally doesn't allow zapping a root shadow page,
> -	 * which doesn't have a parent page table and thus no associated entry.
> +	 * Root shadow pages don't a parent page table and thus no associated

Missed a word or three.

> +	 * entry, but they can never be possible NX huge pages.
>  	 */
>  	if (WARN_ON_ONCE(!sp->ptep))
>  		return false;
>  
> -	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
> -	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
> +	/*
> +	 * Since mmu_lock is held in read mode, it's possible another task has
> +	 * already modified the SPTE. Zap the SPTE if and only if the SPTE
> +	 * points at the SP's page table, as checking  shadow-present isn't
> +	 * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
> +	 * another SP. Note, spte_to_child_pt() also checks that the SPTE is
> +	 * shadow-present, i.e. guards against zapping a frozen SPTE.
> +	 */
> +	if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
>  		return false;
>  
> -	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
> -			 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
> +	/*
> +	 * If a different task modified the SPTE, then it should be impossible
> +	 * for the SPTE to still be used for the to-be-zapped SP. Non-leaf
> +	 * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
> +	 * creating non-leaf SPTEs, and all other bits are immutable for non-
> +	 * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
> +	 * zapping and replacement.
> +	 */
> +	if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
> +		WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
> +		return false;
> +	}
>  
>  	return true;
>  }