This patch adds kvm_arch_test_clear_young() for the vast majority of VMs that are not nested and run on hardware with Radix MMU enabled. It relies on two techniques, RCU and cmpxchg, to safely test and clear the accessed bit without taking the MMU lock. The former protects KVM page tables from being freed while the latter clears the accessed bit atomically against both the hardware and other software page table walkers. Signed-off-by: Yu Zhao <yuzhao@xxxxxxxxxx> --- arch/powerpc/include/asm/kvm_host.h | 18 ++++++ arch/powerpc/include/asm/kvm_ppc.h | 14 +---- arch/powerpc/kvm/book3s.c | 7 +++ arch/powerpc/kvm/book3s.h | 2 + arch/powerpc/kvm/book3s_64_mmu_radix.c | 78 +++++++++++++++++++++++++- arch/powerpc/kvm/book3s_hv.c | 10 ++-- 6 files changed, 110 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index caea15dcb91d..996850029ce0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -886,4 +886,22 @@ static inline void kvm_arch_exit(void) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline int kvmppc_radix_possible(void) +{ + return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); +} + +static inline bool kvmhv_on_pseries(void) +{ + return IS_ENABLED(CONFIG_PPC_PSERIES) && !cpu_has_feature(CPU_FTR_HVMODE); +} + +/* see the comments on the generic kvm_arch_has_test_clear_young() */ +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young +static inline bool kvm_arch_has_test_clear_young(void) +{ + return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && + kvmppc_radix_possible() && !kvmhv_on_pseries(); +} + #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index eae9619b6190..0bb772fc12b1 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -277,6 +277,8 @@ struct kvmppc_ops { bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); + bool (*test_clear_young)(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap); bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); void (*free_memslot)(struct kvm_memory_slot *slot); int (*init_vm)(struct kvm *kvm); @@ -580,18 +582,6 @@ static inline bool kvm_hv_mode_active(void) { return false; } #endif -#ifdef CONFIG_PPC_PSERIES -static inline bool kvmhv_on_pseries(void) -{ - return !cpu_has_feature(CPU_FTR_HVMODE); -} -#else -static inline bool kvmhv_on_pseries(void) -{ - return false; -} -#endif - #ifdef CONFIG_KVM_XICS static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 6d525285dbe8..f4cf330e3e81 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -877,6 +877,13 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return kvm->arch.kvm_ops->test_age_gfn(kvm, range); } +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap) +{ + return kvm->arch.kvm_ops->test_clear_young && + kvm->arch.kvm_ops->test_clear_young(kvm, range, lsb_gfn, bitmap); +} + bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { return kvm->arch.kvm_ops->set_spte_gfn(kvm, range); diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 58391b4b32ed..fe9cac423817 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -12,6 +12,8 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); +extern bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap); extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range); extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 9d3743ca16d5..8476646c554c 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -1083,6 +1083,78 @@ bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, return ref; } +bool kvmhv_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range, + gfn_t lsb_gfn, unsigned long *bitmap) +{ + bool success; + gfn_t gfn = range->start; + + if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young())) + return false; + + /* + * This function relies on two techniques, RCU and cmpxchg, to safely + * test and clear the accessed bit without taking the MMU lock. The + * former protects KVM page tables from being freed while the latter + * clears the accessed bit atomically against both the hardware and + * other software page table walkers. + */ + rcu_read_lock(); + + success = kvm_is_radix(kvm); + if (!success) + goto unlock; + + /* + * case 1: this function kvmppc_switch_mmu_to_hpt() + * + * rcu_read_lock() + * test kvm_is_radix() kvm->arch.radix = 0 + * use kvm->arch.pgtable + * rcu_read_unlock() + * synchronize_rcu() + * kvmppc_free_radix() + * + * + * case 2: this function kvmppc_switch_mmu_to_radix() + * + * kvmppc_init_vm_radix() + * smp_wmb() + * test kvm_is_radix() kvm->arch.radix = 1 + * smp_rmb() + * use kvm->arch.pgtable + */ + smp_rmb(); + + while (gfn < range->end) { + pte_t *ptep; + pte_t old, new; + unsigned int shift; + + ptep = find_kvm_secondary_pte_unlocked(kvm, gfn * PAGE_SIZE, &shift); + if (!ptep) + goto next; + + VM_WARN_ON_ONCE(!page_count(virt_to_page(ptep))); + + old = READ_ONCE(*ptep); + if (!pte_present(old) || !pte_young(old)) + goto next; + + new = pte_mkold(old); + + /* see the comments on the generic kvm_arch_has_test_clear_young() */ + if (__test_and_change_bit(lsb_gfn - gfn, bitmap)) + pte_xchg(ptep, old, new); +next: + gfn += shift ? BIT(shift - PAGE_SHIFT) : 1; + } +unlock: + rcu_read_unlock(); + + return success; +} + /* Returns the number of PAGE_SIZE pages that are dirty */ static int kvm_radix_test_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, int pagenum) @@ -1464,13 +1536,15 @@ int kvmppc_radix_init(void) { unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; - kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); + kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, + SLAB_TYPESAFE_BY_RCU, pte_ctor); if (!kvm_pte_cache) return -ENOMEM; size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; - kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor); + kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, + SLAB_TYPESAFE_BY_RCU, pmd_ctor); if (!kvm_pmd_cache) { kmem_cache_destroy(kvm_pte_cache); return -ENOMEM; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6ba68dd6190b..17b415661282 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5242,6 +5242,8 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) spin_lock(&kvm->mmu_lock); kvm->arch.radix = 0; spin_unlock(&kvm->mmu_lock); + /* see the comments in kvmhv_test_clear_young() */ + synchronize_rcu(); kvmppc_free_radix(kvm); lpcr = LPCR_VPM1; @@ -5266,6 +5268,8 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) if (err) return err; kvmppc_rmap_reset(kvm); + /* see the comments in kvmhv_test_clear_young() */ + smp_wmb(); /* Mutual exclusion with kvm_unmap_gfn_range etc. */ spin_lock(&kvm->mmu_lock); kvm->arch.radix = 1; @@ -6165,6 +6169,7 @@ static struct kvmppc_ops kvm_ops_hv = { .unmap_gfn_range = kvm_unmap_gfn_range_hv, .age_gfn = kvm_age_gfn_hv, .test_age_gfn = kvm_test_age_gfn_hv, + .test_clear_young = kvmhv_test_clear_young, .set_spte_gfn = kvm_set_spte_gfn_hv, .free_memslot = kvmppc_core_free_memslot_hv, .init_vm = kvmppc_core_init_vm_hv, @@ -6225,11 +6230,6 @@ static int kvm_init_subcore_bitmap(void) return 0; } -static int kvmppc_radix_possible(void) -{ - return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); -} - static int kvmppc_book3s_init_hv(void) { int r; -- 2.39.2.637.g21b0678d19-goog