This patch implements MMU notifiers for KVM RISC-V so that Guest physical address space is in-sync with Host physical address space. This will allow swapping, page migration, etc to work transparently with KVM RISC-V. Signed-off-by: Anup Patel <anup.patel@xxxxxxx> Acked-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> Reviewed-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> --- arch/riscv/include/asm/kvm_host.h | 7 ++ arch/riscv/kvm/Kconfig | 1 + arch/riscv/kvm/mmu.c | 200 +++++++++++++++++++++++++++++- arch/riscv/kvm/vm.c | 1 + 4 files changed, 208 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index a37775c92586..ab33e59a3d88 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -192,6 +192,13 @@ static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} int kvm_riscv_setup_vsip(void); void kvm_riscv_cleanup_vsip(void); +#define KVM_ARCH_WANT_MMU_NOTIFIER +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end); +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); + extern void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long vmid, unsigned long gpa); extern void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid); diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 35fd30d0e432..002e14ee37f6 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -20,6 +20,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on OF + select MMU_NOTIFIER select PREEMPT_NOTIFIERS select ANON_INODES select KVM_MMIO diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 9e95ab6769f6..0b8e46aebb02 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -67,6 +67,66 @@ static void *stage2_cache_alloc(struct kvm_mmu_page_cache *pcache) return p; } +static int stage2_pgdp_test_and_clear_young(pgd_t *pgd) +{ + return ptep_test_and_clear_young(NULL, 0, (pte_t *)pgd); +} + +static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) +{ + return ptep_test_and_clear_young(NULL, 0, (pte_t *)pmd); +} + +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + return ptep_test_and_clear_young(NULL, 0, pte); +} + +static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, + pgd_t **pgdpp, pmd_t **pmdpp, pte_t **ptepp) +{ + pgd_t *pgdp; + pmd_t *pmdp; + pte_t *ptep; + + *pgdpp = NULL; + *pmdpp = NULL; + *ptepp = NULL; + + pgdp = &kvm->arch.pgd[pgd_index(addr)]; + if (!pgd_val(*pgdp)) + return false; + if (pgd_val(*pgdp) & _PAGE_LEAF) { + *pgdpp = pgdp; + return true; + } + + if (stage2_have_pmd) { + pmdp = (void *)pgd_page_vaddr(*pgdp); + pmdp = &pmdp[pmd_index(addr)]; + if (!pmd_present(*pmdp)) + return false; + if (pmd_val(*pmdp) & _PAGE_LEAF) { + *pmdpp = pmdp; + return true; + } + + ptep = (void *)pmd_page_vaddr(*pmdp); + } else { + ptep = (void *)pgd_page_vaddr(*pgdp); + } + + ptep = &ptep[pte_index(addr)]; + if (!pte_present(*ptep)) + return false; + if (pte_val(*ptep) & _PAGE_LEAF) { + *ptepp = ptep; + return true; + } + + return false; +} + struct local_guest_tlb_info { struct kvm_vmid *vmid; gpa_t addr; @@ -444,6 +504,38 @@ int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, } +static int handle_hva_to_gpa(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, + gpa_t gpa, u64 size, + void *data), + void *data) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int ret = 0; + + slots = kvm_memslots(kvm); + + /* we only care about the pages that the guest sees */ + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gpa; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + + gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; + ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); + } + + return ret; +} + void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { @@ -576,6 +668,106 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, return ret; } +static int kvm_unmap_hva_handler(struct kvm *kvm, + gpa_t gpa, u64 size, void *data) +{ + stage2_unmap_range(kvm, gpa, size); + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, + unsigned long start, unsigned long end) +{ + if (!kvm->arch.pgd) + return 0; + + handle_hva_to_gpa(kvm, start, end, + &kvm_unmap_hva_handler, NULL); + return 0; +} + +static int kvm_set_spte_handler(struct kvm *kvm, + gpa_t gpa, u64 size, void *data) +{ + pte_t *pte = (pte_t *)data; + + WARN_ON(size != PAGE_SIZE); + stage2_set_pte(kvm, NULL, gpa, pte); + + return 0; +} + +int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + unsigned long end = hva + PAGE_SIZE; + kvm_pfn_t pfn = pte_pfn(pte); + pte_t stage2_pte; + + if (!kvm->arch.pgd) + return 0; + + stage2_pte = pfn_pte(pfn, PAGE_WRITE_EXEC); + handle_hva_to_gpa(kvm, hva, end, + &kvm_set_spte_handler, &stage2_pte); + + return 0; +} + +static int kvm_age_hva_handler(struct kvm *kvm, + gpa_t gpa, u64 size, void *data) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pgd, &pmd, &pte)) + return 0; + + if (pgd) + return stage2_pgdp_test_and_clear_young(pgd); + else if (pmd) + return stage2_pmdp_test_and_clear_young(pmd); + else + return stage2_ptep_test_and_clear_young(pte); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + if (!kvm->arch.pgd) + return 0; + + return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); +} + +static int kvm_test_age_hva_handler(struct kvm *kvm, + gpa_t gpa, u64 size, void *data) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); + if (!stage2_get_leaf_entry(kvm, gpa, &pgd, &pmd, &pte)) + return 0; + + if (pgd) + return pte_young(*((pte_t *)pgd)); + else if (pmd) + return pte_young(*((pte_t *)pmd)); + else + return pte_young(*pte); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.pgd) + return 0; + + return handle_hva_to_gpa(kvm, hva, hva, + kvm_test_age_hva_handler, NULL); +} + int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva, bool is_write) { @@ -587,7 +779,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva, struct vm_area_struct *vma; struct kvm *kvm = vcpu->kvm; struct kvm_mmu_page_cache *pcache = &vcpu->arch.mmu_page_cache; - unsigned long vma_pagesize; + unsigned long vma_pagesize, mmu_seq; down_read(¤t->mm->mmap_sem); @@ -617,6 +809,8 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva, return ret; } + mmu_seq = kvm->mmu_notifier_seq; + hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writeable); if (hfn == KVM_PFN_ERR_HWPOISON) { if (is_vm_hugetlb_page(vma)) @@ -635,6 +829,9 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva, spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(kvm, mmu_seq)) + goto out_unlock; + if (writeable) { kvm_set_pfn_dirty(hfn); ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, @@ -647,6 +844,7 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned long hva, if (ret) kvm_err("Failed to map in stage2\n"); +out_unlock: spin_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(hfn); kvm_release_pfn_clean(hfn); diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index c5aab5478c38..fd84b4d914dc 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -54,6 +54,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) switch (ext) { case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: + case KVM_CAP_SYNC_MMU: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_ONE_REG: case KVM_CAP_READONLY_MEM: -- 2.17.1