Convert unmap_stage2_range() to use kvm_pgtable_stage2_unmap() instead of walking the page-table directly. Cc: Marc Zyngier <maz@xxxxxxxxxx> Cc: Quentin Perret <qperret@xxxxxxxxxx> Signed-off-by: Will Deacon <will@xxxxxxxxxx> --- arch/arm64/kvm/mmu.c | 58 +++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index f6b7220412af..d2ce44183b98 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -39,6 +39,32 @@ static bool is_iomap(unsigned long flags) return flags & KVM_S2PTE_FLAG_IS_IOMAP; } +/* + * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, + * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, + * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too + * long will also starve other vCPUs. We have to also make sure that the page + * tables are not freed while we released the lock. + */ +#define stage2_apply_range_resched(kvm, addr, end, fn) \ +({ \ + int ret; \ + struct kvm *__kvm = (kvm); \ + u64 next, __addr = (addr), __end = (end); \ + do { \ + void *cookie = __kvm->arch.mmu.pgt_cookie; \ + if (!cookie) \ + break; \ + next = stage2_pgd_addr_end(__kvm, __addr, __end); \ + ret = fn(cookie, __addr, next - __addr); \ + if (ret) \ + break; \ + if (next != __end) \ + cond_resched_lock(&__kvm->mmu_lock); \ + } while (__addr = next, __addr != __end); \ + ret; \ +}) + static bool memslot_is_logging(struct kvm_memory_slot *memslot) { return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); @@ -256,8 +282,8 @@ static inline void kvm_pgd_populate(pgd_t *pgdp, p4d_t *p4dp) * end up writing old data to disk. * * This is why right after unmapping a page/section and invalidating - * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure - * the IO subsystem will never hit in the cache. + * the corresponding TLBs, we flush to make sure the IO subsystem will + * never hit in the cache. * * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as * we then fully enforce cacheability of RAM, no matter what the guest @@ -375,36 +401,18 @@ static void unmap_stage2_p4ds(struct kvm_s2_mmu *mmu, pgd_t *pgd, * be called while holding mmu_lock (unless for freeing the stage2 pgd before * destroying the VM), otherwise another faulting VCPU may come in and mess * with things behind our backs. + * + * Return: 0 on success, negative error code on failure. */ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) { struct kvm *kvm = mmu->kvm; - pgd_t *pgd; - phys_addr_t addr = start, end = start + size; - phys_addr_t next; + phys_addr_t end = start + size; assert_spin_locked(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - - pgd = mmu->pgd + stage2_pgd_index(kvm, addr); - do { - /* - * Make sure the page table is still active, as another thread - * could have possibly freed the page table, while we released - * the lock. - */ - if (!READ_ONCE(mmu->pgd)) - break; - next = stage2_pgd_addr_end(kvm, addr, end); - if (!stage2_pgd_none(kvm, *pgd)) - unmap_stage2_p4ds(mmu, pgd, addr, next); - /* - * If the range is too large, release the kvm->mmu_lock - * to prevent starvation and lockup detector warnings. - */ - if (next != end) - cond_resched_lock(&kvm->mmu_lock); - } while (pgd++, addr = next, addr != end); + WARN_ON(stage2_apply_range_resched(kvm, start, end, + kvm_pgtable_stage2_unmap)); } static void stage2_flush_ptes(struct kvm_s2_mmu *mmu, pmd_t *pmd, -- 2.28.0.rc0.142.g3c755180ce-goog _______________________________________________ kvmarm mailing list kvmarm@xxxxxxxxxxxxxxxxxxxxx https://lists.cs.columbia.edu/mailman/listinfo/kvmarm