On Fri, 06 Oct 2023 14:33:04 +0100, Marc Zyngier <maz@xxxxxxxxxx> wrote: Still talking to myself! :D [...] > > However, this isn't enough. > > [ 63.450113] Oh crap 400000435c001 vs 3000004430001 > > So there are situations where we end-up with the wrong VTTBR, rather > than the wrong VMID, which is even worse. Haven't worked out the > scenario yet, but it apparently involves being preempted by a vcpu > from a different VM and not doing the right thing. Actually, no. It is the MMU notifiers kicking in and performing TLB invalidation for a guest while we're in the context of another. The joy of running 4 large VMs on a box with 2GB of RAM, basically running from swap. The trace looks like this: [ 66.147484] Call trace: [ 66.149899] dump_backtrace+0xa0/0x128 [ 66.153607] show_stack+0x20/0x38 [ 66.156884] dump_stack_lvl+0x78/0xc8 [ 66.160507] dump_stack+0x18/0x28 [ 66.163784] __tlb_switch_to_guest+0x50/0x148 [ 66.168097] __kvm_tlb_flush_vmid_ipa+0x3c/0xc8 [ 66.172582] stage2_unmap_put_pte+0xd0/0xe8 [ 66.176722] stage2_unmap_walker+0x160/0x1c0 [ 66.180948] __kvm_pgtable_visit+0x170/0x1f8 [ 66.185174] __kvm_pgtable_walk+0x94/0xc8 [ 66.189142] __kvm_pgtable_visit+0xd8/0x1f8 [ 66.193282] __kvm_pgtable_walk+0x94/0xc8 [ 66.197249] __kvm_pgtable_visit+0xd8/0x1f8 [ 66.201389] __kvm_pgtable_walk+0x94/0xc8 [ 66.205357] kvm_pgtable_walk+0xd4/0x170 [ 66.209238] kvm_pgtable_stage2_unmap+0x54/0xd0 [ 66.213723] stage2_apply_range+0x9c/0x108 [ 66.217777] __unmap_stage2_range+0x34/0x70 [ 66.221917] kvm_unmap_gfn_range+0x38/0x58 [ 66.225970] kvm_mmu_notifier_invalidate_range_start+0xe8/0x310 [ 66.231835] mn_hlist_invalidate_range_start+0x80/0x158 [ 66.237010] __mmu_notifier_invalidate_range_start+0x40/0x78 [ 66.242617] try_to_migrate_one+0x8b0/0xa10 [ 66.246757] rmap_walk_anon+0xec/0x268 [ 66.250465] try_to_migrate+0xc8/0x120 [ 66.254174] migrate_folio_unmap+0x180/0x438 [ 66.258401] migrate_pages_batch+0x14c/0x798 [ 66.262627] migrate_pages_sync+0x8c/0x258 [ 66.266680] migrate_pages+0x4f0/0x690 [ 66.270389] compact_zone+0x1d8/0x6b8 [ 66.274012] compact_zone_order+0xa0/0xf0 [ 66.277979] try_to_compact_pages+0xfc/0x378 [ 66.282205] __alloc_pages_direct_compact+0x80/0x398 [ 66.287122] __alloc_pages_slowpath.constprop.0+0x328/0x868 [ 66.292642] __alloc_pages+0x2cc/0x358 [ 66.296350] __folio_alloc+0x24/0x68 [ 66.299887] vma_alloc_folio+0x2ac/0x340 [ 66.303768] do_huge_pmd_anonymous_page+0xb0/0x3b8 [ 66.308512] __handle_mm_fault+0x31c/0x358 [ 66.312566] handle_mm_fault+0x64/0x270 [ 66.316360] faultin_page+0x74/0x130 [ 66.319897] __get_user_pages+0xc8/0x340 [ 66.323778] get_user_pages_unlocked+0xc8/0x3b8 [ 66.328263] hva_to_pfn+0xfc/0x338 [ 66.331627] __gfn_to_pfn_memslot+0xa8/0x100 [ 66.335853] user_mem_abort+0x17c/0x7c0 [ 66.339648] kvm_handle_guest_abort+0x2f4/0x3d8 [ 66.344133] handle_trap_exceptions+0x44/0xb8 [ 66.348445] handle_exit+0x4c/0x118 [ 66.351895] kvm_arch_vcpu_ioctl_run+0x24c/0x5e0 [ 66.356467] kvm_vcpu_ioctl+0x28c/0x9e0 [ 66.360262] __arm64_sys_ioctl+0xc0/0xe8 [ 66.364143] invoke_syscall+0x50/0x128 [ 66.367852] el0_svc_common.constprop.0+0x48/0xf0 [ 66.372509] do_el0_svc+0x24/0x38 [ 66.375787] el0_svc+0x3c/0xe0 [ 66.378805] el0t_64_sync_handler+0xc0/0xc8 [ 66.382945] el0t_64_sync+0x1a4/0x1a8 Which says it all: we're reclaiming pages from a guest while faulting on another, resulting in the wrong MMU setup being left behind. Damn! There's the sum of my hacks, which keeps the box alive. Thanks, M. diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e2d38c7d6555..759adee42018 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1025,7 +1025,7 @@ int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, extern unsigned int __ro_after_init kvm_arm_vmid_bits; int __init kvm_arm_vmid_alloc_init(void); void __init kvm_arm_vmid_alloc_free(void); -void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); +bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); void kvm_arm_vmid_clear_active(void); static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 39c969c05990..584be562b1d4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -950,7 +950,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * making a thread's VMID inactive. So we need to call * kvm_arm_vmid_update() in non-premptible context. */ - kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid); + if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) && + has_vhe()) + __load_stage2(vcpu->arch.hw_mmu, + vcpu->arch.hw_mmu->arch); kvm_pmu_flush_hwstate(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index f3f2e142e4f4..bc510c7e53f1 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -12,6 +12,7 @@ struct tlb_inv_context { unsigned long flags; + struct kvm_s2_mmu *mmu; u64 tcr; u64 sctlr; }; @@ -19,10 +20,16 @@ struct tlb_inv_context { static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, struct tlb_inv_context *cxt) { + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); u64 val; local_irq_save(cxt->flags); + if (vcpu && mmu != vcpu->arch.hw_mmu) + cxt->mmu = vcpu->arch.hw_mmu; + else + cxt->mmu = NULL; + if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { /* * For CPUs that are affected by ARM errata 1165522 or 1530923, @@ -64,8 +71,10 @@ static void __tlb_switch_to_host(struct tlb_inv_context *cxt) { /* * We're done with the TLB operation, let's restore the host's - * view of HCR_EL2. + * view of HCR_EL2 and current S2 MMU context. */ + if (cxt->mmu) + __load_stage2(cxt->mmu, cxt->mmu->arch); write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); isb(); diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index 7fe8ba1a2851..806223b7022a 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -135,10 +135,11 @@ void kvm_arm_vmid_clear_active(void) atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); } -void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) +bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) { unsigned long flags; u64 vmid, old_active_vmid; + bool updated = false; vmid = atomic64_read(&kvm_vmid->id); @@ -156,17 +157,21 @@ void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) if (old_active_vmid != 0 && vmid_gen_match(vmid) && 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), old_active_vmid, vmid)) - return; + return false; raw_spin_lock_irqsave(&cpu_vmid_lock, flags); /* Check that our VMID belongs to the current generation. */ vmid = atomic64_read(&kvm_vmid->id); - if (!vmid_gen_match(vmid)) + if (!vmid_gen_match(vmid)) { vmid = new_vmid(kvm_vmid); + updated = true; + } atomic64_set(this_cpu_ptr(&active_vmids), vmid); raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); + + return updated; } /* -- Without deviation from the norm, progress is not possible.