Re: [PATCH 0/3] KVM: arm64: Load the stage-2 MMU from vcpu_load() for VHE

Marc Zyngier <maz@xxxxxxxxxx> · Fri, 06 Oct 2023 16:03:32 +0100

On Fri, 06 Oct 2023 14:33:04 +0100,
Marc Zyngier <maz@xxxxxxxxxx> wrote:

Still talking to myself! :D

[...]

>
> However, this isn't enough.
> 
> [   63.450113] Oh crap 400000435c001 vs 3000004430001
> 
> So there are situations where we end-up with the wrong VTTBR, rather
> than the wrong VMID, which is even worse. Haven't worked out the
> scenario yet, but it apparently involves being preempted by a vcpu
> from a different VM and not doing the right thing.

Actually, no. It is the MMU notifiers kicking in and performing TLB
invalidation for a guest while we're in the context of another. The
joy of running 4 large VMs on a box with 2GB of RAM, basically running
from swap.

The trace looks like this:

[   66.147484] Call trace:
[   66.149899]  dump_backtrace+0xa0/0x128
[   66.153607]  show_stack+0x20/0x38
[   66.156884]  dump_stack_lvl+0x78/0xc8
[   66.160507]  dump_stack+0x18/0x28
[   66.163784]  __tlb_switch_to_guest+0x50/0x148
[   66.168097]  __kvm_tlb_flush_vmid_ipa+0x3c/0xc8
[   66.172582]  stage2_unmap_put_pte+0xd0/0xe8
[   66.176722]  stage2_unmap_walker+0x160/0x1c0
[   66.180948]  __kvm_pgtable_visit+0x170/0x1f8
[   66.185174]  __kvm_pgtable_walk+0x94/0xc8
[   66.189142]  __kvm_pgtable_visit+0xd8/0x1f8
[   66.193282]  __kvm_pgtable_walk+0x94/0xc8
[   66.197249]  __kvm_pgtable_visit+0xd8/0x1f8
[   66.201389]  __kvm_pgtable_walk+0x94/0xc8
[   66.205357]  kvm_pgtable_walk+0xd4/0x170
[   66.209238]  kvm_pgtable_stage2_unmap+0x54/0xd0
[   66.213723]  stage2_apply_range+0x9c/0x108
[   66.217777]  __unmap_stage2_range+0x34/0x70
[   66.221917]  kvm_unmap_gfn_range+0x38/0x58
[   66.225970]  kvm_mmu_notifier_invalidate_range_start+0xe8/0x310
[   66.231835]  mn_hlist_invalidate_range_start+0x80/0x158
[   66.237010]  __mmu_notifier_invalidate_range_start+0x40/0x78
[   66.242617]  try_to_migrate_one+0x8b0/0xa10
[   66.246757]  rmap_walk_anon+0xec/0x268
[   66.250465]  try_to_migrate+0xc8/0x120
[   66.254174]  migrate_folio_unmap+0x180/0x438
[   66.258401]  migrate_pages_batch+0x14c/0x798
[   66.262627]  migrate_pages_sync+0x8c/0x258
[   66.266680]  migrate_pages+0x4f0/0x690
[   66.270389]  compact_zone+0x1d8/0x6b8
[   66.274012]  compact_zone_order+0xa0/0xf0
[   66.277979]  try_to_compact_pages+0xfc/0x378
[   66.282205]  __alloc_pages_direct_compact+0x80/0x398
[   66.287122]  __alloc_pages_slowpath.constprop.0+0x328/0x868
[   66.292642]  __alloc_pages+0x2cc/0x358
[   66.296350]  __folio_alloc+0x24/0x68
[   66.299887]  vma_alloc_folio+0x2ac/0x340
[   66.303768]  do_huge_pmd_anonymous_page+0xb0/0x3b8
[   66.308512]  __handle_mm_fault+0x31c/0x358
[   66.312566]  handle_mm_fault+0x64/0x270
[   66.316360]  faultin_page+0x74/0x130
[   66.319897]  __get_user_pages+0xc8/0x340
[   66.323778]  get_user_pages_unlocked+0xc8/0x3b8
[   66.328263]  hva_to_pfn+0xfc/0x338
[   66.331627]  __gfn_to_pfn_memslot+0xa8/0x100
[   66.335853]  user_mem_abort+0x17c/0x7c0
[   66.339648]  kvm_handle_guest_abort+0x2f4/0x3d8
[   66.344133]  handle_trap_exceptions+0x44/0xb8
[   66.348445]  handle_exit+0x4c/0x118
[   66.351895]  kvm_arch_vcpu_ioctl_run+0x24c/0x5e0
[   66.356467]  kvm_vcpu_ioctl+0x28c/0x9e0
[   66.360262]  __arm64_sys_ioctl+0xc0/0xe8
[   66.364143]  invoke_syscall+0x50/0x128
[   66.367852]  el0_svc_common.constprop.0+0x48/0xf0
[   66.372509]  do_el0_svc+0x24/0x38
[   66.375787]  el0_svc+0x3c/0xe0
[   66.378805]  el0t_64_sync_handler+0xc0/0xc8
[   66.382945]  el0t_64_sync+0x1a4/0x1a8

Which says it all: we're reclaiming pages from a guest while faulting
on another, resulting in the wrong MMU setup being left behind. Damn!

There's the sum of my hacks, which keeps the box alive.

Thanks,

	M.

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index e2d38c7d6555..759adee42018 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1025,7 +1025,7 @@ int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
 extern unsigned int __ro_after_init kvm_arm_vmid_bits;
 int __init kvm_arm_vmid_alloc_init(void);
 void __init kvm_arm_vmid_alloc_free(void);
-void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid);
+bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid);
 void kvm_arm_vmid_clear_active(void);
 
 static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 39c969c05990..584be562b1d4 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -950,7 +950,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
 		 * making a thread's VMID inactive. So we need to call
 		 * kvm_arm_vmid_update() in non-premptible context.
 		 */
-		kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid);
+		if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) &&
+		    has_vhe())
+			__load_stage2(vcpu->arch.hw_mmu,
+				      vcpu->arch.hw_mmu->arch);
 
 		kvm_pmu_flush_hwstate(vcpu);
 
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index f3f2e142e4f4..bc510c7e53f1 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -12,6 +12,7 @@
 
 struct tlb_inv_context {
 	unsigned long	flags;
+	struct kvm_s2_mmu *mmu;
 	u64		tcr;
 	u64		sctlr;
 };
@@ -19,10 +20,16 @@ struct tlb_inv_context {
 static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu,
 				  struct tlb_inv_context *cxt)
 {
+	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
 	u64 val;
 
 	local_irq_save(cxt->flags);
 
+	if (vcpu && mmu != vcpu->arch.hw_mmu)
+		cxt->mmu = vcpu->arch.hw_mmu;
+	else
+		cxt->mmu = NULL;
+
 	if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
 		/*
 		 * For CPUs that are affected by ARM errata 1165522 or 1530923,
@@ -64,8 +71,10 @@ static void __tlb_switch_to_host(struct tlb_inv_context *cxt)
 {
 	/*
 	 * We're done with the TLB operation, let's restore the host's
-	 * view of HCR_EL2.
+	 * view of HCR_EL2 and current S2 MMU context.
 	 */
+	if (cxt->mmu)
+		__load_stage2(cxt->mmu, cxt->mmu->arch);
 	write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
 	isb();
 
diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c
index 7fe8ba1a2851..806223b7022a 100644
--- a/arch/arm64/kvm/vmid.c
+++ b/arch/arm64/kvm/vmid.c
@@ -135,10 +135,11 @@ void kvm_arm_vmid_clear_active(void)
 	atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID);
 }
 
-void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid)
+bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid)
 {
 	unsigned long flags;
 	u64 vmid, old_active_vmid;
+	bool updated = false;
 
 	vmid = atomic64_read(&kvm_vmid->id);
 
@@ -156,17 +157,21 @@ void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid)
 	if (old_active_vmid != 0 && vmid_gen_match(vmid) &&
 	    0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids),
 					  old_active_vmid, vmid))
-		return;
+		return false;
 
 	raw_spin_lock_irqsave(&cpu_vmid_lock, flags);
 
 	/* Check that our VMID belongs to the current generation. */
 	vmid = atomic64_read(&kvm_vmid->id);
-	if (!vmid_gen_match(vmid))
+	if (!vmid_gen_match(vmid)) {
 		vmid = new_vmid(kvm_vmid);
+		updated = true;
+	}
 
 	atomic64_set(this_cpu_ptr(&active_vmids), vmid);
 	raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags);
+
+	return updated;
 }
 
 /*

-- 
Without deviation from the norm, progress is not possible.