On 08/03/2023 01:00, Paolo Bonzini wrote: > On Tue, Mar 7, 2023 at 6:36 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote: >> Thinking about this more, I would rather revert commit 1e0c7d40758b ("KVM: SVM: >> hyper-v: Remote TLB flush for SVM") or fix the thing properly straitaway. KVM >> doesn't magically handle the flushes correctly for the shadow/legacy MMU, KVM just >> happens to get lucky and not run afoul of the underlying bugs. > > I don't think it's about luck---the legacy MMU's zapping/invalidation > seems to invoke the flush hypercall correctly: > > Jeremi, did you ever track the call stack where > hyperv_nested_flush_guest_mapping is triggered? > > Paolo > Yes, here's all the stacktraces I get for hyperv_nested_flush_guest_mapping with legacy MMU: 1 entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_ioctl kvm_vcpu_ioctl kvm_arch_vcpu_ioctl_run svm_handle_exit svm_invoke_exit_handler npf_interception kvm_mmu_page_fault kvm_mmu_do_page_fault kvm_tdp_page_fault direct_page_fault kvm_faultin_pfn __gfn_to_pfn_memslot hva_to_pfn get_user_pages_unlocked __gup_longterm_locked __get_user_pages handle_mm_fault __handle_mm_fault do_huge_pmd_wp_page __split_huge_pmd __mmu_notifier_invalidate_range_start kvm_mmu_notifier_invalidate_range_start kvm_flush_remote_tlbs hv_remote_flush_tlb hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping 1 entry_SYSCALL_64_after_hwframe do_syscall_64 syscall_exit_to_user_mode exit_to_user_mode_prepare arch_do_signal_or_restart get_signal do_group_exit do_exit mmput __mmput exit_mmap __mmu_notifier_release kvm_mmu_notifier_release kvm_arch_flush_shadow_all kvm_mmu_zap_all kvm_mmu_commit_zap_page.part.0 kvm_flush_remote_tlbs hv_remote_flush_tlb hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping 1 entry_SYSCALL_64_after_hwframe do_syscall_64 syscall_exit_to_user_mode exit_to_user_mode_prepare arch_do_signal_or_restart get_signal do_group_exit do_exit task_work_run ____fput __fput kvm_vm_release kvm_put_kvm kvm_destroy_vm kvm_arch_destroy_vm kvm_mmu_unload kvm_mmu_free_roots kvm_mmu_commit_zap_page.part.0 kvm_flush_remote_tlbs hv_remote_flush_tlb hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping 6 entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_ioctl kvm_vcpu_ioctl kvm_arch_vcpu_ioctl_run svm_handle_exit svm_invoke_exit_handler npf_interception kvm_mmu_page_fault kvm_mmu_do_page_fault kvm_tdp_page_fault direct_page_fault kvm_faultin_pfn __gfn_to_pfn_memslot hva_to_pfn get_user_pages_unlocked __gup_longterm_locked __get_user_pages handle_mm_fault __handle_mm_fault do_wp_page __mmu_notifier_invalidate_range_start kvm_mmu_notifier_invalidate_range_start kvm_flush_remote_tlbs hv_remote_flush_tlb hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping 8 entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_ioctl kvm_vcpu_ioctl kvm_arch_vcpu_ioctl_run kvm_apic_accept_events kvm_vcpu_reset kvm_mmu_reset_context kvm_mmu_free_roots kvm_mmu_commit_zap_page.part.0 kvm_flush_remote_tlbs hv_remote_flush_tlb hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping 20 entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_ioctl kvm_vcpu_ioctl kvm_arch_vcpu_ioctl_run svm_handle_exit svm_invoke_exit_handler npf_interception kvm_mmu_page_fault kvm_mmu_do_page_fault kvm_tdp_page_fault direct_page_fault mmu_set_spte hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping_range 406 entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_ioctl kvm_vm_ioctl __kvm_set_memory_region kvm_set_memslot kvm_arch_flush_shadow_memslot kvm_page_track_flush_slot kvm_mmu_invalidate_zap_pages_in_memslot kvm_mmu_zap_all_fast kvm_mmu_commit_zap_page.part.0 kvm_flush_remote_tlbs hv_remote_flush_tlb hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping 406 entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_ioctl kvm_vcpu_ioctl kvm_arch_vcpu_ioctl_run kvm_mmu_free_obsolete_roots __kvm_mmu_free_obsolete_roots kvm_mmu_free_roots kvm_mmu_commit_zap_page.part.0 kvm_flush_remote_tlbs hv_remote_flush_tlb hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping and here's the stacks for TDP MMU: 1 entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_ioctl kvm_vcpu_ioctl kvm_arch_vcpu_ioctl_run svm_handle_exit svm_invoke_exit_handler npf_interception kvm_mmu_page_fault kvm_mmu_do_page_fault kvm_tdp_page_fault kvm_faultin_pfn __gfn_to_pfn_memslot hva_to_pfn get_user_pages_unlocked __gup_longterm_locked __get_user_pages handle_mm_fault __handle_mm_fault do_huge_pmd_wp_page __split_huge_pmd __mmu_notifier_invalidate_range_start kvm_mmu_notifier_invalidate_range_start kvm_flush_remote_tlbs hv_remote_flush_tlb hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping 6 entry_SYSCALL_64_after_hwframe do_syscall_64 __x64_sys_ioctl kvm_vcpu_ioctl kvm_arch_vcpu_ioctl_run svm_handle_exit svm_invoke_exit_handler npf_interception kvm_mmu_page_fault kvm_mmu_do_page_fault kvm_tdp_page_fault kvm_faultin_pfn __gfn_to_pfn_memslot hva_to_pfn get_user_pages_unlocked __gup_longterm_locked __get_user_pages handle_mm_fault __handle_mm_fault do_wp_page __mmu_notifier_invalidate_range_start kvm_mmu_notifier_invalidate_range_start kvm_flush_remote_tlbs hv_remote_flush_tlb hv_remote_flush_tlb_with_range hyperv_flush_guest_mapping >> The revert appears >> to be reasonably straightforward (see bottom). >> >> And _if_ we want to hack-a-fix it, then I would strongly prefer a very isolated, >> obviously hacky fix, e.g. >> >> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c >> index 36e4561554ca..a9ba4ae14fda 100644 >> --- a/arch/x86/kvm/mmu/mmu.c >> +++ b/arch/x86/kvm/mmu/mmu.c >> @@ -5779,8 +5779,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, >> tdp_root_level = tdp_forced_root_level; >> max_tdp_level = tdp_max_root_level; >> >> + /* >> + * FIXME: Remove the enlightened TLB restriction when KVM properly >> + * handles TLB flushes for said enlightenment. >> + */. >> #ifdef CONFIG_X86_64 >> - tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled; >> + tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled && >> + !(ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB); >> #endif >> /* >> * max_huge_page_level reflects KVM's MMU capabilities irrespective >> >> >> >> >> The revert... >> >> --- >> arch/x86/kvm/svm/svm.c | 3 --- >> arch/x86/kvm/svm/svm_onhyperv.h | 27 --------------------------- >> 2 files changed, 30 deletions(-) >> >> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c >> index 11068e8eb969..292650dc85a0 100644 >> --- a/arch/x86/kvm/svm/svm.c >> +++ b/arch/x86/kvm/svm/svm.c >> @@ -1320,7 +1320,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) >> if (sev_guest(vcpu->kvm)) >> sev_init_vmcb(svm); >> >> - svm_hv_init_vmcb(vmcb); >> init_vmcb_after_set_cpuid(vcpu); >> >> vmcb_mark_all_dirty(vmcb); >> @@ -4075,8 +4074,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, >> svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); >> vmcb_mark_dirty(svm->vmcb, VMCB_NPT); >> >> - hv_track_root_tdp(vcpu, root_hpa); >> - >> cr3 = vcpu->arch.cr3; >> } else if (root_level >= PT64_ROOT_4LEVEL) { >> cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); >> diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h >> index 6981c1e9a809..5118fd273e73 100644 >> --- a/arch/x86/kvm/svm/svm_onhyperv.h >> +++ b/arch/x86/kvm/svm/svm_onhyperv.h >> @@ -15,31 +15,8 @@ static struct kvm_x86_ops svm_x86_ops; >> >> int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu); >> >> -static inline void svm_hv_init_vmcb(struct vmcb *vmcb) >> -{ >> - struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments; >> - >> - BUILD_BUG_ON(sizeof(vmcb->control.hv_enlightenments) != >> - sizeof(vmcb->control.reserved_sw)); >> - >> - if (npt_enabled && >> - ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) >> - hve->hv_enlightenments_control.enlightened_npt_tlb = 1; >> - >> - if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP) >> - hve->hv_enlightenments_control.msr_bitmap = 1; >> -} >> - >> static inline void svm_hv_hardware_setup(void) >> { >> - if (npt_enabled && >> - ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) { >> - pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n"); >> - svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; >> - svm_x86_ops.tlb_remote_flush_with_range = >> - hv_remote_flush_tlb_with_range; >> - } >> - >> if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) { >> int cpu; >> >> @@ -80,10 +57,6 @@ static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu) >> } >> #else >> >> -static inline void svm_hv_init_vmcb(struct vmcb *vmcb) >> -{ >> -} >> - >> static inline void svm_hv_hardware_setup(void) >> { >> } >> >> base-commit: cb8748a781fe983e451f616ce4861a1c49ce79dd >> -- >>