L1 guest memory as a whole cannot be considered non-sensitive when an L2 is running. Even if L1 is using its own mitigations, L2 VM Exits could, in theory, bring into the cache some sensitive L1 memory without L1 getting a chance to flush it. For simplicity, we just unmap the entire L1 memory from the ASI restricted address space when nested virtualization is turned on. Though this is overridden if the treat_all_userspace_as_nonsensitive flag is enabled. In the future, we could potentially map some portions of L1 memory which are known to contain non-sensitive memory, which would reduce ASI overhead during nested virtualization. Note that unmapping the guest memory still leaves a slight hole because L2 could also potentially access copies of L1 VCPU registers stored in L0 kernel structures. In the future, this could be mitigated by having a separate ASI address space for each VCPU and treating the associated structures as locally non-sensitive only within that VCPU's ASI address space. Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/mmu/mmu.c | 10 ++++++++++ arch/x86/kvm/vmx/nested.c | 22 ++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e63a2f244d7b..8ba88bbcf895 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1200,6 +1200,12 @@ struct kvm_arch { */ struct list_head tdp_mmu_pages; + /* + * Number of VCPUs that have enabled nested virtualization. + * Currently only maintained when ASI is enabled. + */ + int nested_virt_enabled_count; + /* * Protects accesses to the following fields when the MMU lock * is held in read mode: diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 485c0ba3ce8b..5785a0d02558 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -94,6 +94,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); #ifdef CONFIG_ADDRESS_SPACE_ISOLATION bool __ro_after_init treat_all_userspace_as_nonsensitive; module_param(treat_all_userspace_as_nonsensitive, bool, 0444); +EXPORT_SYMBOL_GPL(treat_all_userspace_as_nonsensitive); #endif /* @@ -2769,6 +2770,15 @@ static void asi_map_gfn_range(struct kvm_vcpu *vcpu, int err; size_t hva = __gfn_to_hva_memslot(slot, gfn); + /* + * For now, we just don't map any guest memory when using nested + * virtualization. In the future, we could potentially map some + * portions of guest memory which are known to contain only memory + * which would be considered non-sensitive. + */ + if (vcpu->kvm->arch.nested_virt_enabled_count) + return; + err = asi_map_user(vcpu->kvm->asi, (void *)hva, PAGE_SIZE * npages, &vcpu->arch.asi_pgtbl_pool, slot->userspace_addr, slot->userspace_addr + slot->npages * PAGE_SIZE); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 9c941535f78c..0a0092e4102d 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -318,6 +318,14 @@ static void free_nested(struct kvm_vcpu *vcpu) nested_release_evmcs(vcpu); free_loaded_vmcs(&vmx->nested.vmcs02); + + if (cpu_feature_enabled(X86_FEATURE_ASI) && + !treat_all_userspace_as_nonsensitive) { + write_lock(&vcpu->kvm->mmu_lock); + WARN_ON(vcpu->kvm->arch.nested_virt_enabled_count <= 0); + vcpu->kvm->arch.nested_virt_enabled_count--; + write_unlock(&vcpu->kvm->mmu_lock); + } } /* @@ -4876,6 +4884,20 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } + if (cpu_feature_enabled(X86_FEATURE_ASI) && + !treat_all_userspace_as_nonsensitive) { + /* + * We do the increment under the MMU lock in order to prevent + * it from happening concurrently with asi_map_gfn_range(). + */ + write_lock(&vcpu->kvm->mmu_lock); + WARN_ON(vcpu->kvm->arch.nested_virt_enabled_count < 0); + vcpu->kvm->arch.nested_virt_enabled_count++; + write_unlock(&vcpu->kvm->mmu_lock); + + asi_unmap_user(vcpu->kvm->asi, 0, TASK_SIZE_MAX); + } + return 0; out_shadow_vmcs: -- 2.35.1.473.g83b2b277ed-goog