When a page fault from the secondary page table while the guest is running happens in a memslot with KVM_MEM_PRIVATE, we need go different paths for private access and shared access. - For private access, KVM checks if the page is already allocated in the memory backend, if yes KVM establishes the mapping, otherwise exits to userspace to convert a shared page to private one. - For shared access, KVM also checks if the page is already allocated in the memory backend, if yes then exit to userspace to convert a private page to shared one, otherwise it's treated as a traditional hva-based shared memory, KVM lets existing code to obtain a pfn with get_user_pages() and establish the mapping. The above code assume private memory is persistent and pre-allocated in the memory backend so KVM can use this information as an indicator for a page is private or shared. The above check is then performed by calling kvm_memfd_get_pfn() which currently is implemented as a pagecache search but in theory that can be implemented differently (i.e. when the page is even not mapped into host pagecache there should be some different implementation). Signed-off-by: Yu Zhang <yu.c.zhang@xxxxxxxxxxxxxxx> Signed-off-by: Chao Peng <chao.p.peng@xxxxxxxxxxxxxxx> --- arch/x86/kvm/mmu/mmu.c | 64 +++++++++++++++++++++++++++++++--- arch/x86/kvm/mmu/paging_tmpl.h | 11 ++++-- 2 files changed, 68 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a7006e1ac2d2..7fc29f85313d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3156,6 +3156,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; + if (kvm_slot_is_private(slot)) + return max_level; + host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); return min(host_level, max_level); } @@ -4359,7 +4362,50 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r) +static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, + bool *is_private_pfn, int *r) +{ + int order; + int mem_convert_type; + struct kvm_memory_slot *slot = fault->slot; + long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order); + + if (kvm_vcpu_is_private_gfn(vcpu, fault->addr >> PAGE_SHIFT)) { + if (pfn < 0) + mem_convert_type = KVM_EXIT_MEM_MAP_PRIVATE; + else { + fault->pfn = pfn; + if (slot->flags & KVM_MEM_READONLY) + fault->map_writable = false; + else + fault->map_writable = true; + + if (order == 0) + fault->max_level = PG_LEVEL_4K; + *is_private_pfn = true; + *r = RET_PF_FIXED; + return true; + } + } else { + if (pfn < 0) + return false; + + kvm_memfd_put_pfn(pfn); + mem_convert_type = KVM_EXIT_MEM_MAP_SHARED; + } + + vcpu->run->exit_reason = KVM_EXIT_MEMORY_ERROR; + vcpu->run->mem.type = mem_convert_type; + vcpu->run->mem.u.map.gpa = fault->gfn << PAGE_SHIFT; + vcpu->run->mem.u.map.size = PAGE_SIZE; + fault->pfn = -1; + *r = -1; + return true; +} + +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, + bool *is_private_pfn, int *r) { struct kvm_memory_slot *slot = fault->slot; bool async; @@ -4400,6 +4446,10 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, } } + if (kvm_slot_is_private(slot) && + kvm_faultin_pfn_private(vcpu, fault, is_private_pfn, r)) + return *r == RET_PF_FIXED ? false : true; + async = false; fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, fault->write, &fault->map_writable, @@ -4446,6 +4496,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); unsigned long mmu_seq; + bool is_private_pfn = false; int r; fault->gfn = kvm_gfn_unalias(vcpu->kvm, fault->addr); @@ -4465,7 +4516,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, fault, &r)) + if (kvm_faultin_pfn(vcpu, fault, &is_private_pfn, &r)) return r; if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r)) @@ -4504,7 +4555,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault else write_lock(&vcpu->kvm->mmu_lock); - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (!is_private_pfn && is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; r = make_mmu_pages_available(vcpu); @@ -4522,7 +4573,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + + if (is_private_pfn) + kvm_memfd_put_pfn(fault->pfn); + else + kvm_release_pfn_clean(fault->pfn); + return r; } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 6d343a399913..ebd5c923f844 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -842,6 +842,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault int r; unsigned long mmu_seq; bool is_self_change_mapping; + bool is_private_pfn = false; + pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code); WARN_ON_ONCE(fault->is_tdp); @@ -890,7 +892,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (kvm_faultin_pfn(vcpu, fault, &r)) + if (kvm_faultin_pfn(vcpu, fault, &is_private_pfn, &r)) return r; if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r)) @@ -918,7 +920,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); - if (is_page_fault_stale(vcpu, fault, mmu_seq)) + if (!is_private_pfn && is_page_fault_stale(vcpu, fault, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); @@ -930,7 +932,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + if (is_private_pfn) + kvm_memfd_put_pfn(fault->pfn); + else + kvm_release_pfn_clean(fault->pfn); return r; } -- 2.17.1