From: Yulei Zhang <yuleixzhang@xxxxxxxxxxx> Page table population function will pin the memory and pre-construct the EPT base on the input memory slot configuration so that it won't relay on the page fault interrupt to setup the page table. Signed-off-by: Yulei Zhang <yuleixzhang@xxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 212 +++++++++++++++++++++++++++++++- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 17 ++- include/linux/kvm_host.h | 4 +- virt/kvm/kvm_main.c | 30 ++++- 6 files changed, 250 insertions(+), 17 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2407b872f493..69c946831ca7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1152,7 +1152,7 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); int (*get_tdp_level)(struct kvm_vcpu *vcpu); - u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + u64 (*get_mt_mask)(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 70cf2c1a1423..1609012be67d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -235,6 +235,11 @@ struct kvm_shadow_walk_iterator { ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ __shadow_walk_next(&(_walker), spte)) +#define for_each_direct_build_shadow_entry(_walker, shadow_addr, _addr, level) \ + for (__shadow_walk_init(&(_walker), shadow_addr, _addr, level); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + static struct kmem_cache *pte_list_desc_cache; static struct kmem_cache *mmu_page_header_cache; static struct percpu_counter kvm_total_used_mmu_pages; @@ -2564,13 +2569,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } +static void __shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + hpa_t shadow_addr, u64 addr, int level) +{ + iterator->addr = addr; + iterator->shadow_addr = shadow_addr; + iterator->level = level; + iterator->sptep = NULL; +} + static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, struct kvm_vcpu *vcpu, hpa_t root, u64 addr) { - iterator->addr = addr; - iterator->shadow_addr = root; - iterator->level = vcpu->arch.mmu->shadow_root_level; + __shadow_walk_init(iterator, root, addr, vcpu->arch.mmu->shadow_root_level); if (iterator->level == PT64_ROOT_4LEVEL && vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && @@ -3037,7 +3049,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, + spte |= kvm_x86_ops.get_mt_mask(vcpu->kvm, vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) @@ -6250,6 +6262,198 @@ int kvm_mmu_module_init(void) return ret; } +static int direct_build_tdp_set_spte(struct kvm *kvm, struct kvm_memory_slot *slot, + u64 *sptep, unsigned pte_access, int level, + gfn_t gfn, kvm_pfn_t pfn, bool speculative, + bool dirty, bool host_writable) +{ + u64 spte = 0; + int ret = 0; + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; + if (!speculative) + spte |= shadow_accessed_mask; + + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + + if (pte_access & ACC_EXEC_MASK) + spte |= shadow_x_mask; + else + spte |= shadow_nx_mask; + + if (pte_access & ACC_USER_MASK) + spte |= shadow_user_mask; + + if (level > PT_PAGE_TABLE_LEVEL) + spte |= PT_PAGE_SIZE_MASK; + + if (tdp_enabled) + spte |= kvm_x86_ops.get_mt_mask(kvm, NULL, gfn, kvm_is_mmio_pfn(pfn)); + + if (host_writable) + spte |= SPTE_HOST_WRITEABLE; + else + pte_access &= ~ACC_WRITE_MASK; + + spte |= (u64)pfn << PAGE_SHIFT; + + if (pte_access & ACC_WRITE_MASK) { + + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; + + if (dirty) { + mark_page_dirty_in_slot(slot, gfn); + spte |= shadow_dirty_mask; + } + } + + if (mmu_spte_update(sptep, spte)) + kvm_flush_remote_tlbs(kvm); + + return ret; +} + +static void __kvm_walk_global_page(struct kvm *kvm, u64 addr, int level) +{ + int i; + kvm_pfn_t pfn; + u64 *sptep = (u64 *)__va(addr); + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (is_shadow_present_pte(sptep[i])) { + if (!is_last_spte(sptep[i], level)) { + __kvm_walk_global_page(kvm, sptep[i] & PT64_BASE_ADDR_MASK, level - 1); + } else { + pfn = spte_to_pfn(sptep[i]); + mmu_spte_clear_track_bits(&sptep[i]); + kvm_release_pfn_clean(pfn); + } + } + } + put_page(pfn_to_page(addr >> PAGE_SHIFT)); +} + +static int direct_build_tdp_map(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t pfn, int level) +{ + int ret = 0; + + struct kvm_shadow_walk_iterator iterator; + kvm_pfn_t old_pfn; + u64 spte; + + for_each_direct_build_shadow_entry(iterator, kvm->arch.global_root_hpa, + gfn << PAGE_SHIFT, kvm_x86_ops.get_tdp_level(NULL)) { + if (iterator.level == level) { + break; + } + + if (!is_shadow_present_pte(*iterator.sptep)) { + struct page *page; + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return 0; + + spte = page_to_phys(page) | PT_PRESENT_MASK | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + mmu_spte_set(iterator.sptep, spte); + } + } + /* if presented pte, release the original pfn */ + if (is_shadow_present_pte(*iterator.sptep)) { + if (level > PT_PAGE_TABLE_LEVEL) + __kvm_walk_global_page(kvm, (*iterator.sptep) & PT64_BASE_ADDR_MASK, level - 1); + else { + old_pfn = spte_to_pfn(*iterator.sptep); + mmu_spte_clear_track_bits(iterator.sptep); + kvm_release_pfn_clean(old_pfn); + } + } + direct_build_tdp_set_spte(kvm, slot, iterator.sptep, ACC_ALL, level, gfn, pfn, false, true, true); + + return ret; +} + +static int host_mapping_level(struct kvm *kvm, gfn_t gfn) +{ + unsigned long page_size; + int i, ret = 0; + + page_size = kvm_host_page_size(kvm, NULL, gfn); + + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + if (page_size >= KVM_HPAGE_SIZE(i)) + ret = i; + else + break; + } + + return ret; +} + +int direct_build_mapping_level(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn) +{ + int host_level, max_level, level; + struct kvm_lpage_info *linfo; + + host_level = host_mapping_level(kvm, gfn); + if (host_level != PT_PAGE_TABLE_LEVEL) { + max_level = min(max_page_level, host_level); + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) { + linfo = lpage_info_slot(gfn, slot, level); + if (linfo->disallow_lpage) + break; + } + host_level = level - 1; + } + return host_level; +} + +int kvm_direct_tdp_populate_page_table(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + gfn_t gfn; + kvm_pfn_t pfn; + int host_level; + + if (!kvm->arch.global_root_hpa) { + struct page *page; + WARN_ON(!tdp_enabled); + WARN_ON(kvm_x86_ops.get_tdp_level(NULL) != PT64_ROOT_4LEVEL); + + /* init global root hpa */ + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return -ENOMEM; + + kvm->arch.global_root_hpa = page_to_phys(page); + } + + /* setup page table for the slot */ + for (gfn = slot->base_gfn; + gfn < slot->base_gfn + slot->npages; + gfn += KVM_PAGES_PER_HPAGE(host_level)) { + pfn = gfn_to_pfn_try_write(slot, gfn); + if ((pfn & KVM_PFN_ERR_FAULT) || is_noslot_pfn(pfn)) + return -ENOMEM; + + host_level = direct_build_mapping_level(kvm, slot, gfn); + + if (host_level > PT_PAGE_TABLE_LEVEL) + MMU_WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(host_level) - 1)); + direct_build_tdp_map(kvm, slot, gfn, pfn, host_level); + } + + return 0; +} + /* * Calculate mmu pages needed for kvm. */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index a862c768fd54..40819ed00bf2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3488,7 +3488,7 @@ static bool svm_has_emulated_msr(int index) return true; } -static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static u64 svm_get_mt_mask(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8fafcb2cd103..4b8728c713ff 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2972,10 +2972,12 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static int get_ept_level(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); - if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) - return 5; + if (vcpu) { + if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) + return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); + if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) + return 5; + } return 4; } @@ -6861,7 +6863,7 @@ static int __init vmx_check_processor_compat(void) return 0; } -static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) +static u64 vmx_get_mt_mask(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { u8 cache; u64 ipat = 0; @@ -6889,12 +6891,15 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) goto exit; } - if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + if (!kvm_arch_has_noncoherent_dma(kvm)) { ipat = VMX_EPT_IPAT_BIT; cache = MTRR_TYPE_WRBACK; goto exit; } + if (!vcpu) + vcpu = kvm->vcpus[0]; + if (kvm_read_cr0(vcpu) & X86_CR0_CD) { ipat = VMX_EPT_IPAT_BIT; if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 92efa39ea3d7..d1f75ad5038b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -693,6 +693,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change); +void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); /* flush all memory translations */ void kvm_arch_flush_shadow_all(struct kvm *kvm); /* flush memory translations pointing to 'slot' */ @@ -720,6 +721,7 @@ kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, bool *writable); +kvm_pfn_t gfn_to_pfn_try_write(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); @@ -770,7 +772,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); -unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); +unsigned long kvm_host_page_size(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 77aa91fb08d2..46217b1c8353 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -144,7 +144,7 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); +void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); @@ -1629,14 +1629,17 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); -unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) +unsigned long kvm_host_page_size(struct kvm *kvm, struct kvm_vcpu *vcpu, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr, size; size = PAGE_SIZE; - addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); + if (vcpu) + addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); + else + addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) return PAGE_SIZE; @@ -1931,6 +1934,25 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, return pfn; } +/* Map pfn for direct EPT mode, if map failed and it is readonly memslot, + * will try to remap it with readonly flag. + */ +kvm_pfn_t gfn_to_pfn_try_write(struct kvm_memory_slot *slot, gfn_t gfn) +{ + kvm_pfn_t pfn; + unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, !memslot_is_readonly(slot)); + + if (kvm_is_error_hva(addr)) + return KVM_PFN_NOSLOT; + + pfn = hva_to_pfn(addr, false, NULL, true, NULL); + if (pfn & KVM_PFN_ERR_FAULT) { + if (memslot_is_readonly(slot)) + pfn = hva_to_pfn(addr, false, NULL, false, NULL); + } + return pfn; +} + kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, bool *writable) @@ -2571,7 +2593,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) } EXPORT_SYMBOL_GPL(kvm_clear_guest); -static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, +void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn) { if (memslot && memslot->dirty_bitmap) { -- 2.17.1