Allow vcpus to pin spte translations by: 1) Creating a per-vcpu list of pinned ranges. 2) On mmu reload request: - Fault ranges. - Mark sptes with a pinned bit. - Mark shadow pages as pinned. 3) Then modify the following actions: - Page age => skip spte flush. - MMU notifiers => force mmu reload request (which kicks cpu out of guest mode). - GET_DIRTY_LOG => force mmu reload request. - SLAB shrinker => skip shadow page deletion. TDP-only. Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 14 ++ arch/x86/kvm/mmu.c | 202 ++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/mmu.h | 5 arch/x86/kvm/mmutrace.h | 23 ++++ arch/x86/kvm/paging_tmpl.h | 2 arch/x86/kvm/x86.c | 4 6 files changed, 241 insertions(+), 9 deletions(-) Index: kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h =================================================================== --- kvm.pinned-sptes.orig/arch/x86/include/asm/kvm_host.h 2014-06-18 17:28:17.549456614 -0300 +++ kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h 2014-06-18 17:28:24.338435658 -0300 @@ -221,6 +221,8 @@ /* hold the gfn of each spte inside spt */ gfn_t *gfns; bool unsync; + bool pinned; + int root_count; /* Currently serving as active root */ unsigned int unsync_children; unsigned long parent_ptes; /* Reverse mapping for parent_pte */ @@ -337,6 +339,14 @@ KVM_DEBUGREG_WONT_EXIT = 2, }; +struct kvm_pinned_page_range { + gfn_t base_gfn; + unsigned long npages; + struct list_head link; +}; + +#define KVM_MAX_PER_VCPU_PINNED_RANGE 10 + struct kvm_vcpu_arch { /* * rip and regs accesses must go through @@ -392,6 +402,10 @@ struct kvm_mmu_memory_cache mmu_page_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; + struct list_head pinned_mmu_pages; + struct mutex pinned_mmu_mutex; + unsigned int nr_pinned_ranges; + struct fpu guest_fpu; u64 xcr0; u64 guest_supported_xcr0; Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c =================================================================== --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c 2014-06-18 17:28:17.550456611 -0300 +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c 2014-06-18 17:28:24.339435654 -0300 @@ -148,6 +148,9 @@ #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) +#define SPTE_PINNED (1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT)) + +#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -327,6 +330,11 @@ return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); } +static int is_pinned_spte(u64 spte) +{ + return spte & SPTE_PINNED && is_shadow_present_pte(spte); +} + static int is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; @@ -2818,7 +2826,7 @@ * - false: let the real page fault path to fix it. */ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, - u32 error_code) + u32 error_code, bool pin) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -2828,6 +2836,9 @@ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return false; + if (pin) + return false; + if (!page_fault_can_be_fast(error_code)) return false; @@ -2895,9 +2906,55 @@ } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable); + gva_t gva, pfn_t *pfn, bool write, bool *writable, + bool pin); static void make_mmu_pages_available(struct kvm_vcpu *vcpu); + +static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes[4]) +{ + struct kvm_shadow_walk_iterator iterator; + int nr_sptes = 0; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return nr_sptes; + + for_each_shadow_entry(vcpu, addr, iterator) { + sptes[iterator.level-1] = iterator.sptep; + nr_sptes++; + if (!is_shadow_present_pte(*iterator.sptep)) + break; + } + + return nr_sptes; +} + +static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u64 *sptes[4]; + int r, i, level; + + r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes); + if (!r) + return false; + + level = 5 - r; + if (!is_last_spte(*sptes[r-1], level)) + return false; + if (!is_shadow_present_pte(*sptes[r-1])) + return false; + + for (i = 0; i < r; i++) { + u64 *sptep = sptes[i]; + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + sp->pinned = true; + set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep); + } + + return true; +} + static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, gfn_t gfn, bool prefault, bool pin, bool *pinned) { @@ -2923,13 +2980,14 @@ } else level = PT_PAGE_TABLE_LEVEL; - if (fast_page_fault(vcpu, v, level, error_code)) + if (fast_page_fault(vcpu, v, level, error_code, pin)) return 0; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) + if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable, + pin)) return 0; if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) @@ -2943,6 +3001,8 @@ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, prefault); + if (pin) + *pinned = direct_pin_sptes(vcpu, gfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -3349,7 +3409,8 @@ } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, - gva_t gva, pfn_t *pfn, bool write, bool *writable) + gva_t gva, pfn_t *pfn, bool write, bool *writable, + bool pin) { bool async; @@ -3358,7 +3419,7 @@ if (!async) return false; /* *pfn has correct page already */ - if (!prefault && can_do_async_pf(vcpu)) { + if (!prefault && !pin && can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(gva, gfn); if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(gva, gfn); @@ -3406,13 +3467,14 @@ } else level = PT_PAGE_TABLE_LEVEL; - if (fast_page_fault(vcpu, gpa, level, error_code)) + if (fast_page_fault(vcpu, gpa, level, error_code, pin)) return 0; mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable, + pin)) return 0; if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) @@ -3426,6 +3488,8 @@ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); r = __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault); + if (pin) + *pinned = direct_pin_sptes(vcpu, gfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -3903,6 +3967,127 @@ } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); +int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu, + gfn_t base_gfn, unsigned long npages) +{ + struct kvm_pinned_page_range *p; + + mutex_lock(&vcpu->arch.pinned_mmu_mutex); + list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) { + if (p->base_gfn == base_gfn && p->npages == npages) { + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); + return -EEXIST; + } + } + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); + + if (vcpu->arch.nr_pinned_ranges >= + KVM_MAX_PER_VCPU_PINNED_RANGE) + return -ENOSPC; + + p = kzalloc(sizeof(struct kvm_pinned_page_range), GFP_KERNEL); + if (!p) + return -ENOMEM; + + vcpu->arch.nr_pinned_ranges++; + + trace_kvm_mmu_register_pinned_range(vcpu->vcpu_id, base_gfn, npages); + + INIT_LIST_HEAD(&p->link); + p->base_gfn = base_gfn; + p->npages = npages; + mutex_lock(&vcpu->arch.pinned_mmu_mutex); + list_add(&p->link, &vcpu->arch.pinned_mmu_pages); + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + + return 0; +} + +int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu, + gfn_t base_gfn, unsigned long npages) +{ + struct kvm_pinned_page_range *p; + + mutex_lock(&vcpu->arch.pinned_mmu_mutex); + list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) { + if (p->base_gfn == base_gfn && p->npages == npages) { + list_del(&p->link); + vcpu->arch.nr_pinned_ranges--; + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); + kfree(p); + return 0; + } + } + + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); + return -ENOENT; +} + +void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu) +{ + struct kvm_pinned_page_range *p, *p2; + + mutex_lock(&vcpu->arch.pinned_mmu_mutex); + list_for_each_entry_safe(p, p2, &vcpu->arch.pinned_mmu_pages, link) { + list_del(&p->link); + kfree(p); + } + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); +} + +/* + * Pin KVM MMU page translations. This guarantees, for valid + * addresses registered by kvm_mmu_register_pinned_range (valid address + * meaning address which posses sufficient information for fault to + * be resolved), valid translations exist while in guest mode and + * therefore no VM-exits due to faults will occur. + * + * Failure to instantiate pages will abort guest entry. + * + * Page frames should be pinned with get_page in advance. + * + * Pinning is not guaranteed while executing as L2 guest. + * + */ + +static void kvm_mmu_pin_pages(struct kvm_vcpu *vcpu) +{ + struct kvm_pinned_page_range *p; + + if (is_guest_mode(vcpu)) + return; + + if (!vcpu->arch.mmu.direct_map) + return; + + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + mutex_lock(&vcpu->arch.pinned_mmu_mutex); + list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) { + gfn_t gfn_offset; + + for (gfn_offset = 0; gfn_offset < p->npages; gfn_offset++) { + gfn_t gfn = p->base_gfn + gfn_offset; + int r; + bool pinned = false; + + r = vcpu->arch.mmu.page_fault(vcpu, gfn << PAGE_SHIFT, + PFERR_WRITE_MASK, false, + true, &pinned); + /* MMU notifier sequence window: retry */ + if (!r && !pinned) + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + if (r) { + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + break; + } + + } + } + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); +} + int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; @@ -3916,6 +4101,7 @@ goto out; /* set_cr3() should ensure TLB has been flushed */ vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + kvm_mmu_pin_pages(vcpu); out: return r; } Index: kvm.pinned-sptes/arch/x86/kvm/mmu.h =================================================================== --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.h 2014-06-18 17:27:47.582549238 -0300 +++ kvm.pinned-sptes/arch/x86/kvm/mmu.h 2014-06-18 17:28:24.339435654 -0300 @@ -178,4 +178,9 @@ } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); +int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu, + gfn_t base_gfn, unsigned long npages); +int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu, + gfn_t base_gfn, unsigned long npages); +void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu); #endif Index: kvm.pinned-sptes/arch/x86/kvm/x86.c =================================================================== --- kvm.pinned-sptes.orig/arch/x86/kvm/x86.c 2014-06-18 17:28:17.552456605 -0300 +++ kvm.pinned-sptes/arch/x86/kvm/x86.c 2014-06-18 17:28:24.340435651 -0300 @@ -7049,6 +7049,8 @@ kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); + INIT_LIST_HEAD(&vcpu->arch.pinned_mmu_pages); + mutex_init(&vcpu->arch.pinned_mmu_mutex); return 0; fail_free_wbinvd_dirty_mask: @@ -7069,6 +7071,7 @@ { int idx; + kvm_mmu_free_pinned_ranges(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); @@ -7113,6 +7116,7 @@ int r; r = vcpu_load(vcpu); BUG_ON(r); + kvm_mmu_free_pinned_ranges(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); } Index: kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h =================================================================== --- kvm.pinned-sptes.orig/arch/x86/kvm/paging_tmpl.h 2014-06-18 17:28:17.550456611 -0300 +++ kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h 2014-06-18 17:28:24.340435651 -0300 @@ -747,7 +747,7 @@ smp_rmb(); if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, - &map_writable)) + &map_writable, false)) return 0; if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, Index: kvm.pinned-sptes/arch/x86/kvm/mmutrace.h =================================================================== --- kvm.pinned-sptes.orig/arch/x86/kvm/mmutrace.h 2014-06-18 17:27:47.583549234 -0300 +++ kvm.pinned-sptes/arch/x86/kvm/mmutrace.h 2014-06-18 17:28:24.340435651 -0300 @@ -322,6 +322,29 @@ __entry->kvm_gen == __entry->spte_gen ) ); + +TRACE_EVENT( + kvm_mmu_register_pinned_range, + TP_PROTO(unsigned int vcpu_id, gfn_t gfn, unsigned long npages), + TP_ARGS(vcpu_id, gfn, npages), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( gfn_t, gfn ) + __field( unsigned long, npages ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->gfn = gfn; + __entry->npages = npages; + ), + + TP_printk("vcpu_id %u gfn %llx npages %lx", + __entry->vcpu_id, + __entry->gfn, + __entry->npages) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html