On Wed, Jun 18, 2014 at 08:12:05PM -0300, mtosatti@xxxxxxxxxx wrote: > Allow vcpus to pin spte translations by: > > 1) Creating a per-vcpu list of pinned ranges. What if memory slot containing pinned range is going away? > 2) On mmu reload request: > - Fault ranges. > - Mark sptes with a pinned bit. Should also be marked "dirty" as per SDM: The three DS save area sections should be allocated from a non-paged pool, and marked accessed and dirty Some comment below. > - Mark shadow pages as pinned. > > 3) Then modify the following actions: > - Page age => skip spte flush. > - MMU notifiers => force mmu reload request (which kicks cpu out of > guest mode). > - GET_DIRTY_LOG => force mmu reload request. > - SLAB shrinker => skip shadow page deletion. > > TDP-only. > > Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx> > > --- > arch/x86/include/asm/kvm_host.h | 14 ++ > arch/x86/kvm/mmu.c | 202 ++++++++++++++++++++++++++++++++++++++-- > arch/x86/kvm/mmu.h | 5 > arch/x86/kvm/mmutrace.h | 23 ++++ > arch/x86/kvm/paging_tmpl.h | 2 > arch/x86/kvm/x86.c | 4 > 6 files changed, 241 insertions(+), 9 deletions(-) > > Index: kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h > =================================================================== > --- kvm.pinned-sptes.orig/arch/x86/include/asm/kvm_host.h 2014-06-18 17:28:17.549456614 -0300 > +++ kvm.pinned-sptes/arch/x86/include/asm/kvm_host.h 2014-06-18 17:28:24.338435658 -0300 > @@ -221,6 +221,8 @@ > /* hold the gfn of each spte inside spt */ > gfn_t *gfns; > bool unsync; > + bool pinned; > + > int root_count; /* Currently serving as active root */ > unsigned int unsync_children; > unsigned long parent_ptes; /* Reverse mapping for parent_pte */ > @@ -337,6 +339,14 @@ > KVM_DEBUGREG_WONT_EXIT = 2, > }; > > +struct kvm_pinned_page_range { > + gfn_t base_gfn; > + unsigned long npages; > + struct list_head link; > +}; > + > +#define KVM_MAX_PER_VCPU_PINNED_RANGE 10 > + > struct kvm_vcpu_arch { > /* > * rip and regs accesses must go through > @@ -392,6 +402,10 @@ > struct kvm_mmu_memory_cache mmu_page_cache; > struct kvm_mmu_memory_cache mmu_page_header_cache; > > + struct list_head pinned_mmu_pages; > + struct mutex pinned_mmu_mutex; > + unsigned int nr_pinned_ranges; > + > struct fpu guest_fpu; > u64 xcr0; > u64 guest_supported_xcr0; > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c > =================================================================== > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c 2014-06-18 17:28:17.550456611 -0300 > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c 2014-06-18 17:28:24.339435654 -0300 > @@ -148,6 +148,9 @@ > > #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) > #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) > +#define SPTE_PINNED (1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT)) > + > +#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT > > #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) > > @@ -327,6 +330,11 @@ > return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); > } > > +static int is_pinned_spte(u64 spte) > +{ > + return spte & SPTE_PINNED && is_shadow_present_pte(spte); > +} > + > static int is_large_pte(u64 pte) > { > return pte & PT_PAGE_SIZE_MASK; > @@ -2818,7 +2826,7 @@ > * - false: let the real page fault path to fix it. > */ > static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > - u32 error_code) > + u32 error_code, bool pin) > { > struct kvm_shadow_walk_iterator iterator; > struct kvm_mmu_page *sp; > @@ -2828,6 +2836,9 @@ > if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > return false; > > + if (pin) > + return false; > + > if (!page_fault_can_be_fast(error_code)) > return false; > > @@ -2895,9 +2906,55 @@ > } > > static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, > - gva_t gva, pfn_t *pfn, bool write, bool *writable); > + gva_t gva, pfn_t *pfn, bool write, bool *writable, > + bool pin); > static void make_mmu_pages_available(struct kvm_vcpu *vcpu); > > + > +static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes[4]) > +{ > + struct kvm_shadow_walk_iterator iterator; > + int nr_sptes = 0; > + > + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > + return nr_sptes; > + > + for_each_shadow_entry(vcpu, addr, iterator) { > + sptes[iterator.level-1] = iterator.sptep; > + nr_sptes++; > + if (!is_shadow_present_pte(*iterator.sptep)) > + break; > + } > + > + return nr_sptes; > +} > + > +static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn) > +{ > + u64 *sptes[4]; > + int r, i, level; > + > + r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes); > + if (!r) > + return false; > + > + level = 5 - r; > + if (!is_last_spte(*sptes[r-1], level)) > + return false; > + if (!is_shadow_present_pte(*sptes[r-1])) > + return false; > + > + for (i = 0; i < r; i++) { > + u64 *sptep = sptes[i]; > + struct kvm_mmu_page *sp = page_header(__pa(sptep)); > + > + sp->pinned = true; > + set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep); > + } > + > + return true; > +} > + > static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, > gfn_t gfn, bool prefault, bool pin, bool *pinned) > { > @@ -2923,13 +2980,14 @@ > } else > level = PT_PAGE_TABLE_LEVEL; > > - if (fast_page_fault(vcpu, v, level, error_code)) > + if (fast_page_fault(vcpu, v, level, error_code, pin)) > return 0; > > mmu_seq = vcpu->kvm->mmu_notifier_seq; > smp_rmb(); > > - if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) > + if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable, > + pin)) > return 0; > > if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) > @@ -2943,6 +3001,8 @@ > transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); > r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, > prefault); > + if (pin) > + *pinned = direct_pin_sptes(vcpu, gfn); > spin_unlock(&vcpu->kvm->mmu_lock); > > > @@ -3349,7 +3409,8 @@ > } > > static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, > - gva_t gva, pfn_t *pfn, bool write, bool *writable) > + gva_t gva, pfn_t *pfn, bool write, bool *writable, > + bool pin) > { > bool async; > > @@ -3358,7 +3419,7 @@ > if (!async) > return false; /* *pfn has correct page already */ > > - if (!prefault && can_do_async_pf(vcpu)) { > + if (!prefault && !pin && can_do_async_pf(vcpu)) { > trace_kvm_try_async_get_page(gva, gfn); > if (kvm_find_async_pf_gfn(vcpu, gfn)) { > trace_kvm_async_pf_doublefault(gva, gfn); > @@ -3406,13 +3467,14 @@ > } else > level = PT_PAGE_TABLE_LEVEL; > > - if (fast_page_fault(vcpu, gpa, level, error_code)) > + if (fast_page_fault(vcpu, gpa, level, error_code, pin)) > return 0; > > mmu_seq = vcpu->kvm->mmu_notifier_seq; > smp_rmb(); > > - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) > + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable, > + pin)) > return 0; > > if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) > @@ -3426,6 +3488,8 @@ > transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); > r = __direct_map(vcpu, gpa, write, map_writable, > level, gfn, pfn, prefault); > + if (pin) > + *pinned = direct_pin_sptes(vcpu, gfn); > spin_unlock(&vcpu->kvm->mmu_lock); > > return r; > @@ -3903,6 +3967,127 @@ > } > EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); > > +int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu, > + gfn_t base_gfn, unsigned long npages) > +{ > + struct kvm_pinned_page_range *p; > + > + mutex_lock(&vcpu->arch.pinned_mmu_mutex); > + list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) { > + if (p->base_gfn == base_gfn && p->npages == npages) { > + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); > + return -EEXIST; > + } > + } > + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); > + > + if (vcpu->arch.nr_pinned_ranges >= > + KVM_MAX_PER_VCPU_PINNED_RANGE) > + return -ENOSPC; Shouldn't we refuse to register pinned range if !TDP? > + > + p = kzalloc(sizeof(struct kvm_pinned_page_range), GFP_KERNEL); > + if (!p) > + return -ENOMEM; > + > + vcpu->arch.nr_pinned_ranges++; > + > + trace_kvm_mmu_register_pinned_range(vcpu->vcpu_id, base_gfn, npages); > + > + INIT_LIST_HEAD(&p->link); > + p->base_gfn = base_gfn; > + p->npages = npages; > + mutex_lock(&vcpu->arch.pinned_mmu_mutex); > + list_add(&p->link, &vcpu->arch.pinned_mmu_pages); > + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); > + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); > + > + return 0; > +} > + > +int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu, > + gfn_t base_gfn, unsigned long npages) > +{ > + struct kvm_pinned_page_range *p; > + > + mutex_lock(&vcpu->arch.pinned_mmu_mutex); > + list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) { > + if (p->base_gfn == base_gfn && p->npages == npages) { > + list_del(&p->link); > + vcpu->arch.nr_pinned_ranges--; > + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); > + kfree(p); > + return 0; > + } > + } > + > + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); > + return -ENOENT; > +} > + > +void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu) > +{ > + struct kvm_pinned_page_range *p, *p2; > + > + mutex_lock(&vcpu->arch.pinned_mmu_mutex); > + list_for_each_entry_safe(p, p2, &vcpu->arch.pinned_mmu_pages, link) { > + list_del(&p->link); > + kfree(p); > + } > + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); > +} > + > +/* > + * Pin KVM MMU page translations. This guarantees, for valid > + * addresses registered by kvm_mmu_register_pinned_range (valid address > + * meaning address which posses sufficient information for fault to > + * be resolved), valid translations exist while in guest mode and > + * therefore no VM-exits due to faults will occur. > + * > + * Failure to instantiate pages will abort guest entry. > + * > + * Page frames should be pinned with get_page in advance. > + * > + * Pinning is not guaranteed while executing as L2 guest. > + * > + */ > + > +static void kvm_mmu_pin_pages(struct kvm_vcpu *vcpu) > +{ > + struct kvm_pinned_page_range *p; > + > + if (is_guest_mode(vcpu)) > + return; > + > + if (!vcpu->arch.mmu.direct_map) > + return; > + > + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); > + > + mutex_lock(&vcpu->arch.pinned_mmu_mutex); > + list_for_each_entry(p, &vcpu->arch.pinned_mmu_pages, link) { > + gfn_t gfn_offset; > + > + for (gfn_offset = 0; gfn_offset < p->npages; gfn_offset++) { > + gfn_t gfn = p->base_gfn + gfn_offset; > + int r; > + bool pinned = false; > + > + r = vcpu->arch.mmu.page_fault(vcpu, gfn << PAGE_SHIFT, > + PFERR_WRITE_MASK, false, > + true, &pinned); > + /* MMU notifier sequence window: retry */ > + if (!r && !pinned) > + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); > + if (r) { > + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); I do not think triple fault is appropriate here. The reasons for triple fault are documented in SDM and this is not one of them. What about error exit to user space? > + break; > + } > + > + } > + } > + mutex_unlock(&vcpu->arch.pinned_mmu_mutex); > +} > + > int kvm_mmu_load(struct kvm_vcpu *vcpu) > { > int r; > @@ -3916,6 +4101,7 @@ > goto out; > /* set_cr3() should ensure TLB has been flushed */ > vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); > + kvm_mmu_pin_pages(vcpu); > out: > return r; > } > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.h > =================================================================== > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.h 2014-06-18 17:27:47.582549238 -0300 > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.h 2014-06-18 17:28:24.339435654 -0300 > @@ -178,4 +178,9 @@ > } > > void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); > +int kvm_mmu_register_pinned_range(struct kvm_vcpu *vcpu, > + gfn_t base_gfn, unsigned long npages); > +int kvm_mmu_unregister_pinned_range(struct kvm_vcpu *vcpu, > + gfn_t base_gfn, unsigned long npages); > +void kvm_mmu_free_pinned_ranges(struct kvm_vcpu *vcpu); > #endif > Index: kvm.pinned-sptes/arch/x86/kvm/x86.c > =================================================================== > --- kvm.pinned-sptes.orig/arch/x86/kvm/x86.c 2014-06-18 17:28:17.552456605 -0300 > +++ kvm.pinned-sptes/arch/x86/kvm/x86.c 2014-06-18 17:28:24.340435651 -0300 > @@ -7049,6 +7049,8 @@ > > kvm_async_pf_hash_reset(vcpu); > kvm_pmu_init(vcpu); > + INIT_LIST_HEAD(&vcpu->arch.pinned_mmu_pages); > + mutex_init(&vcpu->arch.pinned_mmu_mutex); > > return 0; > fail_free_wbinvd_dirty_mask: > @@ -7069,6 +7071,7 @@ > { > int idx; > > + kvm_mmu_free_pinned_ranges(vcpu); > kvm_pmu_destroy(vcpu); > kfree(vcpu->arch.mce_banks); > kvm_free_lapic(vcpu); > @@ -7113,6 +7116,7 @@ > int r; > r = vcpu_load(vcpu); > BUG_ON(r); > + kvm_mmu_free_pinned_ranges(vcpu); > kvm_mmu_unload(vcpu); > vcpu_put(vcpu); > } > Index: kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h > =================================================================== > --- kvm.pinned-sptes.orig/arch/x86/kvm/paging_tmpl.h 2014-06-18 17:28:17.550456611 -0300 > +++ kvm.pinned-sptes/arch/x86/kvm/paging_tmpl.h 2014-06-18 17:28:24.340435651 -0300 > @@ -747,7 +747,7 @@ > smp_rmb(); > > if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault, > - &map_writable)) > + &map_writable, false)) > return 0; > > if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr, > Index: kvm.pinned-sptes/arch/x86/kvm/mmutrace.h > =================================================================== > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmutrace.h 2014-06-18 17:27:47.583549234 -0300 > +++ kvm.pinned-sptes/arch/x86/kvm/mmutrace.h 2014-06-18 17:28:24.340435651 -0300 > @@ -322,6 +322,29 @@ > __entry->kvm_gen == __entry->spte_gen > ) > ); > + > +TRACE_EVENT( > + kvm_mmu_register_pinned_range, > + TP_PROTO(unsigned int vcpu_id, gfn_t gfn, unsigned long npages), > + TP_ARGS(vcpu_id, gfn, npages), > + > + TP_STRUCT__entry( > + __field( unsigned int, vcpu_id ) > + __field( gfn_t, gfn ) > + __field( unsigned long, npages ) > + ), > + > + TP_fast_assign( > + __entry->vcpu_id = vcpu_id; > + __entry->gfn = gfn; > + __entry->npages = npages; > + ), > + > + TP_printk("vcpu_id %u gfn %llx npages %lx", > + __entry->vcpu_id, > + __entry->gfn, > + __entry->npages) > +); > #endif /* _TRACE_KVMMMU_H */ > > #undef TRACE_INCLUDE_PATH > > -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html