Hi Marcelo, On Jul 10, 2014, at 3:12 AM, mtosatti@xxxxxxxxxx wrote: > struct kvm_vcpu_arch { > /* > * rip and regs accesses must go through > @@ -392,6 +402,9 @@ > struct kvm_mmu_memory_cache mmu_page_cache; > struct kvm_mmu_memory_cache mmu_page_header_cache; > > + struct list_head pinned_mmu_pages; > + atomic_t nr_pinned_ranges; > + I’m not sure per-cpu pinned list is a good idea, since currently all vcpu are using the same page table, the per-list can not reduce lock-contention, why not make it be global to vm instead? > struct fpu guest_fpu; > u64 xcr0; > u64 guest_supported_xcr0; > Index: kvm.pinned-sptes/arch/x86/kvm/mmu.c > =================================================================== > --- kvm.pinned-sptes.orig/arch/x86/kvm/mmu.c 2014-07-09 12:05:34.837161264 -0300 > +++ kvm.pinned-sptes/arch/x86/kvm/mmu.c 2014-07-09 12:09:21.856684314 -0300 > @@ -148,6 +148,9 @@ > > #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) > #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) > +#define SPTE_PINNED (1ULL << (PT64_SECOND_AVAIL_BITS_SHIFT)) > + > +#define SPTE_PINNED_BIT PT64_SECOND_AVAIL_BITS_SHIFT > > #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) > > @@ -327,6 +330,11 @@ > return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); > } > > +static int is_pinned_spte(u64 spte) > +{ > + return spte & SPTE_PINNED && is_shadow_present_pte(spte); > +} > + > static int is_large_pte(u64 pte) > { > return pte & PT_PAGE_SIZE_MASK; > @@ -1176,6 +1184,16 @@ > kvm_flush_remote_tlbs(vcpu->kvm); > } > > +static bool vcpu_has_pinned(struct kvm_vcpu *vcpu) > +{ > + return atomic_read(&vcpu->arch.nr_pinned_ranges); > +} > + > +static void mmu_reload_pinned_vcpus(struct kvm *kvm) > +{ > + make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD, &vcpu_has_pinned); > +} > + > /* > * Write-protect on the specified @sptep, @pt_protect indicates whether > * spte write-protection is caused by protecting shadow page table. > @@ -1268,7 +1286,8 @@ > } > > static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, > - struct kvm_memory_slot *slot, unsigned long data) > + struct kvm_memory_slot *slot, unsigned long data, > + bool age) > { > u64 *sptep; > struct rmap_iterator iter; > @@ -1278,6 +1297,14 @@ > BUG_ON(!(*sptep & PT_PRESENT_MASK)); > rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); > > + if (is_pinned_spte(*sptep)) { > + /* don't nuke pinned sptes if page aging: return > + * young=yes instead. > + */ > + if (age) > + return 1; > + mmu_reload_pinned_vcpus(kvm); > + } > drop_spte(kvm, step); This has a window between zapping spte and re-pin spte, so guest will fail at this time. > need_tlb_flush = 1; > } > @@ -1286,7 +1313,8 @@ > } > > static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, > - struct kvm_memory_slot *slot, unsigned long data) > + struct kvm_memory_slot *slot, unsigned long data, > + bool age) > { > u64 *sptep; > struct rmap_iterator iter; > @@ -1304,6 +1332,9 @@ > > need_flush = 1; > > + if (is_pinned_spte(*sptep)) > + mmu_reload_pinned_vcpus(kvm); > + > if (pte_write(*ptep)) { > drop_spte(kvm, sptep); > sptep = rmap_get_first(*rmapp, &iter); > @@ -1334,7 +1365,8 @@ > int (*handler)(struct kvm *kvm, > unsigned long *rmapp, > struct kvm_memory_slot *slot, > - unsigned long data)) > + unsigned long data, > + bool age)) > { > int j; > int ret = 0; > @@ -1374,7 +1406,7 @@ > rmapp = __gfn_to_rmap(gfn_start, j, memslot); > > for (; idx <= idx_end; ++idx) > - ret |= handler(kvm, rmapp++, memslot, data); > + ret |= handler(kvm, rmapp++, memslot, data, false); > } > } > > @@ -1385,7 +1417,8 @@ > unsigned long data, > int (*handler)(struct kvm *kvm, unsigned long *rmapp, > struct kvm_memory_slot *slot, > - unsigned long data)) > + unsigned long data, > + bool age)) > { > return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); > } > @@ -1406,7 +1439,8 @@ > } > > static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, > - struct kvm_memory_slot *slot, unsigned long data) > + struct kvm_memory_slot *slot, unsigned long data, > + bool age) > { > u64 *sptep; > struct rmap_iterator uninitialized_var(iter); > @@ -1421,7 +1455,7 @@ > * out actively used pages or breaking up actively used hugepages. > */ > if (!shadow_accessed_mask) { > - young = kvm_unmap_rmapp(kvm, rmapp, slot, data); > + young = kvm_unmap_rmapp(kvm, rmapp, slot, data, true); > goto out; > } > > @@ -1442,7 +1476,8 @@ > } > > static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, > - struct kvm_memory_slot *slot, unsigned long data) > + struct kvm_memory_slot *slot, unsigned long data, > + bool age) > { > u64 *sptep; > struct rmap_iterator iter; > @@ -1480,7 +1515,7 @@ > > rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); > > - kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0); > + kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0, false); > kvm_flush_remote_tlbs(vcpu->kvm); > } > > @@ -2753,7 +2788,8 @@ > } > > static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, > - pfn_t pfn, unsigned access, int *ret_val) > + pfn_t pfn, unsigned access, int *ret_val, > + bool pin) > { > bool ret = true; > > @@ -2763,8 +2799,14 @@ > goto exit; > } > > - if (unlikely(is_noslot_pfn(pfn))) > + if (unlikely(is_noslot_pfn(pfn))) { > + /* pinned sptes must point to RAM */ > + if (unlikely(pin)) { > + *ret_val = -EFAULT; > + goto exit; > + } > vcpu_cache_mmio_info(vcpu, gva, gfn, access); > + } > > ret = false; > exit: > @@ -2818,7 +2860,7 @@ > * - false: let the real page fault path to fix it. > */ > static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > - u32 error_code) > + u32 error_code, bool pin) > { > struct kvm_shadow_walk_iterator iterator; > struct kvm_mmu_page *sp; > @@ -2828,6 +2870,9 @@ > if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > return false; > > + if (pin) > + return false; > + > if (!page_fault_can_be_fast(error_code)) > return false; > > @@ -2895,9 +2940,71 @@ > } > > static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, > - gva_t gva, pfn_t *pfn, bool write, bool *writable); > + gva_t gva, pfn_t *pfn, bool write, bool *writable, > + bool pin); > static void make_mmu_pages_available(struct kvm_vcpu *vcpu); > > + > +static int get_sptep_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes[4]) > + > +{ > + struct kvm_shadow_walk_iterator iterator; > + int nr_sptes = 0; > + > + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) > + return nr_sptes; > + > + for_each_shadow_entry(vcpu, addr, iterator) { > + sptes[iterator.level-1] = iterator.sptep; > + nr_sptes++; > + if (!is_shadow_present_pte(*iterator.sptep)) > + break; > + } > + > + return nr_sptes; > +} > + > +static bool __direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn, bool pin) > +{ > + u64 *sptes[4]; > + int r, i, level; > + > + r = get_sptep_hierarchy(vcpu, gfn << PAGE_SHIFT, sptes); > + if (!r) > + return false; > + > + level = 5 - r; > + if (!is_last_spte(*sptes[level-1], level)) > + return false; > + if (!is_shadow_present_pte(*sptes[level-1])) > + return false; > + > + for (i = 0; i < r; i++) { > + u64 *sptep = sptes[3-i]; > + struct kvm_mmu_page *sp = page_header(__pa(sptep)); > + > + if (pin) { > + sp->pinned = true; > + set_bit(SPTE_PINNED_BIT, (unsigned long *)sptep); > + } else { > + sp->pinned = false; > + clear_bit(SPTE_PINNED_BIT, (unsigned long *)sptep); > + } > + } > + > + return true; > +} > + > +static bool direct_pin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn) > +{ > + return __direct_pin_sptes(vcpu, gfn, true); > +} > + > +static bool direct_unpin_sptes(struct kvm_vcpu *vcpu, gfn_t gfn) > +{ > + return __direct_pin_sptes(vcpu, gfn, false); > +} > + I guess we can reuse the infrastructure which marks/clears unsync spte to pin/unpind pages that can omit lot of code in this patch. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html