Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 7 ++ arch/x86/kvm/mmu/mmu.c | 111 ++++++++++++++++++++++++++------ 2 files changed, 99 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1bab87a444d78..b14864f3e8e74 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1138,6 +1138,13 @@ struct kvm_x86_ops { void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3); + bool (*pin_spte)(struct kvm_vcpu *vcpu, gfn_t gfn, int level, + kvm_pfn_t pfn); + void (*drop_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level, + kvm_pfn_t pfn); + void (*zap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level); + void (*unzap_pinned_spte)(struct kvm *kvm, gfn_t gfn, int level); + bool (*has_wbinvd_exit)(void); /* Returns actual tsc_offset set in active VMCS */ diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 182f398036248..cab3b2f2f49c3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -133,6 +133,9 @@ module_param(dbg, bool, 0644); #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) #define SPTE_MMIO_MASK (3ULL << 52) +/* Special SPTEs flags that can only be used for non-MMIO SPTEs. */ +#define SPTE_PINNED_MASK BIT_ULL(62) + #define PT64_LEVEL_BITS 9 #define PT64_LEVEL_SHIFT(level) \ @@ -211,6 +214,7 @@ enum { RET_PF_EMULATE = 1, RET_PF_INVALID = 2, RET_PF_FIXED = 3, + RET_PF_UNZAPPED = 4, }; struct pte_list_desc { @@ -635,6 +639,11 @@ static bool is_shadow_present_pte(u64 pte) return __is_shadow_present_pte(pte) && !is_mmio_spte(pte); } +static bool is_pinned_pte(u64 pte) +{ + return !!(pte & SPTE_PINNED_MASK); +} + static int is_large_pte(u64 pte) { return pte & PT_PAGE_SIZE_MASK; @@ -937,15 +946,15 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * state bits, it is used to clear the last level sptep. * Returns the old PTE. */ -static u64 mmu_spte_clear_track_bits(u64 *sptep) +static u64 __mmu_spte_clear_track_bits(u64 *sptep, u64 clear_value) { kvm_pfn_t pfn; u64 old_spte = *sptep; if (!spte_has_volatile_bits(old_spte)) - __update_clear_spte_fast(sptep, 0ull); + __update_clear_spte_fast(sptep, clear_value); else - old_spte = __update_clear_spte_slow(sptep, 0ull); + old_spte = __update_clear_spte_slow(sptep, clear_value); if (!is_shadow_present_pte(old_spte)) return old_spte; @@ -968,6 +977,11 @@ static u64 mmu_spte_clear_track_bits(u64 *sptep) return old_spte; } +static inline u64 mmu_spte_clear_track_bits(u64 *sptep) +{ + return __mmu_spte_clear_track_bits(sptep, 0ull); +} + /* * Rules for using mmu_spte_clear_no_track: * Directly clear spte without caring the state bits of sptep, @@ -1399,7 +1413,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return pte_list_add(vcpu, spte, rmap_head); } -static void rmap_remove(struct kvm *kvm, u64 *spte) +static void rmap_remove(struct kvm *kvm, u64 *spte, u64 old_spte) { struct kvm_mmu_page *sp; gfn_t gfn; @@ -1409,6 +1423,10 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmap_head = gfn_to_rmap(kvm, gfn, sp); __pte_list_remove(spte, rmap_head); + + if (is_pinned_pte(old_spte)) + kvm_x86_ops.drop_pinned_spte(kvm, gfn, sp->role.level - 1, + spte_to_pfn(old_spte)); } /* @@ -1446,7 +1464,7 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, iter->pos = 0; sptep = iter->desc->sptes[iter->pos]; out: - BUG_ON(!is_shadow_present_pte(*sptep)); + BUG_ON(!is_shadow_present_pte(*sptep) && !is_pinned_pte(*sptep)); return sptep; } @@ -1491,8 +1509,8 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) { u64 old_spte = mmu_spte_clear_track_bits(sptep); - if (is_shadow_present_pte(old_spte)) - rmap_remove(kvm, sptep); + if (is_shadow_present_pte(old_spte) || is_pinned_pte(old_spte)) + rmap_remove(kvm, sptep, old_spte); } @@ -1730,17 +1748,49 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); } +static bool kvm_mmu_zap_pinned_spte(struct kvm *kvm, u64 *sptep) +{ + struct kvm_mmu_page *sp; + kvm_pfn_t pfn; + gfn_t gfn; + + if (!(*sptep & SPTE_PINNED_MASK)) + return false; + + sp = sptep_to_sp(sptep); + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + pfn = spte_to_pfn(*sptep); + + if (kvm_x86_ops.zap_pinned_spte) + kvm_x86_ops.zap_pinned_spte(kvm, gfn, sp->role.level - 1); + + __mmu_spte_clear_track_bits(sptep, SPTE_PINNED_MASK | pfn << PAGE_SHIFT); + return true; +} + static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; struct rmap_iterator iter; bool flush = false; - while ((sptep = rmap_get_first(rmap_head, &iter))) { +restart: + for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); + if (!is_shadow_present_pte(*sptep)) { + WARN_ON_ONCE(!is_pinned_pte(*sptep)); + continue; + } + + flush = true; + + /* Keep the rmap if the SPTE is pinned. */ + if (kvm_mmu_zap_pinned_spte(kvm, sptep)) + continue; + pte_list_remove(rmap_head, sptep); - flush = true; + goto restart; } return flush; @@ -1774,6 +1824,10 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, need_flush = 1; + /* Pinned pages should not be relocated (obviously). */ + if (WARN_ON_ONCE(is_pinned_pte(*sptep))) + continue; + if (pte_write(*ptep)) { pte_list_remove(rmap_head, sptep); goto restart; @@ -2630,7 +2684,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, struct kvm_mmu_page *child; pte = *spte; - if (is_shadow_present_pte(pte)) { + if (is_shadow_present_pte(pte) || is_pinned_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); if (is_large_pte(pte)) @@ -2639,7 +2693,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } - return true; + return is_shadow_present_pte(pte); } if (is_mmio_spte(pte)) @@ -2987,10 +3041,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte = 0; int ret = 0; struct kvm_mmu_page *sp; + bool is_mmio_pfn; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; + is_mmio_pfn = kvm_is_mmio_pfn(pfn); + sp = sptep_to_sp(sptep); if (sp_ad_disabled(sp)) spte |= SPTE_AD_DISABLED_MASK; @@ -3023,15 +3080,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) - spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, - kvm_is_mmio_pfn(pfn)); + spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, is_mmio_pfn); if (host_writable) spte |= SPTE_HOST_WRITEABLE; else pte_access &= ~ACC_WRITE_MASK; - if (!kvm_is_mmio_pfn(pfn)) + if (!is_mmio_pfn) spte |= shadow_me_mask; spte |= (u64)pfn << PAGE_SHIFT; @@ -3065,6 +3121,12 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (speculative) spte = mark_spte_for_access_track(spte); + if (is_pinned_pte(*sptep) || + (vcpu->arch.mmu->direct_map && !is_mmio_pfn && + kvm_x86_ops.pin_spte && + kvm_x86_ops.pin_spte(vcpu, gfn, level, pfn))) + spte |= SPTE_PINNED_MASK; + set_pte: if (mmu_spte_update(sptep, spte)) ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; @@ -3081,29 +3143,33 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, int set_spte_ret; int ret = RET_PF_FIXED; bool flush = false; + u64 pte = *sptep; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); - if (is_shadow_present_pte(*sptep)) { + if (is_shadow_present_pte(pte)) { /* * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ - if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { + if (level > PG_LEVEL_4K && !is_large_pte(pte)) { struct kvm_mmu_page *child; - u64 pte = *sptep; child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; - } else if (pfn != spte_to_pfn(*sptep)) { + } else if (pfn != spte_to_pfn(pte)) { pgprintk("hfn old %llx new %llx\n", - spte_to_pfn(*sptep), pfn); + spte_to_pfn(pte), pfn); drop_spte(vcpu->kvm, sptep); flush = true; } else was_rmapped = 1; + } else if (is_pinned_pte(pte)) { + WARN_ON_ONCE(pfn != spte_to_pfn(pte)); + ret = RET_PF_UNZAPPED; + was_rmapped = 1; } set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, @@ -3136,6 +3202,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, rmap_recycle(vcpu, sptep, gfn); } + if (ret == RET_PF_UNZAPPED && kvm_x86_ops.unzap_pinned_spte) + kvm_x86_ops.unzap_pinned_spte(vcpu->kvm, gfn, level - 1); + return ret; } @@ -5921,6 +5990,10 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, sp = sptep_to_sp(sptep); pfn = spte_to_pfn(*sptep); + /* Pinned page dirty logging is not supported. */ + if (WARN_ON_ONCE(is_pinned_pte(*sptep))) + continue; + /* * We cannot do huge page mapping for indirect shadow pages, * which are found on the last rmap (level = 1) when not using -- 2.28.0