On 01/07/2017 02:26, Peter Feiner wrote: > Adds the plumbing to disable A/D bits in the MMU based on a new role > bit, ad_disabled. When A/D is disabled, the MMU operates as though A/D > aren't available (i.e., using access tracking faults instead). > > To avoid SP -> kvm_mmu_page.role.ad_disabled lookups all over the > place, A/D disablement is now stored in the SPTE. This state is stored > in the SPTE by tweaking the use of SPTE_SPECIAL_MASK for access > tracking. Rather than just setting SPTE_SPECIAL_MASK when an > access-tracking SPTE is non-present, we now always set > SPTE_SPECIAL_MASK for access-tracking SPTEs. Here are the only small fixes I'd like to make (not tested yet, but I'm trying to poke holes in my understanding of the patch): diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt index 481b6a9c25d5..f50d45b1e967 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virtual/kvm/mmu.txt @@ -179,6 +179,10 @@ Shadow pages contain the following information: shadow page; it is also used to go back from a struct kvm_mmu_page to a memslot, through the kvm_memslots_for_spte_role macro and __gfn_to_memslot. + role.ad_disabled: + Is 1 if the MMU instance cannot use A/D bits. EPT did not have A/D + bits before Haswell; shadow EPT page tables also cannot use A/D bits + if the L1 hypervisor does not enable them. gfn: Either the guest page table containing the translations shadowed by this page, or the base page frame for linear translations. See role.direct. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e814fb6248ac..9aeef6ade6b4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -217,21 +217,24 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) { - return shadow_accessed_mask == 0 || sp->role.ad_disabled; + return sp->role.ad_disabled; } static inline bool spte_ad_enabled(u64 spte) { + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); return !(spte & shadow_acc_track_value); } -static u64 spte_shadow_accessed_mask(u64 spte) +static inline u64 spte_shadow_accessed_mask(u64 spte) { + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; } -static u64 spte_shadow_dirty_mask(u64 spte) +static inline u64 spte_shadow_dirty_mask(u64 spte) { + MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value); return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; } @@ -4328,6 +4329,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->base_role.word = 0; context->base_role.smm = is_smm(vcpu); + context->base_role.ad_disabled = (shadow_accessed_mask == 0); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; > Signed-off-by: Peter Feiner <pfeiner@xxxxxxxxxx> > --- > arch/x86/include/asm/kvm_host.h | 3 +- > arch/x86/kvm/mmu.c | 111 +++++++++++++++++++++++++++++----------- > arch/x86/kvm/mmutrace.h | 6 ++- > 3 files changed, 87 insertions(+), 33 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 695605eb1dfb..2f19c680c3fd 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -254,7 +254,8 @@ union kvm_mmu_page_role { > unsigned cr0_wp:1; > unsigned smep_andnot_wp:1; > unsigned smap_andnot_wp:1; > - unsigned :8; > + unsigned ad_disabled:1; > + unsigned :7; > > /* > * This is left at the top of the word so that > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c > index 10b3cfc7b411..e814fb6248ac 100644 > --- a/arch/x86/kvm/mmu.c > +++ b/arch/x86/kvm/mmu.c > @@ -187,10 +187,9 @@ static u64 __read_mostly shadow_mmio_value; > static u64 __read_mostly shadow_present_mask; > > /* > - * The mask/value to distinguish a PTE that has been marked not-present for > - * access tracking purposes. > - * The mask would be either 0 if access tracking is disabled, or > - * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. > + * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. > + * Non-present SPTEs with shadow_acc_track_value set are in place for access > + * tracking. > */ > static u64 __read_mostly shadow_acc_track_mask; > static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; > @@ -216,10 +215,29 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) > } > EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); > > +static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) > +{ > + return shadow_accessed_mask == 0 || sp->role.ad_disabled; > +} > + > +static inline bool spte_ad_enabled(u64 spte) > +{ > + return !(spte & shadow_acc_track_value); > +} > + > +static u64 spte_shadow_accessed_mask(u64 spte) > +{ > + return spte_ad_enabled(spte) ? shadow_accessed_mask : 0; > +} > + > +static u64 spte_shadow_dirty_mask(u64 spte) > +{ > + return spte_ad_enabled(spte) ? shadow_dirty_mask : 0; > +} > + > static inline bool is_access_track_spte(u64 spte) > { > - /* Always false if shadow_acc_track_mask is zero. */ > - return (spte & shadow_acc_track_mask) == shadow_acc_track_value; > + return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0; > } > > /* > @@ -329,10 +347,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, > u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, > u64 acc_track_mask) > { > - if (acc_track_mask != 0) > - acc_track_mask |= SPTE_SPECIAL_MASK; > BUG_ON(!dirty_mask != !accessed_mask); > BUG_ON(!accessed_mask && !acc_track_mask); > + BUG_ON(acc_track_mask & shadow_acc_track_value); > > shadow_user_mask = user_mask; > shadow_accessed_mask = accessed_mask; > @@ -341,7 +358,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, > shadow_x_mask = x_mask; > shadow_present_mask = p_mask; > shadow_acc_track_mask = acc_track_mask; > - WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); > } > EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); > > @@ -561,7 +577,7 @@ static bool spte_has_volatile_bits(u64 spte) > is_access_track_spte(spte)) > return true; > > - if (shadow_accessed_mask) { > + if (spte_ad_enabled(spte)) { > if ((spte & shadow_accessed_mask) == 0 || > (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) > return true; > @@ -572,14 +588,17 @@ static bool spte_has_volatile_bits(u64 spte) > > static bool is_accessed_spte(u64 spte) > { > - return shadow_accessed_mask ? spte & shadow_accessed_mask > - : !is_access_track_spte(spte); > + u64 accessed_mask = spte_shadow_accessed_mask(spte); > + > + return accessed_mask ? spte & accessed_mask > + : !is_access_track_spte(spte); > } > > static bool is_dirty_spte(u64 spte) > { > - return shadow_dirty_mask ? spte & shadow_dirty_mask > - : spte & PT_WRITABLE_MASK; > + u64 dirty_mask = spte_shadow_dirty_mask(spte); > + > + return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; > } > > /* Rules for using mmu_spte_set: > @@ -719,10 +738,10 @@ static u64 mmu_spte_get_lockless(u64 *sptep) > > static u64 mark_spte_for_access_track(u64 spte) > { > - if (shadow_accessed_mask != 0) > + if (spte_ad_enabled(spte)) > return spte & ~shadow_accessed_mask; > > - if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) > + if (is_access_track_spte(spte)) > return spte; > > /* > @@ -741,7 +760,6 @@ static u64 mark_spte_for_access_track(u64 spte) > spte |= (spte & shadow_acc_track_saved_bits_mask) << > shadow_acc_track_saved_bits_shift; > spte &= ~shadow_acc_track_mask; > - spte |= shadow_acc_track_value; > > return spte; > } > @@ -753,6 +771,7 @@ static u64 restore_acc_track_spte(u64 spte) > u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) > & shadow_acc_track_saved_bits_mask; > > + WARN_ON_ONCE(spte_ad_enabled(spte)); > WARN_ON_ONCE(!is_access_track_spte(spte)); > > new_spte &= ~shadow_acc_track_mask; > @@ -771,7 +790,7 @@ static bool mmu_spte_age(u64 *sptep) > if (!is_accessed_spte(spte)) > return false; > > - if (shadow_accessed_mask) { > + if (spte_ad_enabled(spte)) { > clear_bit((ffs(shadow_accessed_mask) - 1), > (unsigned long *)sptep); > } else { > @@ -1402,6 +1421,22 @@ static bool spte_clear_dirty(u64 *sptep) > return mmu_spte_update(sptep, spte); > } > > +static bool wrprot_ad_disabled_spte(u64 *sptep) > +{ > + bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, > + (unsigned long *)sptep); > + if (was_writable) > + kvm_set_pfn_dirty(spte_to_pfn(*sptep)); > + > + return was_writable; > +} > + > +/* > + * Gets the GFN ready for another round of dirty logging by clearing the > + * - D bit on ad-enabled SPTEs, and > + * - W bit on ad-disabled SPTEs. > + * Returns true iff any D or W bits were cleared. > + */ > static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) > { > u64 *sptep; > @@ -1409,7 +1444,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) > bool flush = false; > > for_each_rmap_spte(rmap_head, &iter, sptep) > - flush |= spte_clear_dirty(sptep); > + if (spte_ad_enabled(*sptep)) > + flush |= spte_clear_dirty(sptep); > + else > + flush |= wrprot_ad_disabled_spte(sptep); > > return flush; > } > @@ -1432,7 +1470,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) > bool flush = false; > > for_each_rmap_spte(rmap_head, &iter, sptep) > - flush |= spte_set_dirty(sptep); > + if (spte_ad_enabled(*sptep)) > + flush |= spte_set_dirty(sptep); > > return flush; > } > @@ -1464,7 +1503,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, > } > > /** > - * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages > + * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write > + * protect the page if the D-bit isn't supported. > * @kvm: kvm instance > * @slot: slot to clear D-bit > * @gfn_offset: start of the BITS_PER_LONG pages we care about > @@ -2389,7 +2429,12 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, > BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); > > spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | > - shadow_user_mask | shadow_x_mask | shadow_accessed_mask; > + shadow_user_mask | shadow_x_mask; > + > + if (sp_ad_disabled(sp)) > + spte |= shadow_acc_track_value; > + else > + spte |= shadow_accessed_mask; > > mmu_spte_set(sptep, spte); > > @@ -2657,10 +2702,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, > { > u64 spte = 0; > int ret = 0; > + struct kvm_mmu_page *sp; > > if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) > return 0; > > + sp = page_header(__pa(sptep)); > + if (sp_ad_disabled(sp)) > + spte |= shadow_acc_track_value; > + > /* > * For the EPT case, shadow_present_mask is 0 if hardware > * supports exec-only page table entries. In that case, > @@ -2669,7 +2719,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, > */ > spte |= shadow_present_mask; > if (!speculative) > - spte |= shadow_accessed_mask; > + spte |= spte_shadow_accessed_mask(spte); > > if (pte_access & ACC_EXEC_MASK) > spte |= shadow_x_mask; > @@ -2726,7 +2776,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, > > if (pte_access & ACC_WRITE_MASK) { > kvm_vcpu_mark_page_dirty(vcpu, gfn); > - spte |= shadow_dirty_mask; > + spte |= spte_shadow_dirty_mask(spte); > } > > if (speculative) > @@ -2868,16 +2918,16 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) > { > struct kvm_mmu_page *sp; > > + sp = page_header(__pa(sptep)); > + > /* > - * Since it's no accessed bit on EPT, it's no way to > - * distinguish between actually accessed translations > - * and prefetched, so disable pte prefetch if EPT is > - * enabled. > + * Without accessed bits, there's no way to distinguish between > + * actually accessed translations and prefetched, so disable pte > + * prefetch if accessed bits aren't available. > */ > - if (!shadow_accessed_mask) > + if (sp_ad_disabled(sp)) > return; > > - sp = page_header(__pa(sptep)); > if (sp->role.level > PT_PAGE_TABLE_LEVEL) > return; > > @@ -4624,6 +4674,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, > mask.smep_andnot_wp = 1; > mask.smap_andnot_wp = 1; > mask.smm = 1; > + mask.ad_disabled = 1; > > /* > * If we don't have indirect shadow pages, it means no page is > diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h > index 5a24b846a1cb..8b97a6cba8d1 100644 > --- a/arch/x86/kvm/mmutrace.h > +++ b/arch/x86/kvm/mmutrace.h > @@ -30,8 +30,9 @@ > \ > role.word = __entry->role; \ > \ > - trace_seq_printf(p, "sp gen %lx gfn %llx %u%s q%u%s %s%s" \ > - " %snxe root %u %s%c", __entry->mmu_valid_gen, \ > + trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \ > + " %snxe %sad root %u %s%c", \ > + __entry->mmu_valid_gen, \ > __entry->gfn, role.level, \ > role.cr4_pae ? " pae" : "", \ > role.quadrant, \ > @@ -39,6 +40,7 @@ > access_str[role.access], \ > role.invalid ? " invalid" : "", \ > role.nxe ? "" : "!", \ > + role.ad_disabled ? "!" : "", \ > __entry->root_count, \ > __entry->unsync ? "unsync" : "sync", 0); \ > saved_ptr; \ >