On 09/11/2016 00:00, Junaid Shahid wrote: > This change implements lockless access tracking for Intel CPUs without EPT > A bits. This is achieved by marking the PTEs as not-present (but not > completely clearing them) when clear_flush_young() is called after marking > the pages as accessed. When an EPT Violation is generated as a result of > the VM accessing those pages, the PTEs are restored to their original values. > > Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx> > --- > arch/x86/include/asm/vmx.h | 39 ++++++ > arch/x86/kvm/mmu.c | 314 ++++++++++++++++++++++++++++++++++----------- > arch/x86/kvm/mmu.h | 2 + > arch/x86/kvm/vmx.c | 20 ++- > 4 files changed, 301 insertions(+), 74 deletions(-) > > diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h > index 60991fb..3d63098 100644 > --- a/arch/x86/include/asm/vmx.h > +++ b/arch/x86/include/asm/vmx.h > @@ -434,6 +434,45 @@ enum vmcs_field { > #define VMX_EPT_IPAT_BIT (1ull << 6) > #define VMX_EPT_ACCESS_BIT (1ull << 8) > #define VMX_EPT_DIRTY_BIT (1ull << 9) > +#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ > + VMX_EPT_WRITABLE_MASK | \ > + VMX_EPT_EXECUTABLE_MASK) > +#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) > + > +/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ > +#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ > + VMX_EPT_EXECUTABLE_MASK) > + So far so good. > + * The shift to use for saving the original RWX value when marking the PTE as > + * not-present for tracking purposes. > + */ > +#define VMX_EPT_RWX_SAVE_SHIFT 52 > + > +/* > + * The shift/mask for determining the type of tracking (if any) being used for a > + * not-present PTE. Currently, only two bits are used, but more can be added. > + * > + * NOTE: Bit 63 is an architecturally ignored bit (and hence can be used for our > + * purpose) when the EPT PTE is in a misconfigured state. However, it is > + * not necessarily an ignored bit otherwise (even in a not-present state). > + * Since the existing MMIO code already uses this bit and since KVM > + * doesn't use #VEs currently (where this bit comes into play), so we can > + * continue to use it for storing the type. But to be on the safe side, > + * we should not set it to 1 in those TRACK_TYPEs where the tracking is > + * done via EPT Violations instead of EPT Misconfigurations. > + */ > +#define VMX_EPT_TRACK_TYPE_SHIFT 62 > +#define VMX_EPT_TRACK_TYPE_MASK (3ull << \ > + VMX_EPT_TRACK_TYPE_SHIFT) > + > +/* Sets only bit 62 as the tracking is done by EPT Violations. See note above */ > +#define VMX_EPT_TRACK_ACCESS (1ull << \ > + VMX_EPT_TRACK_TYPE_SHIFT) > +/* Sets bits 62 and 63. See note above */ > +#define VMX_EPT_TRACK_MMIO (3ull << \ > + VMX_EPT_TRACK_TYPE_SHIFT) > + I think this is overengineered. Let's just say bit 62 in the SPTE denotes "special SPTE" violations, of both the access tracking and MMIO kinds; no need to use bit 63 as well. Let's call it SPTE_SPECIAL_MASK and, first of all, make KVM use it for the fast MMIO case. Second, is_shadow_present_pte can just be return (pte != 0) && !is_mmio_spte(pte); Third we can add this on top, but let's keep the cases simple. You can fix: - shadow_acc_track_saved_bits_mask to 7 (for XWR in the EPT case, but it would work just as well for UWP for "traditional" page tables) - shadow_acc_track_saved_bits_shift to PT64_SECOND_AVAIL_BITS_SHIFT - shadow_acc_track_value to SPTE_SPECIAL_MASK aka bit 62 In the end, all that needs customization is shadow_acc_track_mask. That one is 0 if fast access tracking is not used, and VMX_EPT_RWX_MASK|SPTE_SPECIAL_MASK for ept && !eptad. So you can add a single argument to kvm_mmu_set_mask_ptes. > @@ -490,6 +532,9 @@ static bool spte_has_volatile_bits(u64 spte) > if (spte_can_locklessly_be_made_writable(spte)) > return true; > > + if (is_access_track_spte(spte)) > + return true; > + > if (!shadow_accessed_mask) > return false; I think it's worth rewriting the whole function: if (spte_is_locklessly_modifiable(spte)) return true; if (!is_shadow_present_pte(spte)) return false; if (shadow_accessed_mask) return ((spte & shadow_accessed_mask) == 0 || (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) else return is_access_track_spte(spte); > @@ -533,17 +578,21 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) > * will find a read-only spte, even though the writable spte > * might be cached on a CPU's TLB, the return value indicates this > * case. > + * > + * Returns true if the TLB needs to be flushed > */ > static bool mmu_spte_update(u64 *sptep, u64 new_spte) > { > u64 old_spte = *sptep; > - bool ret = false; > + bool flush = false; > + bool writable_cleared; > + bool acc_track_enabled; > > WARN_ON(!is_shadow_present_pte(new_spte)); > > if (!is_shadow_present_pte(old_spte)) { > mmu_spte_set(sptep, new_spte); > - return ret; > + return flush; > } > > if (!spte_has_volatile_bits(old_spte)) > @@ -551,24 +600,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) > else > old_spte = __update_clear_spte_slow(sptep, new_spte); > > + BUG_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); WARN_ON, please. > /* > * For the spte updated out of mmu-lock is safe, since > * we always atomically update it, see the comments in > * spte_has_volatile_bits(). > */ > if (spte_can_locklessly_be_made_writable(old_spte) && > - !is_writable_pte(new_spte)) > - ret = true; > - > - if (!shadow_accessed_mask) { > - /* > - * We don't set page dirty when dropping non-writable spte. > - * So do it now if the new spte is becoming non-writable. > - */ > - if (ret) > - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); > - return ret; > - } > + !is_writable_pte(new_spte)) > + flush = true; > > /* > * Flush TLB when accessed/dirty bits are changed in the page tables, > @@ -576,20 +617,34 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) > */ > if (spte_is_bit_changed(old_spte, new_spte, > shadow_accessed_mask | shadow_dirty_mask)) > - ret = true; > + flush = true; > > - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) > + writable_cleared = is_writable_pte(old_spte) && > + !is_writable_pte(new_spte); > + acc_track_enabled = !is_access_track_spte(old_spte) && > + is_access_track_spte(new_spte); > + > + if (writable_cleared || acc_track_enabled) > + flush = true; > + > + if (shadow_accessed_mask ? > + spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask) : > + acc_track_enabled) Please introduce a new function spte_is_access_tracking_enabled(u64 old_spte, u64 new_spte) and use it here: if (shadow_accessed_mask ? spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask) : spte_is_access_tracking_enabled(old_spte, new_spte)) { flush |= !shadow_accessed_mask; kvm_set_pfn_accessed(spte_to_pfn(old_spte)); } > kvm_set_pfn_accessed(spte_to_pfn(old_spte)); > - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) > + > + if (shadow_dirty_mask ? > + spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask) : > + writable_cleared) writable_cleared can be inline here and written as spte_is_bit_cleared(old_spte, new_spte, PT_WRITABLE_MASK) so if (shadow_dirty_mask ? spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask) : spte_is_bit_cleared(old_spte, new_spte, PT_WRITABLE_MASK)) { flush |= !shadow_dirty_mask; kvm_set_pfn_dirty(spte_to_pfn(old_spte)); } > kvm_set_pfn_dirty(spte_to_pfn(old_spte)); > > - return ret; > + return flush; > } But please anticipate the changes to this function, except for introducing is_access_track_spte of course, to a separate function. > /* > * Rules for using mmu_spte_clear_track_bits: > * It sets the sptep from present to nonpresent, and track the > * state bits, it is used to clear the last level sptep. > + * Returns non-zero if the PTE was previously valid. > */ > static int mmu_spte_clear_track_bits(u64 *sptep) > { > @@ -604,6 +659,13 @@ static int mmu_spte_clear_track_bits(u64 *sptep) > if (!is_shadow_present_pte(old_spte)) > return 0; > > + /* > + * For access tracking SPTEs, the pfn was already marked accessed/dirty > + * when the SPTE was marked for access tracking, so nothing to do here. > + */ > + if (is_access_track_spte(old_spte)) > + return 1; This should go after the "WARN_ON", since that's a valuable check. In addition, I think it's a good idea to keep similar idioms between mmu_spte_update and mmu_spte_clear_track_bits, like this: if (shadow_accessed_mask ? old_spte & shadow_accessed_mask : !is_access_track_spte(old_spte)) kvm_set_pfn_accessed(pfn); if (shadow_dirty_mask ? old_spte & shadow_dirty_mask : old_spte & PT_WRITABLE_MASK) kvm_set_pfn_dirty(pfn); return 1; or (you pick) if (shadow_accessed_mask ? !(old_spte & shadow_accessed_mask) : is_access_track_spte(old_spte)) return 1; kvm_set_pfn_accessed(pfn); if (shadow_dirty_mask ? old_spte & shadow_dirty_mask : old_spte & PT_WRITABLE_MASK) kvm_set_pfn_dirty(pfn); return 1; > pfn = spte_to_pfn(old_spte); > > /* > @@ -618,6 +680,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) > if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : > PT_WRITABLE_MASK)) > kvm_set_pfn_dirty(pfn); > + > return 1; > } > > @@ -636,6 +699,52 @@ static u64 mmu_spte_get_lockless(u64 *sptep) > return __get_spte_lockless(sptep); > } > > +static u64 mark_spte_for_access_track(u64 spte) > +{ > + if (shadow_acc_track_mask == 0) > + return spte; Should this return spte & ~shadow_accessed_mask if shadow_accessed_mask is nonzero? See for example: new_spte &= ~shadow_accessed_mask; + new_spte = mark_spte_for_access_track(new_spte); > + /* > + * Verify that the write-protection that we do below will be fixable > + * via the fast page fault path. Currently, that is always the case, at > + * least when using EPT (which is when access tracking would be used). > + */ > + WARN_ONCE((spte & PT_WRITABLE_MASK) && > + !spte_can_locklessly_be_made_writable(spte), > + "Writable SPTE is not locklessly dirty-trackable\n"); > + > + /* > + * Any PTE marked for access tracking should also be marked for dirty > + * tracking (by being non-writable) > + */ > + spte &= ~PT_WRITABLE_MASK; This should be implicit in the definition of shadow_acc_track_mask/value, so it's not necessary (or it can be a WARN_ONCE). > + spte &= ~(shadow_acc_track_saved_bits_mask << > + shadow_acc_track_saved_bits_shift); Should there be a WARN_ON if these bits are not zero? > + spte |= (spte & shadow_acc_track_saved_bits_mask) << > + shadow_acc_track_saved_bits_shift; > + spte &= ~shadow_acc_track_mask; > + spte |= shadow_acc_track_value; > + > + return spte; > +} > + > +/* Returns true if the TLB needs to be flushed */ > +static bool mmu_spte_enable_access_track(u64 *sptep) > +{ > + u64 spte = mmu_spte_get_lockless(sptep); > + > + if (is_access_track_spte(spte)) > + return false; > + > + /* Access tracking should not be enabled if CPU supports A/D bits */ > + BUG_ON(shadow_accessed_mask != 0); WARN_ONCE please. However, I think mmu_spte_enable_access_track should be renamed to mmu_spte_age and handle the shadow_accessed_mask case as clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); similar to kvm_age_rmapp. See below about kvm_age_rmapp. > + spte = mark_spte_for_access_track(spte); > + > + return mmu_spte_update(sptep, spte); > +} > + > static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) > { > /* > @@ -1403,6 +1512,25 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > return kvm_zap_rmapp(kvm, rmap_head); > } > > +static int kvm_acc_track_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > + struct kvm_memory_slot *slot, gfn_t gfn, > + int level, unsigned long data) > +{ > + u64 *sptep; > + struct rmap_iterator iter; > + int need_tlb_flush = 0; > + > + for_each_rmap_spte(rmap_head, &iter, sptep) { > + Unnecessary blank line---but see below about kvm_acc_track_rmapp. > + rmap_printk("kvm_acc_track_rmapp: spte %p %llx gfn %llx (%d)\n", > + sptep, *sptep, gfn, level); > + > + need_tlb_flush |= mmu_spte_enable_access_track(sptep); > + } > + > + return need_tlb_flush; > +} > + > static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > struct kvm_memory_slot *slot, gfn_t gfn, int level, > unsigned long data) > @@ -1419,8 +1547,9 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > > restart: > for_each_rmap_spte(rmap_head, &iter, sptep) { > + Unnecessary blank line. > rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", > - sptep, *sptep, gfn, level); > + sptep, *sptep, gfn, level); > > need_flush = 1; > > @@ -1435,6 +1564,8 @@ restart: > new_spte &= ~SPTE_HOST_WRITEABLE; > new_spte &= ~shadow_accessed_mask; > > + new_spte = mark_spte_for_access_track(new_spte); > + > mmu_spte_clear_track_bits(sptep); > mmu_spte_set(sptep, new_spte); > } > @@ -1615,24 +1746,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, > { > u64 *sptep; > struct rmap_iterator iter; > - int young = 0; > - > - /* > - * If there's no access bit in the secondary pte set by the > - * hardware it's up to gup-fast/gup to set the access bit in > - * the primary pte or in the page structure. > - */ > - if (!shadow_accessed_mask) > - goto out; > > for_each_rmap_spte(rmap_head, &iter, sptep) { > - if (*sptep & shadow_accessed_mask) { > - young = 1; > - break; > - } > + if ((*sptep & shadow_accessed_mask) || > + (!shadow_accessed_mask && !is_access_track_spte(*sptep))) This can also be written, like before, as if (shadow_accessed_mask ? *sptep & shadow_accessed_mask : !is_access_track_spte(*sptep)) Introducing a new helper is_accessed_spte starts to look like a good idea! > + return 1; > } > -out: > - return young; > + > + return 0; > } > > #define RMAP_RECYCLE_THRESHOLD 1000 > @@ -1669,7 +1790,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) > */ > kvm->mmu_notifier_seq++; > return kvm_handle_hva_range(kvm, start, end, 0, > - kvm_unmap_rmapp); > + shadow_acc_track_mask != 0 > + ? kvm_acc_track_rmapp > + : kvm_unmap_rmapp); Please rewrite kvm_age_rmapp to use the new mmu_spte_age instead, and test if (shadow_accessed_mask || shadow_acc_track_mask) in kvm_age_hva. > @@ -2877,16 +3003,27 @@ static bool page_fault_can_be_fast(u32 error_code) > if (unlikely(error_code & PFERR_RSVD_MASK)) > return false; > > - /* > - * #PF can be fast only if the shadow page table is present and it > - * is caused by write-protect, that means we just need change the > - * W bit of the spte which can be done out of mmu-lock. > - */ > - if (!(error_code & PFERR_PRESENT_MASK) || > - !(error_code & PFERR_WRITE_MASK)) > + /* See if the page fault is due to an NX violation */ > + if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) > + == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) > return false; > > - return true; > + /* > + * #PF can be fast if: > + * 1. The shadow page table entry is not present, which could mean that > + * the fault is potentially caused by access tracking (if enabled). > + * 2. The shadow page table entry is present and the fault > + * is caused by write-protect, that means we just need change the W > + * bit of the spte which can be done out of mmu-lock. > + * > + * However, if Access Tracking is disabled, then the first condition > + * above cannot be handled by the fast path. So if access tracking is > + * disabled, we return true only if the second condition is met. Better: However, if access tracking is disabled we know that a non-present page must be a genuine page fault where we have to create a new SPTE. So, if access tracking is disabled, we return true only for write accesses to a present page. > + */ > + > + return shadow_acc_track_mask != 0 || > + ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) > + == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); > } > > /* > @@ -2895,17 +3032,24 @@ static bool page_fault_can_be_fast(u32 error_code) > */ > static bool > fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > - u64 *sptep, u64 spte) > + u64 *sptep, u64 old_spte, > + bool remove_write_prot, bool remove_acc_track) > { > gfn_t gfn; > + u64 new_spte = old_spte; > > WARN_ON(!sp->role.direct); > > - /* > - * The gfn of direct spte is stable since it is calculated > - * by sp->gfn. > - */ > - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); > + if (remove_acc_track) { > + u64 saved_bits = old_spte & (shadow_acc_track_saved_bits_mask << > + shadow_acc_track_saved_bits_shift); You can shift right old_spte here... > + new_spte &= ~shadow_acc_track_mask; > + new_spte |= saved_bits >> shadow_acc_track_saved_bits_shift; ... instead of shifting saved_bits left here. > + } > + > + if (remove_write_prot) > + new_spte |= PT_WRITABLE_MASK; > > /* > * Theoretically we could also set dirty bit (and flush TLB) here in > @@ -2919,10 +3063,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > * > * Compare with set_spte where instead shadow_dirty_mask is set. > */ > - if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte) > + if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) > return false; > > - kvm_vcpu_mark_page_dirty(vcpu, gfn); > + if (remove_write_prot) { Stupid question ahead, why not call kvm_vcpu_mark_page_accessed in this function? > + /* > + * The gfn of direct spte is stable since it is > + * calculated by sp->gfn. > + */ > + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); > + kvm_vcpu_mark_page_dirty(vcpu, gfn); > + } > > return true; > } > @@ -2937,7 +3088,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > { > struct kvm_shadow_walk_iterator iterator; > struct kvm_mmu_page *sp; > - bool ret = false; > + bool fault_handled = false; > u64 spte = 0ull; > uint retry_count = 0; > > @@ -2953,36 +3104,43 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > break; > > do { > - /* > - * If the mapping has been changed, let the vcpu fault on the > - * same address again. > - */ > - if (!is_shadow_present_pte(spte)) { > - ret = true; > - break; > - } > + bool remove_write_prot = (error_code & PFERR_WRITE_MASK) && > + !(spte & PT_WRITABLE_MASK); > + bool remove_acc_track; > + bool valid_exec_access = (error_code & PFERR_FETCH_MASK) && > + (spte & shadow_x_mask); > > sp = page_header(__pa(iterator.sptep)); > if (!is_last_spte(spte, sp->role.level)) > break; > > /* > - * Check if it is a spurious fault caused by TLB lazily flushed. > + * Check whether the memory access that caused the fault would > + * still cause it if it were to be performed right now. If not, > + * then this is a spurious fault caused by TLB lazily flushed, > + * or some other CPU has already fixed the PTE after the > + * current CPU took the fault. > + * > + * If Write-Only mappings ever become supported, then the > + * condition below would need to be changed appropriately. > * > * Need not check the access of upper level table entries since > * they are always ACC_ALL. > */ > - if (is_writable_pte(spte)) { > - ret = true; > + if (((spte & PT_PRESENT_MASK) && !remove_write_prot) || > + valid_exec_access) { > + fault_handled = true; > break; > } Let's separate the three conditions (R/W/X): if ((error_code & PFERR_FETCH_MASK) { if ((spte & (shadow_x_mask|shadow_nx_mask)) == shadow_x_mask) { fault_handled = true; break; } } if (error_code & PFERR_WRITE_MASK) { if (is_writable_pte(spte)) { fault_handled = true; break; } remove_write_prot = spte_can_locklessly_be_made_writable(spte); } if (!(error_code & PFERR_PRESENT_MASK)) { if (!is_access_track_spte(spte)) { fault_handled = true; break; } remove_acc_track = true; } > + remove_acc_track = is_access_track_spte(spte); > + > /* > - * Currently, to simplify the code, only the spte > - * write-protected by dirty-log can be fast fixed. > + * Currently, to simplify the code, write-protection can be > + * removed in the fast path only if the SPTE was write-protected > + * for dirty-logging. > */ > - if (!spte_can_locklessly_be_made_writable(spte)) > - break; > + remove_write_prot &= spte_can_locklessly_be_made_writable(spte); > > /* > * Do not fix write-permission on the large spte since we only > @@ -2998,13 +3156,20 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > if (sp->role.level > PT_PAGE_TABLE_LEVEL) > break; > > + /* Verify that the fault can be handled in the fast path */ > + if (!remove_acc_track && !remove_write_prot) > + break; > + > /* > * Currently, fast page fault only works for direct mapping > * since the gfn is not stable for indirect shadow page. See > * Documentation/virtual/kvm/locking.txt to get more detail. > */ > - ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); > - if (ret) > + fault_handled = fast_pf_fix_direct_spte(vcpu, sp, > + iterator.sptep, spte, > + remove_write_prot, > + remove_acc_track); > + if (fault_handled) > break; > > if (++retry_count > 4) { > @@ -3018,10 +3183,10 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, > } while (true); > > trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, > - spte, ret); > + spte, fault_handled); > walk_shadow_page_lockless_end(vcpu); > > - return ret; > + return fault_handled; > } > > static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, > @@ -4300,6 +4465,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, > vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); > } > > +/* This is only supposed to be used for non-EPT mappings */ It's only used for non-EPT mappings, why is it only *supposed* to be used for non-EPT mappings? It seems to me that it would work. > static bool need_remote_flush(u64 old, u64 new) > { > if (!is_shadow_present_pte(old)) > @@ -5067,6 +5233,8 @@ static void mmu_destroy_caches(void) > > int kvm_mmu_module_init(void) > { > + kvm_mmu_clear_all_pte_masks(); > + > pte_list_desc_cache = kmem_cache_create("pte_list_desc", > sizeof(struct pte_list_desc), > 0, 0, NULL); > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > index ddc56e9..dfd3056 100644 > --- a/arch/x86/kvm/mmu.h > +++ b/arch/x86/kvm/mmu.h > @@ -52,6 +52,8 @@ static inline u64 rsvd_bits(int s, int e) > } > > void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask); > +void kvm_mmu_set_access_track_masks(u64 acc_track_mask, u64 acc_track_value, > + u64 saved_bits_mask, u64 saved_bits_shift); > > void > reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 88e3b02..363517e 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -5019,7 +5019,22 @@ static void ept_set_mmio_spte_mask(void) > * Also, magic bits (0x3ull << 62) is set to quickly identify mmio > * spte. > */ > - kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); > + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE | > + VMX_EPT_TRACK_MMIO); > +} > + > +static void ept_set_acc_track_spte_mask(void) > +{ > + /* > + * For access track PTEs we use a non-present PTE to trigger an EPT > + * Violation. The original RWX value is saved in some unused bits in > + * the PTE and restored when the violation is fixed. > + */ > + kvm_mmu_set_access_track_masks(VMX_EPT_RWX_MASK | > + VMX_EPT_TRACK_TYPE_MASK, > + VMX_EPT_TRACK_ACCESS, > + VMX_EPT_RWX_MASK, > + VMX_EPT_RWX_SAVE_SHIFT); > } > > #define VMX_XSS_EXIT_BITMAP 0 > @@ -6551,6 +6566,9 @@ static __init int hardware_setup(void) > 0ull : VMX_EPT_READABLE_MASK); > ept_set_mmio_spte_mask(); > kvm_enable_tdp(); > + > + if (!enable_ept_ad_bits) > + ept_set_acc_track_spte_mask(); Let's put the whole "then" block in a single function vmx_enable_tdp. Thanks, Paolo > } else > kvm_disable_tdp(); > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html