This change implements lockless access tracking for Intel CPUs without EPT A bits. This is achieved by marking the PTEs as not-present (but not completely clearing them) when clear_flush_young() is called after marking the pages as accessed. When an EPT Violation is generated as a result of the VM accessing those pages, the PTEs are restored to their original values. Signed-off-by: Junaid Shahid <junaids@xxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 3 +- arch/x86/include/asm/vmx.h | 9 +- arch/x86/kvm/mmu.c | 274 +++++++++++++++++++++++++++++++--------- arch/x86/kvm/vmx.c | 26 ++-- arch/x86/kvm/x86.c | 2 +- 5 files changed, 237 insertions(+), 77 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5a10eb7..da1d4b9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1056,7 +1056,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 45ee6d9..9d228a8 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -438,11 +438,14 @@ enum vmcs_field { #define VMX_EPT_IPAT_BIT (1ull << 6) #define VMX_EPT_ACCESS_BIT (1ull << 8) #define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ + VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) +#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) /* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ -#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ - VMX_EPT_EXECUTABLE_MASK) - +#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3f66fd3..6ba6220 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -37,6 +37,7 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/kern_levels.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -129,6 +130,10 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -178,6 +183,25 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_present_mask; +/* + * The mask/value to distinguish a PTE that has been marked not-present for + * access tracking purposes. + * The mask would be either 0 if access tracking is disabled, or + * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + */ +static u64 __read_mostly shadow_acc_track_mask; +static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -187,6 +211,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool is_access_track_spte(u64 spte) +{ + return shadow_acc_track_mask != 0 && + (spte & shadow_acc_track_mask) == shadow_acc_track_value; +} + /* * the low bit of the generation number is always presumed to be zero. * This disables mmio caching during memslot updates. The concept is @@ -284,7 +314,8 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -292,9 +323,23 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +void kvm_mmu_clear_all_pte_masks(void) +{ + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_mmio_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; +} + static int is_cpuid_PSE36(void) { return 1; @@ -307,7 +352,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); + return (pte != 0) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -490,23 +535,20 @@ static bool spte_has_volatile_bits(u64 spte) if (spte_can_locklessly_be_made_writable(spte)) return true; - if (!shadow_accessed_mask) - return false; - if (!is_shadow_present_pte(spte)) return false; - if ((spte & shadow_accessed_mask) && - (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) - return false; + if (!shadow_accessed_mask) + return is_access_track_spte(spte); - return true; + return (spte & shadow_accessed_mask) == 0 || + (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0); } static bool is_accessed_spte(u64 spte) { return shadow_accessed_mask ? spte & shadow_accessed_mask - : true; + : !is_access_track_spte(spte); } static bool is_dirty_spte(u64 spte) @@ -650,6 +692,65 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } +static u64 mark_spte_for_access_track(u64 spte) +{ + if (shadow_accessed_mask != 0) + return spte & ~shadow_accessed_mask; + + if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + return spte; + + /* + * Verify that the write-protection that we do below will be fixable + * via the fast page fault path. Currently, that is always the case, at + * least when using EPT (which is when access tracking would be used). + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + spte |= shadow_acc_track_value; + + return spte; +} + +/* Returns the Accessed status of the PTE and resets it at the same time. */ +static bool mmu_spte_age(u64 *sptep) +{ + u64 spte = mmu_spte_get_lockless(sptep); + + if (spte & shadow_accessed_mask) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + return true; + } + + if (shadow_accessed_mask == 0) { + if (is_access_track_spte(spte)) + return false; + + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(spte)) + kvm_set_pfn_dirty(spte_to_pfn(spte)); + + spte = mark_spte_for_access_track(spte); + mmu_spte_update_no_track(sptep, spte); + return true; + } + + return false; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { /* @@ -1434,7 +1535,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + sptep, *sptep, gfn, level); need_flush = 1; @@ -1447,7 +1548,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - new_spte &= ~shadow_accessed_mask; + + new_spte = mark_spte_for_access_track(new_spte); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1609,15 +1711,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator uninitialized_var(iter); int young = 0; - BUG_ON(!shadow_accessed_mask); - - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + young |= mmu_spte_age(sptep); trace_kvm_age_page(gfn, level, slot, young); return young; @@ -1631,11 +1726,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator iter; /* - * If there's no access bit in the secondary pte set by the - * hardware it's up to gup-fast/gup to set the access bit in - * the primary pte or in the page structure. + * If there's no access bit in the secondary pte set by the hardware and + * fast access tracking is also not enabled, it's up to gup-fast/gup to + * set the access bit in the primary pte or in the page structure. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) goto out; for_each_rmap_spte(rmap_head, &iter, sptep) @@ -1670,7 +1765,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); @@ -2593,6 +2688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_dirty_mask; } + if (speculative) + spte = mark_spte_for_access_track(spte); + set_pte: if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2646,7 +2744,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, *sptep, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -2879,16 +2977,28 @@ static bool page_fault_can_be_fast(u32 error_code) if (unlikely(error_code & PFERR_RSVD_MASK)) return false; - /* - * #PF can be fast only if the shadow page table is present and it - * is caused by write-protect, that means we just need change the - * W bit of the spte which can be done out of mmu-lock. - */ - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) + /* See if the page fault is due to an NX violation */ + if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) + == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) return false; - return true; + /* + * #PF can be fast if: + * 1. The shadow page table entry is not present, which could mean that + * the fault is potentially caused by access tracking (if enabled). + * 2. The shadow page table entry is present and the fault + * is caused by write-protect, that means we just need change the W + * bit of the spte which can be done out of mmu-lock. + * + * However, if access tracking is disabled we know that a non-present + * page must be a genuine page fault where we have to create a new SPTE. + * So, if access tracking is disabled, we return true only for write + * accesses to a present page. + */ + + return shadow_acc_track_mask != 0 || + ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) + == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); } /* @@ -2897,17 +3007,26 @@ static bool page_fault_can_be_fast(u32 error_code) */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *sptep, u64 spte) + u64 *sptep, u64 old_spte, + bool remove_write_prot, bool remove_acc_track) { gfn_t gfn; + u64 new_spte = old_spte; WARN_ON(!sp->role.direct); - /* - * The gfn of direct spte is stable since it is calculated - * by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + if (remove_acc_track) { + u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift) + & shadow_acc_track_saved_bits_mask; + + new_spte &= ~shadow_acc_track_mask; + new_spte &= ~(shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift); + new_spte |= saved_bits; + } + + if (remove_write_prot) + new_spte |= PT_WRITABLE_MASK; /* * Theoretically we could also set dirty bit (and flush TLB) here in @@ -2921,10 +3040,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte) + if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) return false; - kvm_vcpu_mark_page_dirty(vcpu, gfn); + if (remove_write_prot) { + /* + * The gfn of direct spte is stable since it is + * calculated by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + kvm_vcpu_mark_page_dirty(vcpu, gfn); + } return true; } @@ -2955,35 +3081,55 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, break; do { - /* - * If the mapping has been changed, let the vcpu fault on the - * same address again. - */ - if (!is_shadow_present_pte(spte)) { - fault_handled = true; - break; - } + bool remove_write_prot = false; + bool remove_acc_track; sp = page_header(__pa(iterator.sptep)); if (!is_last_spte(spte, sp->role.level)) break; /* - * Check if it is a spurious fault caused by TLB lazily flushed. + * Check whether the memory access that caused the fault would + * still cause it if it were to be performed right now. If not, + * then this is a spurious fault caused by TLB lazily flushed, + * or some other CPU has already fixed the PTE after the + * current CPU took the fault. * * Need not check the access of upper level table entries since * they are always ACC_ALL. */ - if (is_writable_pte(spte)) { - fault_handled = true; - break; + + if (error_code & PFERR_FETCH_MASK) { + if ((spte & (shadow_x_mask | shadow_nx_mask)) + == shadow_x_mask) { + fault_handled = true; + break; + } + } else if (error_code & PFERR_WRITE_MASK) { + if (is_writable_pte(spte)) { + fault_handled = true; + break; + } + + /* + * Currently, to simplify the code, write-protection can + * be removed in the fast path only if the SPTE was + * write-protected for dirty-logging. + */ + remove_write_prot = + spte_can_locklessly_be_made_writable(spte); + } else { + /* Fault was on Read access */ + if (spte & PT_PRESENT_MASK) { + fault_handled = true; + break; + } } - /* - * Currently, to simplify the code, only the spte - * write-protected by dirty-log can be fast fixed. - */ - if (!spte_can_locklessly_be_made_writable(spte)) + remove_acc_track = is_access_track_spte(spte); + + /* Verify that the fault can be handled in the fast path */ + if (!remove_acc_track && !remove_write_prot) break; /* @@ -2997,7 +3143,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * * See the comments in kvm_arch_commit_memory_region(). */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot) break; /* @@ -3006,7 +3152,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, * Documentation/virtual/kvm/locking.txt to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, - iterator.sptep, spte); + iterator.sptep, spte, + remove_write_prot, + remove_acc_track); if (fault_handled) break; @@ -5095,6 +5243,8 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { + kvm_mmu_clear_all_pte_masks(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, 0, NULL); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6a01e755..50fc078 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6318,6 +6318,19 @@ static void wakeup_handler(void) spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } +void vmx_enable_tdp(void) +{ + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, + enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK); + + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6443,16 +6456,9 @@ static __init int hardware_setup(void) /* SELF-IPI */ vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); - if (enable_ept) { - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? - 0ull : VMX_EPT_READABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else + if (enable_ept) + vmx_enable_tdp(); + else kvm_disable_tdp(); update_ple_window_actual_max(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec59301..c15418d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5955,7 +5955,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK); + PT_PRESENT_MASK, 0); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); -- 2.8.0.rc3.226.g39d4020 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html