On Wed, Apr 08, 2020 at 10:23:58AM +0200, Paolo Bonzini wrote: > Page-not-present async page faults are almost a perfect match for the > hardware use of #VE (and it might even be possible to let the processor > deliver the exceptions). My "async" page fault knowledge is limited, but if the desired behavior is to reflect a fault into the guest for select EPT Violations, then yes, enabling EPT Violation #VEs in hardware is doable. The big gotcha is that KVM needs to set the suppress #VE bit for all EPTEs when allocating a new MMU page, otherwise not-present faults on zero-initialized EPTEs will get reflected. Attached a patch that does the prep work in the MMU. The VMX usage would be: kvm_mmu_set_spte_init_value(VMX_EPT_SUPPRESS_VE_BIT); when EPT Violation #VEs are enabled. It's 64-bit only as it uses stosq to initialize EPTEs. 32-bit could also be supported by doing memcpy() from a static page.
>From 078b485e8a64e6d72ebad58bf66f950763ba30bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> Date: Mon, 29 Jul 2019 19:23:46 -0700 Subject: [PATCH] KVM: x86/mmu: Allow non-zero init value for shadow PTE Add support for using a non-zero "init" value for shadow PTEs, which is required to enable EPT Violation #VEs in hardware. When #VEs are enabled, KVM needs to set the "suppress #VE" bit in unused PTEs to avoid unintentionally reflecting not-present EPT Violations into the guest. Signed-off-by: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> --- arch/x86/kvm/mmu.h | 1 + arch/x86/kvm/mmu/mmu.c | 43 ++++++++++++++++++++++++++++------ arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 8a3b1bce722a..139db8a125d6 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -52,6 +52,7 @@ static inline u64 rsvd_bits(int s, int e) } void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask); +void kvm_mmu_set_spte_init_value(u64 init_value); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8071952e9cf2..742ea9c254c4 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -250,6 +250,8 @@ static u64 __read_mostly shadow_mmio_access_mask; static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; +static u64 __read_mostly shadow_init_value; + /* * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; * shadow_acc_track_mask is the set of bits to be cleared in non-accessed @@ -538,6 +540,13 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +void kvm_mmu_set_spte_init_value(u64 init_value) +{ + WARN_ON(!IS_ENABLED(CONFIG_X86_64) && init_value); + shadow_init_value = init_value; +} +EXPORT_SYMBOL_GPL(kvm_mmu_set_spte_init_value); + static u8 kvm_get_shadow_phys_bits(void) { /* @@ -569,6 +578,7 @@ static void kvm_mmu_reset_all_pte_masks(void) shadow_mmio_mask = 0; shadow_present_mask = 0; shadow_acc_track_mask = 0; + shadow_init_value = 0; shadow_phys_bits = kvm_get_shadow_phys_bits(); @@ -610,7 +620,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return (pte != 0) && !is_mmio_spte(pte); + return (pte != 0 && pte != shadow_init_value && !is_mmio_spte(pte)); } static int is_large_pte(u64 pte) @@ -921,9 +931,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep) u64 old_spte = *sptep; if (!spte_has_volatile_bits(old_spte)) - __update_clear_spte_fast(sptep, 0ull); + __update_clear_spte_fast(sptep, shadow_init_value); else - old_spte = __update_clear_spte_slow(sptep, 0ull); + old_spte = __update_clear_spte_slow(sptep, shadow_init_value); if (!is_shadow_present_pte(old_spte)) return 0; @@ -953,7 +963,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) */ static void mmu_spte_clear_no_track(u64 *sptep) { - __update_clear_spte_fast(sptep, 0ull); + __update_clear_spte_fast(sptep, shadow_init_value); } static u64 mmu_spte_get_lockless(u64 *sptep) @@ -2473,6 +2483,20 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sp); } +#ifdef CONFIG_X86_64 +static inline void kvm_clear_ptes(void *page) +{ + int ign; + + asm volatile ( + "rep stosq\n\t" + : "=c"(ign), "=D"(page) + : "a"(shadow_init_value), "c"(4096/8), "D"(page) + : "memory" + ); +} +#endif + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -2553,7 +2577,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PT_PAGE_TABLE_LEVEL && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - clear_page(sp->spt); +#ifdef CONFIG_X86_64 + if (shadow_init_value) + kvm_clear_ptes(sp->spt); + else +#endif + clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); @@ -3515,7 +3544,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; bool fault_handled = false; - u64 spte = 0ull; + u64 spte = shadow_init_value; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) @@ -3951,7 +3980,7 @@ static bool walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; - u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull; + u64 sptes[PT64_ROOT_MAX_LEVEL], spte = shadow_init_value; struct rsvd_bits_validate *rsvd_check; int root, leaf; bool reserved = false; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 9bdf9b7d9a96..949deed15933 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -1025,7 +1025,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gpa_t pte_gpa; gfn_t gfn; - if (!sp->spt[i]) + if (!sp->spt[i] || sp->spt[i] == shadow_init_value) continue; pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); -- 2.24.1