On Wed, Apr 06, 2022 at 11:06:41PM +1200, Kai Huang <kai.huang@xxxxxxxxx> wrote: > > diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c > > index 5071e8332db2..ea83927b9231 100644 > > --- a/arch/x86/kvm/mmu/spte.c > > +++ b/arch/x86/kvm/mmu/spte.c > > @@ -29,8 +29,7 @@ u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ > > u64 __read_mostly shadow_user_mask; > > u64 __read_mostly shadow_accessed_mask; > > u64 __read_mostly shadow_dirty_mask; > > -u64 __read_mostly shadow_mmio_value; > > -u64 __read_mostly shadow_mmio_mask; > > +u64 __read_mostly shadow_default_mmio_mask; > > u64 __read_mostly shadow_mmio_access_mask; > > u64 __read_mostly shadow_present_mask; > > u64 __read_mostly shadow_me_mask; > > @@ -59,10 +58,11 @@ u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) > > u64 spte = generation_mmio_spte_mask(gen); > > u64 gpa = gfn << PAGE_SHIFT; > > > > - WARN_ON_ONCE(!shadow_mmio_value); > > + WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value && > > + !kvm_gfn_stolen_mask(vcpu->kvm)); > > > > access &= shadow_mmio_access_mask; > > - spte |= shadow_mmio_value | access; > > + spte |= vcpu->kvm->arch.shadow_mmio_value | access; > > spte |= gpa | shadow_nonpresent_or_rsvd_mask; > > spte |= (gpa & shadow_nonpresent_or_rsvd_mask) > > << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; > > @@ -279,7 +279,8 @@ u64 mark_spte_for_access_track(u64 spte) > > return spte; > > } > > > > -void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) > > +void kvm_mmu_set_mmio_spte_mask(struct kvm *kvm, u64 mmio_value, u64 mmio_mask, > > + u64 access_mask) > > { > > BUG_ON((u64)(unsigned)access_mask != access_mask); > > WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); > > @@ -308,39 +309,32 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) > > WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value)) > > mmio_value = 0; > > > > - shadow_mmio_value = mmio_value; > > - shadow_mmio_mask = mmio_mask; > > + kvm->arch.shadow_mmio_value = mmio_value; > > + kvm->arch.shadow_mmio_mask = mmio_mask; > > shadow_mmio_access_mask = access_mask; > > } > > EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); > > > > -void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) > > +void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only, u64 init_value) > > { > > shadow_user_mask = VMX_EPT_READABLE_MASK; > > shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; > > shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; > > shadow_nx_mask = 0ull; > > shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; > > - shadow_present_mask = has_exec_only ? 0ull : VMX_EPT_READABLE_MASK; > > + shadow_present_mask = > > + (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | init_value; > > This change doesn't seem make any sense. Why should "Suppress #VE" bit be set > for a present PTE? Because W or NX violation also needs #VE. Although the name uses present, it's actually readable. > > shadow_acc_track_mask = VMX_EPT_RWX_MASK; > > shadow_me_mask = 0ull; > > > > shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; > > shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; > > - > > - /* > > - * EPT Misconfigurations are generated if the value of bits 2:0 > > - * of an EPT paging-structure entry is 110b (write/execute). > > - */ > > - kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, > > - VMX_EPT_RWX_MASK, 0); > > } > > EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); > > > > void kvm_mmu_reset_all_pte_masks(void) > > { > > u8 low_phys_bits; > > - u64 mask; > > > > shadow_phys_bits = kvm_get_shadow_phys_bits(); > > > > @@ -389,9 +383,13 @@ void kvm_mmu_reset_all_pte_masks(void) > > * PTEs and so the reserved PA approach must be disabled. > > */ > > if (shadow_phys_bits < 52) > > - mask = BIT_ULL(51) | PT_PRESENT_MASK; > > + shadow_default_mmio_mask = BIT_ULL(51) | PT_PRESENT_MASK; > > Hmm... Not related to this patch, but it seems there's a bug here. On a MKTME > enabled system (but not TDX) with 52 physical bits, the shadow_phys_bits will be > set to < 52 (depending on how many MKTME KeyIDs are configured by BIOS). In > this case, bit 51 is set, but actually bit 51 isn't a reserved bit in this case. > Instead, it is a MKTME KeyID bit. Therefore, above setting won't cause #PF, but > will use a non-zero MKTME keyID to access the physical address. > > Paolo/Sean, any comments here? > > > else > > - mask = 0; > > + shadow_default_mmio_mask = 0; > > +} > > > > - kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); > > +void kvm_mmu_set_default_mmio_spte_mask(u64 mask) > > +{ > > + shadow_default_mmio_mask = mask; > > } > > +EXPORT_SYMBOL_GPL(kvm_mmu_set_default_mmio_spte_mask); > > diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h > > index 8e13a35ab8c9..bde843bce878 100644 > > --- a/arch/x86/kvm/mmu/spte.h > > +++ b/arch/x86/kvm/mmu/spte.h > > @@ -165,8 +165,7 @@ extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ > > extern u64 __read_mostly shadow_user_mask; > > extern u64 __read_mostly shadow_accessed_mask; > > extern u64 __read_mostly shadow_dirty_mask; > > -extern u64 __read_mostly shadow_mmio_value; > > -extern u64 __read_mostly shadow_mmio_mask; > > +extern u64 __read_mostly shadow_default_mmio_mask; > > extern u64 __read_mostly shadow_mmio_access_mask; > > extern u64 __read_mostly shadow_present_mask; > > extern u64 __read_mostly shadow_me_mask; > > @@ -229,10 +228,10 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; > > */ > > extern u8 __read_mostly shadow_phys_bits; > > > > -static inline bool is_mmio_spte(u64 spte) > > +static inline bool is_mmio_spte(struct kvm *kvm, u64 spte) > > { > > - return (spte & shadow_mmio_mask) == shadow_mmio_value && > > - likely(shadow_mmio_value); > > + return (spte & kvm->arch.shadow_mmio_mask) == kvm->arch.shadow_mmio_value && > > + likely(kvm->arch.shadow_mmio_value || kvm_gfn_stolen_mask(kvm)); > > I don't like using kvm_gfn_stolen_mask() to check whether SPTE is MMIO. > kvm_gfn_stolen_mask() really doesn't imply anything regarding to setting up the > value of MMIO SPTE. At least, I guess we can use some is_protected_vm() sort of > things since it implies guest memory is protected therefore legacy way handling > of MMIO doesn't work (i.e. you cannot parse MMIO instruction). As discussed in other thread, let's rename those functions. > > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > > index 07fd892768be..00f88aa25047 100644 > > --- a/arch/x86/kvm/vmx/vmx.c > > +++ b/arch/x86/kvm/vmx/vmx.c > > @@ -7065,6 +7065,14 @@ int vmx_vm_init(struct kvm *kvm) > > if (!ple_gap) > > kvm->arch.pause_in_guest = true; > > > > + /* > > + * EPT Misconfigurations can be generated if the value of bits 2:0 > > + * of an EPT paging-structure entry is 110b (write/execute). > > + */ > > + if (enable_ept) > > + kvm_mmu_set_mmio_spte_mask(kvm, VMX_EPT_MISCONFIG_WX_VALUE, > > + VMX_EPT_MISCONFIG_WX_VALUE, 0); > > Should be: > > kvm_mmu_set_mmio_spte_mask(kvm, VMX_EPT_MISCONFIG_WX_VALUE, > VMX_EPT_RWX_MASK, 0); Thanks for catching it. It's fixed in github repo. -- Isaku Yamahata <isaku.yamahata@xxxxxxxxx>