On Tue, Mar 21, 2023 at 01:21:36PM +0200, Zhi Wang wrote: > On Mon, 20 Mar 2023 13:05:43 -0500 > Michael Roth <michael.roth@xxxxxxx> wrote: > > > On Fri, Mar 17, 2023 at 09:56:11PM -0700, Isaku Yamahata wrote: > > > On Mon, Feb 20, 2023 at 12:37:53PM -0600, > > > Michael Roth <michael.roth@xxxxxxx> wrote: > > > > > > > This callback will do any platform-specific handling needed for > > > > converting pages between shared/private. > > > > > > > > Signed-off-by: Michael Roth <michael.roth@xxxxxxx> > > > > --- > > > > arch/x86/include/asm/kvm-x86-ops.h | 1 + > > > > arch/x86/include/asm/kvm_host.h | 2 ++ > > > > arch/x86/kvm/mmu/mmu.c | 13 +++++++++++++ > > > > include/linux/kvm_host.h | 4 ++++ > > > > virt/kvm/kvm_main.c | 29 +++++++++++++++++++++++++++++ > > > > 5 files changed, 49 insertions(+) > > > > > > > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h > > > > index 72183da010b8..a8aaf532c2ab 100644 > > > > --- a/arch/x86/include/asm/kvm-x86-ops.h > > > > +++ b/arch/x86/include/asm/kvm-x86-ops.h > > > > @@ -132,6 +132,7 @@ KVM_X86_OP(complete_emulated_msr) > > > > KVM_X86_OP(vcpu_deliver_sipi_vector) > > > > KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons); > > > > KVM_X86_OP_OPTIONAL_RET0(fault_is_private); > > > > +KVM_X86_OP_OPTIONAL_RET0(update_mem_attr) > > > > > > > > #undef KVM_X86_OP > > > > #undef KVM_X86_OP_OPTIONAL > > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > > > index f856d689dda0..2da3fb2d5d1b 100644 > > > > --- a/arch/x86/include/asm/kvm_host.h > > > > +++ b/arch/x86/include/asm/kvm_host.h > > > > @@ -1644,6 +1644,8 @@ struct kvm_x86_ops { > > > > void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, > > > > int root_level); > > > > bool (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code, bool *private_fault); > > > > + int (*update_mem_attr)(struct kvm_memory_slot *slot, unsigned int attr, > > > > + gfn_t start, gfn_t end); > > > > > > > > bool (*has_wbinvd_exit)(void); > > > > > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > > > index fb3f34b7391c..053bd77bbf52 100644 > > > > --- a/arch/x86/kvm/mmu/mmu.c > > > > +++ b/arch/x86/kvm/mmu/mmu.c > > > > @@ -7251,4 +7251,17 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm, > > > > linfo_update_mixed(gfn, slot, level, mixed); > > > > } > > > > } > > > > + > > > > +void kvm_arch_post_set_memory_attributes(struct kvm *kvm, > > > > + struct kvm_memory_slot *slot, > > > > + unsigned long attrs, > > > > + gfn_t start, gfn_t end) > > > > +{ > > > > + int ret; > > > > + > > > > + ret = static_call(kvm_x86_update_mem_attr)(slot, attrs, start, end); > > > > + if (ret) > > > > + pr_warn_ratelimited("Failed to update GFN range 0x%llx-0x%llx with attributes 0x%lx. Ret: %d\n", > > > > + start, end, attrs, ret); > > > > +} > > > > #endif > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > > > index fdc59479b3e2..d200b8f45583 100644 > > > > --- a/include/linux/kvm_host.h > > > > +++ b/include/linux/kvm_host.h > > > > @@ -2330,6 +2330,10 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm, > > > > struct kvm_memory_slot *slot, > > > > unsigned long attrs, > > > > gfn_t start, gfn_t end); > > > > +void kvm_arch_post_set_memory_attributes(struct kvm *kvm, > > > > + struct kvm_memory_slot *slot, > > > > + unsigned long attrs, > > > > + gfn_t start, gfn_t end); > > > > > > > > static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) > > > > { > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > > index b68574ff6c30..8ec985f1c57d 100644 > > > > --- a/virt/kvm/kvm_main.c > > > > +++ b/virt/kvm/kvm_main.c > > > > @@ -2561,6 +2561,32 @@ static void kvm_mem_attrs_changed(struct kvm *kvm, unsigned long attrs, > > > > kvm_flush_remote_tlbs(kvm); > > > > } > > > > > > > > +static void kvm_post_mem_attrs_changed(struct kvm *kvm, unsigned long attrs, > > > > + gfn_t start_orig, gfn_t end_orig) > > > > +{ > > > > + struct kvm_memory_slot *slot; > > > > + struct kvm_memslots *slots; > > > > + struct kvm_memslot_iter iter; > > > > + int i; > > > > + > > > > + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { > > > > + slots = __kvm_memslots(kvm, i); > > > > + > > > > + kvm_for_each_memslot_in_gfn_range(&iter, slots, start_orig, end_orig) { > > > > + gfn_t start, end; > > > > + > > > > + slot = iter.slot; > > > > + start = max(start_orig, slot->base_gfn); > > > > + end = min(end_orig, slot->base_gfn + slot->npages); > > > > + > > > > + if (start >= end) > > > > + continue; > > > > + > > > > + kvm_arch_post_set_memory_attributes(kvm, slot, attrs, start, end); > > > > + } > > > > + } > > > > +} > > > > + > > > > static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, > > > > struct kvm_memory_attributes *attrs) > > > > { > > > > @@ -2602,6 +2628,9 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, > > > > kvm_mmu_invalidate_end(kvm); > > > > KVM_MMU_UNLOCK(kvm); > > > > > > > > + if (i > start) > > > > + kvm_post_mem_attrs_changed(kvm, attrs->attributes, start, i); > > > > + > > > > > > Doesn't kvm_arch_set_memory_attributes() work for you? i.e the following patch. > > > The error check and pr_warn_ratelimited() can be pushed down into the callback. > > > > This is originally how I had but when CONFIG_PREEMPT_COUNT is set this > > will generate warnings for this callback as well as the invalidation > > callback as reported in v7 here: > > > > https://lore.kernel.org/lkml/Y80vhKwQyw8hS%2F22@notebook/ > > > > The main issue is that kvm_mem_attrs_changed() is called while holding > > the KVM MMU lock, which disables preemption. But when updating > > attributes for SNP, we also need to remove private pages from kernel > > directmap, which involves acquiring a mutex which results in > > "BUG: scheduling while atomic" warnings. > > > > So that's why we ended up somewhat duplicating some of the logic and > > using a separate callback chain that happens out of KVM MMU lock. > > Let's split the things of changing memory attributes: > > 1) Update the memory attributes in the xa array (Both TDX and SNP) > 2) Zapping the EPT/NPT mappings (Required by TDX) > 3) Update RMP table (Required by SNP) > 4) Update the directmap of kernel (SNP, but I guess TDX needs it as well) I'm not so sure TDX requires this. I was under that impression, but Kirill raised some doubts about this and I'm not sure it's been confirmed. If it's purely an SNP thing then there may not be much value in creating a separate callback for it: https://lore.kernel.org/linux-mm/20221031141426.GA3994099@xxxxxxxxxxxxxxxxxx/T/#meba4ce80709cd3afd3818b61e6419fd800287b9e And for SNP, the current code does the unmapping/RMP update in the same function: [PATCH RFC v8 15/56] x86/sev: Invalidate pages from the direct map when adding them to the RMP table I'm not against splitting RMP/directmap handling, but just want to understand what the requirements are around that a bit better. Does handling the #3 / RMP update / kvm_arch_post_set_memory_attributes stuff outside of MMU lock cause issues on TDX side? What sort of handling is needed in these callbacks for TDX (if anything)? > > Does SNP really need to zap the NPT mappings when changing the memory > attributes? (The new mappings will be created later in the fault). I don't > find this requirement from APM. I don't think we've added anything specifically for SNP. Do you mean the generic kvm_unmap_gfn_range/kvm_flush_remote_tlbs sequence below? kvm_vm_ioctl_set_mem_attributes(): KVM_MMU_LOCK(kvm) kvm_mmu_invalidate_begin() ... KVM_MMU_UNLOCK(kvm) kvm_vm_set_region_attr() // xarray/attribute update ... KVM_MMU_LOCK(kvm) kvm_mem_attrs_changed(): flush |= kvm_unmap_gfn_range() if (flush) kvm_flush_remote_tlbs() KVM_MMU_UNLOCK(kvm) In general, when the RMPUPDATE instruction happens, the TLB entries for the GPAs being modified will be flushed, so subsequent nested page fault should be able to obtain the updated mapping based on xarray/#NPF at that point. In that respect *maybe* we don't need to zap the entries there. But if the nested page fault occurs before the RMPUPDATE, I think we would have a race if the above sequence isn't in place to handle the unmap/flush, since in that case we might get a stale mapping because nothing would've forced a tlbflush. There's also stuff like the UPM selftests and SEV lazy-pinning where I think that kvm_unmap_gfn_range() sequence is also needed. But I might be misunderstanding the question here. > If yes, can we postpone the update of the RMP table in the later fault, > like TDX? So that we can save this update_mem_attr x86 ops as things > will be solved in the SNP-specific fault handler. Hmm, I think this would be possible. But it's nice to be able to handle the RMPUPDATE as part of KVM_SET_MEMORY_ATTRIBUTES, since it allows KVM MMU code to rely solely on xarray state and not have to query RMP table to check if a particular PFN needs an RMPUPDATE before mapping it into RMP table. At least... it would *in theory*, if the RMPUPDATE happened under protection of mmu_invalidate_seq (in which case it could inherit all the same protections KVM MMU has around mmu_invalidate_seq/fault->mmu_seq, e.g. letting the guest retry the #PF if fault->mmu_seq is stale). But currently, RMPUPDATE (via kvm_arch_post_set_memory_attributes) happens *after* the invalidation sequence above, so in theory a guest could fault on a page just after xarray state is updated, but before the RMPUPDATE has been done, in which case the KVM MMU code would properly map the page accordingly to xarray, but since RMPUPDATE wouldn't have happened yet, the state of the corresponding PFN in RMP table won't match the shared/private access type expected by the guest, so when it tries to access it it will get another #NPF with RMP bit set in the error code, which will get handled as a no-op in handle_rmp_page_fault() (patch #44) and loop like this until the RMPUPDATE is finally done. So it still works out, but maybe not keeping as much in sync with xarray state and could be. But deferring RMPUPDATE to fault time goes in the other direction of that. Are there benefits/requirements for doing things this way for TDX? I could see it being beneficial in terms of reducing overhead for uneeded page-state transitions, since they are only done on-demand but doesn't seem like it would be that much overhead compared to some of the other operations being done. > > If no, guess we need a x86 ops to tell if a zapping is required. Sorry don't think I quite understand the suggestion. What would this zapping be covering vs. the invalidation sequence that currently happens in kvm_vm_ioctl_set_mem_attributes()? > > Back to the lock, updating RMP table doesn't require a mutex. Taking > the lock is required when updating the directmap. both TDX/SNP requires > this update the directmap when changing memory attributes. Is that confirmed? If so, do you have a pointer to the associated documentation? I'm a bit unclear on this due to above-mentioned discussion. > > Wouldn't it better to factor the touching directmap of kernel part out? It actually needs to happen before the RMPUPDATE. As soon as there is a shared->private conversion in the RMP table for a particular PFN, then any access via directmap by any particular kernel thread to any PFN that happens to be in the same physical 2M range can cause an RMP fault on the host, which would be fatal. So the rmpupdate() helper in this series will unmap directmap entry corresponding the PFN before a shared->private RMPUPDATE, and restore mappings after private->shared RMPUPDATE So we could still factor it out, but it would be something like: if (attr == private) kvm_unmap_directmap(start, end) kvm_mem_attrs_changed() if (attr == shared) kvm_map_directmap(start, end) > > Then you can call the x86 ops.update_mem_attr() in kvm_mem_attrs_changed(). > And update the direct kernel mapping for both TDX/SNP in the > kvm_post_mem_attrs_changed(). Or, adjusting for the above logic, move the unmapping/mapping to a new kvm_pre_mem_attrs_changed() and kvm_post_mem_attrs_changed(), respectively. Which seems pretty reasonable to me. Then we can: - drop duplicating the kvm_for_each_memslot_in_gfn_range() walk stuff because we'd just need to know what PFNs to map/unmap from directmap (although we'd still need a loop around kvm_restrictedmem_get_pfn() for the GFN range so not necessarily prettier) - call the RMPUPDATE / corresponding TDX handling via kvm_mem_attrs_changed() which brings it both under KVM MMU lock and also let's it piggyback off the fault->mmu_seq handling so it doesn't get out of sync with xarray during fault time. But would be good to hear others' opinions on this. And also confirm whether TDX needs that pre/post directmap handle or not. Thanks! -Mike > > > > > -Mike > > > > > > > > From 7c618c1f3c236c382e64680efcbe7d8a672aa870 Mon Sep 17 00:00:00 2001 > > > Message-Id: <7c618c1f3c236c382e64680efcbe7d8a672aa870.1679114841.git.isaku.yamahata@xxxxxxxxx> > > > In-Reply-To: <428a676face7a06a90e59dca1c32941c9b6ee001.1679114841.git.isaku.yamahata@xxxxxxxxx> > > > References: <428a676face7a06a90e59dca1c32941c9b6ee001.1679114841.git.isaku.yamahata@xxxxxxxxx> > > > From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> > > > Date: Fri, 17 Mar 2023 12:00:09 -0700 > > > Subject: [PATCH 4/4] KVM: x86: Add 'set_mem_attr' x86 op > > > > > > This callback will do any platform-specific handling needed for > > > converting pages between shared/private. > > > > > > Originally-by: Michael Roth <michael.roth@xxxxxxx> > > > Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx> > > > --- > > > arch/x86/include/asm/kvm-x86-ops.h | 1 + > > > arch/x86/include/asm/kvm_host.h | 2 ++ > > > arch/x86/kvm/mmu/mmu.c | 1 + > > > 3 files changed, 4 insertions(+) > > > > > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h > > > index dc5f18ac0bd5..956db2ee25a5 100644 > > > --- a/arch/x86/include/asm/kvm-x86-ops.h > > > +++ b/arch/x86/include/asm/kvm-x86-ops.h > > > @@ -100,6 +100,7 @@ KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr) > > > KVM_X86_OP_OPTIONAL_RET0(get_mt_mask) > > > KVM_X86_OP(load_mmu_pgd) > > > KVM_X86_OP(fault_is_private) > > > +KVM_X86_OP_OPTIONAL(set_mem_attr) > > > KVM_X86_OP_OPTIONAL(link_private_spt) > > > KVM_X86_OP_OPTIONAL(free_private_spt) > > > KVM_X86_OP_OPTIONAL(split_private_spt) > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > > > index 0382d236fbf4..88e11dd3afde 100644 > > > --- a/arch/x86/include/asm/kvm_host.h > > > +++ b/arch/x86/include/asm/kvm_host.h > > > @@ -1731,6 +1731,8 @@ struct kvm_x86_ops { > > > void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, > > > int root_level); > > > bool (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code); > > > + void (*set_mem_attr)(struct kvm *kvm, struct kvm_memory_slot *slot, > > > + unsigned int attr, gfn_t start, gfn_t end); > > > > > > int (*link_private_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level, > > > void *private_spt); > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > > index 0ec94c72895c..329333486e64 100644 > > > --- a/arch/x86/kvm/mmu/mmu.c > > > +++ b/arch/x86/kvm/mmu/mmu.c > > > @@ -7908,6 +7908,7 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm, > > > gfn_t start, gfn_t end) > > > { > > > kvm_update_lpage_mixed_flag(kvm, slot, true, attrs, start, end); > > > + static_call(kvm_x86_set_mem_attr)(kvm, slot, attrs, start, end); > > > } > > > > > > void kvm_memory_attributes_create_memslot(struct kvm *kvm, > > > -- > > > 2.25.1 > > > > > > -- > > > Isaku Yamahata <isaku.yamahata@xxxxxxxxx> > >