Re: [PATCH RFC v8 02/56] KVM: x86: Add 'update_mem_attr' x86 op

Michael Roth <michael.roth@xxxxxxx> · Tue, 21 Mar 2023 20:58:38 -0500

On Tue, Mar 21, 2023 at 01:21:36PM +0200, Zhi Wang wrote:
> On Mon, 20 Mar 2023 13:05:43 -0500
> Michael Roth <michael.roth@xxxxxxx> wrote:
> 
> > On Fri, Mar 17, 2023 at 09:56:11PM -0700, Isaku Yamahata wrote:
> > > On Mon, Feb 20, 2023 at 12:37:53PM -0600,
> > > Michael Roth <michael.roth@xxxxxxx> wrote:
> > >   
> > > > This callback will do any platform-specific handling needed for
> > > > converting pages between shared/private.
> > > > 
> > > > Signed-off-by: Michael Roth <michael.roth@xxxxxxx>
> > > > ---
> > > >  arch/x86/include/asm/kvm-x86-ops.h |  1 +
> > > >  arch/x86/include/asm/kvm_host.h    |  2 ++
> > > >  arch/x86/kvm/mmu/mmu.c             | 13 +++++++++++++
> > > >  include/linux/kvm_host.h           |  4 ++++
> > > >  virt/kvm/kvm_main.c                | 29 +++++++++++++++++++++++++++++
> > > >  5 files changed, 49 insertions(+)
> > > > 
> > > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> > > > index 72183da010b8..a8aaf532c2ab 100644
> > > > --- a/arch/x86/include/asm/kvm-x86-ops.h
> > > > +++ b/arch/x86/include/asm/kvm-x86-ops.h
> > > > @@ -132,6 +132,7 @@ KVM_X86_OP(complete_emulated_msr)
> > > >  KVM_X86_OP(vcpu_deliver_sipi_vector)
> > > >  KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
> > > >  KVM_X86_OP_OPTIONAL_RET0(fault_is_private);
> > > > +KVM_X86_OP_OPTIONAL_RET0(update_mem_attr)
> > > >  
> > > >  #undef KVM_X86_OP
> > > >  #undef KVM_X86_OP_OPTIONAL
> > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > > index f856d689dda0..2da3fb2d5d1b 100644
> > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > @@ -1644,6 +1644,8 @@ struct kvm_x86_ops {
> > > >  	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
> > > >  			     int root_level);
> > > >  	bool (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code, bool *private_fault);
> > > > +	int (*update_mem_attr)(struct kvm_memory_slot *slot, unsigned int attr,
> > > > +			       gfn_t start, gfn_t end);
> > > >  
> > > >  	bool (*has_wbinvd_exit)(void);
> > > >  
> > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > index fb3f34b7391c..053bd77bbf52 100644
> > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > > @@ -7251,4 +7251,17 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm,
> > > >  		linfo_update_mixed(gfn, slot, level, mixed);
> > > >  	}
> > > >  }
> > > > +
> > > > +void kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> > > > +					 struct kvm_memory_slot *slot,
> > > > +					 unsigned long attrs,
> > > > +					 gfn_t start, gfn_t end)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	ret = static_call(kvm_x86_update_mem_attr)(slot, attrs, start, end);
> > > > +	if (ret)
> > > > +		pr_warn_ratelimited("Failed to update GFN range 0x%llx-0x%llx with attributes 0x%lx. Ret: %d\n",
> > > > +				    start, end, attrs, ret);
> > > > +}
> > > >  #endif
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index fdc59479b3e2..d200b8f45583 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -2330,6 +2330,10 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm,
> > > >  				    struct kvm_memory_slot *slot,
> > > >  				    unsigned long attrs,
> > > >  				    gfn_t start, gfn_t end);
> > > > +void kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> > > > +					 struct kvm_memory_slot *slot,
> > > > +					 unsigned long attrs,
> > > > +					 gfn_t start, gfn_t end);
> > > >  
> > > >  static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > > >  {
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index b68574ff6c30..8ec985f1c57d 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -2561,6 +2561,32 @@ static void kvm_mem_attrs_changed(struct kvm *kvm, unsigned long attrs,
> > > >  		kvm_flush_remote_tlbs(kvm);
> > > >  }
> > > >  
> > > > +static void kvm_post_mem_attrs_changed(struct kvm *kvm, unsigned long attrs,
> > > > +				       gfn_t start_orig, gfn_t end_orig)
> > > > +{
> > > > +	struct kvm_memory_slot *slot;
> > > > +	struct kvm_memslots *slots;
> > > > +	struct kvm_memslot_iter iter;
> > > > +	int i;
> > > > +
> > > > +	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
> > > > +		slots = __kvm_memslots(kvm, i);
> > > > +
> > > > +		kvm_for_each_memslot_in_gfn_range(&iter, slots, start_orig, end_orig) {
> > > > +			gfn_t start, end;
> > > > +
> > > > +			slot = iter.slot;
> > > > +			start = max(start_orig, slot->base_gfn);
> > > > +			end = min(end_orig, slot->base_gfn + slot->npages);
> > > > +
> > > > +			if (start >= end)
> > > > +				continue;
> > > > +
> > > > +			kvm_arch_post_set_memory_attributes(kvm, slot, attrs, start, end);
> > > > +		}
> > > > +	}
> > > > +}
> > > > +
> > > >  static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > > >  					   struct kvm_memory_attributes *attrs)
> > > >  {
> > > > @@ -2602,6 +2628,9 @@ static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
> > > >  	kvm_mmu_invalidate_end(kvm);
> > > >  	KVM_MMU_UNLOCK(kvm);
> > > >  
> > > > +	if (i > start)
> > > > +		kvm_post_mem_attrs_changed(kvm, attrs->attributes, start, i);
> > > > +  
> > > 
> > > Doesn't kvm_arch_set_memory_attributes() work for you? i.e the following patch.
> > > The error check and pr_warn_ratelimited() can be pushed down into the callback.  
> > 
> > This is originally how I had but when CONFIG_PREEMPT_COUNT is set this
> > will generate warnings for this callback as well as the invalidation
> > callback as reported in v7 here:
> > 
> >   https://lore.kernel.org/lkml/Y80vhKwQyw8hS%2F22@notebook/
> > 
> > The main issue is that kvm_mem_attrs_changed() is called while holding
> > the KVM MMU lock, which disables preemption. But when updating
> > attributes for SNP, we also need to remove private pages from kernel
> > directmap, which involves acquiring a mutex which results in
> > "BUG: scheduling while atomic" warnings.
> > 
> > So that's why we ended up somewhat duplicating some of the logic and
> > using a separate callback chain that happens out of KVM MMU lock.
> 
> Let's split the things of changing memory attributes:
> 
> 1) Update the memory attributes in the xa array (Both TDX and SNP)
> 2) Zapping the EPT/NPT mappings (Required by TDX)
> 3) Update RMP table (Required by SNP)
> 4) Update the directmap of kernel (SNP, but I guess TDX needs it as well)

I'm not so sure TDX requires this. I was under that impression, but
Kirill raised some doubts about this and I'm not sure it's been
confirmed. If it's purely an SNP thing then there may not be much value
in creating a separate callback for it:

  https://lore.kernel.org/linux-mm/20221031141426.GA3994099@xxxxxxxxxxxxxxxxxx/T/#meba4ce80709cd3afd3818b61e6419fd800287b9e

And for SNP, the current code does the unmapping/RMP update in the same
function:

  [PATCH RFC v8 15/56] x86/sev: Invalidate pages from the direct map when adding them to the RMP table

I'm not against splitting RMP/directmap handling, but just want to
understand what the requirements are around that a bit better.

Does handling the #3 / RMP update / kvm_arch_post_set_memory_attributes
stuff outside of MMU lock cause issues on TDX side? What sort of
handling is needed in these callbacks for TDX (if anything)?

> 
> Does SNP really need to zap the NPT mappings when changing the memory
> attributes? (The new mappings will be created later in the fault). I don't
> find this requirement from APM.

I don't think we've added anything specifically for SNP. Do you mean the
generic kvm_unmap_gfn_range/kvm_flush_remote_tlbs sequence below?

  kvm_vm_ioctl_set_mem_attributes():
    KVM_MMU_LOCK(kvm)
    kvm_mmu_invalidate_begin()
    ...
    KVM_MMU_UNLOCK(kvm)

    kvm_vm_set_region_attr()  // xarray/attribute update

    ...
    KVM_MMU_LOCK(kvm)
    kvm_mem_attrs_changed():
      flush |= kvm_unmap_gfn_range()
      if (flush)
        kvm_flush_remote_tlbs()
    KVM_MMU_UNLOCK(kvm)

In general, when the RMPUPDATE instruction happens, the TLB entries for
the GPAs being modified will be flushed, so subsequent nested page fault
should be able to obtain the updated mapping based on xarray/#NPF at that
point. In that respect *maybe* we don't need to zap the entries there.

But if the nested page fault occurs before the RMPUPDATE, I think we would
have a race if the above sequence isn't in place to handle the unmap/flush,
since in that case we might get a stale mapping because nothing would've
forced a tlbflush.

There's also stuff like the UPM selftests and SEV lazy-pinning where I
think that kvm_unmap_gfn_range() sequence is also needed. But I might be
misunderstanding the question here.

> If yes, can we postpone the update of the RMP table in the later fault,
> like TDX? So that we can save this update_mem_attr x86 ops as things
> will be solved in the SNP-specific fault handler.

Hmm, I think this would be possible. But it's nice to be able to handle
the RMPUPDATE as part of KVM_SET_MEMORY_ATTRIBUTES, since it allows
KVM MMU code to rely solely on xarray state and not have to query RMP
table to check if a particular PFN needs an RMPUPDATE before mapping it
into RMP table.

At least... it would *in theory*, if the RMPUPDATE happened under
protection of mmu_invalidate_seq (in which case it could inherit all the
same protections KVM MMU has around mmu_invalidate_seq/fault->mmu_seq,
e.g. letting the guest retry the #PF if fault->mmu_seq is stale).

But currently, RMPUPDATE (via kvm_arch_post_set_memory_attributes) happens
*after* the invalidation sequence above, so in theory a guest could fault
on a page just after xarray state is updated, but before the RMPUPDATE has
been done, in which case the KVM MMU code would properly map the page
accordingly to xarray, but since RMPUPDATE wouldn't have happened yet, the
state of the corresponding PFN in RMP table won't match the shared/private
access type expected by the guest, so when it tries to access it it will
get another #NPF with RMP bit set in the error code, which will get
handled as a no-op in handle_rmp_page_fault() (patch #44) and loop like
this until the RMPUPDATE is finally done. So it still works out, but
maybe not keeping as much in sync with xarray state and could be.

But deferring RMPUPDATE to fault time goes in the other direction of
that. Are there benefits/requirements for doing things this way for TDX?
I could see it being beneficial in terms of reducing overhead for
uneeded page-state transitions, since they are only done on-demand but
doesn't seem like it would be that much overhead compared to some of the
other operations being done.

> 
> If no, guess we need a x86 ops to tell if a zapping is required.

Sorry don't think I quite understand the suggestion. What would this
zapping be covering vs. the invalidation sequence that currently happens
in kvm_vm_ioctl_set_mem_attributes()?

> 
> Back to the lock, updating RMP table doesn't require a mutex. Taking
> the lock is required when updating the directmap. both TDX/SNP requires
> this update the directmap when changing memory attributes.

Is that confirmed? If so, do you have a pointer to the associated
documentation? I'm a bit unclear on this due to above-mentioned
discussion.

> 
> Wouldn't it better to factor the touching directmap of kernel part out?

It actually needs to happen before the RMPUPDATE. As soon as there is a
shared->private conversion in the RMP table for a particular PFN, then
any access via directmap by any particular kernel thread to any PFN that
happens to be in the same physical 2M range can cause an RMP fault on
the host, which would be fatal. So the rmpupdate() helper in this series
will unmap directmap entry corresponding the PFN before a shared->private
RMPUPDATE, and restore mappings after private->shared RMPUPDATE

So we could still factor it out, but it would be something like:

  if (attr == private)
    kvm_unmap_directmap(start, end)
  kvm_mem_attrs_changed()
  if (attr == shared)
    kvm_map_directmap(start, end)

> 
> Then you can call the x86 ops.update_mem_attr() in kvm_mem_attrs_changed().
> And update the direct kernel mapping for both TDX/SNP in the
> kvm_post_mem_attrs_changed().

Or, adjusting for the above logic, move the unmapping/mapping to a new
kvm_pre_mem_attrs_changed() and kvm_post_mem_attrs_changed(), respectively.

Which seems pretty reasonable to me. Then we can:
 - drop duplicating the kvm_for_each_memslot_in_gfn_range() walk stuff because
   we'd just need to know what PFNs to map/unmap from directmap
   (although we'd still need a loop around kvm_restrictedmem_get_pfn()
   for the GFN range so not necessarily prettier)
 - call the RMPUPDATE / corresponding TDX handling via kvm_mem_attrs_changed()
   which brings it both under KVM MMU lock and also let's it piggyback
   off the fault->mmu_seq handling so it doesn't get out of sync with
   xarray during fault time.

But would be good to hear others' opinions on this. And also confirm
whether TDX needs that pre/post directmap handle or not.

Thanks!

-Mike

> 
> > 
> > -Mike
> > 
> > > 
> > > From 7c618c1f3c236c382e64680efcbe7d8a672aa870 Mon Sep 17 00:00:00 2001
> > > Message-Id: <7c618c1f3c236c382e64680efcbe7d8a672aa870.1679114841.git.isaku.yamahata@xxxxxxxxx>
> > > In-Reply-To: <428a676face7a06a90e59dca1c32941c9b6ee001.1679114841.git.isaku.yamahata@xxxxxxxxx>
> > > References: <428a676face7a06a90e59dca1c32941c9b6ee001.1679114841.git.isaku.yamahata@xxxxxxxxx>
> > > From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> > > Date: Fri, 17 Mar 2023 12:00:09 -0700
> > > Subject: [PATCH 4/4] KVM: x86: Add 'set_mem_attr' x86 op
> > > 
> > > This callback will do any platform-specific handling needed for
> > > converting pages between shared/private.
> > > 
> > > Originally-by: Michael Roth <michael.roth@xxxxxxx>
> > > Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> > > ---
> > >  arch/x86/include/asm/kvm-x86-ops.h | 1 +
> > >  arch/x86/include/asm/kvm_host.h    | 2 ++
> > >  arch/x86/kvm/mmu/mmu.c             | 1 +
> > >  3 files changed, 4 insertions(+)
> > > 
> > > diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
> > > index dc5f18ac0bd5..956db2ee25a5 100644
> > > --- a/arch/x86/include/asm/kvm-x86-ops.h
> > > +++ b/arch/x86/include/asm/kvm-x86-ops.h
> > > @@ -100,6 +100,7 @@ KVM_X86_OP_OPTIONAL_RET0(set_identity_map_addr)
> > >  KVM_X86_OP_OPTIONAL_RET0(get_mt_mask)
> > >  KVM_X86_OP(load_mmu_pgd)
> > >  KVM_X86_OP(fault_is_private)
> > > +KVM_X86_OP_OPTIONAL(set_mem_attr)
> > >  KVM_X86_OP_OPTIONAL(link_private_spt)
> > >  KVM_X86_OP_OPTIONAL(free_private_spt)
> > >  KVM_X86_OP_OPTIONAL(split_private_spt)
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 0382d236fbf4..88e11dd3afde 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1731,6 +1731,8 @@ struct kvm_x86_ops {
> > >  	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
> > >  			     int root_level);
> > >  	bool (*fault_is_private)(struct kvm *kvm, gpa_t gpa, u64 error_code);
> > > +	void (*set_mem_attr)(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > +			     unsigned int attr, gfn_t start, gfn_t end);
> > >  
> > >  	int (*link_private_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
> > >  				void *private_spt);
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 0ec94c72895c..329333486e64 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -7908,6 +7908,7 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm,
> > >  				    gfn_t start, gfn_t end)
> > >  {
> > >  	kvm_update_lpage_mixed_flag(kvm, slot, true, attrs, start, end);
> > > +	static_call(kvm_x86_set_mem_attr)(kvm, slot, attrs, start, end);
> > >  }
> > >  
> > >  void kvm_memory_attributes_create_memslot(struct kvm *kvm,
> > > -- 
> > > 2.25.1
> > > 
> > > -- 
> > > Isaku Yamahata <isaku.yamahata@xxxxxxxxx>  
> 
>