On 5/17/21 10:07 PM, Claudio Imbrenda wrote: > Until now, destroying a protected guest was an entirely synchronous > operation that could potentially take a very long time, depending on > the size of the guest, due to the time needed to clean up the address > space from protected pages. > > This patch implements a lazy destroy mechanism, that allows a protected > guest to reboot significantly faster than previously. > > This is achieved by clearing the pages of the old guest in background. > In case of reboot, the new guest will be able to run in the same > address space almost immediately. > > The old protected guest is then only destroyed when all of its memory has > been destroyed or otherwise made non protected. LGTM some comments below > > Signed-off-by: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx> > --- > arch/s390/kvm/kvm-s390.c | 6 +- > arch/s390/kvm/kvm-s390.h | 2 +- > arch/s390/kvm/pv.c | 118 ++++++++++++++++++++++++++++++++++++++- > 3 files changed, 120 insertions(+), 6 deletions(-) > > diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c > index 2f09e9d7dc95..db25aa1fb6a6 100644 > --- a/arch/s390/kvm/kvm-s390.c > +++ b/arch/s390/kvm/kvm-s390.c > @@ -2248,7 +2248,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) > > r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc); > if (r) > - kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); > + kvm_s390_pv_deinit_vm_deferred(kvm, &dummy, &dummy); > > /* we need to block service interrupts from now on */ > set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); > @@ -2267,7 +2267,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) > */ > if (r) > break; > - r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc); > + r = kvm_s390_pv_deinit_vm_deferred(kvm, &cmd->rc, &cmd->rrc); > > /* no need to block service interrupts any more */ > clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); > @@ -2796,7 +2796,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) > * complaining we do not use kvm_s390_pv_is_protected. > */ > if (kvm_s390_pv_get_handle(kvm)) > - kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); > + kvm_s390_pv_deinit_vm_deferred(kvm, &rc, &rrc); > debug_unregister(kvm->arch.dbf); > free_page((unsigned long)kvm->arch.sie_page2); > if (!kvm_is_ucontrol(kvm)) > diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h > index 79dcd647b378..b3c0796a3cc1 100644 > --- a/arch/s390/kvm/kvm-s390.h > +++ b/arch/s390/kvm/kvm-s390.h > @@ -211,7 +211,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) > /* implemented in pv.c */ > int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); > int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); > -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); > +int kvm_s390_pv_deinit_vm_deferred(struct kvm *kvm, u16 *rc, u16 *rrc); > int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); > int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, > u16 *rrc); > diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c > index 59039b8a7be7..9a3547966e18 100644 > --- a/arch/s390/kvm/pv.c > +++ b/arch/s390/kvm/pv.c > @@ -14,8 +14,17 @@ > #include <asm/mman.h> > #include <linux/pagewalk.h> > #include <linux/sched/mm.h> > +#include <linux/kthread.h> > #include "kvm-s390.h" > > +struct deferred_priv { > + struct mm_struct *mm; > + unsigned long old_table; > + u64 handle; > + void *virt; > + unsigned long real; > +}; > + > int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) > { > int cc = 0; > @@ -202,7 +211,7 @@ static int kvm_s390_pv_replace_asce(struct kvm *kvm) > } > > /* this should not fail, but if it does, we must not free the donated memory */ > -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) > +static int kvm_s390_pv_deinit_vm_now(struct kvm *kvm, u16 *rc, u16 *rrc) > { > int cc; > > @@ -230,6 +239,111 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) > return cc ? -EIO : 0; > } > > +static int kvm_s390_pv_destroy_vm_thread(void *priv) > +{ > + struct deferred_priv *p = priv; > + u16 rc, rrc; > + int r = 1; > + > + /* Exit early if we end up being the only users of the mm */ > + s390_uv_destroy_range(p->mm, 1, 0, TASK_SIZE_MAX); > + mmput(p->mm); Where do we exit early? > + > + r = uv_cmd_nodata(p->handle, UVC_CMD_DESTROY_SEC_CONF, &rc, &rrc); > + WARN_ONCE(r, "protvirt destroy vm failed rc %x rrc %x", rc, rrc); > + if (r) > + return r; > + atomic_dec(&p->mm->context.is_protected); > + > + /* > + * Intentional leak in case the destroy secure VM call fails. The > + * call should never fail if the hardware is not broken. > + */ > + free_pages(p->real, get_order(uv_info.guest_base_stor_len)); > + free_pages(p->old_table, CRST_ALLOC_ORDER); > + vfree(p->virt); > + kfree(p); > + return 0; > +} > + > +static int deferred_destroy(struct kvm *kvm, struct deferred_priv *priv, u16 *rc, u16 *rrc) > +{ > + struct task_struct *t; > + > + priv->virt = kvm->arch.pv.stor_var; > + priv->real = kvm->arch.pv.stor_base; Now I know what the struct members actually mean... Is there a reson you didn't like config_stor_* as a name or something similar? > + priv->handle = kvm_s390_pv_get_handle(kvm); > + priv->old_table = (unsigned long)kvm->arch.gmap->table; > + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); > + > + if (kvm_s390_pv_replace_asce(kvm)) > + goto fail; > + > + t = kthread_create(kvm_s390_pv_destroy_vm_thread, priv, > + "kvm_s390_pv_destroy_vm_thread"); > + if (IS_ERR_OR_NULL(t)) > + goto fail; > + > + memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv)); > + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM DEFERRED %d", t->pid); > + wake_up_process(t); > + /* > + * no actual UVC is performed at this point, just return a successful > + * rc value to make userspace happy, and an arbitrary rrc > + */ > + *rc = 1; > + *rrc = 42; > + > + return 0; > + > +fail: > + kfree(priv); > + return kvm_s390_pv_deinit_vm_now(kvm, rc, rrc); > +} > + > +/* Clear the first 2GB of guest memory, to avoid prefix issues after reboot */ > +static void kvm_s390_clear_2g(struct kvm *kvm) > +{ > + struct kvm_memory_slot *slot; > + struct kvm_memslots *slots; > + unsigned long lim; > + int idx; > + > + idx = srcu_read_lock(&kvm->srcu); > + > + slots = kvm_memslots(kvm); > + kvm_for_each_memslot(slot, slots) { > + if (slot->base_gfn >= (SZ_2G / PAGE_SIZE)) > + continue; > + if (slot->base_gfn + slot->npages > (SZ_2G / PAGE_SIZE)) > + lim = slot->userspace_addr + SZ_2G - slot->base_gfn * PAGE_SIZE; > + else > + lim = slot->userspace_addr + slot->npages * PAGE_SIZE; > + s390_uv_destroy_range(kvm->mm, 1, slot->userspace_addr, lim); > + } > + > + srcu_read_unlock(&kvm->srcu, idx); > +} > + > +int kvm_s390_pv_deinit_vm_deferred(struct kvm *kvm, u16 *rc, u16 *rrc) > +{ > + struct deferred_priv *priv; > + > + priv = kmalloc(sizeof(*priv), GFP_KERNEL | __GFP_ZERO); > + if (!priv) > + return kvm_s390_pv_deinit_vm_now(kvm, rc, rrc); > + > + if (mmget_not_zero(kvm->mm)) { > + kvm_s390_clear_2g(kvm); > + } else { > + /* No deferred work to do */ > + kfree(priv); > + return kvm_s390_pv_deinit_vm_now(kvm, rc, rrc); > + } > + priv->mm = kvm->mm; > + return deferred_destroy(kvm, priv, rc, rrc); > +} > + > int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) > { > struct uv_cb_cgc uvcb = { > @@ -263,7 +377,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) > atomic_inc(&kvm->mm->context.is_protected); > if (cc) { > if (uvcb.header.rc & UVC_RC_NEED_DESTROY) { > - kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); > + kvm_s390_pv_deinit_vm_now(kvm, &dummy, &dummy); > } else { > atomic_dec(&kvm->mm->context.is_protected); > kvm_s390_pv_dealloc_vm(kvm); >