Re: [PATCH v1 08/11] KVM: s390: pv: lazy destroy for reboot

Janosch Frank <frankja@xxxxxxxxxxxxx> · Thu, 27 May 2021 11:43:32 +0200

On 5/17/21 10:07 PM, Claudio Imbrenda wrote:
> Until now, destroying a protected guest was an entirely synchronous
> operation that could potentially take a very long time, depending on
> the size of the guest, due to the time needed to clean up the address
> space from protected pages.
> 
> This patch implements a lazy destroy mechanism, that allows a protected
> guest to reboot significantly faster than previously.
> 
> This is achieved by clearing the pages of the old guest in background.
> In case of reboot, the new guest will be able to run in the same
> address space almost immediately.
> 
> The old protected guest is then only destroyed when all of its memory has
> been destroyed or otherwise made non protected.

LGTM some comments below

> 
> Signed-off-by: Claudio Imbrenda <imbrenda@xxxxxxxxxxxxx>
> ---
>  arch/s390/kvm/kvm-s390.c |   6 +-
>  arch/s390/kvm/kvm-s390.h |   2 +-
>  arch/s390/kvm/pv.c       | 118 ++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 120 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 2f09e9d7dc95..db25aa1fb6a6 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -2248,7 +2248,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
>  
>  		r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc);
>  		if (r)
> -			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
> +			kvm_s390_pv_deinit_vm_deferred(kvm, &dummy, &dummy);
>  
>  		/* we need to block service interrupts from now on */
>  		set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
> @@ -2267,7 +2267,7 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
>  		 */
>  		if (r)
>  			break;
> -		r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc);
> +		r = kvm_s390_pv_deinit_vm_deferred(kvm, &cmd->rc, &cmd->rrc);
>  
>  		/* no need to block service interrupts any more */
>  		clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs);
> @@ -2796,7 +2796,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>  	 * complaining we do not use kvm_s390_pv_is_protected.
>  	 */
>  	if (kvm_s390_pv_get_handle(kvm))
> -		kvm_s390_pv_deinit_vm(kvm, &rc, &rrc);
> +		kvm_s390_pv_deinit_vm_deferred(kvm, &rc, &rrc);
>  	debug_unregister(kvm->arch.dbf);
>  	free_page((unsigned long)kvm->arch.sie_page2);
>  	if (!kvm_is_ucontrol(kvm))
> diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
> index 79dcd647b378..b3c0796a3cc1 100644
> --- a/arch/s390/kvm/kvm-s390.h
> +++ b/arch/s390/kvm/kvm-s390.h
> @@ -211,7 +211,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
>  /* implemented in pv.c */
>  int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
>  int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc);
> -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
> +int kvm_s390_pv_deinit_vm_deferred(struct kvm *kvm, u16 *rc, u16 *rrc);
>  int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc);
>  int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc,
>  			      u16 *rrc);
> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> index 59039b8a7be7..9a3547966e18 100644
> --- a/arch/s390/kvm/pv.c
> +++ b/arch/s390/kvm/pv.c
> @@ -14,8 +14,17 @@
>  #include <asm/mman.h>
>  #include <linux/pagewalk.h>
>  #include <linux/sched/mm.h>
> +#include <linux/kthread.h>
>  #include "kvm-s390.h"
>  
> +struct deferred_priv {
> +	struct mm_struct *mm;
> +	unsigned long old_table;
> +	u64 handle;
> +	void *virt;
> +	unsigned long real;
> +};
> +
>  int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
>  {
>  	int cc = 0;
> @@ -202,7 +211,7 @@ static int kvm_s390_pv_replace_asce(struct kvm *kvm)
>  }
>  
>  /* this should not fail, but if it does, we must not free the donated memory */
> -int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> +static int kvm_s390_pv_deinit_vm_now(struct kvm *kvm, u16 *rc, u16 *rrc)
>  {
>  	int cc;
>  
> @@ -230,6 +239,111 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>  	return cc ? -EIO : 0;
>  }
>  
> +static int kvm_s390_pv_destroy_vm_thread(void *priv)
> +{
> +	struct deferred_priv *p = priv;
> +	u16 rc, rrc;
> +	int r = 1;
> +
> +	/* Exit early if we end up being the only users of the mm */
> +	s390_uv_destroy_range(p->mm, 1, 0, TASK_SIZE_MAX);
> +	mmput(p->mm);

Where do we exit early?

> +
> +	r = uv_cmd_nodata(p->handle, UVC_CMD_DESTROY_SEC_CONF, &rc, &rrc);
> +	WARN_ONCE(r, "protvirt destroy vm failed rc %x rrc %x", rc, rrc);
> +	if (r)
> +		return r;
> +	atomic_dec(&p->mm->context.is_protected);
> +
> +	/*
> +	 * Intentional leak in case the destroy secure VM call fails. The
> +	 * call should never fail if the hardware is not broken.
> +	 */
> +	free_pages(p->real, get_order(uv_info.guest_base_stor_len));
> +	free_pages(p->old_table, CRST_ALLOC_ORDER);
> +	vfree(p->virt);
> +	kfree(p);
> +	return 0;
> +}
> +
> +static int deferred_destroy(struct kvm *kvm, struct deferred_priv *priv, u16 *rc, u16 *rrc)
> +{
> +	struct task_struct *t;
> +
> +	priv->virt = kvm->arch.pv.stor_var;
> +	priv->real = kvm->arch.pv.stor_base;

Now I know what the struct members actually mean...
Is there a reson you didn't like config_stor_* as a name or something
similar?

> +	priv->handle = kvm_s390_pv_get_handle(kvm);
> +	priv->old_table = (unsigned long)kvm->arch.gmap->table;
> +	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
> +
> +	if (kvm_s390_pv_replace_asce(kvm))
> +		goto fail;
> +
> +	t = kthread_create(kvm_s390_pv_destroy_vm_thread, priv,
> +			   "kvm_s390_pv_destroy_vm_thread");
> +	if (IS_ERR_OR_NULL(t))
> +		goto fail;
> +
> +	memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv));
> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM DEFERRED %d", t->pid);
> +	wake_up_process(t);
> +	/*
> +	 * no actual UVC is performed at this point, just return a successful
> +	 * rc value to make userspace happy, and an arbitrary rrc
> +	 */
> +	*rc = 1;
> +	*rrc = 42;
> +
> +	return 0;
> +
> +fail:
> +	kfree(priv);
> +	return kvm_s390_pv_deinit_vm_now(kvm, rc, rrc);
> +}
> +
> +/*  Clear the first 2GB of guest memory, to avoid prefix issues after reboot */
> +static void kvm_s390_clear_2g(struct kvm *kvm)
> +{
> +	struct kvm_memory_slot *slot;
> +	struct kvm_memslots *slots;
> +	unsigned long lim;
> +	int idx;
> +
> +	idx = srcu_read_lock(&kvm->srcu);
> +
> +	slots = kvm_memslots(kvm);
> +	kvm_for_each_memslot(slot, slots) {
> +		if (slot->base_gfn >= (SZ_2G / PAGE_SIZE))
> +			continue;
> +		if (slot->base_gfn + slot->npages > (SZ_2G / PAGE_SIZE))
> +			lim = slot->userspace_addr + SZ_2G - slot->base_gfn * PAGE_SIZE;
> +		else
> +			lim = slot->userspace_addr + slot->npages * PAGE_SIZE;
> +		s390_uv_destroy_range(kvm->mm, 1, slot->userspace_addr, lim);
> +	}
> +
> +	srcu_read_unlock(&kvm->srcu, idx);
> +}
> +
> +int kvm_s390_pv_deinit_vm_deferred(struct kvm *kvm, u16 *rc, u16 *rrc)
> +{
> +	struct deferred_priv *priv;
> +
> +	priv = kmalloc(sizeof(*priv), GFP_KERNEL | __GFP_ZERO);
> +	if (!priv)
> +		return kvm_s390_pv_deinit_vm_now(kvm, rc, rrc);
> +
> +	if (mmget_not_zero(kvm->mm)) {
> +		kvm_s390_clear_2g(kvm);
> +	} else {
> +		/* No deferred work to do */
> +		kfree(priv);
> +		return kvm_s390_pv_deinit_vm_now(kvm, rc, rrc);
> +	}
> +	priv->mm = kvm->mm;
> +	return deferred_destroy(kvm, priv, rc, rrc);
> +}
> +
>  int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>  {
>  	struct uv_cb_cgc uvcb = {
> @@ -263,7 +377,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>  	atomic_inc(&kvm->mm->context.is_protected);
>  	if (cc) {
>  		if (uvcb.header.rc & UVC_RC_NEED_DESTROY) {
> -			kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
> +			kvm_s390_pv_deinit_vm_now(kvm, &dummy, &dummy);
>  		} else {
>  			atomic_dec(&kvm->mm->context.is_protected);
>  			kvm_s390_pv_dealloc_vm(kvm);
>