Re: [PATCH v4 09/36] KVM: s390: protvirt: Add initial vm and cpu lifecycle handling

David Hildenbrand <david@xxxxxxxxxx> · Tue, 25 Feb 2020 18:46:09 +0100

On 24.02.20 12:40, Christian Borntraeger wrote:
> From: Janosch Frank <frankja@xxxxxxxxxxxxx>
> 
> This contains 3 main changes:
> 1. changes in SIE control block handling for secure guests
> 2. helper functions for create/destroy/unpack secure guests
> 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
> machines

side note: I really dislike such patch descriptions (lists!) and
squashing a whole bunch of things that could be nicely split up into
separat patches (with much nicer patch descriptions) into a single
patch. E.g., enable/disable would be sufficiently complicated to review.

This makes review unnecessary complicated. But here we are in v4, so
I'll try my best for (hopefully) the second last time ;)

[...]

> +static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
> +{
> +	struct kvm_vcpu *vcpu;
> +	bool failed = false;
> +	u16 rc, rrc;
> +	int cc = 0;
> +	int i;
> +
> +	/*
> +	 * we ignore failures and try to destroy as many CPUs as possible.

nit: "We"

> +	 * At the same time we must not free the assigned resources when
> +	 * this fails, as the ultravisor has still access to that memory.
> +	 * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak
> +	 * behind.
> +	 * We want to return the first failure rc and rrc though.

nit, ", though".

> +	 */
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		mutex_lock(&vcpu->mutex);
> +		if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !failed) {
> +			*rcp = rc;
> +			*rrcp = rrc;
> +			cc = 1;
> +			failed = true;

no need for "failed". Just check against cc != 0 instead.

> +		}
> +		mutex_unlock(&vcpu->mutex);
> +	}
> +	return cc;

The question will repeat a couple of times in the patch: Do we want to
convert that to a proper error (e.g., EBUSY, EINVAL, EWHATSOEVER)
instead of returning "1" to user space (whoch looks weird).

> +}
> +
> +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
> +{
> +	int i, r = 0;
> +	u16 dummy;
> +
> +	struct kvm_vcpu *vcpu;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		mutex_lock(&vcpu->mutex);
> +		r = kvm_s390_pv_create_cpu(vcpu, rc, rrc);
> +		mutex_unlock(&vcpu->mutex);
> +		if (r)
> +			break;
> +	}
> +	if (r)
> +		kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
> +	return r;
> +}

[...]

> @@ -0,0 +1,266 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Hosting Secure Execution virtual machines

Just wondering "Protected Virtualization" vs. "Secure Execution".

[...]

> +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
> +{
> +	int cc = 0;
> +
> +	if (kvm_s390_pv_cpu_get_handle(vcpu)) {
> +		cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
> +				   UVC_CMD_DESTROY_SEC_CPU, rc, rrc);
> +
> +		KVM_UV_EVENT(vcpu->kvm, 3,
> +			     "PROTVIRT DESTROY VCPU %d: rc %x rrc %x",
> +			     vcpu->vcpu_id, *rc, *rrc);
> +		WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x",
> +			  *rc, *rrc);
> +	}

/* Intended memory leak for something that should never happen. */

> +	if (!cc)
> +		free_pages(vcpu->arch.pv.stor_base,
> +			   get_order(uv_info.guest_cpu_stor_len));

Should we clear arch.pv.handle?

Also, I do wonder if it makes sense to

vcpu->arch.pv.stor_base = NULL;

So really remove any traces and act like the error never happened. Only
skip the freeing. Makes sense? Then we're not stuck with a
half-initialized VM state.

> +	vcpu->arch.sie_block->pv_handle_cpu = 0;
> +	vcpu->arch.sie_block->pv_handle_config = 0;
> +	memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
> +	vcpu->arch.sie_block->sdf = 0;
> +	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
> +
> +	return cc;

Convert to a proper error?

> +}
> +
> +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
> +{
> +	struct uv_cb_csc uvcb = {
> +		.header.cmd = UVC_CMD_CREATE_SEC_CPU,
> +		.header.len = sizeof(uvcb),
> +	};
> +	int cc;
> +
> +	if (kvm_s390_pv_cpu_get_handle(vcpu))
> +		return -EINVAL;
> +
> +	vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL,
> +						   get_order(uv_info.guest_cpu_stor_len));
> +	if (!vcpu->arch.pv.stor_base)
> +		return -ENOMEM;
> +
> +	/* Input */
> +	uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
> +	uvcb.num = vcpu->arch.sie_block->icpua;
> +	uvcb.state_origin = (u64)vcpu->arch.sie_block;
> +	uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base;
> +
> +	cc = uv_call(0, (u64)&uvcb);
> +	*rc = uvcb.header.rc;
> +	*rrc = uvcb.header.rrc;
> +	KVM_UV_EVENT(vcpu->kvm, 3,
> +		     "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x",
> +		     vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc,
> +		     uvcb.header.rrc);
> +
> +	if (cc) {
> +		u16 dummy;
> +
> +		kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy);
> +		return -EINVAL;

Ah, here we convert from cc to an actual error :)

> +	}
> +
> +	/* Output */
> +	vcpu->arch.pv.handle = uvcb.cpu_handle;
> +	vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle;
> +	vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm);
> +	vcpu->arch.sie_block->sdf = 2;
> +	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
> +	return 0;
> +}
> +
> +/* only free resources when the destroy was successful */

s/destroy/deinit/

> +static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
> +{
> +	vfree(kvm->arch.pv.stor_var);
> +	free_pages(kvm->arch.pv.stor_base,
> +		   get_order(uv_info.guest_base_stor_len));
> +	memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv));
> +}
> +
> +static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
> +{
> +	unsigned long base = uv_info.guest_base_stor_len;
> +	unsigned long virt = uv_info.guest_virt_var_stor_len;
> +	unsigned long npages = 0, vlen = 0;
> +	struct kvm_memory_slot *memslot;
> +
> +	kvm->arch.pv.stor_var = NULL;
> +	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
> +	if (!kvm->arch.pv.stor_base)
> +		return -ENOMEM;
> +
> +	/*
> +	 * Calculate current guest storage for allocation of the
> +	 * variable storage, which is based on the length in MB.
> +	 *
> +	 * Slots are sorted by GFN
> +	 */
> +	mutex_lock(&kvm->slots_lock);
> +	memslot = kvm_memslots(kvm)->memslots;
> +	npages = memslot->base_gfn + memslot->npages;

I remember I asked this question already, maybe I missed the reply :(

1. What if we have multiple slots?
2. What is expected to happen if new slots are added (e.g., memory
hotplug in the future?)

Shouldn't you bail out if there is more than one slot and make sure that
no new ones can be added as long as pv is active (I remember the latter
should be very easy from an arch callback)?

> +	mutex_unlock(&kvm->slots_lock);
> +
> +	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
> +
> +	/* Allocate variable storage */
> +	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
> +	vlen += uv_info.guest_virt_base_stor_len;
> +	kvm->arch.pv.stor_var = vzalloc(vlen);
> +	if (!kvm->arch.pv.stor_var)
> +		goto out_err;
> +	return 0;
> +
> +out_err:
> +	kvm_s390_pv_dealloc_vm(kvm);
> +	return -ENOMEM;
> +}
> +
> +/* this should not fail, but if it does we must not free the donated memory */
> +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> +{
> +	int cc;
> +
> +	cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
> +			   UVC_CMD_DESTROY_SEC_CONF, rc, rrc);

Could convert to

int cc = ...

> +	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
> +	atomic_set(&kvm->mm->context.is_protected, 0);
> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
> +	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
> +	if (!cc)
> +		kvm_s390_pv_dealloc_vm(kvm);

Similar to the VCPU path, should be set all pointers to NULL but skip
the freeing? With a similar comment /* Inteded memory leak ... */

> +	return cc;

Does it make more sense to translate that to a proper error? (EBUSY,
EINVAL etc.) I'd assume we translate that to a proper error - if any.
Returning e.g., "1" does not make too much sense IMHO.

> +}
> +
> +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> +{
> +	u16 drc, drrc;
> +	int cc, ret;
> +

superfluous empty line.

> +	struct uv_cb_cgc uvcb = {
> +		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
> +		.header.len = sizeof(uvcb)
> +	};

maybe

int ret = kvm_s390_pv_alloc_vm(kvm);

no strong feelings.

> +
> +	ret = kvm_s390_pv_alloc_vm(kvm);
> +	if (ret)
> +		return ret;
> +
> +	/* Inputs */
> +	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
> +	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
> +	uvcb.guest_asce = kvm->arch.gmap->asce;
> +	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
> +	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
> +	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
> +
> +	cc = uv_call(0, (u64)&uvcb);
> +	*rc = uvcb.header.rc;
> +	*rrc = uvcb.header.rrc;
> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
> +		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
> +
> +	/* Outputs */
> +	kvm->arch.pv.handle = uvcb.guest_handle;
> +
> +	if (cc && (uvcb.header.rc & UVC_RC_NEED_DESTROY)) {

So, in case cc!=0 and UVC_RC_NEED_DESTROY is not set, we would return an
error (!=0 from this function) and not even try to deinit the vm?

This is honestly confusing stuff.

> +		if (!kvm_s390_pv_deinit_vm(kvm, &drc, &drrc))
> +			kvm_s390_pv_dealloc_vm(kvm);

kvm_s390_pv_deinit_vm() will already call kvm_s390_pv_dealloc_vm().

> +		return -EINVAL;
> +	}
> +	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
> +	atomic_set(&kvm->mm->context.is_protected, 1);
> +	return cc;

Convert to a proper error?

Feel free to send a new version of this patch only on top. I'll try to
review it very fast :)

-- 
Thanks,

David / dhildenb