Re: [PATCH v4 09/36] KVM: s390: protvirt: Add initial vm and cpu lifecycle handling

Christian Borntraeger <borntraeger@xxxxxxxxxx> · Tue, 25 Feb 2020 22:44:40 +0100

On 25.02.20 18:46, David Hildenbrand wrote:
> On 24.02.20 12:40, Christian Borntraeger wrote:
>> From: Janosch Frank <frankja@xxxxxxxxxxxxx>
>>
>> This contains 3 main changes:
>> 1. changes in SIE control block handling for secure guests
>> 2. helper functions for create/destroy/unpack secure guests
>> 3. KVM_S390_PV_COMMAND ioctl to allow userspace dealing with secure
>> machines
> 
> side note: I really dislike such patch descriptions (lists!) and
> squashing a whole bunch of things that could be nicely split up into
> separat patches (with much nicer patch descriptions) into a single
> patch. E.g., enable/disable would be sufficiently complicated to review.
> 
> This makes review unnecessary complicated. But here we are in v4, so
> I'll try my best for (hopefully) the second last time ;)
> 
> [...]
> 
>> +static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp)
>> +{
>> +	struct kvm_vcpu *vcpu;
>> +	bool failed = false;
>> +	u16 rc, rrc;
>> +	int cc = 0;
>> +	int i;
>> +
>> +	/*
>> +	 * we ignore failures and try to destroy as many CPUs as possible.
> 
> nit: "We"

ack

> 
>> +	 * At the same time we must not free the assigned resources when
>> +	 * this fails, as the ultravisor has still access to that memory.
>> +	 * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak
>> +	 * behind.
>> +	 * We want to return the first failure rc and rrc though.
> 
> nit, ", though".

ack
> 
>> +	 */
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		mutex_lock(&vcpu->mutex);
>> +		if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !failed) {
>> +			*rcp = rc;
>> +			*rrcp = rrc;
>> +			cc = 1;
>> +			failed = true;
> 
> no need for "failed". Just check against cc != 0 instead.

ack

> 
>> +		}
>> +		mutex_unlock(&vcpu->mutex);
>> +	}
>> +	return cc;
> 
> The question will repeat a couple of times in the patch: Do we want to
> convert that to a proper error (e.g., EBUSY, EINVAL, EWHATSOEVER)
> instead of returning "1" to user space (whoch looks weird).

Not sure about the right error code. 
-EIO for cc == 1?



> 
>> +}
>> +
>> +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc)
>> +{
>> +	int i, r = 0;
>> +	u16 dummy;
>> +
>> +	struct kvm_vcpu *vcpu;
>> +
>> +	kvm_for_each_vcpu(i, vcpu, kvm) {
>> +		mutex_lock(&vcpu->mutex);
>> +		r = kvm_s390_pv_create_cpu(vcpu, rc, rrc);
>> +		mutex_unlock(&vcpu->mutex);
>> +		if (r)
>> +			break;
>> +	}
>> +	if (r)
>> +		kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
>> +	return r;
>> +}
> 
> [...]
> 
>> @@ -0,0 +1,266 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Hosting Secure Execution virtual machines
> 
> Just wondering "Protected Virtualization" vs. "Secure Execution".

No name yet, will use protected virtual machines as an independent term.

> 
> [...]
> 
>> +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
>> +{
>> +	int cc = 0;
>> +
>> +	if (kvm_s390_pv_cpu_get_handle(vcpu)) {
>> +		cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu),
>> +				   UVC_CMD_DESTROY_SEC_CPU, rc, rrc);
>> +
>> +		KVM_UV_EVENT(vcpu->kvm, 3,
>> +			     "PROTVIRT DESTROY VCPU %d: rc %x rrc %x",
>> +			     vcpu->vcpu_id, *rc, *rrc);
>> +		WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x",
>> +			  *rc, *rrc);
>> +	}
> 
> /* Intended memory leak for something that should never happen. */

ack
> 
>> +	if (!cc)
>> +		free_pages(vcpu->arch.pv.stor_base,
>> +			   get_order(uv_info.guest_cpu_stor_len));
> 
> Should we clear arch.pv.handle?

this is done in the memset below
> 
> Also, I do wonder if it makes sense to
> 
> vcpu->arch.pv.stor_base = NULL;

same. We could do 4 single assignments instead, but the memset is probably ok?

> 
> So really remove any traces and act like the error never happened. Only
> skip the freeing. Makes sense? Then we're not stuck with a
> half-initialized VM state.

I think this is what we do with the memset.

> 
> 
>> +	vcpu->arch.sie_block->pv_handle_cpu = 0;
>> +	vcpu->arch.sie_block->pv_handle_config = 0;
>> +	memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv));
>> +	vcpu->arch.sie_block->sdf = 0;
>> +	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>> +
>> +	return cc;
> 
> Convert to a proper error?

-EIO?

> 
>> +}
>> +
>> +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc)
>> +{
>> +	struct uv_cb_csc uvcb = {
>> +		.header.cmd = UVC_CMD_CREATE_SEC_CPU,
>> +		.header.len = sizeof(uvcb),
>> +	};
>> +	int cc;
>> +
>> +	if (kvm_s390_pv_cpu_get_handle(vcpu))
>> +		return -EINVAL;
>> +
>> +	vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL,
>> +						   get_order(uv_info.guest_cpu_stor_len));
>> +	if (!vcpu->arch.pv.stor_base)
>> +		return -ENOMEM;
>> +
>> +	/* Input */
>> +	uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm);
>> +	uvcb.num = vcpu->arch.sie_block->icpua;
>> +	uvcb.state_origin = (u64)vcpu->arch.sie_block;
>> +	uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base;
>> +
>> +	cc = uv_call(0, (u64)&uvcb);
>> +	*rc = uvcb.header.rc;
>> +	*rrc = uvcb.header.rrc;
>> +	KVM_UV_EVENT(vcpu->kvm, 3,
>> +		     "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x",
>> +		     vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc,
>> +		     uvcb.header.rrc);
>> +
>> +	if (cc) {
>> +		u16 dummy;
>> +
>> +		kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy);
>> +		return -EINVAL;
> 
> Ah, here we convert from cc to an actual error :)

also EIO then?
> 
>> +	}
>> +
>> +	/* Output */
>> +	vcpu->arch.pv.handle = uvcb.cpu_handle;
>> +	vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle;
>> +	vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm);
>> +	vcpu->arch.sie_block->sdf = 2;
>> +	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>> +	return 0;
>> +}
>> +
>> +/* only free resources when the destroy was successful */
> 
> s/destroy/deinit/

Not really. deinit is destroy + dealloc. And we only dealloc when destroy was ok.



> 
>> +static void kvm_s390_pv_dealloc_vm(struct kvm *kvm)
>> +{
>> +	vfree(kvm->arch.pv.stor_var);
>> +	free_pages(kvm->arch.pv.stor_base,
>> +		   get_order(uv_info.guest_base_stor_len));
>> +	memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv));
>> +}
>> +
>> +static int kvm_s390_pv_alloc_vm(struct kvm *kvm)
>> +{
>> +	unsigned long base = uv_info.guest_base_stor_len;
>> +	unsigned long virt = uv_info.guest_virt_var_stor_len;
>> +	unsigned long npages = 0, vlen = 0;
>> +	struct kvm_memory_slot *memslot;
>> +
>> +	kvm->arch.pv.stor_var = NULL;
>> +	kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base));
>> +	if (!kvm->arch.pv.stor_base)
>> +		return -ENOMEM;
>> +
>> +	/*
>> +	 * Calculate current guest storage for allocation of the
>> +	 * variable storage, which is based on the length in MB.
>> +	 *
>> +	 * Slots are sorted by GFN
>> +	 */
>> +	mutex_lock(&kvm->slots_lock);
>> +	memslot = kvm_memslots(kvm)->memslots;
>> +	npages = memslot->base_gfn + memslot->npages;
> 
> I remember I asked this question already, maybe I missed the reply :(
> 
> 1. What if we have multiple slots?

memslot 0 is the last one, so this should actually have the last memory address
so this should be ok.

> 2. What is expected to happen if new slots are added (e.g., memory
> hotplug in the future?)
> 
> Shouldn't you bail out if there is more than one slot and make sure that
> no new ones can be added as long as pv is active (I remember the latter
> should be very easy from an arch callback)?

Yes, that should be easy, something like the following I guess

--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -4744,6 +4744,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
        if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit)
                return -EINVAL;
 
+       /* When we are protected we should not change the memory slots */
+       if (kvm_s390_pv_is_protected(kvm))
+               return -EINVAL;
        return 0;
 }
 



I think we can extend that later to actually use
the memorysize from kvm->arch.mem_limit as long as this is reasonably small.
This should then be done when we implement memory hotplug.


> 
>> +	mutex_unlock(&kvm->slots_lock);
>> +
>> +	kvm->arch.pv.guest_len = npages * PAGE_SIZE;
>> +
>> +	/* Allocate variable storage */
>> +	vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE);
>> +	vlen += uv_info.guest_virt_base_stor_len;
>> +	kvm->arch.pv.stor_var = vzalloc(vlen);
>> +	if (!kvm->arch.pv.stor_var)
>> +		goto out_err;
>> +	return 0;
>> +
>> +out_err:
>> +	kvm_s390_pv_dealloc_vm(kvm);
>> +	return -ENOMEM;
>> +}
>> +
>> +/* this should not fail, but if it does we must not free the donated memory */
>> +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>> +{
>> +	int cc;
>> +
>> +	cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
>> +			   UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
> 
> Could convert to
> 
> int cc = ...
there will be a call to s390_reset_acc in a later patch that sneaks in here.
> 
>> +	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
>> +	atomic_set(&kvm->mm->context.is_protected, 0);
>> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc);
>> +	WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc);
>> +	if (!cc)
>> +		kvm_s390_pv_dealloc_vm(kvm);
> 
> Similar to the VCPU path, should be set all pointers to NULL but skip
> the freeing? With a similar comment /* Inteded memory leak ... */

This is done in kvm_s390_pv_dealloc_vm. And I think it makes sense to keep
the VM thing linked to the KVM struct. This will prevent the user from doing
another PV_ENABLE on this guest.

> 
>> +	return cc;
> 
> Does it make more sense to translate that to a proper error? (EBUSY,
> EINVAL etc.) I'd assume we translate that to a proper error - if any.
> Returning e.g., "1" does not make too much sense IMHO.

-EIO?

> 
>> +}
>> +
>> +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>> +{
>> +	u16 drc, drrc;
>> +	int cc, ret;
>> +
> 
> superfluous empty line.
> 

ack

>> +	struct uv_cb_cgc uvcb = {
>> +		.header.cmd = UVC_CMD_CREATE_SEC_CONF,
>> +		.header.len = sizeof(uvcb)
>> +	};
> 
> maybe
> 
> int ret = kvm_s390_pv_alloc_vm(kvm);
> 
> no strong feelings.
> 
>> +
>> +	ret = kvm_s390_pv_alloc_vm(kvm);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/* Inputs */
>> +	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
>> +	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
>> +	uvcb.guest_asce = kvm->arch.gmap->asce;
>> +	uvcb.guest_sca = (unsigned long)kvm->arch.sca;
>> +	uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base;
>> +	uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var;
>> +
>> +	cc = uv_call(0, (u64)&uvcb);
>> +	*rc = uvcb.header.rc;
>> +	*rrc = uvcb.header.rrc;
>> +	KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x",
>> +		     uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc);
>> +



>> +	/* Outputs */
>> +	kvm->arch.pv.handle = uvcb.guest_handle;
>> +
>> +	if (cc && (uvcb.header.rc & UVC_RC_NEED_DESTROY)) {
> 
> So, in case cc!=0 and UVC_RC_NEED_DESTROY is not set, we would return an
> error (!=0 from this function) and not even try to deinit the vm?
> 
> This is honestly confusing stuff.
> 
>> +		if (!kvm_s390_pv_deinit_vm(kvm, &drc, &drrc))
>> +			kvm_s390_pv_dealloc_vm(kvm);
> 
> kvm_s390_pv_deinit_vm() will already call kvm_s390_pv_dealloc_vm().

right. Will do 

        if (cc) {
                if (uvcb.header.rc & UVC_RC_NEED_DESTROY)
                        kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy);
                else
                        kvm_s390_pv_dealloc_vm(kvm);
                return -EIO;
        }



> 
>> +		return -EINVAL;
>> +	}
>> +	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
>> +	atomic_set(&kvm->mm->context.is_protected, 1);
>> +	return cc;
> 
> Convert to a proper error?

-EIO (I think I will keep -EINVAL for the mpstate ioctl).
> 
> 
> Feel free to send a new version of this patch only on top. I'll try to
> review it very fast :)
>