Re: [PATCH v2 2/2] KVM: s390: add floating irq controller

Alexander Graf <agraf@xxxxxxx> · Sat, 5 Oct 2013 01:53:33 +0200

On 06.09.2013, at 14:19, Jens Freimann wrote:

> This patch adds a floating irq controller as a kvm_device.
> It will be necessary for migration of floating interrupts as well
> as for hardening the reset code by allowing user space to explicitly
> remove all pending floating interrupts.
> 
> Signed-off-by: Jens Freimann <jfrei@xxxxxxxxxxxxxxxxxx>
> Reviewed-by: Cornelia Huck <cornelia.huck@xxxxxxxxxx>
> ---
> Documentation/virtual/kvm/devices/s390_flic.txt |  36 +++
> arch/s390/include/asm/kvm_host.h                |   1 +
> arch/s390/include/uapi/asm/kvm.h                |   5 +
> arch/s390/kvm/interrupt.c                       | 304 ++++++++++++++++++++----
> arch/s390/kvm/kvm-s390.c                        |   1 +
> include/linux/kvm_host.h                        |   1 +
> include/uapi/linux/kvm.h                        |   2 +
> virt/kvm/kvm_main.c                             |   5 +
> 8 files changed, 304 insertions(+), 51 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt
> new file mode 100644
> index 0000000..06aef31
> --- /dev/null
> +++ b/Documentation/virtual/kvm/devices/s390_flic.txt
> @@ -0,0 +1,36 @@
> +FLIC (floating interrupt controller)
> +====================================
> +
> +FLIC handles floating (non per-cpu) interrupts, i.e.  I/O, service and some
> +machine check interruptions. All interrupts are stored in a per-vm list of
> +pending interrupts. FLIC performs operations on this list.
> +
> +Only one FLIC instance may be instantiated.
> +
> +FLIC provides support to
> +- add/delete interrupts (KVM_DEV_FLIC_ENQUEUE and _DEQUEUE)
> +- purge all pending floating interrupts (KVM_DEV_FLIC_CLEAR_IRQS)
> +
> +Groups:
> +  KVM_DEV_FLIC_ENQUEUE
> +    Adds one interrupt to the list of pending floating interrupts. Interrupts
> +    are taken from this list for injection into the guest. attr contains
> +    a struct kvm_s390_irq which contains all data relevant for
> +    interrupt injection.
> +    The format of the data structure kvm_s390_irq as it is copied from userspace
> +    is defined in usr/include/linux/kvm.h.
> +    For historic reasons list members are stored in a different data structure, i.e.
> +    we need to copy the relevant data into a struct kvm_s390_interrupt_info
> +    which can then be added to the list.
> +
> +  KVM_DEV_FLIC_DEQUEUE
> +    Takes one element off the pending interrupts list and copies it into userspace.
> +    Dequeued interrupts are not injected into the guest.
> +    attr->addr contains the userspace address of a struct kvm_s390_irq.
> +    List elements are stored in the format of struct kvm_s390_interrupt_info
> +    (arch/s390/include/asm/kvm_host.h) and are copied into a struct kvm_s390_irq
> +    (usr/include/linux/kvm.h)
> +
> +  KVM_DEV_FLIC_CLEAR_IRQS
> +    Simply deletes all elements from the list of currently pending floating interrupts.
> +    No interrupts are injected into the guest.
> diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
> index adb05c5..e1cc166 100644
> --- a/arch/s390/include/asm/kvm_host.h
> +++ b/arch/s390/include/asm/kvm_host.h
> @@ -238,6 +238,7 @@ struct kvm_arch{
> 	struct sca_block *sca;
> 	debug_info_t *dbf;
> 	struct kvm_s390_float_interrupt float_int;
> +	struct kvm_device *flic;
> 	struct gmap *gmap;
> 	int css_support;
> };
> diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
> index d25da59..33d52b8 100644
> --- a/arch/s390/include/uapi/asm/kvm.h
> +++ b/arch/s390/include/uapi/asm/kvm.h
> @@ -16,6 +16,11 @@
> 
> #define __KVM_S390
> 
> +/* Device control API: s390-specific devices */
> +#define KVM_DEV_FLIC_DEQUEUE 1
> +#define KVM_DEV_FLIC_ENQUEUE 2
> +#define KVM_DEV_FLIC_CLEAR_IRQS 3
> +
> /* for KVM_GET_REGS and KVM_SET_REGS */
> struct kvm_regs {
> 	/* general purpose regs for s390 */
> diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
> index 7f35cb3..d6d5e36 100644
> --- a/arch/s390/kvm/interrupt.c
> +++ b/arch/s390/kvm/interrupt.c
> @@ -656,53 +656,86 @@ struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
> 	return inti;
> }
> 
> -int kvm_s390_inject_vm(struct kvm *kvm,
> -		       struct kvm_s390_interrupt *s390int)
> +static void __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)

This really doesn't only inject, it enqueues an interrupt and also injects them then, right?

> {
> 	struct kvm_s390_local_interrupt *li;
> 	struct kvm_s390_float_interrupt *fi;
> -	struct kvm_s390_interrupt_info *inti, *iter;
> +	struct kvm_s390_interrupt_info *iter;
> 	int sigcpu;
> 
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;

You probably want to move this into your device structure eventually.

> +	spin_lock(&fi->lock);
> +	if (!is_ioint(inti->type)) {
> +		list_add_tail(&inti->list, &fi->list);
> +	} else {
> +		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> +
> +		/* Keep I/O interrupts sorted in isc order. */
> +		list_for_each_entry(iter, &fi->list, list) {
> +			if (!is_ioint(iter->type))
> +				continue;
> +			if (int_word_to_isc_bits(iter->io.io_int_word)
> +					<= isc_bits)
> +				continue;
> +			break;
> +		}
> +		list_add_tail(&inti->list, &iter->list);
> +	}
> +	atomic_set(&fi->active, 1);
> +	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> +	if (sigcpu == KVM_MAX_VCPUS) {
> +		do {
> +			sigcpu = fi->next_rr_cpu++;
> +			if (sigcpu == KVM_MAX_VCPUS)
> +				sigcpu = fi->next_rr_cpu = 0;
> +		} while (fi->local_int[sigcpu] == NULL);
> +	}
> +	li = fi->local_int[sigcpu];
> +	spin_lock_bh(&li->lock);
> +	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> +	if (waitqueue_active(li->wq))
> +		wake_up_interruptible(li->wq);

Does this kick the other vcpu to notify it that an irq is available?

The code is very spaghetti style. There's certainly a good opportunity to split things up and make it more readable here ;). I think splitting it into "enqueue" and "target cpu deliver" parts makes a lot of sense for starters.

Btw, usually the way these interrupt controllers work in KVM is that the CPU local interrupt controller part "fetches" interrupts from the floating interrupt controller. So in here you would only enqueue and maybe kick a potential receiver of the interrupt. Before a vcpu goes back into guest state, it asks the floating interrupt controller whether there's anything pending for it. If so, it delivers it.

That way even if the vcpu you chose here happens to get preempted, you still deliver the interrupt quickly through another vcpu.

> +	spin_unlock_bh(&li->lock);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +}
> +
> +int kvm_s390_inject_vm(struct kvm *kvm,
> +		       struct kvm_s390_interrupt *s390int)
> +{
> +	struct kvm_s390_interrupt_info *inti;
> +
> 	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> 	if (!inti)
> 		return -ENOMEM;
> 
> -	switch (s390int->type) {
> +	inti->type = s390int->type;
> +	switch (inti->type) {
> 	case KVM_S390_INT_VIRTIO:
> 		VM_EVENT(kvm, 5, "inject: virtio parm:%x,parm64:%llx",
> 			 s390int->parm, s390int->parm64);
> -		inti->type = s390int->type;
> 		inti->ext.ext_params = s390int->parm;
> 		inti->ext.ext_params2 = s390int->parm64;
> 		break;
> 	case KVM_S390_INT_SERVICE:
> 		VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm);
> -		inti->type = s390int->type;
> 		inti->ext.ext_params = s390int->parm;
> 		break;
> -	case KVM_S390_PROGRAM_INT:
> -	case KVM_S390_SIGP_STOP:
> -	case KVM_S390_INT_EXTERNAL_CALL:
> -	case KVM_S390_INT_EMERGENCY:
> -		kfree(inti);
> -		return -EINVAL;
> 	case KVM_S390_MCHK:
> 		VM_EVENT(kvm, 5, "inject: machine check parm64:%llx",
> 			 s390int->parm64);
> -		inti->type = s390int->type;
> 		inti->mchk.cr14 = s390int->parm; /* upper bits are not used */
> 		inti->mchk.mcic = s390int->parm64;
> 		break;
> 	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> -		if (s390int->type & IOINT_AI_MASK)
> +		if (inti->type & IOINT_AI_MASK)
> 			VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
> 		else
> 			VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
> 				 s390int->type & IOINT_CSSID_MASK,
> 				 s390int->type & IOINT_SSID_MASK,
> 				 s390int->type & IOINT_SCHID_MASK);
> -		inti->type = s390int->type;
> 		inti->io.subchannel_id = s390int->parm >> 16;
> 		inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
> 		inti->io.io_int_parm = s390int->parm64 >> 32;
> @@ -715,42 +748,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
> 	trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
> 				 2);
> 
> -	mutex_lock(&kvm->lock);
> -	fi = &kvm->arch.float_int;
> -	spin_lock(&fi->lock);
> -	if (!is_ioint(inti->type))
> -		list_add_tail(&inti->list, &fi->list);
> -	else {
> -		u64 isc_bits = int_word_to_isc_bits(inti->io.io_int_word);
> -
> -		/* Keep I/O interrupts sorted in isc order. */
> -		list_for_each_entry(iter, &fi->list, list) {
> -			if (!is_ioint(iter->type))
> -				continue;
> -			if (int_word_to_isc_bits(iter->io.io_int_word)
> -			    <= isc_bits)
> -				continue;
> -			break;
> -		}
> -		list_add_tail(&inti->list, &iter->list);
> -	}
> -	atomic_set(&fi->active, 1);
> -	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
> -	if (sigcpu == KVM_MAX_VCPUS) {
> -		do {
> -			sigcpu = fi->next_rr_cpu++;
> -			if (sigcpu == KVM_MAX_VCPUS)
> -				sigcpu = fi->next_rr_cpu = 0;
> -		} while (fi->local_int[sigcpu] == NULL);
> -	}
> -	li = fi->local_int[sigcpu];
> -	spin_lock_bh(&li->lock);
> -	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
> -	if (waitqueue_active(li->wq))
> -		wake_up_interruptible(li->wq);
> -	spin_unlock_bh(&li->lock);
> -	spin_unlock(&fi->lock);
> -	mutex_unlock(&kvm->lock);

... but as it's mostly copy&paste from here I wouldn't mind if you do the cleanup in a follow-up patch.

> +	__inject_vm(kvm, inti);
> 	return 0;
> }
> 
> @@ -838,3 +836,207 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
> 	mutex_unlock(&vcpu->kvm->lock);
> 	return 0;
> }
> +
> +static void clear_floating_interrupts(struct kvm *kvm)

This should eventually only require device scope, no?

> +{
> +	struct kvm_s390_float_interrupt *fi;
> +	struct kvm_s390_interrupt_info	*n, *inti = NULL;
> +
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	list_for_each_entry_safe(inti, n, &fi->list, list) {
> +		list_del(&inti->list);
> +		kfree(inti);
> +	}
> +	atomic_set(&fi->active, 0);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +}
> +
> +static inline int copy_irq_to_user(struct kvm_s390_interrupt_info *inti,
> +				   u64 addr)
> +{
> +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> +	void __user *target;
> +	void *source;
> +	u64 size;
> +	int r = 0;
> +
> +	switch (inti->type) {
> +	case KVM_S390_INT_VIRTIO:
> +	case KVM_S390_INT_SERVICE:
> +		source = &inti->ext;
> +		target = &uptr->ext;
> +		size = sizeof(inti->ext);
> +		break;
> +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> +		source = &inti->io;
> +		target = &uptr->io;
> +		size = sizeof(inti->io);
> +		break;
> +	case KVM_S390_MCHK:
> +		source = &inti->mchk;
> +		target = &uptr->mchk;
> +		size = sizeof(inti->mchk);
> +		break;
> +	case KVM_S390_INT_MAX:
> +		goto out;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	r = put_user(inti->type, (u64 __user *) &uptr->type);
> +	if (copy_to_user(target, source, size))
> +		r = -EFAULT;
> +
> +out:
> +	return r;
> +}
> +
> +static int dequeue_floating_irq(struct kvm *kvm, __u64 addr)

I don't understand the purpose of this function. Why would you want to dequeue (and obtain all information about) a floating interrupt without checking for its type? In a normal interrupt controller world you would say "lower the line for interrupt line X now".  I'm missing that "interrupt line X" parameter.

Alex

> +{
> +	struct kvm_s390_interrupt_info *inti;
> +	struct kvm_s390_float_interrupt *fi;
> +	int r = 0;
> +
> +
> +	mutex_lock(&kvm->lock);
> +	fi = &kvm->arch.float_int;
> +	spin_lock(&fi->lock);
> +	if (list_empty(&fi->list)) {
> +		mutex_unlock(&kvm->lock);
> +		spin_unlock(&fi->lock);
> +		return -ENODATA;
> +	}
> +	inti = list_first_entry(&fi->list,
> +			struct kvm_s390_interrupt_info, list);
> +	list_del(&inti->list);
> +	spin_unlock(&fi->lock);
> +	mutex_unlock(&kvm->lock);
> +
> +	r = copy_irq_to_user(inti, addr);
> +
> +	kfree(inti);
> +	return r;
> +}
> +
> +static int flic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	int r;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_FLIC_DEQUEUE:
> +		r = dequeue_floating_irq(dev->kvm, attr->addr);
> +		break;
> +	default:
> +		r = -EINVAL;
> +	}
> +
> +	return r;
> +}
> +
> +static inline int copy_irq_from_user(struct kvm_s390_interrupt_info *inti,
> +				     u64 addr)
> +{
> +	struct kvm_s390_irq __user *uptr = (struct kvm_s390_irq __user *) addr;
> +	void *target = NULL;
> +	void *source;
> +	u64 size;
> +	int r = 0;
> +
> +	if (get_user(inti->type, (u64 __user *)addr))
> +		return -EFAULT;
> +	switch (inti->type) {
> +	case KVM_S390_INT_VIRTIO:
> +	case KVM_S390_INT_SERVICE:
> +		target = (void *) &inti->ext;
> +		source = &uptr->ext;
> +		size = sizeof(inti->ext);
> +		break;
> +	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
> +		target = (void *) &inti->io;
> +		source = &uptr->io;
> +		size = sizeof(inti->io);
> +		break;
> +	case KVM_S390_MCHK:
> +		target = (void *) &inti->mchk;
> +		source = &uptr->mchk;
> +		size = sizeof(inti->mchk);
> +		break;
> +	case KVM_S390_INT_MAX:
> +		goto out;
> +	default:
> +		r = -EINVAL;
> +		return r;
> +	}
> +
> +	if (copy_from_user(target, source, size))
> +		r = -EFAULT;
> +
> +out:
> +	return r;
> +}
> +
> +static int enqueue_floating_irq(struct kvm_device *dev,
> +				 struct kvm_device_attr *attr)
> +{
> +	struct kvm_s390_interrupt_info *inti = NULL;
> +	int r = 0;
> +
> +	inti = kzalloc(sizeof(*inti), GFP_KERNEL);
> +	if (!inti)
> +		return -ENOMEM;
> +
> +	r = copy_irq_from_user(inti, attr->addr);
> +	if (r) {
> +		kfree(inti);
> +		return r;
> +	}
> +	__inject_vm(dev->kvm, inti);
> +
> +	return r;
> +}
> +
> +static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	int r = 0;
> +
> +	switch (attr->group) {
> +	case KVM_DEV_FLIC_ENQUEUE:
> +		r = enqueue_floating_irq(dev, attr);
> +		break;
> +	case KVM_DEV_FLIC_CLEAR_IRQS:
> +		r = 0;
> +		clear_floating_interrupts(dev->kvm);
> +		break;
> +	default:
> +		r = -EINVAL;
> +	}
> +
> +	return r;
> +}
> +
> +static int flic_create(struct kvm_device *dev, u32 type)
> +{
> +	if (!dev)
> +		return -EINVAL;
> +	if (dev->kvm->arch.flic)
> +		return -EINVAL;
> +	dev->kvm->arch.flic = dev;
> +	return 0;
> +}
> +
> +static void flic_destroy(struct kvm_device *dev)
> +{
> +	dev->kvm->arch.flic = NULL;
> +}
> +
> +/* s390 floating irq controller (flic) */
> +struct kvm_device_ops kvm_flic_ops = {
> +	.name = "kvm-flic",
> +	.get_attr = flic_get_attr,
> +	.set_attr = flic_set_attr,
> +	.create = flic_create,
> +	.destroy = flic_destroy,
> +};
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index 776dafe..760febf 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -157,6 +157,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> 	case KVM_CAP_ENABLE_CAP:
> 	case KVM_CAP_S390_CSS_SUPPORT:
> 	case KVM_CAP_IOEVENTFD:
> +	case KVM_CAP_DEVICE_CTRL:
> 		r = 1;
> 		break;
> 	case KVM_CAP_NR_VCPUS:
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ca645a0..e89304d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1065,6 +1065,7 @@ struct kvm_device *kvm_device_from_filp(struct file *filp);
> 
> extern struct kvm_device_ops kvm_mpic_ops;
> extern struct kvm_device_ops kvm_xics_ops;
> +extern struct kvm_device_ops kvm_flic_ops;
> 
> #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> 
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index eeb08a1..9c8b457 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -426,6 +426,7 @@ struct kvm_s390_psw {
> 	 ((ai) << 26))
> #define KVM_S390_INT_IO_MIN		0x00000000u
> #define KVM_S390_INT_IO_MAX		0xfffdffffu
> +#define KVM_S390_INT_MAX		0xffffffffu
> 
> 
> struct kvm_s390_interrupt {
> @@ -906,6 +907,7 @@ struct kvm_device_attr {
> #define KVM_DEV_TYPE_FSL_MPIC_20	1
> #define KVM_DEV_TYPE_FSL_MPIC_42	2
> #define KVM_DEV_TYPE_XICS		3
> +#define KVM_DEV_TYPE_FLIC		4
> 
> /*
>  * ioctls for VM fds
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index bf040c4..79a2ecc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2265,6 +2265,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
> 		ops = &kvm_xics_ops;
> 		break;
> #endif
> +#ifdef CONFIG_S390
> +	case KVM_DEV_TYPE_FLIC:
> +		ops = &kvm_flic_ops;
> +		break;
> +#endif
> 	default:
> 		return -ENODEV;
> 	}
> -- 
> 1.8.3.4
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html