Re: [v4 12/16] KVM: kvm-vfio: implement the VFIO skeleton for VT-d Posted-Interrupts

Alex Williamson <alex.williamson@xxxxxxxxxx> · Fri, 12 Jun 2015 08:51:04 -0600

On Fri, 2015-06-12 at 04:54 +0000, Wu, Feng wrote:
> 
> 
> > -----Original Message-----
> > From: Alex Williamson [mailto:alex.williamson@xxxxxxxxxx]
> > Sent: Friday, June 12, 2015 1:15 AM
> > To: Wu, Feng
> > Cc: kvm@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; pbonzini@xxxxxxxxxx;
> > mtosatti@xxxxxxxxxx; eric.auger@xxxxxxxxxx
> > Subject: Re: [v4 12/16] KVM: kvm-vfio: implement the VFIO skeleton for VT-d
> > Posted-Interrupts
> > 
> > On Thu, 2015-06-11 at 18:51 +0800, Feng Wu wrote:
> > > This patch adds the kvm-vfio interface for VT-d Posted-Interrupts.
> > > When guests update MSI/MSI-x information for an assigned-device,
> > > QEMU will use KVM_DEV_VFIO_DEVICE_POST_IRQ attribute to setup
> > > IRTE for VT-d PI. Userspace program can also use
> > > KVM_DEV_VFIO_DEVICE_UNPOST_IRQ to change back to irq remapping
> > mode.
> > > This patch implements these IRQ attributes.
> > >
> > > Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx>
> > > ---
> > >  include/linux/kvm_host.h |  22 +++++++++
> > >  virt/kvm/vfio.c          | 126
> > +++++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 148 insertions(+)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index f591f7c..69f8711 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -1073,6 +1073,28 @@ extern struct kvm_device_ops kvm_xics_ops;
> > >  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
> > >  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
> > >
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> > > +/*
> > > + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
> > > + *
> > > + * @kvm: kvm
> > > + * @host_irq: host irq of the interrupt
> > > + * @guest_irq: gsi of the interrupt
> > > + * @set: set or unset PI
> > > + * returns 0 on success, < 0 on failure
> > > + */
> > > +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> > > +				 uint32_t guest_irq, bool set);
> > > +#else
> > > +static inline int kvm_arch_vfio_update_pi_irte(struct kvm *kvm,
> > > +					       unsigned int host_irq,
> > > +					       uint32_t guest_irq,
> > > +					       bool set)
> > > +{
> > > +	return 0;
> > > +}
> > 
> > The code below can't get to this function without
> > __KVM_HAVE_ARCH_KVM_VFIO_POST, but this seems like it should return an
> > error if not implemented.
> 
> kvm_arch_vfio_update_pi_irte() is called by kvm_vfio_control_pi(), if we remove the
> dummy definition of kvm_arch_vfio_update_pi_irte(), kvm_vfio_control_pi() is also
> needed to be included in __KVM_HAVE_ARCH_KVM_VFIO_POST, I will handle this
> in the next version.

But typically we wouldn't return success for a function that's not
implemented.

> > > +#endif
> > > +
> > >  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> > >
> > >  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool
> > val)
> > > diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> > > index 80a45e4..547fc51 100644
> > > --- a/virt/kvm/vfio.c
> > > +++ b/virt/kvm/vfio.c
> > > @@ -18,6 +18,7 @@
> > >  #include <linux/slab.h>
> > >  #include <linux/uaccess.h>
> > >  #include <linux/vfio.h>
> > > +#include <asm/irq_remapping.h>
> > 
> > This only exists on x86.   
> 
> But in kvm_vfio_has_attr(), we can only return 0 when posted interrupt is supported
> via calling " irq_remapping_cap(IRQ_POSTING_CAP)" which needs this header file.
> Do you think how can I handle this?

Further abstraction, #ifdef...  This file is not x86 specific.

> > Are we also getting lucky with some of the
> > include chains that give us the PCI related defines?  It looks like
> > we're implicitly assuming CONFIG_PCI
> 
> Yes, I think the PCI related header files are included implicitly here. Anyway
> I can add "#include <linux/pci.h>" explicitly.

Ok, but that doesn't solve the problem that this file should not assume
CONFIG_PCI.  Posted interrupts functionality may requires PCI, but
neither KVM nor VFIO necessarily do and therefore the kvm-vfio interface
cannot assume it.

> > >  #include "vfio.h"
> > >
> > >  struct kvm_vfio_group {
> > > @@ -276,12 +277,128 @@ static int kvm_vfio_set_group(struct kvm_device
> > *dev, long attr, u64 arg)
> > >  	return -ENXIO;
> > >  }
> > >
> > > +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
> > > +{
> > > +	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> > > +		u8 pin;
> > > +
> > > +		pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> > > +		if (pin)
> > > +			return 1;
> > > +	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
> > > +		return pci_msi_vec_count(pdev);
> > > +	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
> > > +		return pci_msix_vec_count(pdev);
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int kvm_vfio_control_pi(struct kvm_device *kdev,
> > > +			       int32_t __user *argp, bool set)
> > > +{
> > > +	struct kvm_vfio_dev_irq pi_info;
> > > +	uint32_t *gsi;
> > > +	unsigned long minsz;
> > > +	struct vfio_device *vdev;
> > > +	struct msi_desc *entry;
> > > +	struct device *dev;
> > > +	struct pci_dev *pdev;
> > > +	int i, max, ret;
> > > +
> > > +	minsz = offsetofend(struct kvm_vfio_dev_irq, count);
> > > +
> > > +	if (copy_from_user(&pi_info, (void __user *)argp, minsz))
> > > +		return -EFAULT;
> > > +
> > > +	if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
> > > +		return -EINVAL;
> > 
> > Could we also abort on pi_info.count == 0?
> 
> Yes, that is a good point.
> 
> > 
> > > +
> > > +	vdev = kvm_vfio_get_vfio_device(pi_info.fd);
> > > +	if (IS_ERR(vdev))
> > > +		return PTR_ERR(vdev);
> > > +
> > > +	dev = kvm_vfio_external_base_device(vdev);
> > > +	if (!dev || !dev_is_pci(dev)) {
> > > +		ret = -EFAULT;
> > > +		goto put_vfio_device;
> > > +	}
> > > +
> > > +	pdev = to_pci_dev(dev);
> > > +
> > > +	max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
> > > +	if (max <= 0) {
> > > +		ret = -EFAULT;
> > > +		goto put_vfio_device;
> > > +	}
> > > +
> > > +	if (pi_info.argsz - minsz < pi_info.count * sizeof(u32) ||
> > > +	    pi_info.start >= max || pi_info.start + pi_info.count > max) {
> > > +		ret = -EINVAL;
> > > +		goto put_vfio_device;
> > > +	}
> > > +
> > > +	gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
> > > +			   pi_info.count * sizeof(u32));
> > > +	if (IS_ERR(gsi)) {
> > > +		ret = PTR_ERR(gsi);
> > > +		goto put_vfio_device;
> > > +	}
> > > +
> > > +#ifdef CONFIG_PCI_MSI
> > > +	for (i = 0; i < pi_info.count; i++) {
> > > +		list_for_each_entry(entry, &pdev->msi_list, list) {
> > 
> > Should we be able to get here for INTx?
> 
> We only support PI for MSI/MSIx. I think I can return earlier in this function if
> the index is not VFIO_PCI_MSI_IRQ_INDEX or VFIO_PCI_MSIX_IRQ_INDEX,
> is this okay for you?

Yes, preferably early enough that you don't need to read the pin
register to determine if the device supports INTx.

> > 
> > > +			if (entry->msi_attrib.entry_nr != pi_info.start+i)
> > > +				continue;
> > > +
> > > +			ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
> > > +							   entry->irq,
> > > +							   gsi[i],
> > > +							   set);
> > > +			if (ret)
> > > +				goto free_gsi;
> > > +		}
> > > +	}
> > > +#endif
> > > +
> > > +	ret = 0;
> > 
> > So if we didn't do anything, return success?  That seems strange.
> > Should we also be doing some unwind on failure?  Thanks,
> > 
> I can't think of what I need to do on failure. Do you have any ideas?

This is a user interface, so I think a user would expect that success
means that all of the parameters were correct and all of the indexes
were configured.  If something generated an error, everything should be
returned to the initial state and an error returned.  That may mean that
we need to reverse the loop and unset things that we set, or set things
that were unset.  Thanks,

Alex

> > > +free_gsi:
> > > +	kfree(gsi);
> > > +
> > > +put_vfio_device:
> > > +	kvm_vfio_put_vfio_device(vdev);
> > > +	return ret;
> > > +}
> > > +
> > > +static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
> > > +{
> > > +	int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
> > > +	int ret;
> > > +
> > > +	switch (attr) {
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> > > +	case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> > > +		ret = kvm_vfio_control_pi(kdev, argp, 1);
> > > +		break;
> > > +	case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> > > +		ret = kvm_vfio_control_pi(kdev, argp, 0);
> > > +		break;
> > > +#endif
> > > +	default:
> > > +		ret = -ENXIO;
> > > +	}
> > > +	return ret;
> > > +}
> > > +
> > >  static int kvm_vfio_set_attr(struct kvm_device *dev,
> > >  			     struct kvm_device_attr *attr)
> > >  {
> > >  	switch (attr->group) {
> > >  	case KVM_DEV_VFIO_GROUP:
> > >  		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
> > > +	case KVM_DEV_VFIO_DEVICE:
> > > +		return kvm_vfio_set_device(dev, attr->attr, attr->addr);
> > >  	}
> > >
> > >  	return -ENXIO;
> > > @@ -299,6 +416,15 @@ static int kvm_vfio_has_attr(struct kvm_device
> > *dev,
> > >  		}
> > >
> > >  		break;
> > > +	case KVM_DEV_VFIO_DEVICE:
> > > +		switch (attr->attr) {
> > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> > > +		case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> > > +		case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> > > +			return irq_remapping_cap(IRQ_POSTING_CAP) ? 0 : -ENXIO;
> > > +#endif
> > > +		}
> > > +		break;
> > >  	}
> > >
> > >  	return -ENXIO;
> > 
> > 
> 
> NrybXǧv^)޺{.n+hܨ}Ơz&j:+vzZ++zfh~izw?&)ߢf

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html