RE: [v4 12/16] KVM: kvm-vfio: implement the VFIO skeleton for VT-d Posted-Interrupts

"Wu, Feng" <feng.wu@xxxxxxxxx> · Fri, 12 Jun 2015 04:54:56 +0000

> -----Original Message-----
> From: Alex Williamson [mailto:alex.williamson@xxxxxxxxxx]
> Sent: Friday, June 12, 2015 1:15 AM
> To: Wu, Feng
> Cc: kvm@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; pbonzini@xxxxxxxxxx;
> mtosatti@xxxxxxxxxx; eric.auger@xxxxxxxxxx
> Subject: Re: [v4 12/16] KVM: kvm-vfio: implement the VFIO skeleton for VT-d
> Posted-Interrupts
> 
> On Thu, 2015-06-11 at 18:51 +0800, Feng Wu wrote:
> > This patch adds the kvm-vfio interface for VT-d Posted-Interrupts.
> > When guests update MSI/MSI-x information for an assigned-device,
> > QEMU will use KVM_DEV_VFIO_DEVICE_POST_IRQ attribute to setup
> > IRTE for VT-d PI. Userspace program can also use
> > KVM_DEV_VFIO_DEVICE_UNPOST_IRQ to change back to irq remapping
> mode.
> > This patch implements these IRQ attributes.
> >
> > Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx>
> > ---
> >  include/linux/kvm_host.h |  22 +++++++++
> >  virt/kvm/vfio.c          | 126
> +++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 148 insertions(+)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index f591f7c..69f8711 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1073,6 +1073,28 @@ extern struct kvm_device_ops kvm_xics_ops;
> >  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
> >  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
> >
> > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> > +/*
> > + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
> > + *
> > + * @kvm: kvm
> > + * @host_irq: host irq of the interrupt
> > + * @guest_irq: gsi of the interrupt
> > + * @set: set or unset PI
> > + * returns 0 on success, < 0 on failure
> > + */
> > +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> > +				 uint32_t guest_irq, bool set);
> > +#else
> > +static inline int kvm_arch_vfio_update_pi_irte(struct kvm *kvm,
> > +					       unsigned int host_irq,
> > +					       uint32_t guest_irq,
> > +					       bool set)
> > +{
> > +	return 0;
> > +}
> 
> The code below can't get to this function without
> __KVM_HAVE_ARCH_KVM_VFIO_POST, but this seems like it should return an
> error if not implemented.

kvm_arch_vfio_update_pi_irte() is called by kvm_vfio_control_pi(), if we remove the
dummy definition of kvm_arch_vfio_update_pi_irte(), kvm_vfio_control_pi() is also
needed to be included in __KVM_HAVE_ARCH_KVM_VFIO_POST, I will handle this
in the next version.

> 
> > +#endif
> > +
> >  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
> >
> >  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool
> val)
> > diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> > index 80a45e4..547fc51 100644
> > --- a/virt/kvm/vfio.c
> > +++ b/virt/kvm/vfio.c
> > @@ -18,6 +18,7 @@
> >  #include <linux/slab.h>
> >  #include <linux/uaccess.h>
> >  #include <linux/vfio.h>
> > +#include <asm/irq_remapping.h>
> 
> This only exists on x86.   

But in kvm_vfio_has_attr(), we can only return 0 when posted interrupt is supported
via calling " irq_remapping_cap(IRQ_POSTING_CAP)" which needs this header file.
Do you think how can I handle this?

> Are we also getting lucky with some of the
> include chains that give us the PCI related defines?  It looks like
> we're implicitly assuming CONFIG_PCI

Yes, I think the PCI related header files are included implicitly here. Anyway
I can add "#include <linux/pci.h>" explicitly.

> >  #include "vfio.h"
> >
> >  struct kvm_vfio_group {
> > @@ -276,12 +277,128 @@ static int kvm_vfio_set_group(struct kvm_device
> *dev, long attr, u64 arg)
> >  	return -ENXIO;
> >  }
> >
> > +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
> > +{
> > +	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> > +		u8 pin;
> > +
> > +		pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> > +		if (pin)
> > +			return 1;
> > +	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
> > +		return pci_msi_vec_count(pdev);
> > +	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
> > +		return pci_msix_vec_count(pdev);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int kvm_vfio_control_pi(struct kvm_device *kdev,
> > +			       int32_t __user *argp, bool set)
> > +{
> > +	struct kvm_vfio_dev_irq pi_info;
> > +	uint32_t *gsi;
> > +	unsigned long minsz;
> > +	struct vfio_device *vdev;
> > +	struct msi_desc *entry;
> > +	struct device *dev;
> > +	struct pci_dev *pdev;
> > +	int i, max, ret;
> > +
> > +	minsz = offsetofend(struct kvm_vfio_dev_irq, count);
> > +
> > +	if (copy_from_user(&pi_info, (void __user *)argp, minsz))
> > +		return -EFAULT;
> > +
> > +	if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
> > +		return -EINVAL;
> 
> Could we also abort on pi_info.count == 0?

Yes, that is a good point.

> 
> > +
> > +	vdev = kvm_vfio_get_vfio_device(pi_info.fd);
> > +	if (IS_ERR(vdev))
> > +		return PTR_ERR(vdev);
> > +
> > +	dev = kvm_vfio_external_base_device(vdev);
> > +	if (!dev || !dev_is_pci(dev)) {
> > +		ret = -EFAULT;
> > +		goto put_vfio_device;
> > +	}
> > +
> > +	pdev = to_pci_dev(dev);
> > +
> > +	max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
> > +	if (max <= 0) {
> > +		ret = -EFAULT;
> > +		goto put_vfio_device;
> > +	}
> > +
> > +	if (pi_info.argsz - minsz < pi_info.count * sizeof(u32) ||
> > +	    pi_info.start >= max || pi_info.start + pi_info.count > max) {
> > +		ret = -EINVAL;
> > +		goto put_vfio_device;
> > +	}
> > +
> > +	gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
> > +			   pi_info.count * sizeof(u32));
> > +	if (IS_ERR(gsi)) {
> > +		ret = PTR_ERR(gsi);
> > +		goto put_vfio_device;
> > +	}
> > +
> > +#ifdef CONFIG_PCI_MSI
> > +	for (i = 0; i < pi_info.count; i++) {
> > +		list_for_each_entry(entry, &pdev->msi_list, list) {
> 
> Should we be able to get here for INTx?

We only support PI for MSI/MSIx. I think I can return earlier in this function if
the index is not VFIO_PCI_MSI_IRQ_INDEX or VFIO_PCI_MSIX_IRQ_INDEX,
is this okay for you?

> 
> > +			if (entry->msi_attrib.entry_nr != pi_info.start+i)
> > +				continue;
> > +
> > +			ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
> > +							   entry->irq,
> > +							   gsi[i],
> > +							   set);
> > +			if (ret)
> > +				goto free_gsi;
> > +		}
> > +	}
> > +#endif
> > +
> > +	ret = 0;
> 
> So if we didn't do anything, return success?  That seems strange.
> Should we also be doing some unwind on failure?  Thanks,
> 
I can't think of what I need to do on failure. Do you have any ideas?

Thanks,
Feng

> Alex
> 
> > +
> > +free_gsi:
> > +	kfree(gsi);
> > +
> > +put_vfio_device:
> > +	kvm_vfio_put_vfio_device(vdev);
> > +	return ret;
> > +}
> > +
> > +static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
> > +{
> > +	int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
> > +	int ret;
> > +
> > +	switch (attr) {
> > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> > +	case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> > +		ret = kvm_vfio_control_pi(kdev, argp, 1);
> > +		break;
> > +	case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> > +		ret = kvm_vfio_control_pi(kdev, argp, 0);
> > +		break;
> > +#endif
> > +	default:
> > +		ret = -ENXIO;
> > +	}
> > +	return ret;
> > +}
> > +
> >  static int kvm_vfio_set_attr(struct kvm_device *dev,
> >  			     struct kvm_device_attr *attr)
> >  {
> >  	switch (attr->group) {
> >  	case KVM_DEV_VFIO_GROUP:
> >  		return kvm_vfio_set_group(dev, attr->attr, attr->addr);
> > +	case KVM_DEV_VFIO_DEVICE:
> > +		return kvm_vfio_set_device(dev, attr->attr, attr->addr);
> >  	}
> >
> >  	return -ENXIO;
> > @@ -299,6 +416,15 @@ static int kvm_vfio_has_attr(struct kvm_device
> *dev,
> >  		}
> >
> >  		break;
> > +	case KVM_DEV_VFIO_DEVICE:
> > +		switch (attr->attr) {
> > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> > +		case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> > +		case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> > +			return irq_remapping_cap(IRQ_POSTING_CAP) ? 0 : -ENXIO;
> > +#endif
> > +		}
> > +		break;
> >  	}
> >
> >  	return -ENXIO;
> 
> 

��.n��������+%������w��{.n�����o�^n�r������&��z�ޗ�zf���h���~����������_��+v���)ߣ�