On Thu, 2015-06-11 at 18:51 +0800, Feng Wu wrote: > This patch adds the kvm-vfio interface for VT-d Posted-Interrupts. > When guests update MSI/MSI-x information for an assigned-device, > QEMU will use KVM_DEV_VFIO_DEVICE_POST_IRQ attribute to setup > IRTE for VT-d PI. Userspace program can also use > KVM_DEV_VFIO_DEVICE_UNPOST_IRQ to change back to irq remapping mode. > This patch implements these IRQ attributes. > > Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx> > --- > include/linux/kvm_host.h | 22 +++++++++ > virt/kvm/vfio.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 148 insertions(+) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index f591f7c..69f8711 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1073,6 +1073,28 @@ extern struct kvm_device_ops kvm_xics_ops; > extern struct kvm_device_ops kvm_arm_vgic_v2_ops; > extern struct kvm_device_ops kvm_arm_vgic_v3_ops; > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST > +/* > + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts > + * > + * @kvm: kvm > + * @host_irq: host irq of the interrupt > + * @guest_irq: gsi of the interrupt > + * @set: set or unset PI > + * returns 0 on success, < 0 on failure > + */ > +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq, > + uint32_t guest_irq, bool set); > +#else > +static inline int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, > + unsigned int host_irq, > + uint32_t guest_irq, > + bool set) > +{ > + return 0; > +} The code below can't get to this function without __KVM_HAVE_ARCH_KVM_VFIO_POST, but this seems like it should return an error if not implemented. > +#endif > + > #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT > > static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) > diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c > index 80a45e4..547fc51 100644 > --- a/virt/kvm/vfio.c > +++ b/virt/kvm/vfio.c > @@ -18,6 +18,7 @@ > #include <linux/slab.h> > #include <linux/uaccess.h> > #include <linux/vfio.h> > +#include <asm/irq_remapping.h> This only exists on x86. Are we also getting lucky with some of the include chains that give us the PCI related defines? It looks like we're implicitly assuming CONFIG_PCI > #include "vfio.h" > > struct kvm_vfio_group { > @@ -276,12 +277,128 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) > return -ENXIO; > } > > +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type) > +{ > + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { > + u8 pin; > + > + pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin); > + if (pin) > + return 1; > + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { > + return pci_msi_vec_count(pdev); > + } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { > + return pci_msix_vec_count(pdev); > + } > + > + return 0; > +} > + > +static int kvm_vfio_control_pi(struct kvm_device *kdev, > + int32_t __user *argp, bool set) > +{ > + struct kvm_vfio_dev_irq pi_info; > + uint32_t *gsi; > + unsigned long minsz; > + struct vfio_device *vdev; > + struct msi_desc *entry; > + struct device *dev; > + struct pci_dev *pdev; > + int i, max, ret; > + > + minsz = offsetofend(struct kvm_vfio_dev_irq, count); > + > + if (copy_from_user(&pi_info, (void __user *)argp, minsz)) > + return -EFAULT; > + > + if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS) > + return -EINVAL; Could we also abort on pi_info.count == 0? > + > + vdev = kvm_vfio_get_vfio_device(pi_info.fd); > + if (IS_ERR(vdev)) > + return PTR_ERR(vdev); > + > + dev = kvm_vfio_external_base_device(vdev); > + if (!dev || !dev_is_pci(dev)) { > + ret = -EFAULT; > + goto put_vfio_device; > + } > + > + pdev = to_pci_dev(dev); > + > + max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index); > + if (max <= 0) { > + ret = -EFAULT; > + goto put_vfio_device; > + } > + > + if (pi_info.argsz - minsz < pi_info.count * sizeof(u32) || > + pi_info.start >= max || pi_info.start + pi_info.count > max) { > + ret = -EINVAL; > + goto put_vfio_device; > + } > + > + gsi = memdup_user((void __user *)((unsigned long)argp + minsz), > + pi_info.count * sizeof(u32)); > + if (IS_ERR(gsi)) { > + ret = PTR_ERR(gsi); > + goto put_vfio_device; > + } > + > +#ifdef CONFIG_PCI_MSI > + for (i = 0; i < pi_info.count; i++) { > + list_for_each_entry(entry, &pdev->msi_list, list) { Should we be able to get here for INTx? > + if (entry->msi_attrib.entry_nr != pi_info.start+i) > + continue; > + > + ret = kvm_arch_vfio_update_pi_irte(kdev->kvm, > + entry->irq, > + gsi[i], > + set); > + if (ret) > + goto free_gsi; > + } > + } > +#endif > + > + ret = 0; So if we didn't do anything, return success? That seems strange. Should we also be doing some unwind on failure? Thanks, Alex > + > +free_gsi: > + kfree(gsi); > + > +put_vfio_device: > + kvm_vfio_put_vfio_device(vdev); > + return ret; > +} > + > +static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg) > +{ > + int32_t __user *argp = (int32_t __user *)(unsigned long)arg; > + int ret; > + > + switch (attr) { > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST > + case KVM_DEV_VFIO_DEVICE_POST_IRQ: > + ret = kvm_vfio_control_pi(kdev, argp, 1); > + break; > + case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ: > + ret = kvm_vfio_control_pi(kdev, argp, 0); > + break; > +#endif > + default: > + ret = -ENXIO; > + } > + return ret; > +} > + > static int kvm_vfio_set_attr(struct kvm_device *dev, > struct kvm_device_attr *attr) > { > switch (attr->group) { > case KVM_DEV_VFIO_GROUP: > return kvm_vfio_set_group(dev, attr->attr, attr->addr); > + case KVM_DEV_VFIO_DEVICE: > + return kvm_vfio_set_device(dev, attr->attr, attr->addr); > } > > return -ENXIO; > @@ -299,6 +416,15 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, > } > > break; > + case KVM_DEV_VFIO_DEVICE: > + switch (attr->attr) { > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST > + case KVM_DEV_VFIO_DEVICE_POST_IRQ: > + case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ: > + return irq_remapping_cap(IRQ_POSTING_CAP) ? 0 : -ENXIO; > +#endif > + } > + break; > } > > return -ENXIO; -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html