> -----Original Message----- > From: Alex Williamson [mailto:alex.williamson@xxxxxxxxxx] > Sent: Friday, June 12, 2015 1:15 AM > To: Wu, Feng > Cc: kvm@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; pbonzini@xxxxxxxxxx; > mtosatti@xxxxxxxxxx; eric.auger@xxxxxxxxxx > Subject: Re: [v4 12/16] KVM: kvm-vfio: implement the VFIO skeleton for VT-d > Posted-Interrupts > > On Thu, 2015-06-11 at 18:51 +0800, Feng Wu wrote: > > This patch adds the kvm-vfio interface for VT-d Posted-Interrupts. > > When guests update MSI/MSI-x information for an assigned-device, > > QEMU will use KVM_DEV_VFIO_DEVICE_POST_IRQ attribute to setup > > IRTE for VT-d PI. Userspace program can also use > > KVM_DEV_VFIO_DEVICE_UNPOST_IRQ to change back to irq remapping > mode. > > This patch implements these IRQ attributes. > > > > Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx> > > --- > > include/linux/kvm_host.h | 22 +++++++++ > > virt/kvm/vfio.c | 126 > +++++++++++++++++++++++++++++++++++++++++++++++ > > 2 files changed, 148 insertions(+) > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index f591f7c..69f8711 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -1073,6 +1073,28 @@ extern struct kvm_device_ops kvm_xics_ops; > > extern struct kvm_device_ops kvm_arm_vgic_v2_ops; > > extern struct kvm_device_ops kvm_arm_vgic_v3_ops; > > > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST > > +/* > > + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts > > + * > > + * @kvm: kvm > > + * @host_irq: host irq of the interrupt > > + * @guest_irq: gsi of the interrupt > > + * @set: set or unset PI > > + * returns 0 on success, < 0 on failure > > + */ > > +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq, > > + uint32_t guest_irq, bool set); > > +#else > > +static inline int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, > > + unsigned int host_irq, > > + uint32_t guest_irq, > > + bool set) > > +{ > > + return 0; > > +} > > The code below can't get to this function without > __KVM_HAVE_ARCH_KVM_VFIO_POST, but this seems like it should return an > error if not implemented. kvm_arch_vfio_update_pi_irte() is called by kvm_vfio_control_pi(), if we remove the dummy definition of kvm_arch_vfio_update_pi_irte(), kvm_vfio_control_pi() is also needed to be included in __KVM_HAVE_ARCH_KVM_VFIO_POST, I will handle this in the next version. > > > +#endif > > + > > #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT > > > > static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool > val) > > diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c > > index 80a45e4..547fc51 100644 > > --- a/virt/kvm/vfio.c > > +++ b/virt/kvm/vfio.c > > @@ -18,6 +18,7 @@ > > #include <linux/slab.h> > > #include <linux/uaccess.h> > > #include <linux/vfio.h> > > +#include <asm/irq_remapping.h> > > This only exists on x86. But in kvm_vfio_has_attr(), we can only return 0 when posted interrupt is supported via calling " irq_remapping_cap(IRQ_POSTING_CAP)" which needs this header file. Do you think how can I handle this? > Are we also getting lucky with some of the > include chains that give us the PCI related defines? It looks like > we're implicitly assuming CONFIG_PCI Yes, I think the PCI related header files are included implicitly here. Anyway I can add "#include <linux/pci.h>" explicitly. > > #include "vfio.h" > > > > struct kvm_vfio_group { > > @@ -276,12 +277,128 @@ static int kvm_vfio_set_group(struct kvm_device > *dev, long attr, u64 arg) > > return -ENXIO; > > } > > > > +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type) > > +{ > > + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { > > + u8 pin; > > + > > + pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin); > > + if (pin) > > + return 1; > > + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { > > + return pci_msi_vec_count(pdev); > > + } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { > > + return pci_msix_vec_count(pdev); > > + } > > + > > + return 0; > > +} > > + > > +static int kvm_vfio_control_pi(struct kvm_device *kdev, > > + int32_t __user *argp, bool set) > > +{ > > + struct kvm_vfio_dev_irq pi_info; > > + uint32_t *gsi; > > + unsigned long minsz; > > + struct vfio_device *vdev; > > + struct msi_desc *entry; > > + struct device *dev; > > + struct pci_dev *pdev; > > + int i, max, ret; > > + > > + minsz = offsetofend(struct kvm_vfio_dev_irq, count); > > + > > + if (copy_from_user(&pi_info, (void __user *)argp, minsz)) > > + return -EFAULT; > > + > > + if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS) > > + return -EINVAL; > > Could we also abort on pi_info.count == 0? Yes, that is a good point. > > > + > > + vdev = kvm_vfio_get_vfio_device(pi_info.fd); > > + if (IS_ERR(vdev)) > > + return PTR_ERR(vdev); > > + > > + dev = kvm_vfio_external_base_device(vdev); > > + if (!dev || !dev_is_pci(dev)) { > > + ret = -EFAULT; > > + goto put_vfio_device; > > + } > > + > > + pdev = to_pci_dev(dev); > > + > > + max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index); > > + if (max <= 0) { > > + ret = -EFAULT; > > + goto put_vfio_device; > > + } > > + > > + if (pi_info.argsz - minsz < pi_info.count * sizeof(u32) || > > + pi_info.start >= max || pi_info.start + pi_info.count > max) { > > + ret = -EINVAL; > > + goto put_vfio_device; > > + } > > + > > + gsi = memdup_user((void __user *)((unsigned long)argp + minsz), > > + pi_info.count * sizeof(u32)); > > + if (IS_ERR(gsi)) { > > + ret = PTR_ERR(gsi); > > + goto put_vfio_device; > > + } > > + > > +#ifdef CONFIG_PCI_MSI > > + for (i = 0; i < pi_info.count; i++) { > > + list_for_each_entry(entry, &pdev->msi_list, list) { > > Should we be able to get here for INTx? We only support PI for MSI/MSIx. I think I can return earlier in this function if the index is not VFIO_PCI_MSI_IRQ_INDEX or VFIO_PCI_MSIX_IRQ_INDEX, is this okay for you? > > > + if (entry->msi_attrib.entry_nr != pi_info.start+i) > > + continue; > > + > > + ret = kvm_arch_vfio_update_pi_irte(kdev->kvm, > > + entry->irq, > > + gsi[i], > > + set); > > + if (ret) > > + goto free_gsi; > > + } > > + } > > +#endif > > + > > + ret = 0; > > So if we didn't do anything, return success? That seems strange. > Should we also be doing some unwind on failure? Thanks, > I can't think of what I need to do on failure. Do you have any ideas? Thanks, Feng > Alex > > > + > > +free_gsi: > > + kfree(gsi); > > + > > +put_vfio_device: > > + kvm_vfio_put_vfio_device(vdev); > > + return ret; > > +} > > + > > +static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg) > > +{ > > + int32_t __user *argp = (int32_t __user *)(unsigned long)arg; > > + int ret; > > + > > + switch (attr) { > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST > > + case KVM_DEV_VFIO_DEVICE_POST_IRQ: > > + ret = kvm_vfio_control_pi(kdev, argp, 1); > > + break; > > + case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ: > > + ret = kvm_vfio_control_pi(kdev, argp, 0); > > + break; > > +#endif > > + default: > > + ret = -ENXIO; > > + } > > + return ret; > > +} > > + > > static int kvm_vfio_set_attr(struct kvm_device *dev, > > struct kvm_device_attr *attr) > > { > > switch (attr->group) { > > case KVM_DEV_VFIO_GROUP: > > return kvm_vfio_set_group(dev, attr->attr, attr->addr); > > + case KVM_DEV_VFIO_DEVICE: > > + return kvm_vfio_set_device(dev, attr->attr, attr->addr); > > } > > > > return -ENXIO; > > @@ -299,6 +416,15 @@ static int kvm_vfio_has_attr(struct kvm_device > *dev, > > } > > > > break; > > + case KVM_DEV_VFIO_DEVICE: > > + switch (attr->attr) { > > +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST > > + case KVM_DEV_VFIO_DEVICE_POST_IRQ: > > + case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ: > > + return irq_remapping_cap(IRQ_POSTING_CAP) ? 0 : -ENXIO; > > +#endif > > + } > > + break; > > } > > > > return -ENXIO; > > ��.n��������+%������w��{.n�����o�^n�r������&��z�ޗ�zf���h���~����������_��+v���)ߣ�