On Mon, Jan 09, 2012 at 03:03:00PM +0100, Jan Kiszka wrote: > PCI 2.3 allows to generically disable IRQ sources at device level. This > enables us to share legacy IRQs of such devices with other host devices > when passing them to a guest. > > The new IRQ sharing feature introduced here is optional, user space has > to request it explicitly. Moreover, user space can inform us about its > view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the > interrupt and signaling it if the guest masked it via the virtualized > PCI config space. > > Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> > --- > > This applies to kvm/master after merging > > PCI: Rework config space blocking services > PCI: Introduce INTx check & mask API > > from current linux-next/master. I suppose those two will make it into > 3.3. > > To recall the history of it: I tried hard to implement an adaptive > solution that automatically picks the fastest masking technique whenever > possible. However, the changes required to the IRQ core subsystem and > the logic of the device assignment code became so complex and partly > ugly that I gave up on this. It's simply not worth the pain given that > legacy PCI interrupts are rarely raised for performance critical device > at such a high rate (KHz...) that you can measure the difference. > > Documentation/virtual/kvm/api.txt | 27 +++++ > arch/x86/kvm/x86.c | 1 + > include/linux/kvm.h | 6 + > include/linux/kvm_host.h | 2 + > virt/kvm/assigned-dev.c | 208 +++++++++++++++++++++++++++++++----- > 5 files changed, 215 insertions(+), 29 deletions(-) > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt > index e1d94bf..670015a 100644 > --- a/Documentation/virtual/kvm/api.txt > +++ b/Documentation/virtual/kvm/api.txt > @@ -1159,6 +1159,14 @@ following flags are specified: > > /* Depends on KVM_CAP_IOMMU */ > #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) > +/* The following two depend on KVM_CAP_PCI_2_3 */ > +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) > +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) > + > +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts > +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other > +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the > +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details. > > The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure > isolation of the device. Usages not specifying this flag are deprecated. > @@ -1399,6 +1407,25 @@ The following flags are defined: > If datamatch flag is set, the event will be signaled only if the written value > to the registered address is equal to datamatch in struct kvm_ioeventfd. > > +4.59 KVM_ASSIGN_SET_INTX_MASK > + > +Capability: KVM_CAP_PCI_2_3 > +Architectures: x86 > +Type: vm ioctl > +Parameters: struct kvm_assigned_pci_dev (in) > +Returns: 0 on success, -1 on error > + > +Informs the kernel about the guest's view on the INTx mask. A wild idea: since this is guest view of its IRQ, can this be specified per guest IRQ+id then? That might be useful to support MSIX mask bit emulation. > As long as the > +guest masks the legacy INTx, the kernel will refrain from unmasking it at > +hardware level and will not assert the guest's IRQ line. User space is still > +responsible for applying this state to the assigned device's real config space. Can this be made more explicit? You mean writing into 1st byte of PCI control, right? > +To avoid that the kernel overwrites the state user space wants to set, > +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space. This looks like a strange requirement, could you explain how this helps avoid races? This also raises questions about what should be done to write a bit unrelated to masking. > + > +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified > +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is > +evaluated. > + > 4.62 KVM_CREATE_SPAPR_TCE > > Capability: KVM_CAP_SPAPR_TCE > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 1171def..9381806 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2057,6 +2057,7 @@ int kvm_dev_ioctl_check_extension(long ext) > case KVM_CAP_XSAVE: > case KVM_CAP_ASYNC_PF: > case KVM_CAP_GET_TSC_KHZ: > + case KVM_CAP_PCI_2_3: > r = 1; > break; > case KVM_CAP_COALESCED_MMIO: > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > index 68e67e5..da5f7b7 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -556,6 +556,7 @@ struct kvm_ppc_pvinfo { > #define KVM_CAP_PPC_RMA 65 > #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ > #define KVM_CAP_PPC_PAPR 68 > +#define KVM_CAP_PCI_2_3 69 > #define KVM_CAP_S390_GMAP 71 > #define KVM_CAP_TSC_DEADLINE_TIMER 72 > > @@ -697,6 +698,9 @@ struct kvm_clock_data { > /* Available with KVM_CAP_TSC_CONTROL */ > #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) > #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) > +/* Available with KVM_CAP_PCI_2_3 */ > +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ > + struct kvm_assigned_pci_dev) > > /* > * ioctls for vcpu fds > @@ -765,6 +769,8 @@ struct kvm_clock_data { > #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) > > #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) > +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) > +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) > > struct kvm_assigned_pci_dev { > __u32 assigned_dev_id; > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 900c763..07461bd 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel { > unsigned int entries_nr; > int host_irq; > bool host_irq_disabled; > + bool pci_2_3; > struct msix_entry *host_msix_entries; > int guest_irq; > struct msix_entry *guest_msix_entries; > @@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel { > struct pci_dev *dev; > struct kvm *kvm; > spinlock_t intx_lock; > + struct mutex intx_mask_lock; What exactly does this lock protect? I see it used sometimes with intx_lock and sometimes without. > char irq_name[32]; > struct pci_saved_state *pci_saved_state; > }; > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > index 758e3b3..b35aba9 100644 > --- a/virt/kvm/assigned-dev.c > +++ b/virt/kvm/assigned-dev.c > @@ -57,22 +57,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel > return index; > } > > -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) > +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) > { > struct kvm_assigned_dev_kernel *assigned_dev = dev_id; > + int ret; > > - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { > - spin_lock(&assigned_dev->intx_lock); > + spin_lock(&assigned_dev->intx_lock); > + if (pci_check_and_mask_intx(assigned_dev->dev)) { > + assigned_dev->host_irq_disabled = true; > + ret = IRQ_WAKE_THREAD; > + } else > + ret = IRQ_NONE; > + spin_unlock(&assigned_dev->intx_lock); > + > + return ret; > +} > + > +static void > +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, > + int vector) > +{ > + if (unlikely(assigned_dev->irq_requested_type & > + KVM_DEV_IRQ_GUEST_INTX)) { > + mutex_lock(&assigned_dev->intx_mask_lock); > + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) > + kvm_set_irq(assigned_dev->kvm, > + assigned_dev->irq_source_id, vector, 1); > + mutex_unlock(&assigned_dev->intx_mask_lock); > + } else > + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, > + vector, 1); > +} > + > +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) > +{ > + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; > + > + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { > + spin_lock_irq(&assigned_dev->intx_lock); > disable_irq_nosync(irq); > assigned_dev->host_irq_disabled = true; > - spin_unlock(&assigned_dev->intx_lock); > + spin_unlock_irq(&assigned_dev->intx_lock); > } > > - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, > - assigned_dev->guest_irq, 1); > + kvm_assigned_dev_raise_guest_irq(assigned_dev, > + assigned_dev->guest_irq); > + > + return IRQ_HANDLED; > +} > + > +#ifdef __KVM_HAVE_MSI > +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) > +{ > + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; > + > + kvm_assigned_dev_raise_guest_irq(assigned_dev, > + assigned_dev->guest_irq); > > return IRQ_HANDLED; > } > +#endif > > #ifdef __KVM_HAVE_MSIX > static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) > @@ -83,8 +127,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) > > if (index >= 0) { > vector = assigned_dev->guest_msix_entries[index].vector; > - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, > - vector, 1); > + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); > } > > return IRQ_HANDLED; > @@ -100,15 +143,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) > > kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); > > - /* The guest irq may be shared so this ack may be > - * from another device. > - */ > - spin_lock(&dev->intx_lock); > - if (dev->host_irq_disabled) { > - enable_irq(dev->host_irq); > - dev->host_irq_disabled = false; > + mutex_lock(&dev->intx_mask_lock); > + > + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { > + bool reassert = false; > + > + spin_lock_irq(&dev->intx_lock); > + /* > + * The guest IRQ may be shared so this ack can come from an > + * IRQ for another guest device. > + */ > + if (dev->host_irq_disabled) { > + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) > + enable_irq(dev->host_irq); > + else if (!pci_check_and_unmask_intx(dev->dev)) > + reassert = true; > + dev->host_irq_disabled = reassert; > + } > + spin_unlock_irq(&dev->intx_lock); > + > + if (reassert) > + kvm_set_irq(dev->kvm, dev->irq_source_id, > + dev->guest_irq, 1); > } > - spin_unlock(&dev->intx_lock); > + > + mutex_unlock(&dev->intx_mask_lock); > } > > static void deassign_guest_irq(struct kvm *kvm, > @@ -156,7 +215,13 @@ static void deassign_host_irq(struct kvm *kvm, > pci_disable_msix(assigned_dev->dev); > } else { > /* Deal with MSI and INTx */ > - disable_irq(assigned_dev->host_irq); > + if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { > + spin_lock_irq(&assigned_dev->intx_lock); Interesting that the new mutex is not used here. > + pci_intx(assigned_dev->dev, false); Interesting. This will leave the device with interrupts masked. Do we reset it later? Maybe we want a comment explaining why it's done. > + spin_unlock_irq(&assigned_dev->intx_lock); > + synchronize_irq(assigned_dev->host_irq); > + } else > + disable_irq(assigned_dev->host_irq); > > free_irq(assigned_dev->host_irq, assigned_dev); > > @@ -237,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm) > static int assigned_device_enable_host_intx(struct kvm *kvm, > struct kvm_assigned_dev_kernel *dev) > { > + irq_handler_t irq_handler; > + unsigned long flags; > + > dev->host_irq = dev->dev->irq; > - /* Even though this is PCI, we don't want to use shared > - * interrupts. Sharing host devices with guest-assigned devices > - * on the same interrupt line is not a happy situation: there > - * are going to be long delays in accepting, acking, etc. > + > + /* > + * We can only share the IRQ line with other host devices if we are > + * able to disable the IRQ source at device-level - independently of > + * the guest driver. Otherwise host devices may suffer from unbounded > + * IRQ latencies when the guest keeps the line asserted. > */ > - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, > - IRQF_ONESHOT, dev->irq_name, dev)) > + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { > + irq_handler = kvm_assigned_dev_intx; > + flags = IRQF_SHARED; > + } else { > + irq_handler = NULL; > + flags = IRQF_ONESHOT; > + } > + if (request_threaded_irq(dev->host_irq, irq_handler, > + kvm_assigned_dev_thread_intx, flags, > + dev->irq_name, dev)) > return -EIO; > + > + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { > + spin_lock_irq(&dev->intx_lock); > + pci_intx(dev->dev, true); > + spin_unlock_irq(&dev->intx_lock); > + } > return 0; > } > > @@ -262,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, > } > > dev->host_irq = dev->dev->irq; > - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, > - 0, dev->irq_name, dev)) { > + if (request_threaded_irq(dev->host_irq, NULL, > + kvm_assigned_dev_thread_msi, 0, > + dev->irq_name, dev)) { > pci_disable_msi(dev->dev); > return -EIO; > } > @@ -321,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, > { > dev->guest_irq = irq->guest_irq; > dev->ack_notifier.gsi = -1; > - dev->host_irq_disabled = false; > return 0; > } > #endif > @@ -333,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, > { > dev->guest_irq = irq->guest_irq; > dev->ack_notifier.gsi = -1; > - dev->host_irq_disabled = false; > return 0; > } > #endif > @@ -367,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm, > default: > r = -EINVAL; > } > + dev->host_irq_disabled = false; > > if (!r) > dev->irq_requested_type |= host_irq_type; > @@ -468,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, > { > int r = -ENODEV; > struct kvm_assigned_dev_kernel *match; > + unsigned long irq_type; > > mutex_lock(&kvm->lock); > > @@ -476,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, > if (!match) > goto out; > > - r = kvm_deassign_irq(kvm, match, assigned_irq->flags); > + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | > + KVM_DEV_IRQ_GUEST_MASK); > + r = kvm_deassign_irq(kvm, match, irq_type); > out: > mutex_unlock(&kvm->lock); > return r; > @@ -609,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, > if (!match->pci_saved_state) > printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", > __func__, dev_name(&dev->dev)); > + > + if (!pci_intx_mask_supported(dev)) > + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; > + > match->assigned_dev_id = assigned_dev->assigned_dev_id; > match->host_segnr = assigned_dev->segnr; > match->host_busnr = assigned_dev->busnr; > @@ -616,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, > match->flags = assigned_dev->flags; > match->dev = dev; > spin_lock_init(&match->intx_lock); > + mutex_init(&match->intx_mask_lock); > match->irq_source_id = -1; > match->kvm = kvm; > match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; > @@ -761,6 +853,56 @@ msix_entry_out: > } > #endif > > +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, > + struct kvm_assigned_pci_dev *assigned_dev) > +{ > + int r = 0; > + struct kvm_assigned_dev_kernel *match; > + > + mutex_lock(&kvm->lock); > + > + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > + assigned_dev->assigned_dev_id); > + if (!match) { > + r = -ENODEV; > + goto out; > + } > + > + mutex_lock(&match->intx_mask_lock); > + > + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; > + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; > + > + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { > + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { > + kvm_set_irq(match->kvm, match->irq_source_id, > + match->guest_irq, 0); > + /* > + * Masking at hardware-level is performed on demand, > + * i.e. when an IRQ actually arrives at the host. > + */ > + } else { > + /* > + * Unmask the IRQ line. It may have been masked > + * meanwhile if we aren't using PCI 2.3 INTx masking > + * on the host side. > + */ > + spin_lock_irq(&match->intx_lock); > + if (match->host_irq_disabled) { > + enable_irq(match->host_irq); > + match->host_irq_disabled = false; > + } > + spin_unlock_irq(&match->intx_lock); > + } > + } > + > + mutex_unlock(&match->intx_mask_lock); > + > +out: > + mutex_unlock(&kvm->lock); > + return r; > +} > + > long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, > unsigned long arg) > { > @@ -868,6 +1010,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, > break; > } > #endif > + case KVM_ASSIGN_SET_INTX_MASK: { > + struct kvm_assigned_pci_dev assigned_dev; > + > + r = -EFAULT; > + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) > + goto out; > + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); > + break; > + } > default: > r = -ENOTTY; > break; > @@ -875,4 +1026,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, > out: > return r; > } > - > -- > 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html