On 2012-02-10 19:17, Jan Kiszka wrote: > PCI 2.3 allows to generically disable IRQ sources at device level. This > enables us to share legacy IRQs of such devices with other host devices > when passing them to a guest. > > The new IRQ sharing feature introduced here is optional, user space has > to request it explicitly. Moreover, user space can inform us about its > view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the > interrupt and signaling it if the guest masked it via the virtualized > PCI config space. > > Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> > --- > > Changes in v3: > - rebased over current kvm.git (no code conflict, just api.txt) > > Documentation/virtual/kvm/api.txt | 31 ++++++ > arch/x86/kvm/x86.c | 1 + > include/linux/kvm.h | 6 + > include/linux/kvm_host.h | 2 + > virt/kvm/assigned-dev.c | 208 +++++++++++++++++++++++++++++++----- > 5 files changed, 219 insertions(+), 29 deletions(-) > > diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt > index 59a3826..5ce0e29 100644 > --- a/Documentation/virtual/kvm/api.txt > +++ b/Documentation/virtual/kvm/api.txt > @@ -1169,6 +1169,14 @@ following flags are specified: > > /* Depends on KVM_CAP_IOMMU */ > #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) > +/* The following two depend on KVM_CAP_PCI_2_3 */ > +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) > +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) > + > +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts > +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other > +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the > +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details. > > The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure > isolation of the device. Usages not specifying this flag are deprecated. > @@ -1441,6 +1449,29 @@ The "num_dirty" field is a performance hint for KVM to determine whether it > should skip processing the bitmap and just invalidate everything. It must > be set to the number of set bits in the bitmap. > > +4.60 KVM_ASSIGN_SET_INTX_MASK > + > +Capability: KVM_CAP_PCI_2_3 > +Architectures: x86 > +Type: vm ioctl > +Parameters: struct kvm_assigned_pci_dev (in) > +Returns: 0 on success, -1 on error > + > +Informs the kernel about the guest's view on the INTx mask. As long as the > +guest masks the legacy INTx, the kernel will refrain from unmasking it at > +hardware level and will not assert the guest's IRQ line. User space is still > +responsible for applying this state to the assigned device's real config space > +by setting or clearing the Interrupt Disable bit 10 in the Command register. > + > +To avoid that the kernel overwrites the state user space wants to set, > +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space. > +Moreover, user space has to write back its own view on the Interrupt Disable > +bit whenever modifying the Command word. > + > +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified > +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is > +evaluated. > + > 4.62 KVM_CREATE_SPAPR_TCE > > Capability: KVM_CAP_SPAPR_TCE > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 2bd77a3..1f11435 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2099,6 +2099,7 @@ int kvm_dev_ioctl_check_extension(long ext) > case KVM_CAP_XSAVE: > case KVM_CAP_ASYNC_PF: > case KVM_CAP_GET_TSC_KHZ: > + case KVM_CAP_PCI_2_3: > r = 1; > break; > case KVM_CAP_COALESCED_MMIO: > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > index acbe429..6c322a9 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -588,6 +588,7 @@ struct kvm_ppc_pvinfo { > #define KVM_CAP_TSC_DEADLINE_TIMER 72 > #define KVM_CAP_S390_UCONTROL 73 > #define KVM_CAP_SYNC_REGS 74 > +#define KVM_CAP_PCI_2_3 75 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -784,6 +785,9 @@ struct kvm_s390_ucas_mapping { > /* Available with KVM_CAP_TSC_CONTROL */ > #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) > #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) > +/* Available with KVM_CAP_PCI_2_3 */ > +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ > + struct kvm_assigned_pci_dev) > > /* > * ioctls for vcpu fds > @@ -857,6 +861,8 @@ struct kvm_s390_ucas_mapping { > #define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg) > > #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) > +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) > +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) > > struct kvm_assigned_pci_dev { > __u32 assigned_dev_id; > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 9698080..d1d68f4 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -547,6 +547,7 @@ struct kvm_assigned_dev_kernel { > unsigned int entries_nr; > int host_irq; > bool host_irq_disabled; > + bool pci_2_3; > struct msix_entry *host_msix_entries; > int guest_irq; > struct msix_entry *guest_msix_entries; > @@ -556,6 +557,7 @@ struct kvm_assigned_dev_kernel { > struct pci_dev *dev; > struct kvm *kvm; > spinlock_t intx_lock; > + struct mutex intx_mask_lock; > char irq_name[32]; > struct pci_saved_state *pci_saved_state; > }; > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > index ece8061..3ee2970 100644 > --- a/virt/kvm/assigned-dev.c > +++ b/virt/kvm/assigned-dev.c > @@ -55,22 +55,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel > return index; > } > > -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) > +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) > { > struct kvm_assigned_dev_kernel *assigned_dev = dev_id; > + int ret; > > - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { > - spin_lock(&assigned_dev->intx_lock); > + spin_lock(&assigned_dev->intx_lock); > + if (pci_check_and_mask_intx(assigned_dev->dev)) { > + assigned_dev->host_irq_disabled = true; > + ret = IRQ_WAKE_THREAD; > + } else > + ret = IRQ_NONE; > + spin_unlock(&assigned_dev->intx_lock); > + > + return ret; > +} > + > +static void > +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, > + int vector) > +{ > + if (unlikely(assigned_dev->irq_requested_type & > + KVM_DEV_IRQ_GUEST_INTX)) { > + mutex_lock(&assigned_dev->intx_mask_lock); > + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) > + kvm_set_irq(assigned_dev->kvm, > + assigned_dev->irq_source_id, vector, 1); > + mutex_unlock(&assigned_dev->intx_mask_lock); > + } else > + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, > + vector, 1); > +} > + > +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) > +{ > + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; > + > + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { > + spin_lock_irq(&assigned_dev->intx_lock); > disable_irq_nosync(irq); > assigned_dev->host_irq_disabled = true; > - spin_unlock(&assigned_dev->intx_lock); > + spin_unlock_irq(&assigned_dev->intx_lock); > } > > - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, > - assigned_dev->guest_irq, 1); > + kvm_assigned_dev_raise_guest_irq(assigned_dev, > + assigned_dev->guest_irq); > + > + return IRQ_HANDLED; > +} > + > +#ifdef __KVM_HAVE_MSI > +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) > +{ > + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; > + > + kvm_assigned_dev_raise_guest_irq(assigned_dev, > + assigned_dev->guest_irq); > > return IRQ_HANDLED; > } > +#endif > > #ifdef __KVM_HAVE_MSIX > static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) > @@ -81,8 +125,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) > > if (index >= 0) { > vector = assigned_dev->guest_msix_entries[index].vector; > - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, > - vector, 1); > + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); > } > > return IRQ_HANDLED; > @@ -98,15 +141,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) > > kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); > > - /* The guest irq may be shared so this ack may be > - * from another device. > - */ > - spin_lock(&dev->intx_lock); > - if (dev->host_irq_disabled) { > - enable_irq(dev->host_irq); > - dev->host_irq_disabled = false; > + mutex_lock(&dev->intx_mask_lock); > + > + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { > + bool reassert = false; > + > + spin_lock_irq(&dev->intx_lock); > + /* > + * The guest IRQ may be shared so this ack can come from an > + * IRQ for another guest device. > + */ > + if (dev->host_irq_disabled) { > + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) > + enable_irq(dev->host_irq); > + else if (!pci_check_and_unmask_intx(dev->dev)) > + reassert = true; > + dev->host_irq_disabled = reassert; > + } > + spin_unlock_irq(&dev->intx_lock); > + > + if (reassert) > + kvm_set_irq(dev->kvm, dev->irq_source_id, > + dev->guest_irq, 1); > } > - spin_unlock(&dev->intx_lock); > + > + mutex_unlock(&dev->intx_mask_lock); > } > > static void deassign_guest_irq(struct kvm *kvm, > @@ -154,7 +213,13 @@ static void deassign_host_irq(struct kvm *kvm, > pci_disable_msix(assigned_dev->dev); > } else { > /* Deal with MSI and INTx */ > - disable_irq(assigned_dev->host_irq); > + if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { > + spin_lock_irq(&assigned_dev->intx_lock); > + pci_intx(assigned_dev->dev, false); > + spin_unlock_irq(&assigned_dev->intx_lock); > + synchronize_irq(assigned_dev->host_irq); > + } else > + disable_irq(assigned_dev->host_irq); > > free_irq(assigned_dev->host_irq, assigned_dev); > > @@ -235,15 +300,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm) > static int assigned_device_enable_host_intx(struct kvm *kvm, > struct kvm_assigned_dev_kernel *dev) > { > + irq_handler_t irq_handler; > + unsigned long flags; > + > dev->host_irq = dev->dev->irq; > - /* Even though this is PCI, we don't want to use shared > - * interrupts. Sharing host devices with guest-assigned devices > - * on the same interrupt line is not a happy situation: there > - * are going to be long delays in accepting, acking, etc. > + > + /* > + * We can only share the IRQ line with other host devices if we are > + * able to disable the IRQ source at device-level - independently of > + * the guest driver. Otherwise host devices may suffer from unbounded > + * IRQ latencies when the guest keeps the line asserted. > */ > - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, > - IRQF_ONESHOT, dev->irq_name, dev)) > + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { > + irq_handler = kvm_assigned_dev_intx; > + flags = IRQF_SHARED; > + } else { > + irq_handler = NULL; > + flags = IRQF_ONESHOT; > + } > + if (request_threaded_irq(dev->host_irq, irq_handler, > + kvm_assigned_dev_thread_intx, flags, > + dev->irq_name, dev)) > return -EIO; > + > + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { > + spin_lock_irq(&dev->intx_lock); > + pci_intx(dev->dev, true); > + spin_unlock_irq(&dev->intx_lock); > + } > return 0; > } > > @@ -260,8 +344,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, > } > > dev->host_irq = dev->dev->irq; > - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, > - 0, dev->irq_name, dev)) { > + if (request_threaded_irq(dev->host_irq, NULL, > + kvm_assigned_dev_thread_msi, 0, > + dev->irq_name, dev)) { > pci_disable_msi(dev->dev); > return -EIO; > } > @@ -319,7 +404,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, > { > dev->guest_irq = irq->guest_irq; > dev->ack_notifier.gsi = -1; > - dev->host_irq_disabled = false; > return 0; > } > #endif > @@ -331,7 +415,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, > { > dev->guest_irq = irq->guest_irq; > dev->ack_notifier.gsi = -1; > - dev->host_irq_disabled = false; > return 0; > } > #endif > @@ -365,6 +448,7 @@ static int assign_host_irq(struct kvm *kvm, > default: > r = -EINVAL; > } > + dev->host_irq_disabled = false; > > if (!r) > dev->irq_requested_type |= host_irq_type; > @@ -466,6 +550,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, > { > int r = -ENODEV; > struct kvm_assigned_dev_kernel *match; > + unsigned long irq_type; > > mutex_lock(&kvm->lock); > > @@ -474,7 +559,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, > if (!match) > goto out; > > - r = kvm_deassign_irq(kvm, match, assigned_irq->flags); > + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | > + KVM_DEV_IRQ_GUEST_MASK); > + r = kvm_deassign_irq(kvm, match, irq_type); > out: > mutex_unlock(&kvm->lock); > return r; > @@ -607,6 +694,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, > if (!match->pci_saved_state) > printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", > __func__, dev_name(&dev->dev)); > + > + if (!pci_intx_mask_supported(dev)) > + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; > + > match->assigned_dev_id = assigned_dev->assigned_dev_id; > match->host_segnr = assigned_dev->segnr; > match->host_busnr = assigned_dev->busnr; > @@ -614,6 +705,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, > match->flags = assigned_dev->flags; > match->dev = dev; > spin_lock_init(&match->intx_lock); > + mutex_init(&match->intx_mask_lock); > match->irq_source_id = -1; > match->kvm = kvm; > match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; > @@ -759,6 +851,56 @@ msix_entry_out: > } > #endif > > +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, > + struct kvm_assigned_pci_dev *assigned_dev) > +{ > + int r = 0; > + struct kvm_assigned_dev_kernel *match; > + > + mutex_lock(&kvm->lock); > + > + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > + assigned_dev->assigned_dev_id); > + if (!match) { > + r = -ENODEV; > + goto out; > + } > + > + mutex_lock(&match->intx_mask_lock); > + > + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; > + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; > + > + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { > + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { > + kvm_set_irq(match->kvm, match->irq_source_id, > + match->guest_irq, 0); > + /* > + * Masking at hardware-level is performed on demand, > + * i.e. when an IRQ actually arrives at the host. > + */ > + } else { > + /* > + * Unmask the IRQ line. It may have been masked > + * meanwhile if we aren't using PCI 2.3 INTx masking > + * on the host side. > + */ > + spin_lock_irq(&match->intx_lock); > + if (match->host_irq_disabled) { > + enable_irq(match->host_irq); > + match->host_irq_disabled = false; > + } > + spin_unlock_irq(&match->intx_lock); > + } > + } > + > + mutex_unlock(&match->intx_mask_lock); > + > +out: > + mutex_unlock(&kvm->lock); > + return r; > +} > + > long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, > unsigned long arg) > { > @@ -866,6 +1008,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, > break; > } > #endif > + case KVM_ASSIGN_SET_INTX_MASK: { > + struct kvm_assigned_pci_dev assigned_dev; > + > + r = -EFAULT; > + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) > + goto out; > + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); > + break; > + } > default: > r = -ENOTTY; > break; > @@ -873,4 +1024,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, > out: > return r; > } > - Ping? Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html