On Tue, Nov 02, 2010 at 04:49:20PM +0100, Jan Kiszka wrote: > PCI 2.3 allows to generically disable IRQ sources at device level. This > enables us to share IRQs of such devices between on the host side when > passing them to a guest. > > Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> > --- > include/linux/kvm_host.h | 1 + > virt/kvm/assigned-dev.c | 194 ++++++++++++++++++++++++++++++++++++++++++---- > 2 files changed, 180 insertions(+), 15 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 46120da..fdc2bd9 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -466,6 +466,7 @@ struct kvm_assigned_dev_kernel { > unsigned int entries_nr; > int host_irq; > bool host_irq_disabled; > + bool pci_2_3; > struct msix_entry *host_msix_entries; > int guest_irq; > struct msix_entry *guest_msix_entries; > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > index ca402ed..91fe9c8 100644 > --- a/virt/kvm/assigned-dev.c > +++ b/virt/kvm/assigned-dev.c > @@ -55,17 +55,145 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel > return index; > } > > +/* > + * Verify that the device supports Interrupt Disable bit in command register, > + * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly > + * in PCI 2.2. > + */ > +static bool pci_2_3_supported(struct pci_dev *pdev) > +{ > + bool supported = false; > + u16 orig, new; > + > + pci_block_user_cfg_access(pdev); > + pci_read_config_word(pdev, PCI_COMMAND, &orig); > + pci_write_config_word(pdev, PCI_COMMAND, > + orig ^ PCI_COMMAND_INTX_DISABLE); > + pci_read_config_word(pdev, PCI_COMMAND, &new); > + > + /* > + * There's no way to protect against > + * hardware bugs or detect them reliably, but as long as we know > + * what the value should be, let's go ahead and check it. > + */ > + if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) { > + dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: " > + "driver or HW bug?\n", orig, new); > + goto out; > + } > + if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) { > + dev_warn(&pdev->dev, "Device does not support " > + "disabling interrupts: unable to bind.\n"); > + goto out; > + } > + supported = true; > + > + /* Now restore the original value. */ > + pci_write_config_word(pdev, PCI_COMMAND, orig); > + > +out: > + pci_unblock_user_cfg_access(pdev); > + return supported; > +} > + > +static unsigned int > +pci_2_3_set_irq_mask(struct pci_dev *dev, bool mask, bool check_status) > +{ > + u32 cmd_status_dword; > + u16 origcmd, newcmd; > + unsigned int status; > + > + /* > + * We do a single dword read to retrieve both command and status. > + * Document assumptions that make this possible. > + */ > + BUILD_BUG_ON(PCI_COMMAND % 4); > + BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS); > + > + pci_block_user_cfg_access(dev); > + > + /* > + * Read both command and status registers in a single 32-bit operation. > + * Note: we could cache the value for command and move the status read > + * out of the lock if there was a way to get notified of user changes > + * to command register through sysfs. Should be good for shared irqs. > + */ > + pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword); > + origcmd = cmd_status_dword; > + status = cmd_status_dword >> 16; > + > + if (check_status) { > + bool irq_pending = status & PCI_STATUS_INTERRUPT; > + > + /* > + * Check interrupt status register to see whether our device > + * triggered the interrupt (when masking) or the next IRQ is > + * already pending (when unmasking). > + */ > + if (!(mask == irq_pending)) Same as mask != irq_pending? > + goto done; > + } > + > + newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE; > + if (mask) > + newcmd |= PCI_COMMAND_INTX_DISABLE; > + if (newcmd != origcmd) > + pci_write_config_word(dev, PCI_COMMAND, newcmd); > + > +done: > + pci_unblock_user_cfg_access(dev); > + return status; > +} > + > +static void pci_2_3_irq_mask(struct pci_dev *dev) > +{ > + pci_2_3_set_irq_mask(dev, true, false); > +} > + > +static unsigned int pci_2_3_irq_check_and_mask(struct pci_dev *dev) > +{ > + return pci_2_3_set_irq_mask(dev, true, true); > +} > + > +static void pci_2_3_irq_unmask(struct pci_dev *dev) > +{ > + pci_2_3_set_irq_mask(dev, false, false); > +} > + > +static unsigned int pci_2_3_irq_check_and_unmask(struct pci_dev *dev) > +{ > + return pci_2_3_set_irq_mask(dev, false, true); > +} > + IMO this is not a terribly good interface: all users check the pending bit (PCI_STATUS_INTERRUPT) which is what the function pci_2_3_set_irq_mask did anyway. I'd suggest returning irqreturn_t or bool and not unsigned int. > +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) > +{ > + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; > + int ret = IRQ_WAKE_THREAD; > + > + spin_lock(&assigned_dev->intx_lock); > + if (assigned_dev->host_irq_disabled || > + !(pci_2_3_irq_check_and_mask(assigned_dev->dev) & > + PCI_STATUS_INTERRUPT)) > + ret = IRQ_NONE; > + else > + assigned_dev->host_irq_disabled = true; This is a bug I think. For pci 2.3 we should never track interrupt state in kvm IMO. For example, if userspace unmasks an interrupt through a config write, we will get an interrupt while host_irq_disabled is set. If we then fail to mask it, kaboom. > + spin_unlock(&assigned_dev->intx_lock); > + > + return ret; > +} > + > static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) > { > struct kvm_assigned_dev_kernel *assigned_dev = dev_id; > u32 vector; > int index; > > - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { > - spin_lock(&assigned_dev->intx_lock); > + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX && > + !assigned_dev->pci_2_3) { > + spin_lock_irq(&assigned_dev->intx_lock); > disable_irq_nosync(irq); > assigned_dev->host_irq_disabled = true; > - spin_unlock(&assigned_dev->intx_lock); > + spin_unlock_irq(&assigned_dev->intx_lock); > } > > if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { > @@ -87,6 +215,7 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) > static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) > { > struct kvm_assigned_dev_kernel *dev; > + bool reassert = false; > > if (kian->gsi == -1) > return; > @@ -99,12 +228,23 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) > /* The guest irq may be shared so this ack may be > * from another device. > */ > - spin_lock(&dev->intx_lock); > + spin_lock_irq(&dev->intx_lock); > if (dev->host_irq_disabled) { > - enable_irq(dev->host_irq); > + if (dev->pci_2_3) { > + if (pci_2_3_irq_check_and_unmask(dev->dev) & > + PCI_STATUS_INTERRUPT) { > + reassert = true; > + goto out; > + } > + } else > + enable_irq(dev->host_irq); Or if (!dev->pci_2_3) enable_irq(dev->host_irq); else if (pci_2_3_irq_check_and_unmask(dev->dev) & PCI_STATUS_INTERRUPT) { reassert = true; goto out; } to reduce nesting. > dev->host_irq_disabled = false; > } > - spin_unlock(&dev->intx_lock); > +out: > + spin_unlock_irq(&dev->intx_lock); > + > + if (reassert) > + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1); Hmm, I think this still has more overhead than it needs to have. Instead of setting level to 0 and then back to 1, can't we just avoid set to 1 in the first place? This would need a different interface than pci_2_3_irq_check_and_unmask to avoid a race where interrupt is received while we are acking another one: block userspace access check pending bit if (!pending) set irq (0) clear pending block userspace access Would be worth it for high volume devices. > } > > static void deassign_guest_irq(struct kvm *kvm, > @@ -151,7 +291,11 @@ static void deassign_host_irq(struct kvm *kvm, > pci_disable_msix(assigned_dev->dev); > } else { > /* Deal with MSI and INTx */ > - disable_irq(assigned_dev->host_irq); > + if (assigned_dev->pci_2_3) { > + pci_2_3_irq_mask(assigned_dev->dev); > + synchronize_irq(assigned_dev->host_irq); > + } else > + disable_irq(assigned_dev->host_irq); > > free_irq(assigned_dev->host_irq, (void *)assigned_dev); > > @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm, > > pci_reset_function(assigned_dev->dev); > > + /* > + * Unmask the IRQ at PCI level once the reset is done - the next user > + * may not expect the IRQ being masked. > + */ > + if (assigned_dev->pci_2_3) > + pci_2_3_irq_unmask(assigned_dev->dev); > + Doesn't pci_reset_function clear mask bit? It seems to ... > pci_release_regions(assigned_dev->dev); > pci_disable_device(assigned_dev->dev); > pci_dev_put(assigned_dev->dev); > @@ -224,15 +375,29 @@ void kvm_free_all_assigned_devices(struct kvm *kvm) > static int assigned_device_enable_host_intx(struct kvm *kvm, > struct kvm_assigned_dev_kernel *dev) > { > + irq_handler_t irq_handler = NULL; > + unsigned long flags = 0; > + > dev->host_irq = dev->dev->irq; > - /* Even though this is PCI, we don't want to use shared > - * interrupts. Sharing host devices with guest-assigned devices > - * on the same interrupt line is not a happy situation: there > - * are going to be long delays in accepting, acking, etc. > + > + /* > + * We can only share the IRQ line with other host devices if we are > + * able to disable the IRQ source at device-level - independently of > + * the guest driver. Otherwise host devices may suffer from unbounded > + * IRQ latencies when the guest keeps the line asserted. > */ > - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, > - 0, dev->irq_name, (void *)dev)) > + dev->pci_2_3 = pci_2_3_supported(dev->dev); > + if (dev->pci_2_3) { > + irq_handler = kvm_assigned_dev_intr; > + flags = IRQF_SHARED; > + } I would prefer and else clause here instead of initializing variables at the top and overwriting here. Makes it easier to see which value is valid when. > + if (request_threaded_irq(dev->host_irq, irq_handler, > + kvm_assigned_dev_thread, flags, > + dev->irq_name, (void *)dev)) > return -EIO; > + > + if (dev->pci_2_3) > + pci_2_3_irq_unmask(dev->dev); > return 0; > } > > @@ -308,7 +473,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, > { > dev->guest_irq = irq->guest_irq; > dev->ack_notifier.gsi = -1; > - dev->host_irq_disabled = false; > return 0; > } > #endif > @@ -320,7 +484,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, > { > dev->guest_irq = irq->guest_irq; > dev->ack_notifier.gsi = -1; > - dev->host_irq_disabled = false; > return 0; > } > #endif > @@ -354,6 +517,7 @@ static int assign_host_irq(struct kvm *kvm, > default: > r = -EINVAL; > } > + dev->host_irq_disabled = false; > > if (!r) > dev->irq_requested_type |= host_irq_type; > -- > 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html