Am 02.11.2010 18:41, Michael S. Tsirkin wrote: > On Tue, Nov 02, 2010 at 04:49:20PM +0100, Jan Kiszka wrote: >> PCI 2.3 allows to generically disable IRQ sources at device level. This >> enables us to share IRQs of such devices between on the host side when >> passing them to a guest. >> >> Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> >> --- >> include/linux/kvm_host.h | 1 + >> virt/kvm/assigned-dev.c | 194 ++++++++++++++++++++++++++++++++++++++++++---- >> 2 files changed, 180 insertions(+), 15 deletions(-) >> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >> index 46120da..fdc2bd9 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -466,6 +466,7 @@ struct kvm_assigned_dev_kernel { >> unsigned int entries_nr; >> int host_irq; >> bool host_irq_disabled; >> + bool pci_2_3; >> struct msix_entry *host_msix_entries; >> int guest_irq; >> struct msix_entry *guest_msix_entries; >> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c >> index ca402ed..91fe9c8 100644 >> --- a/virt/kvm/assigned-dev.c >> +++ b/virt/kvm/assigned-dev.c >> @@ -55,17 +55,145 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel >> return index; >> } >> >> +/* >> + * Verify that the device supports Interrupt Disable bit in command register, >> + * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly >> + * in PCI 2.2. >> + */ >> +static bool pci_2_3_supported(struct pci_dev *pdev) >> +{ >> + bool supported = false; >> + u16 orig, new; >> + >> + pci_block_user_cfg_access(pdev); >> + pci_read_config_word(pdev, PCI_COMMAND, &orig); >> + pci_write_config_word(pdev, PCI_COMMAND, >> + orig ^ PCI_COMMAND_INTX_DISABLE); >> + pci_read_config_word(pdev, PCI_COMMAND, &new); >> + >> + /* >> + * There's no way to protect against >> + * hardware bugs or detect them reliably, but as long as we know >> + * what the value should be, let's go ahead and check it. >> + */ >> + if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) { >> + dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: " >> + "driver or HW bug?\n", orig, new); >> + goto out; >> + } >> + if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) { >> + dev_warn(&pdev->dev, "Device does not support " >> + "disabling interrupts: unable to bind.\n"); >> + goto out; >> + } >> + supported = true; >> + >> + /* Now restore the original value. */ >> + pci_write_config_word(pdev, PCI_COMMAND, orig); >> + >> +out: >> + pci_unblock_user_cfg_access(pdev); >> + return supported; >> +} >> + >> +static unsigned int >> +pci_2_3_set_irq_mask(struct pci_dev *dev, bool mask, bool check_status) >> +{ >> + u32 cmd_status_dword; >> + u16 origcmd, newcmd; >> + unsigned int status; >> + >> + /* >> + * We do a single dword read to retrieve both command and status. >> + * Document assumptions that make this possible. >> + */ >> + BUILD_BUG_ON(PCI_COMMAND % 4); >> + BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS); >> + >> + pci_block_user_cfg_access(dev); >> + >> + /* >> + * Read both command and status registers in a single 32-bit operation. >> + * Note: we could cache the value for command and move the status read >> + * out of the lock if there was a way to get notified of user changes >> + * to command register through sysfs. Should be good for shared irqs. >> + */ >> + pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword); >> + origcmd = cmd_status_dword; >> + status = cmd_status_dword >> 16; >> + >> + if (check_status) { >> + bool irq_pending = status & PCI_STATUS_INTERRUPT; >> + >> + /* >> + * Check interrupt status register to see whether our device >> + * triggered the interrupt (when masking) or the next IRQ is >> + * already pending (when unmasking). >> + */ >> + if (!(mask == irq_pending)) > > Same as mask != irq_pending? Yes. Relict of various refactorings. > >> + goto done; >> + } >> + >> + newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE; >> + if (mask) >> + newcmd |= PCI_COMMAND_INTX_DISABLE; >> + if (newcmd != origcmd) >> + pci_write_config_word(dev, PCI_COMMAND, newcmd); >> + >> +done: >> + pci_unblock_user_cfg_access(dev); >> + return status; >> +} >> + >> +static void pci_2_3_irq_mask(struct pci_dev *dev) >> +{ >> + pci_2_3_set_irq_mask(dev, true, false); >> +} >> + >> +static unsigned int pci_2_3_irq_check_and_mask(struct pci_dev *dev) >> +{ >> + return pci_2_3_set_irq_mask(dev, true, true); >> +} >> + >> +static void pci_2_3_irq_unmask(struct pci_dev *dev) >> +{ >> + pci_2_3_set_irq_mask(dev, false, false); >> +} >> + >> +static unsigned int pci_2_3_irq_check_and_unmask(struct pci_dev *dev) >> +{ >> + return pci_2_3_set_irq_mask(dev, false, true); >> +} >> + > > IMO this is not a terribly good interface: all users check the pending bit > (PCI_STATUS_INTERRUPT) which is what the function pci_2_3_set_irq_mask > did anyway. I'd suggest returning irqreturn_t or bool and not unsigned > int. Agreed. Originally, I thought there are more bits in the status word the caller may make use of. But there are in fact none. > > >> +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) >> +{ >> + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; >> + int ret = IRQ_WAKE_THREAD; >> + >> + spin_lock(&assigned_dev->intx_lock); >> + if (assigned_dev->host_irq_disabled || >> + !(pci_2_3_irq_check_and_mask(assigned_dev->dev) & >> + PCI_STATUS_INTERRUPT)) >> + ret = IRQ_NONE; >> + else >> + assigned_dev->host_irq_disabled = true; > > This is a bug I think. For pci 2.3 we should never track interrupt > state in kvm IMO. For example, if userspace unmasks an interrupt > through a config write, we will get an interrupt while host_irq_disabled > is set. If we then fail to mask it, kaboom. Good point. There is no way around evaluating the status word as long as user space can fiddle with INTX_DISABLE. > >> + spin_unlock(&assigned_dev->intx_lock); >> + >> + return ret; >> +} >> + >> static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) >> { >> struct kvm_assigned_dev_kernel *assigned_dev = dev_id; >> u32 vector; >> int index; >> >> - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { >> - spin_lock(&assigned_dev->intx_lock); >> + if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX && >> + !assigned_dev->pci_2_3) { >> + spin_lock_irq(&assigned_dev->intx_lock); >> disable_irq_nosync(irq); >> assigned_dev->host_irq_disabled = true; >> - spin_unlock(&assigned_dev->intx_lock); >> + spin_unlock_irq(&assigned_dev->intx_lock); >> } >> >> if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { >> @@ -87,6 +215,7 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) >> static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) >> { >> struct kvm_assigned_dev_kernel *dev; >> + bool reassert = false; >> >> if (kian->gsi == -1) >> return; >> @@ -99,12 +228,23 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) >> /* The guest irq may be shared so this ack may be >> * from another device. >> */ >> - spin_lock(&dev->intx_lock); >> + spin_lock_irq(&dev->intx_lock); >> if (dev->host_irq_disabled) { >> - enable_irq(dev->host_irq); >> + if (dev->pci_2_3) { >> + if (pci_2_3_irq_check_and_unmask(dev->dev) & >> + PCI_STATUS_INTERRUPT) { >> + reassert = true; >> + goto out; >> + } >> + } else >> + enable_irq(dev->host_irq); > > Or > > if (!dev->pci_2_3) > enable_irq(dev->host_irq); > else if (pci_2_3_irq_check_and_unmask(dev->dev) & PCI_STATUS_INTERRUPT) { > reassert = true; > goto out; > } > > to reduce nesting. Yeah. > >> dev->host_irq_disabled = false; >> } >> - spin_unlock(&dev->intx_lock); >> +out: >> + spin_unlock_irq(&dev->intx_lock); >> + >> + if (reassert) >> + kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1); > > Hmm, I think this still has more overhead than it needs to have. > Instead of setting level to 0 and then back to 1, can't we just > avoid set to 1 in the first place? This would need a different > interface than pci_2_3_irq_check_and_unmask to avoid a race > where interrupt is received while we are acking another one: > > block userspace access > check pending bit > if (!pending) > set irq (0) > clear pending > block userspace access > > Would be worth it for high volume devices. The problem is that we can't reorder guest IRQ line clearing and host IRQ line enabling without taking a lock across host IRQ disable + guest IRQ raise - and that is now distributed across hard and threaded IRQ handlers and we don't want to hold and IRQ-safe lock during kvm_set_irq. > >> } >> >> static void deassign_guest_irq(struct kvm *kvm, >> @@ -151,7 +291,11 @@ static void deassign_host_irq(struct kvm *kvm, >> pci_disable_msix(assigned_dev->dev); >> } else { >> /* Deal with MSI and INTx */ >> - disable_irq(assigned_dev->host_irq); >> + if (assigned_dev->pci_2_3) { >> + pci_2_3_irq_mask(assigned_dev->dev); >> + synchronize_irq(assigned_dev->host_irq); >> + } else >> + disable_irq(assigned_dev->host_irq); >> >> free_irq(assigned_dev->host_irq, (void *)assigned_dev); >> >> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm, >> >> pci_reset_function(assigned_dev->dev); >> >> + /* >> + * Unmask the IRQ at PCI level once the reset is done - the next user >> + * may not expect the IRQ being masked. >> + */ >> + if (assigned_dev->pci_2_3) >> + pci_2_3_irq_unmask(assigned_dev->dev); >> + > > Doesn't pci_reset_function clear mask bit? It seems to ... I was left with non-functional devices for the host here if I was not doing this. Need to recheck, but I think it was required. > >> pci_release_regions(assigned_dev->dev); >> pci_disable_device(assigned_dev->dev); >> pci_dev_put(assigned_dev->dev); >> @@ -224,15 +375,29 @@ void kvm_free_all_assigned_devices(struct kvm *kvm) >> static int assigned_device_enable_host_intx(struct kvm *kvm, >> struct kvm_assigned_dev_kernel *dev) >> { >> + irq_handler_t irq_handler = NULL; >> + unsigned long flags = 0; >> + >> dev->host_irq = dev->dev->irq; >> - /* Even though this is PCI, we don't want to use shared >> - * interrupts. Sharing host devices with guest-assigned devices >> - * on the same interrupt line is not a happy situation: there >> - * are going to be long delays in accepting, acking, etc. >> + >> + /* >> + * We can only share the IRQ line with other host devices if we are >> + * able to disable the IRQ source at device-level - independently of >> + * the guest driver. Otherwise host devices may suffer from unbounded >> + * IRQ latencies when the guest keeps the line asserted. >> */ >> - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, >> - 0, dev->irq_name, (void *)dev)) >> + dev->pci_2_3 = pci_2_3_supported(dev->dev); >> + if (dev->pci_2_3) { >> + irq_handler = kvm_assigned_dev_intr; >> + flags = IRQF_SHARED; >> + } > > I would prefer and else clause here instead of initializing > variables at the top and overwriting here. Makes it easier > to see which value is valid when. OK. Thanks, Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html