Re: [PATCH 3/3] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Mon, 1 Nov 2010 17:52:14 +0200

On Mon, Nov 01, 2010 at 04:41:08PM +0100, Jan Kiszka wrote:
> Am 01.11.2010 16:24, Michael S. Tsirkin wrote:
> > On Mon, Nov 01, 2010 at 03:08:37PM +0100, Jan Kiszka wrote:
> >> From: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
> >>
> >> PCI 2.3 allows to generically disable IRQ sources at device level. This
> >> enables us to share IRQs of such devices between on the host side when
> >> passing them to a guest.
> >>
> >> Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
> >> ---
> >>  include/linux/kvm_host.h |    1 +
> >>  virt/kvm/assigned-dev.c  |  153 +++++++++++++++++++++++++++++++++++++++++----
> >>  2 files changed, 140 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> >> index df5497f..fcdc849 100644
> >> --- a/include/linux/kvm_host.h
> >> +++ b/include/linux/kvm_host.h
> >> @@ -473,6 +473,7 @@ struct kvm_assigned_dev_kernel {
> >>  	unsigned int entries_nr;
> >>  	int host_irq;
> >>  	bool host_irq_disabled;
> >> +	bool pci_2_3;
> >>  	struct msix_entry *host_msix_entries;
> >>  	int guest_irq;
> >>  	struct kvm_guest_msix_entry *guest_msix_entries;
> >> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> >> index d3ddfea..411643c 100644
> >> --- a/virt/kvm/assigned-dev.c
> >> +++ b/virt/kvm/assigned-dev.c
> >> @@ -55,10 +55,96 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
> >>  	return index;
> >>  }
> >>  
> >> +/*
> >> + * Verify that the device supports Interrupt Disable bit in command register,
> >> + * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly
> >> + * in PCI 2.2.
> >> + */
> >> +static bool pci_2_3_supported(struct pci_dev *pdev)
> >> +{
> >> +	u16 orig, new;
> >> +	bool supported = false;
> >> +
> >> +	pci_block_user_cfg_access(pdev);
> >> +	pci_read_config_word(pdev, PCI_COMMAND, &orig);
> >> +	pci_write_config_word(pdev, PCI_COMMAND,
> >> +			      orig ^ PCI_COMMAND_INTX_DISABLE);
> >> +	pci_read_config_word(pdev, PCI_COMMAND, &new);
> >> +	
> >> +	/*
> >> +	 * There's no way to protect against
> >> +	 * hardware bugs or detect them reliably, but as long as we know
> >> +	 * what the value should be, let's go ahead and check it.
> >> +	 */
> >> +	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
> >> +		dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
> >> +			"driver or HW bug?\n", orig, new);
> >> +		goto out;
> >> +	}
> >> +	if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
> >> +		dev_warn(&pdev->dev, "Device does not support "
> >> +			 "disabling interrupts: unable to bind.\n");
> >> +		goto out;
> >> +	}
> >> +	supported = true;
> >> +
> >> +	/* Now restore the original value. */
> >> +	pci_write_config_word(pdev, PCI_COMMAND, orig);
> >> +
> >> +out:
> >> +	pci_unblock_user_cfg_access(pdev);
> >> +	return supported;
> >> +}
> >> +
> >> +static void
> >> +pci_2_3_mask_irq(struct pci_dev *dev, int mask, unsigned int *irq_status)
> >> +{
> >> +	u32 cmd_status_dword;
> >> +	u16 origcmd, newcmd;
> >> +
> >> +	/*
> >> +	 * We do a single dword read to retrieve both command and status.
> >> +	 * Document assumptions that make this possible.
> >> +	 */
> >> +	BUILD_BUG_ON(PCI_COMMAND % 4);
> >> +	BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS);
> >> +
> >> +	pci_block_user_cfg_access(dev);
> >> +
> >> +	/*
> >> +	 * Read both command and status registers in a single 32-bit operation.
> >> +	 * Note: we could cache the value for command and move the status read
> >> +	 * out of the lock if there was a way to get notified of user changes
> >> +	 * to command register through sysfs. Should be good for shared irqs.
> >> +	 */
> >> +	pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword);
> >> +	origcmd = cmd_status_dword;
> >> +
> >> +	if (irq_status) {
> >> +		/*
> >> +		* Check interrupt status register to see whether our device triggered
> >> +		* the interrupt.
> >> +		*/
> >> +		*irq_status = (cmd_status_dword >> 16) & PCI_STATUS_INTERRUPT;
> >> +		if (*irq_status == 0)
> >> +			goto done;
> >> +	}
> >> +
> >> +	newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE;
> >> +	if (mask)
> >> +		newcmd |= PCI_COMMAND_INTX_DISABLE;
> >> +	if (newcmd != origcmd)
> >> +		pci_write_config_word(dev, PCI_COMMAND, newcmd);
> >> +
> >> +done:
> >> +	pci_unblock_user_cfg_access(dev);
> >> +}
> >> +
> > 
> > Let's return irq_status instead of returning through a pointer?
> > Will save a branch and generally make the code a bit cleaner.
> 
> I'm open for a better API suggestion,

Maybe use separate functions for this.
pci_2_3_mask_irq
pci_2_3_unmask_irq
pci_2_3_disable_irq

Common code can go into subfunctions.

> but the current one goes like
> this: if irq_status is non-null, only mask the IRQ if the status bit
> indicates an interrupt. But we also have a user that wants to mask
> unconditionally.

Why do you ever want to do that?

> > 
> >>  static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
> >>  {
> >>  	struct kvm_assigned_dev_kernel *assigned_dev =
> >>  		(struct kvm_assigned_dev_kernel *) dev_id;
> >> +	int ret = IRQ_HANDLED;
> >>  	unsigned long flags;
> >>  
> >>  	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
> >> @@ -83,19 +169,34 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
> >>  				    guest_entries[i].vector, 1);
> >>  		}
> >>  	} else {
> >> -		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> >> -			    assigned_dev->guest_irq, 1);
> >> -
> >>  		if (assigned_dev->irq_requested_type &
> >>  				KVM_DEV_IRQ_GUEST_INTX) {
> >> -			disable_irq_nosync(irq);
> >> +			if (assigned_dev->pci_2_3) {
> >> +				unsigned int irq_status;
> >> +
> >> +				if (assigned_dev->host_irq_disabled) {
> >> +					ret = IRQ_NONE;
> >> +					goto out;
> >> +				}
> >> +
> >> +				pci_2_3_mask_irq(assigned_dev->dev, 1,
> >> +						 &irq_status);
> >> +				if (irq_status == 0) {
> >> +					ret = IRQ_NONE;
> >> +					goto out;
> >> +				}
> > 
> > 
> > This will be cleaner if pci_2_3_mask_irq returns irqreturn_t.
> > 
> >> +			} else
> >> +				disable_irq_nosync(irq);
> >>  			assigned_dev->host_irq_disabled = true;
> >>  		}
> >> +
> >> +		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> >> +			    assigned_dev->guest_irq, 1);
> >>  	}
> >>  
> >>  out:
> >>  	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
> >> -	return IRQ_HANDLED;
> >> +	return ret;
> >>  }
> >>  
> >>  /* Ack the irq line for an assigned device */
> >> @@ -117,7 +218,10 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
> >>  	 */
> >>  	spin_lock_irqsave(&dev->assigned_dev_lock, flags);
> >>  	if (dev->host_irq_disabled) {
> >> -		enable_irq(dev->host_irq);
> >> +		if (dev->pci_2_3)
> >> +			pci_2_3_mask_irq(dev->dev, 0, NULL);
> >> +		else
> >> +			enable_irq(dev->host_irq);
> >>  		dev->host_irq_disabled = false;
> > 
> > So what happens here is that if interrupt is still pending
> > we will set level to 0, then get another interrupt from device
> > which will set it back to 1.  An obvious optimization is avoid
> > all this, check pending bit and just keep level at 1.
> 
> Isn't this an unrelated optimization, independent of this patch? But
> I'll think about it. What pending bit are you referring to?

The one in PCI_STATUS.

> > 
> >>  	}
> >>  	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
> >> @@ -166,7 +270,11 @@ static void deassign_host_irq(struct kvm *kvm,
> >>  		pci_disable_msix(assigned_dev->dev);
> >>  	} else {
> >>  		/* Deal with MSI and INTx */
> >> -		disable_irq(assigned_dev->host_irq);
> >> +		if (assigned_dev->pci_2_3) {
> >> +			pci_2_3_mask_irq(assigned_dev->dev, 1, NULL);
> >> +			synchronize_irq(assigned_dev->host_irq);
> >> +		} else
> >> +			disable_irq(assigned_dev->host_irq);
> >>  
> >>  		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
> >>  
> >> @@ -214,6 +322,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
> >>  
> >>  	pci_reset_function(assigned_dev->dev);
> >>  
> >> +	/*
> >> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
> >> +	 * may not expect the IRQ being masked.
> >> +	 */
> >> +	if (assigned_dev->pci_2_3)
> >> +		pci_2_3_mask_irq(assigned_dev->dev, 0, NULL);
> >> +
> >>  	pci_release_regions(assigned_dev->dev);
> >>  	pci_disable_device(assigned_dev->dev);
> >>  	pci_dev_put(assigned_dev->dev);
> >> @@ -239,15 +354,26 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
> >>  static int assigned_device_enable_host_intx(struct kvm *kvm,
> >>  					    struct kvm_assigned_dev_kernel *dev)
> >>  {
> >> +	unsigned long flags = 0;
> >> +
> >>  	dev->host_irq = dev->dev->irq;
> >> -	/* Even though this is PCI, we don't want to use shared
> >> -	 * interrupts. Sharing host devices with guest-assigned devices
> >> -	 * on the same interrupt line is not a happy situation: there
> >> -	 * are going to be long delays in accepting, acking, etc.
> >> +
> >> +	/*
> >> +	 * We can only share the IRQ line with other host devices if we are
> >> +	 * able to disable the IRQ source at device-level - independently of
> >> +	 * the guest driver. Otherwise host devices may suffer from unbounded
> >> +	 * IRQ latencies when the guest keeps the line asserted.
> >>  	 */
> >> +	dev->pci_2_3 = pci_2_3_supported(dev->dev);
> >> +	if (dev->pci_2_3)
> >> +		flags = IRQF_SHARED;
> >> +
> >>  	if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
> >> -			0, "kvm_assigned_intx_device", (void *)dev))
> >> +			flags, "kvm_assigned_intx_device", (void *)dev))
> >>  		return -EIO;
> >> +
> >> +	if (dev->pci_2_3)
> >> +		pci_2_3_mask_irq(dev->dev, 0, NULL);
> >>  	return 0;
> >>  }
> >>  
> > 
> > Let's reverse the logic and try non-shared first, 2.3 is that fails?
> > This way we are backwards compatible ...
> 
> Compatible with what?

With the status quo :)

> I thought about trying non-shared IRQs first, but that would break host
> devices arriving later - including other VMs that want to pass a device
> sitting on the same IRQ line. It's better (from management POV) to have
> IRQF_SHARED available.

OTOH non-shared might be faster as we don't need to do
config writes/reads on data path ...  Add a knob for management
to control this?

> > 
> >> @@ -324,7 +450,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
> >>  {
> >>  	dev->guest_irq = irq->guest_irq;
> >>  	dev->ack_notifier.gsi = -1;
> >> -	dev->host_irq_disabled = false;
> >>  	return 0;
> >>  }
> >>  #endif
> >> @@ -336,7 +461,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
> >>  {
> >>  	dev->guest_irq = irq->guest_irq;
> >>  	dev->ack_notifier.gsi = -1;
> >> -	dev->host_irq_disabled = false;
> >>  	return 0;
> >>  }
> >>  #endif
> >> @@ -367,6 +491,7 @@ static int assign_host_irq(struct kvm *kvm,
> >>  	default:
> >>  		r = -EINVAL;
> >>  	}
> >> +	dev->host_irq_disabled = false;
> >>  
> >>  	if (!r)
> >>  		dev->irq_requested_type |= host_irq_type;
> >> -- 
> >> 1.7.1
> 
> Thanks for the comment,
> Jan
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html