Re: [PATCH 3/3] KVM: Allow host IRQ sharing for passed-through PCI 2.3 devices

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 01.11.2010 16:24, Michael S. Tsirkin wrote:
> On Mon, Nov 01, 2010 at 03:08:37PM +0100, Jan Kiszka wrote:
>> From: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
>>
>> PCI 2.3 allows to generically disable IRQ sources at device level. This
>> enables us to share IRQs of such devices between on the host side when
>> passing them to a guest.
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
>> ---
>>  include/linux/kvm_host.h |    1 +
>>  virt/kvm/assigned-dev.c  |  153 +++++++++++++++++++++++++++++++++++++++++----
>>  2 files changed, 140 insertions(+), 14 deletions(-)
>>
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index df5497f..fcdc849 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -473,6 +473,7 @@ struct kvm_assigned_dev_kernel {
>>  	unsigned int entries_nr;
>>  	int host_irq;
>>  	bool host_irq_disabled;
>> +	bool pci_2_3;
>>  	struct msix_entry *host_msix_entries;
>>  	int guest_irq;
>>  	struct kvm_guest_msix_entry *guest_msix_entries;
>> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
>> index d3ddfea..411643c 100644
>> --- a/virt/kvm/assigned-dev.c
>> +++ b/virt/kvm/assigned-dev.c
>> @@ -55,10 +55,96 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
>>  	return index;
>>  }
>>  
>> +/*
>> + * Verify that the device supports Interrupt Disable bit in command register,
>> + * per PCI 2.3, by flipping this bit and reading it back: this bit was readonly
>> + * in PCI 2.2.
>> + */
>> +static bool pci_2_3_supported(struct pci_dev *pdev)
>> +{
>> +	u16 orig, new;
>> +	bool supported = false;
>> +
>> +	pci_block_user_cfg_access(pdev);
>> +	pci_read_config_word(pdev, PCI_COMMAND, &orig);
>> +	pci_write_config_word(pdev, PCI_COMMAND,
>> +			      orig ^ PCI_COMMAND_INTX_DISABLE);
>> +	pci_read_config_word(pdev, PCI_COMMAND, &new);
>> +	
>> +	/*
>> +	 * There's no way to protect against
>> +	 * hardware bugs or detect them reliably, but as long as we know
>> +	 * what the value should be, let's go ahead and check it.
>> +	 */
>> +	if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
>> +		dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
>> +			"driver or HW bug?\n", orig, new);
>> +		goto out;
>> +	}
>> +	if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
>> +		dev_warn(&pdev->dev, "Device does not support "
>> +			 "disabling interrupts: unable to bind.\n");
>> +		goto out;
>> +	}
>> +	supported = true;
>> +
>> +	/* Now restore the original value. */
>> +	pci_write_config_word(pdev, PCI_COMMAND, orig);
>> +
>> +out:
>> +	pci_unblock_user_cfg_access(pdev);
>> +	return supported;
>> +}
>> +
>> +static void
>> +pci_2_3_mask_irq(struct pci_dev *dev, int mask, unsigned int *irq_status)
>> +{
>> +	u32 cmd_status_dword;
>> +	u16 origcmd, newcmd;
>> +
>> +	/*
>> +	 * We do a single dword read to retrieve both command and status.
>> +	 * Document assumptions that make this possible.
>> +	 */
>> +	BUILD_BUG_ON(PCI_COMMAND % 4);
>> +	BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS);
>> +
>> +	pci_block_user_cfg_access(dev);
>> +
>> +	/*
>> +	 * Read both command and status registers in a single 32-bit operation.
>> +	 * Note: we could cache the value for command and move the status read
>> +	 * out of the lock if there was a way to get notified of user changes
>> +	 * to command register through sysfs. Should be good for shared irqs.
>> +	 */
>> +	pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword);
>> +	origcmd = cmd_status_dword;
>> +
>> +	if (irq_status) {
>> +		/*
>> +		* Check interrupt status register to see whether our device triggered
>> +		* the interrupt.
>> +		*/
>> +		*irq_status = (cmd_status_dword >> 16) & PCI_STATUS_INTERRUPT;
>> +		if (*irq_status == 0)
>> +			goto done;
>> +	}
>> +
>> +	newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE;
>> +	if (mask)
>> +		newcmd |= PCI_COMMAND_INTX_DISABLE;
>> +	if (newcmd != origcmd)
>> +		pci_write_config_word(dev, PCI_COMMAND, newcmd);
>> +
>> +done:
>> +	pci_unblock_user_cfg_access(dev);
>> +}
>> +
> 
> Let's return irq_status instead of returning through a pointer?
> Will save a branch and generally make the code a bit cleaner.

I'm open for a better API suggestion, but the current one goes like
this: if irq_status is non-null, only mask the IRQ if the status bit
indicates an interrupt. But we also have a user that wants to mask
unconditionally.

> 
>>  static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
>>  {
>>  	struct kvm_assigned_dev_kernel *assigned_dev =
>>  		(struct kvm_assigned_dev_kernel *) dev_id;
>> +	int ret = IRQ_HANDLED;
>>  	unsigned long flags;
>>  
>>  	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
>> @@ -83,19 +169,34 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
>>  				    guest_entries[i].vector, 1);
>>  		}
>>  	} else {
>> -		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
>> -			    assigned_dev->guest_irq, 1);
>> -
>>  		if (assigned_dev->irq_requested_type &
>>  				KVM_DEV_IRQ_GUEST_INTX) {
>> -			disable_irq_nosync(irq);
>> +			if (assigned_dev->pci_2_3) {
>> +				unsigned int irq_status;
>> +
>> +				if (assigned_dev->host_irq_disabled) {
>> +					ret = IRQ_NONE;
>> +					goto out;
>> +				}
>> +
>> +				pci_2_3_mask_irq(assigned_dev->dev, 1,
>> +						 &irq_status);
>> +				if (irq_status == 0) {
>> +					ret = IRQ_NONE;
>> +					goto out;
>> +				}
> 
> 
> This will be cleaner if pci_2_3_mask_irq returns irqreturn_t.
> 
>> +			} else
>> +				disable_irq_nosync(irq);
>>  			assigned_dev->host_irq_disabled = true;
>>  		}
>> +
>> +		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
>> +			    assigned_dev->guest_irq, 1);
>>  	}
>>  
>>  out:
>>  	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
>> -	return IRQ_HANDLED;
>> +	return ret;
>>  }
>>  
>>  /* Ack the irq line for an assigned device */
>> @@ -117,7 +218,10 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>>  	 */
>>  	spin_lock_irqsave(&dev->assigned_dev_lock, flags);
>>  	if (dev->host_irq_disabled) {
>> -		enable_irq(dev->host_irq);
>> +		if (dev->pci_2_3)
>> +			pci_2_3_mask_irq(dev->dev, 0, NULL);
>> +		else
>> +			enable_irq(dev->host_irq);
>>  		dev->host_irq_disabled = false;
> 
> So what happens here is that if interrupt is still pending
> we will set level to 0, then get another interrupt from device
> which will set it back to 1.  An obvious optimization is avoid
> all this, check pending bit and just keep level at 1.

Isn't this an unrelated optimization, independent of this patch? But
I'll think about it. What pending bit are you referring to?

> 
>>  	}
>>  	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
>> @@ -166,7 +270,11 @@ static void deassign_host_irq(struct kvm *kvm,
>>  		pci_disable_msix(assigned_dev->dev);
>>  	} else {
>>  		/* Deal with MSI and INTx */
>> -		disable_irq(assigned_dev->host_irq);
>> +		if (assigned_dev->pci_2_3) {
>> +			pci_2_3_mask_irq(assigned_dev->dev, 1, NULL);
>> +			synchronize_irq(assigned_dev->host_irq);
>> +		} else
>> +			disable_irq(assigned_dev->host_irq);
>>  
>>  		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
>>  
>> @@ -214,6 +322,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>>  
>>  	pci_reset_function(assigned_dev->dev);
>>  
>> +	/*
>> +	 * Unmask the IRQ at PCI level once the reset is done - the next user
>> +	 * may not expect the IRQ being masked.
>> +	 */
>> +	if (assigned_dev->pci_2_3)
>> +		pci_2_3_mask_irq(assigned_dev->dev, 0, NULL);
>> +
>>  	pci_release_regions(assigned_dev->dev);
>>  	pci_disable_device(assigned_dev->dev);
>>  	pci_dev_put(assigned_dev->dev);
>> @@ -239,15 +354,26 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
>>  static int assigned_device_enable_host_intx(struct kvm *kvm,
>>  					    struct kvm_assigned_dev_kernel *dev)
>>  {
>> +	unsigned long flags = 0;
>> +
>>  	dev->host_irq = dev->dev->irq;
>> -	/* Even though this is PCI, we don't want to use shared
>> -	 * interrupts. Sharing host devices with guest-assigned devices
>> -	 * on the same interrupt line is not a happy situation: there
>> -	 * are going to be long delays in accepting, acking, etc.
>> +
>> +	/*
>> +	 * We can only share the IRQ line with other host devices if we are
>> +	 * able to disable the IRQ source at device-level - independently of
>> +	 * the guest driver. Otherwise host devices may suffer from unbounded
>> +	 * IRQ latencies when the guest keeps the line asserted.
>>  	 */
>> +	dev->pci_2_3 = pci_2_3_supported(dev->dev);
>> +	if (dev->pci_2_3)
>> +		flags = IRQF_SHARED;
>> +
>>  	if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
>> -			0, "kvm_assigned_intx_device", (void *)dev))
>> +			flags, "kvm_assigned_intx_device", (void *)dev))
>>  		return -EIO;
>> +
>> +	if (dev->pci_2_3)
>> +		pci_2_3_mask_irq(dev->dev, 0, NULL);
>>  	return 0;
>>  }
>>  
> 
> Let's reverse the logic and try non-shared first, 2.3 is that fails?
> This way we are backwards compatible ...

Compatible with what?

I thought about trying non-shared IRQs first, but that would break host
devices arriving later - including other VMs that want to pass a device
sitting on the same IRQ line. It's better (from management POV) to have
IRQF_SHARED available.

> 
>> @@ -324,7 +450,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
>>  {
>>  	dev->guest_irq = irq->guest_irq;
>>  	dev->ack_notifier.gsi = -1;
>> -	dev->host_irq_disabled = false;
>>  	return 0;
>>  }
>>  #endif
>> @@ -336,7 +461,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
>>  {
>>  	dev->guest_irq = irq->guest_irq;
>>  	dev->ack_notifier.gsi = -1;
>> -	dev->host_irq_disabled = false;
>>  	return 0;
>>  }
>>  #endif
>> @@ -367,6 +491,7 @@ static int assign_host_irq(struct kvm *kvm,
>>  	default:
>>  		r = -EINVAL;
>>  	}
>> +	dev->host_irq_disabled = false;
>>  
>>  	if (!r)
>>  		dev->irq_requested_type |= host_irq_type;
>> -- 
>> 1.7.1

Thanks for the comment,
Jan

Attachment: signature.asc
Description: OpenPGP digital signature


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux