Re: [PATCH] KVM: Allow host IRQ sharing for assigned PCI 2.3 devices

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Tue, 10 Jan 2012 18:17:49 +0200

On Mon, Jan 09, 2012 at 03:03:00PM +0100, Jan Kiszka wrote:
> PCI 2.3 allows to generically disable IRQ sources at device level. This
> enables us to share legacy IRQs of such devices with other host devices
> when passing them to a guest.
> 
> The new IRQ sharing feature introduced here is optional, user space has
> to request it explicitly. Moreover, user space can inform us about its
> view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the
> interrupt and signaling it if the guest masked it via the virtualized
> PCI config space.
> 
> Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx>
> ---
> 
> This applies to kvm/master after merging
> 
>   PCI: Rework config space blocking services
>   PCI: Introduce INTx check & mask API
> 
> from current linux-next/master. I suppose those two will make it into
> 3.3.
> 
> To recall the history of it: I tried hard to implement an adaptive
> solution that automatically picks the fastest masking technique whenever
> possible. However, the changes required to the IRQ core subsystem and
> the logic of the device assignment code became so complex and partly
> ugly that I gave up on this. It's simply not worth the pain given that
> legacy PCI interrupts are rarely raised for performance critical device
> at such a high rate (KHz...) that you can measure the difference.
> 
>  Documentation/virtual/kvm/api.txt |   27 +++++
>  arch/x86/kvm/x86.c                |    1 +
>  include/linux/kvm.h               |    6 +
>  include/linux/kvm_host.h          |    2 +
>  virt/kvm/assigned-dev.c           |  208 +++++++++++++++++++++++++++++++-----
>  5 files changed, 215 insertions(+), 29 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index e1d94bf..670015a 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -1159,6 +1159,14 @@ following flags are specified:
>  
>  /* Depends on KVM_CAP_IOMMU */
>  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
> +/* The following two depend on KVM_CAP_PCI_2_3 */
> +#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> +#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
> +
> +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
> +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
> +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
> +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
>  
>  The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
>  isolation of the device.  Usages not specifying this flag are deprecated.
> @@ -1399,6 +1407,25 @@ The following flags are defined:
>  If datamatch flag is set, the event will be signaled only if the written value
>  to the registered address is equal to datamatch in struct kvm_ioeventfd.
>  
> +4.59 KVM_ASSIGN_SET_INTX_MASK
> +
> +Capability: KVM_CAP_PCI_2_3
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_assigned_pci_dev (in)
> +Returns: 0 on success, -1 on error
> +
> +Informs the kernel about the guest's view on the INTx mask.

A wild idea: since this is guest view of its IRQ,
can this be specified per guest IRQ+id then?
That might be useful to support MSIX mask bit emulation.

> As long as the
> +guest masks the legacy INTx, the kernel will refrain from unmasking it at
> +hardware level and will not assert the guest's IRQ line. User space is still
> +responsible for applying this state to the assigned device's real config space.

Can this be made more explicit? You mean writing into 1st
byte of PCI control, right?

> +To avoid that the kernel overwrites the state user space wants to set,
> +KVM_ASSIGN_SET_INTX_MASK has to be called prior to updating the config space.

This looks like a strange requirement, could you explain how
this helps avoid races? This also raises questions about
what should be done to write a bit unrelated to masking.

> +
> +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
> +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
> +evaluated.
> +
>  4.62 KVM_CREATE_SPAPR_TCE
>  
>  Capability: KVM_CAP_SPAPR_TCE
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1171def..9381806 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2057,6 +2057,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_XSAVE:
>  	case KVM_CAP_ASYNC_PF:
>  	case KVM_CAP_GET_TSC_KHZ:
> +	case KVM_CAP_PCI_2_3:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 68e67e5..da5f7b7 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -556,6 +556,7 @@ struct kvm_ppc_pvinfo {
>  #define KVM_CAP_PPC_RMA	65
>  #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
>  #define KVM_CAP_PPC_PAPR 68
> +#define KVM_CAP_PCI_2_3 69
>  #define KVM_CAP_S390_GMAP 71
>  #define KVM_CAP_TSC_DEADLINE_TIMER 72
>  
> @@ -697,6 +698,9 @@ struct kvm_clock_data {
>  /* Available with KVM_CAP_TSC_CONTROL */
>  #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
>  #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
> +/* Available with KVM_CAP_PCI_2_3 */
> +#define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
> +				       struct kvm_assigned_pci_dev)
>  
>  /*
>   * ioctls for vcpu fds
> @@ -765,6 +769,8 @@ struct kvm_clock_data {
>  #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
>  
>  #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
> +#define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
> +#define KVM_DEV_ASSIGN_MASK_INTX	(1 << 2)
>  
>  struct kvm_assigned_pci_dev {
>  	__u32 assigned_dev_id;
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 900c763..07461bd 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel {
>  	unsigned int entries_nr;
>  	int host_irq;
>  	bool host_irq_disabled;
> +	bool pci_2_3;
>  	struct msix_entry *host_msix_entries;
>  	int guest_irq;
>  	struct msix_entry *guest_msix_entries;
> @@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel {
>  	struct pci_dev *dev;
>  	struct kvm *kvm;
>  	spinlock_t intx_lock;
> +	struct mutex intx_mask_lock;

What exactly does this lock protect?
I see it used sometimes with intx_lock and sometimes without.

>  	char irq_name[32];
>  	struct pci_saved_state *pci_saved_state;
>  };
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index 758e3b3..b35aba9 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -57,22 +57,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
>  	return index;
>  }
>  
> -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
> +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
>  {
>  	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> +	int ret;
>  
> -	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
> -		spin_lock(&assigned_dev->intx_lock);
> +	spin_lock(&assigned_dev->intx_lock);
> +	if (pci_check_and_mask_intx(assigned_dev->dev)) {
> +		assigned_dev->host_irq_disabled = true;
> +		ret = IRQ_WAKE_THREAD;
> +	} else
> +		ret = IRQ_NONE;
> +	spin_unlock(&assigned_dev->intx_lock);
> +
> +	return ret;
> +}
> +
> +static void
> +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
> +				 int vector)
> +{
> +	if (unlikely(assigned_dev->irq_requested_type &
> +		     KVM_DEV_IRQ_GUEST_INTX)) {
> +		mutex_lock(&assigned_dev->intx_mask_lock);
> +		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
> +			kvm_set_irq(assigned_dev->kvm,
> +				    assigned_dev->irq_source_id, vector, 1);
> +		mutex_unlock(&assigned_dev->intx_mask_lock);
> +	} else
> +		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> +			    vector, 1);
> +}
> +
> +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
> +{
> +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> +
> +	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
> +		spin_lock_irq(&assigned_dev->intx_lock);
>  		disable_irq_nosync(irq);
>  		assigned_dev->host_irq_disabled = true;
> -		spin_unlock(&assigned_dev->intx_lock);
> +		spin_unlock_irq(&assigned_dev->intx_lock);
>  	}
>  
> -	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> -		    assigned_dev->guest_irq, 1);
> +	kvm_assigned_dev_raise_guest_irq(assigned_dev,
> +					 assigned_dev->guest_irq);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +#ifdef __KVM_HAVE_MSI
> +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
> +{
> +	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> +
> +	kvm_assigned_dev_raise_guest_irq(assigned_dev,
> +					 assigned_dev->guest_irq);
>  
>  	return IRQ_HANDLED;
>  }
> +#endif
>  
>  #ifdef __KVM_HAVE_MSIX
>  static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
> @@ -83,8 +127,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
>  
>  	if (index >= 0) {
>  		vector = assigned_dev->guest_msix_entries[index].vector;
> -		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
> -			    vector, 1);
> +		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
>  	}
>  
>  	return IRQ_HANDLED;
> @@ -100,15 +143,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>  
>  	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
>  
> -	/* The guest irq may be shared so this ack may be
> -	 * from another device.
> -	 */
> -	spin_lock(&dev->intx_lock);
> -	if (dev->host_irq_disabled) {
> -		enable_irq(dev->host_irq);
> -		dev->host_irq_disabled = false;
> +	mutex_lock(&dev->intx_mask_lock);
> +
> +	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
> +		bool reassert = false;
> +
> +		spin_lock_irq(&dev->intx_lock);
> +		/*
> +		 * The guest IRQ may be shared so this ack can come from an
> +		 * IRQ for another guest device.
> +		 */
> +		if (dev->host_irq_disabled) {
> +			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
> +				enable_irq(dev->host_irq);
> +			else if (!pci_check_and_unmask_intx(dev->dev))
> +				reassert = true;
> +			dev->host_irq_disabled = reassert;
> +		}
> +		spin_unlock_irq(&dev->intx_lock);
> +
> +		if (reassert)
> +			kvm_set_irq(dev->kvm, dev->irq_source_id,
> +				    dev->guest_irq, 1);
>  	}
> -	spin_unlock(&dev->intx_lock);
> +
> +	mutex_unlock(&dev->intx_mask_lock);
>  }
>  
>  static void deassign_guest_irq(struct kvm *kvm,
> @@ -156,7 +215,13 @@ static void deassign_host_irq(struct kvm *kvm,
>  		pci_disable_msix(assigned_dev->dev);
>  	} else {
>  		/* Deal with MSI and INTx */
> -		disable_irq(assigned_dev->host_irq);
> +		if (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
> +			spin_lock_irq(&assigned_dev->intx_lock);

Interesting that the new mutex is not used here.

> +			pci_intx(assigned_dev->dev, false);

Interesting.
This will leave the device with interrupts masked.
Do we reset it later?
Maybe we want a comment explaining why it's done.

> +			spin_unlock_irq(&assigned_dev->intx_lock);
> +			synchronize_irq(assigned_dev->host_irq);
> +		} else
> +			disable_irq(assigned_dev->host_irq);
>  
>  		free_irq(assigned_dev->host_irq, assigned_dev);
>  
> @@ -237,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
>  static int assigned_device_enable_host_intx(struct kvm *kvm,
>  					    struct kvm_assigned_dev_kernel *dev)
>  {
> +	irq_handler_t irq_handler;
> +	unsigned long flags;
> +
>  	dev->host_irq = dev->dev->irq;
> -	/* Even though this is PCI, we don't want to use shared
> -	 * interrupts. Sharing host devices with guest-assigned devices
> -	 * on the same interrupt line is not a happy situation: there
> -	 * are going to be long delays in accepting, acking, etc.
> +
> +	/*
> +	 * We can only share the IRQ line with other host devices if we are
> +	 * able to disable the IRQ source at device-level - independently of
> +	 * the guest driver. Otherwise host devices may suffer from unbounded
> +	 * IRQ latencies when the guest keeps the line asserted.
>  	 */
> -	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
> -				 IRQF_ONESHOT, dev->irq_name, dev))
> +	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
> +		irq_handler = kvm_assigned_dev_intx;
> +		flags = IRQF_SHARED;
> +	} else {
> +		irq_handler = NULL;
> +		flags = IRQF_ONESHOT;
> +	}
> +	if (request_threaded_irq(dev->host_irq, irq_handler,
> +				 kvm_assigned_dev_thread_intx, flags,
> +				 dev->irq_name, dev))
>  		return -EIO;
> +
> +	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
> +		spin_lock_irq(&dev->intx_lock);
> +		pci_intx(dev->dev, true);
> +		spin_unlock_irq(&dev->intx_lock);
> +	}
>  	return 0;
>  }
>  
> @@ -262,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
>  	}
>  
>  	dev->host_irq = dev->dev->irq;
> -	if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
> -				 0, dev->irq_name, dev)) {
> +	if (request_threaded_irq(dev->host_irq, NULL,
> +				 kvm_assigned_dev_thread_msi, 0,
> +				 dev->irq_name, dev)) {
>  		pci_disable_msi(dev->dev);
>  		return -EIO;
>  	}
> @@ -321,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
>  {
>  	dev->guest_irq = irq->guest_irq;
>  	dev->ack_notifier.gsi = -1;
> -	dev->host_irq_disabled = false;
>  	return 0;
>  }
>  #endif
> @@ -333,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
>  {
>  	dev->guest_irq = irq->guest_irq;
>  	dev->ack_notifier.gsi = -1;
> -	dev->host_irq_disabled = false;
>  	return 0;
>  }
>  #endif
> @@ -367,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm,
>  	default:
>  		r = -EINVAL;
>  	}
> +	dev->host_irq_disabled = false;
>  
>  	if (!r)
>  		dev->irq_requested_type |= host_irq_type;
> @@ -468,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
>  {
>  	int r = -ENODEV;
>  	struct kvm_assigned_dev_kernel *match;
> +	unsigned long irq_type;
>  
>  	mutex_lock(&kvm->lock);
>  
> @@ -476,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
>  	if (!match)
>  		goto out;
>  
> -	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
> +	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
> +					  KVM_DEV_IRQ_GUEST_MASK);
> +	r = kvm_deassign_irq(kvm, match, irq_type);
>  out:
>  	mutex_unlock(&kvm->lock);
>  	return r;
> @@ -609,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
>  	if (!match->pci_saved_state)
>  		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
>  		       __func__, dev_name(&dev->dev));
> +
> +	if (!pci_intx_mask_supported(dev))
> +		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
> +
>  	match->assigned_dev_id = assigned_dev->assigned_dev_id;
>  	match->host_segnr = assigned_dev->segnr;
>  	match->host_busnr = assigned_dev->busnr;
> @@ -616,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
>  	match->flags = assigned_dev->flags;
>  	match->dev = dev;
>  	spin_lock_init(&match->intx_lock);
> +	mutex_init(&match->intx_mask_lock);
>  	match->irq_source_id = -1;
>  	match->kvm = kvm;
>  	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
> @@ -761,6 +853,56 @@ msix_entry_out:
>  }
>  #endif
>  
> +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
> +		struct kvm_assigned_pci_dev *assigned_dev)
> +{
> +	int r = 0;
> +	struct kvm_assigned_dev_kernel *match;
> +
> +	mutex_lock(&kvm->lock);
> +
> +	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev->assigned_dev_id);
> +	if (!match) {
> +		r = -ENODEV;
> +		goto out;
> +	}
> +
> +	mutex_lock(&match->intx_mask_lock);
> +
> +	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
> +	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
> +
> +	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
> +		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
> +			kvm_set_irq(match->kvm, match->irq_source_id,
> +				    match->guest_irq, 0);
> +			/*
> +			 * Masking at hardware-level is performed on demand,
> +			 * i.e. when an IRQ actually arrives at the host.
> +			 */
> +		} else {
> +			/*
> +			 * Unmask the IRQ line. It may have been masked
> +			 * meanwhile if we aren't using PCI 2.3 INTx masking
> +			 * on the host side.
> +			 */
> +			spin_lock_irq(&match->intx_lock);
> +			if (match->host_irq_disabled) {
> +				enable_irq(match->host_irq);
> +				match->host_irq_disabled = false;
> +			}
> +			spin_unlock_irq(&match->intx_lock);
> +		}
> +	}
> +
> +	mutex_unlock(&match->intx_mask_lock);
> +
> +out:
> +	mutex_unlock(&kvm->lock);
> +	return r;
> +}
> +
>  long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
>  				  unsigned long arg)
>  {
> @@ -868,6 +1010,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
>  		break;
>  	}
>  #endif
> +	case KVM_ASSIGN_SET_INTX_MASK: {
> +		struct kvm_assigned_pci_dev assigned_dev;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
> +			goto out;
> +		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
> +		break;
> +	}
>  	default:
>  		r = -ENOTTY;
>  		break;
> @@ -875,4 +1026,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
>  out:
>  	return r;
>  }
> -
> -- 
> 1.7.3.4
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html