Re: [PATCH 5/5] KVM: assigned dev: MSI-X mask support

"Michael S. Tsirkin" <mst@xxxxxxxxxx> · Thu, 4 Nov 2010 12:43:22 +0200

On Thu, Nov 04, 2010 at 02:15:21PM +0800, Sheng Yang wrote:
> This patch enable per-vector mask for assigned devices using MSI-X.
> 
> This patch provided two new APIs: one is for guest to specific device's MSI-X
> table address in MMIO, the other is for userspace to get information about mask
> bit.
> 
> All the mask bit operation are kept in kernel, in order to accelerate.
> Userspace shouldn't access the device MMIO directly for the information,
> instead it should uses provided API to do so.
> 
> Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx>
> ---
>  Documentation/kvm/api.txt |   46 +++++++
>  arch/x86/kvm/x86.c        |    1 +
>  include/linux/kvm.h       |   21 +++-
>  include/linux/kvm_host.h  |    3 +
>  virt/kvm/assigned-dev.c   |  285 ++++++++++++++++++++++++++++++++++++++++++++-
>  5 files changed, 354 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
> index b336266..69b993c 100644
> --- a/Documentation/kvm/api.txt
> +++ b/Documentation/kvm/api.txt
> @@ -1085,6 +1085,52 @@ of 4 instructions that make up a hypercall.
>  If any additional field gets added to this structure later on, a bit for that
>  additional piece of information will be set in the flags bitmap.
>  
> +4.47 KVM_ASSIGN_REG_MSIX_MMIO
> +
> +Capability: KVM_CAP_DEVICE_MSIX_MASK
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_assigned_msix_mmio (in)
> +Returns: 0 on success, !0 on error
> +
> +struct kvm_assigned_msix_mmio {
> +	/* Assigned device's ID */
> +	__u32 assigned_dev_id;
> +	/* Must be 0 */
> +	__u32 flags;
> +	/* MSI-X table MMIO address */
> +	__u64 base_addr;
> +	/* Must be 0, reserved for future use */
> +	__u64 reserved;

We also want length I think.

> +};
> +
> +This ioctl would enable in-kernel MSI-X emulation, which would handle MSI-X
> +mask bit in the kernel.

What happens on repeated calls when it's already enabled?
How does one disable after it's enabled?

> +

This is very specialized for assigned devices.  I would like an
interface not tied to assigned devices explicitly
(even if the implementation at the moment only works
for assigned devices, I can patch that in later). E.g. vhost-net would
benefit from such an extension too.  Maybe tie this to a special
GSI and this GSI can be an assigned device or not?

> +4.48 KVM_ASSIGN_GET_MSIX_ENTRY
> +
> +Capability: KVM_CAP_DEVICE_MSIX_MASK
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_assigned_msix_entry (in and out)
> +Returns: 0 on success, !0 on error
> +
> +struct kvm_assigned_msix_entry {
> +	/* Assigned device's ID */
> +	__u32 assigned_dev_id;
> +	/* Ignored */
> +	__u32 gsi;
> +	/* The index of entry in the MSI-X table */
> +	__u16 entry;
> +	/* Querying flags and returning status */
> +	__u16 flags;
> +	/* Must be 0 */
> +	__u16 padding[2];
> +};
> +
> +This ioctl would allow userspace to get the status of one specific MSI-X
> +entry. Currently we support mask bit status querying.
> +
>  5. The kvm_run structure
>  
>  Application code obtains a pointer to the kvm_run structure by
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index f3f86b2..8fd5121 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1926,6 +1926,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_DEBUGREGS:
>  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
>  	case KVM_CAP_XSAVE:
> +	case KVM_CAP_DEVICE_MSIX_MASK:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 919ae53..eafafb1 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -540,6 +540,9 @@ struct kvm_ppc_pvinfo {
>  #endif
>  #define KVM_CAP_PPC_GET_PVINFO 57
>  #define KVM_CAP_PPC_IRQ_LEVEL 58
> +#ifdef __KVM_HAVE_MSIX
> +#define KVM_CAP_DEVICE_MSIX_MASK 59
> +#endif
>  

We seem to have these HAVE macros all over.
Avi, what's the idea? Let's drop them?

>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -671,6 +674,10 @@ struct kvm_clock_data {
>  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
>  #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
>  #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> +#define KVM_ASSIGN_GET_MSIX_ENTRY _IOWR(KVMIO,  0x7d, \
> +					struct kvm_assigned_msix_entry)
> +#define KVM_ASSIGN_REG_MSIX_MMIO  _IOW(KVMIO,  0x7e, \
> +					struct kvm_assigned_msix_mmio)
>  /* Available with KVM_CAP_PIT_STATE2 */
>  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
>  #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
> @@ -787,11 +794,23 @@ struct kvm_assigned_msix_nr {
>  };
>  
>  #define KVM_MAX_MSIX_PER_DEV		256
> +
> +#define KVM_MSIX_FLAG_MASK		(1 << 0)
> +#define KVM_MSIX_FLAG_QUERY_MASK	(1 << 15)
> +
>  struct kvm_assigned_msix_entry {
>  	__u32 assigned_dev_id;
>  	__u32 gsi;
>  	__u16 entry; /* The index of entry in the MSI-X table */
> -	__u16 padding[3];
> +	__u16 flags;
> +	__u16 padding[2];
> +};
> +
> +struct kvm_assigned_msix_mmio {
> +	__u32 assigned_dev_id;
> +	__u32 flags;
> +	__u64 base_addr;
> +	__u64 reserved;
>  };
>  
>  #endif /* __LINUX_KVM_H */
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index e2ecbac..5ac4db9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -464,6 +464,9 @@ struct kvm_assigned_dev_kernel {
>  	struct pci_dev *dev;
>  	struct kvm *kvm;
>  	spinlock_t assigned_dev_lock;
> +	DECLARE_BITMAP(msix_mask_bitmap, KVM_MAX_MSIX_PER_DEV);
> +	gpa_t msix_mmio_base;
> +	struct kvm_io_device msix_mmio_dev;
>  };
>  
>  struct kvm_irq_mask_notifier {
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index 7c98928..9398c27 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -232,6 +232,14 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>  {
>  	kvm_free_assigned_irq(kvm, assigned_dev);
>  
> +#ifdef __KVM_HAVE_MSIX
> +	if (assigned_dev->msix_mmio_base) {
> +		mutex_lock(&kvm->slots_lock);
> +		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> +				&assigned_dev->msix_mmio_dev);
> +		mutex_unlock(&kvm->slots_lock);
> +	}
> +#endif
>  	pci_reset_function(assigned_dev->dev);
>  
>  	pci_release_regions(assigned_dev->dev);
> @@ -504,7 +512,7 @@ out:
>  static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
>  				      struct kvm_assigned_pci_dev *assigned_dev)
>  {
> -	int r = 0, idx;
> +	int r = 0, idx, i;
>  	struct kvm_assigned_dev_kernel *match;
>  	struct pci_dev *dev;
>  
> @@ -563,6 +571,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
>  
>  	list_add(&match->list, &kvm->arch.assigned_dev_head);
>  
> +	/* The state after reset of MSI-X table is all masked */
> +	for (i = 0; i < KVM_MAX_MSIX_PER_DEV; i++)
> +		set_bit(i, match->msix_mask_bitmap);
> +
>  	if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
>  		if (!kvm->arch.iommu_domain) {
>  			r = kvm_iommu_map_guest(kvm);
> @@ -666,6 +678,43 @@ msix_nr_out:
>  	return r;
>  }
>  
> +static void update_msix_mask(struct kvm_assigned_dev_kernel *adev,
> +			     int idx, bool new_mask_flag)
> +{
> +	int irq;
> +	bool old_mask_flag, need_flush = false;
> +
> +	spin_lock_irq(&adev->assigned_dev_lock);
> +
> +	if (!adev->dev->msix_enabled ||
> +	    !(adev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX))
> +		goto out;
> +
> +	old_mask_flag = test_bit(adev->guest_msix_entries[idx].entry,
> +			adev->msix_mask_bitmap);
> +	if (old_mask_flag == new_mask_flag)
> +		goto out;
> +
> +	irq = adev->host_msix_entries[idx].vector;
> +	BUG_ON(irq == 0);
> +
> +	if (new_mask_flag) {
> +		set_bit(adev->guest_msix_entries[idx].entry,
> +				adev->msix_mask_bitmap);
> +		disable_irq_nosync(irq);
> +		need_flush = true;
> +	} else {
> +		clear_bit(adev->guest_msix_entries[idx].entry,
> +				adev->msix_mask_bitmap);
> +		enable_irq(irq);
> +	}
> +out:
> +	spin_unlock_irq(&adev->assigned_dev_lock);
> +
> +	if (need_flush)
> +		flush_work(&adev->interrupt_work);
> +}
> +
>  static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
>  				       struct kvm_assigned_msix_entry *entry)
>  {
> @@ -700,6 +749,210 @@ msix_entry_out:
>  
>  	return r;
>  }
> +
> +static int kvm_vm_ioctl_get_msix_entry(struct kvm *kvm,
> +				       struct kvm_assigned_msix_entry *entry)
> +{
> +	int r = 0;
> +	struct kvm_assigned_dev_kernel *adev;
> +
> +	mutex_lock(&kvm->lock);
> +
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      entry->assigned_dev_id);
> +
> +	if (!adev) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (entry->entry >= KVM_MAX_MSIX_PER_DEV) {
> +		r = -ENOSPC;
> +		goto out;
> +	}
> +
> +	if (entry->flags & KVM_MSIX_FLAG_QUERY_MASK) {
> +		if (test_bit(entry->entry, adev->msix_mask_bitmap))
> +			entry->flags |= KVM_MSIX_FLAG_MASK;
> +		else
> +			entry->flags &= ~KVM_MSIX_FLAG_MASK;
> +	}
> +
> +out:
> +	mutex_unlock(&kvm->lock);
> +
> +	return r;
> +}
> +
> +static bool msix_mmio_in_range(struct kvm_assigned_dev_kernel *adev,
> +			      gpa_t addr, int len)
> +{
> +	gpa_t start, end;
> +
> +	BUG_ON(adev->msix_mmio_base == 0);

Below I see
	adev->msix_mmio_base = msix_mmio->base_addr;
which comes from userspace. BUG_ON is an unfriendly way to
tell user about a bug in qemu.

Anyway, I don't think we should special-case 0 gpa.
It's up to the user where to base the table.
Use a separate flag to signal that table was initialized.

> +	start = adev->msix_mmio_base;
> +	end = adev->msix_mmio_base + PCI_MSIX_ENTRY_SIZE * KVM_MAX_MSIX_PER_DEV;

This seems wrong. What if the device has a smaller msix table?
Can't we simply pass in base and length?

> +	if (addr >= start && addr + len <= end)

interesting wrap-around cases come to mind. Likely not a real
problem but maybe better use addr + len - 1 all over
to make it robust.

> +		return true;
> +
> +	return false;
> +}
> +
> +static int msix_get_enabled_idx(struct kvm_assigned_dev_kernel *adev,
> +			      gpa_t addr, int len)
> +{
> +	int i;
> +	gpa_t start, end;
> +
> +	for (i = 0; i < adev->entries_nr; i++) {
> +		start = adev->msix_mmio_base +
> +			adev->guest_msix_entries[i].entry * PCI_MSIX_ENTRY_SIZE;
> +		end = start + PCI_MSIX_ENTRY_SIZE;
> +		if (addr >= start && addr + len <= end) {
> +			return i;
> +		}
> +	}

That's a scary thing to do on each access.
Can we build an interface that does not force us
to scan the whole list each time?

> +
> +	return -EINVAL;
> +}
> +
> +static int msix_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +			  void *val)
> +{
> +	struct kvm_assigned_dev_kernel *adev =
> +			container_of(this, struct kvm_assigned_dev_kernel,
> +				     msix_mmio_dev);
> +	int idx, r = 0;
> +	u32 entry[4];
> +	struct kvm_kernel_irq_routing_entry e;
> +
> +	mutex_lock(&adev->kvm->lock);
> +	if (!msix_mmio_in_range(adev, addr, len)) {
> +		r = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || len != 4)
> +		goto out;
> +
> +	idx = msix_get_enabled_idx(adev, addr, len);
> +	if (idx < 0) {
> +		idx = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE;
> +		if ((addr % PCI_MSIX_ENTRY_SIZE) ==
> +				PCI_MSIX_ENTRY_VECTOR_CTRL)
> +			*(unsigned long *)val =
> +				test_bit(idx, adev->msix_mask_bitmap) ?
> +				PCI_MSIX_ENTRY_CTRL_MASKBIT : 0;

I suspect this is wrong for big endian platforms: PCI returns
all values in little endian format.

> +		else
> +			r = -EOPNOTSUPP;
> +		goto out;
> +	}
> +
> +	r = kvm_get_irq_routing_entry(adev->kvm,
> +			adev->guest_msix_entries[idx].vector, &e);
> +	if (r || e.type != KVM_IRQ_ROUTING_MSI) {
> +		printk(KERN_WARNING "KVM: Wrong MSI-X routing entry! "
> +			"idx %d, addr 0x%llx, len %d\n", idx, addr, len);

Can a malicious userspace trigger this intentionally?

> +		r = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	entry[0] = e.msi.address_lo;
> +	entry[1] = e.msi.address_hi;
> +	entry[2] = e.msi.data;
> +	entry[3] = test_bit(adev->guest_msix_entries[idx].entry,
> +			adev->msix_mask_bitmap);

I suspect this is wrong for big endian platforms: PCI returns
all values in little endian format.

> +	memcpy(val, &entry[addr % PCI_MSIX_ENTRY_SIZE / 4

4 is sizeof *entry?

], len);
> +
> +out:
> +	mutex_unlock(&adev->kvm->lock);
> +	return r;
> +}
> +
> +static int msix_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
> +			   const void *val)
> +{
> +	struct kvm_assigned_dev_kernel *adev =
> +			container_of(this, struct kvm_assigned_dev_kernel,
> +				     msix_mmio_dev);
> +	int idx, r = 0;
> +	unsigned long new_val = *(unsigned long *)val;

I suspect this is wrong for big endian platforms: PCI returns
all values in little endian format.

> +
> +	mutex_lock(&adev->kvm->lock);
> +	if (!msix_mmio_in_range(adev, addr, len)) {
> +		r = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || len != 4)
> +		goto out;
> +
> +	idx = msix_get_enabled_idx(adev, addr, len);
> +	if (idx < 0) {
> +		idx = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE;
> +		if (((addr % PCI_MSIX_ENTRY_SIZE) ==
> +				PCI_MSIX_ENTRY_VECTOR_CTRL)) {
> +			if (new_val & ~PCI_MSIX_ENTRY_CTRL_MASKBIT)
> +				goto out;
> +			if (new_val & PCI_MSIX_ENTRY_CTRL_MASKBIT)
> +				set_bit(idx, adev->msix_mask_bitmap);
> +			else
> +				clear_bit(idx, adev->msix_mask_bitmap);
> +		} else
> +			/* Userspace would handle other MMIO writing */
> +			r = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if (addr % PCI_MSIX_ENTRY_SIZE != PCI_MSIX_ENTRY_VECTOR_CTRL) {
> +		r = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if (new_val & ~PCI_MSIX_ENTRY_CTRL_MASKBIT)
> +		goto out;
> +	update_msix_mask(adev, idx, !!(new_val & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> +out:
> +	mutex_unlock(&adev->kvm->lock);
> +
> +	return r;
> +}
> +
> +static const struct kvm_io_device_ops msix_mmio_ops = {
> +	.read     = msix_mmio_read,
> +	.write    = msix_mmio_write,
> +};
> +
> +static int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				struct kvm_assigned_msix_mmio *msix_mmio)
> +{
> +	int r = 0;
> +	struct kvm_assigned_dev_kernel *adev;
> +
> +	mutex_lock(&kvm->lock);
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      msix_mmio->assigned_dev_id);
> +	if (!adev) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if (msix_mmio->base_addr == 0) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	mutex_lock(&kvm->slots_lock);
> +	if (adev->msix_mmio_base == 0) {

What does this do? Handles case we are already registered?
Also ! adev->msix_mmio_base

> +		kvm_iodevice_init(&adev->msix_mmio_dev, &msix_mmio_ops);
> +		r = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> +				&adev->msix_mmio_dev);
> +		if (r)
> +			goto out2;
> +	}
> +
> +	adev->msix_mmio_base = msix_mmio->base_addr;
> +out2:
> +	mutex_unlock(&kvm->slots_lock);
> +out:
> +	mutex_unlock(&kvm->lock);
> +
> +	return r;
> +}
>  #endif
>  
>  long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
> @@ -812,6 +1065,36 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
>  			goto out;
>  		break;
>  	}
> +	case KVM_ASSIGN_GET_MSIX_ENTRY: {
> +		struct kvm_assigned_msix_entry entry;
> +		r = -EFAULT;
> +		if (copy_from_user(&entry, argp, sizeof entry))
> +			goto out;
> +		r = kvm_vm_ioctl_get_msix_entry(kvm, &entry);
> +		if (r)
> +			goto out;
> +		r = -EFAULT;
> +		if (copy_to_user(argp, &entry, sizeof entry))
> +			goto out;
> +		r = 0;
> +		break;
> +	}
> +	case KVM_ASSIGN_REG_MSIX_MMIO: {
> +		struct kvm_assigned_msix_mmio msix_mmio;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&msix_mmio, argp, sizeof(msix_mmio)))
> +			goto out;
> +
> +		r = -EINVAL;
> +		if (msix_mmio.flags != 0 || msix_mmio.reserved != 0)
> +			goto out;
> +
> +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &msix_mmio);
> +		if (r)
> +			goto out;
> +		break;
> +	}
>  #endif
>  	}
>  out:
> -- 
> 1.7.0.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html