Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

Marcelo Tosatti <mtosatti@xxxxxxxxxx> · Mon, 17 Jan 2011 09:54:47 -0200

On Thu, Jan 06, 2011 at 06:19:44PM +0800, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
> 
> Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx>
> ---
>  arch/x86/kvm/Makefile    |    2 +-
>  arch/x86/kvm/x86.c       |    8 +-
>  include/linux/kvm.h      |   21 ++++
>  include/linux/kvm_host.h |   25 ++++
>  virt/kvm/assigned-dev.c  |   44 +++++++
>  virt/kvm/kvm_main.c      |   38 ++++++-
>  virt/kvm/msix_mmio.c     |  284 ++++++++++++++++++++++++++++++++++++++++++++++
>  virt/kvm/msix_mmio.h     |   25 ++++
>  8 files changed, 440 insertions(+), 7 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h
> 
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index ae72ae6..7444dcd 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -18,6 +18,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/slab.h>
>  #include "irq.h"
> +#include "msix_mmio.h"
>  
>  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
>  						      int assigned_dev_id)
> @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
>  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
>  }
>  
> +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> +				struct kvm_assigned_dev_kernel *adev)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = adev->assigned_dev_id;
> +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +	kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
>  static void kvm_free_assigned_device(struct kvm *kvm,
>  				     struct kvm_assigned_dev_kernel
>  				     *assigned_dev)
>  {
>  	kvm_free_assigned_irq(kvm, assigned_dev);
>  
> +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> +
>  	__pci_reset_function(assigned_dev->dev);
>  	pci_restore_state(assigned_dev->dev);
>  
> @@ -785,3 +799,33 @@ out:
>  	return r;
>  }
>  
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, bool mask)
> +{
> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;
> +
> +	mutex_lock(&kvm->lock);
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	for (i = 0; i < adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (mask)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);
> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}

Should check if the host irq is registered as MSIX type.

> +out:
> +	mutex_unlock(&kvm->lock);
> +	return r;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b1b6cbb..b7807c8 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -56,6 +56,7 @@
>  
>  #include "coalesced_mmio.h"
>  #include "async_pf.h"
> +#include "msix_mmio.h"
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/kvm.h>
> @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>  	struct mm_struct *mm = kvm->mm;
>  
>  	kvm_arch_sync_events(kvm);
> +	kvm_unregister_msix_mmio_dev(kvm);
>  	spin_lock(&kvm_lock);
>  	list_del(&kvm->vm_list);
>  	spin_unlock(&kvm_lock);
> @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
>  		mutex_unlock(&kvm->lock);
>  		break;
>  #endif
> +	case KVM_REGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
> +	case KVM_UNREGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
> @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
>  		return r;
>  	}
>  #endif
> +	r = kvm_register_msix_mmio_dev(kvm);
> +	if (r < 0) {
> +		kvm_put_kvm(kvm);
> +		return r;
> +	}
> +
>  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>  	if (r < 0)
>  		kvm_put_kvm(kvm);
> @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  		     int len, const void *val)
>  {
> -	int i;
> +	int i, r = -EOPNOTSUPP;
>  	struct kvm_io_bus *bus;
>  
>  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> -	for (i = 0; i < bus->dev_count; i++)
> -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> +	for (i = 0; i < bus->dev_count; i++) {
> +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> +		if (r == -ENOTSYNC)
> +			break;
> +		else if (!r)
>  			return 0;
> -	return -EOPNOTSUPP;
> +	}
> +	return r;
>  }
>  
>  /* kvm_io_bus_read - called under kvm->slots_lock */
> diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> new file mode 100644
> index 0000000..dc6e8ca
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.c
> @@ -0,0 +1,284 @@
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@xxxxxxxxx>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +
> +#include "msix_mmio.h"
> +#include "iodev.h"
> +
> +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
> +				int entry, u32 flag)
> +{
> +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> +				mmio->dev_id, entry, flag);
> +	return -EFAULT;
> +}
> +
> +/* Caller must hold dev->lock */
> +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> +				gpa_t addr, int len)
> +{
> +	gpa_t start, end;
> +	int i, r = -EINVAL;
> +
> +	for (i = 0; i < dev->mmio_nr; i++) {
> +		start = dev->mmio[i].table_base_addr;
> +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> +			dev->mmio[i].max_entries_nr;
> +		if (addr >= start && addr + len <= end) {
> +			r = i;
> +			break;
> +		}
> +	}
> +
> +	return r;
> +}
> +
> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xf;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> +	if (r)
> +		goto out;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	gpa_t entry_base;
> +	u32 old_ctrl, new_ctrl;
> +	u32 *ctrl_pos;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xF;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> +
> +	if (get_user(old_ctrl, ctrl_pos))
> +		goto out;
> +
> +	/* No allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> +		goto out;

Instead of copying to/from userspace (which is subject to swapin,
unexpected values), you could include the guest written value in a
kvm_run structure, along with address. Qemu-kvm would use that to
synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.

> +
> +	if (get_user(new_ctrl, ctrl_pos))
> +		goto out;
> +
> +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> +		ret = -ENOTSYNC;
> +	if (old_ctrl == new_ctrl)
> +		goto out;
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);

Then you rely on the kernel copy of the values to enable/disable irq.

> +	if (r || ret)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> +	.read     = msix_table_mmio_read,
> +	.write    = msix_table_mmio_write,
> +};
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> +	mutex_init(&kvm->msix_mmio_dev.lock);
> +	kvm->msix_mmio_dev.kvm = kvm;
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio = &mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}
> +	}
> +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	/* All reserved currently */
> +	if (mmio_user->flags) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if (!mmio) {
> +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> +			r = -ENOSPC;
> +			goto out;
> +		}
> +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> +	}
> +
> +	mmio_dev->mmio_nr++;
> +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> +	mmio->dev_id = mmio_user->dev_id;
> +	mmio->flags = mmio_user->flags;
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +		mmio->table_base_addr = mmio_user->base_addr;
> +		mmio->table_base_va = mmio_user->base_va;
> +	}
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	int r = 0, i, j;
> +	bool found = 0;
> +
> +	if (!mmio)
> +		return 0;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	BUG_ON(mmio_dev->mmio_nr >= KVM_MSIX_MMIO_MAX);

Its valid for mmio_nr to be equal to KVM_MSIX_MMIO_MAX.

> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> +		    mmio_dev->mmio[i].type == mmio->type) {
> +			found = true;
> +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> +			mmio_dev->mmio_nr--;
> +			break;
> +		}
> +	}
> +	if (!found)
> +		r = -EINVAL;
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}

This is not consistent with registration, where there are no checks
regarding validity of assigned device id. So why is it necessary?

There is a lock ordering problem BTW: 

- read/write handlers: dev->lock -> kvm->lock
- vm_ioctl_deassign_device -> kvm_free_msix_mmio: kvm->lock -> dev->lock

> +
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = mmio_user->dev_id;
> +	mmio.type = mmio_user->type;
> +
> +	return kvm_free_msix_mmio(kvm, &mmio);
> +}

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html