Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel

Sheng Yang <sheng@xxxxxxxxxxxxxxx> · Tue, 1 Mar 2011 14:10:37 +0800

On Monday 28 February 2011 19:27:29 Michael S. Tsirkin wrote:
> On Mon, Feb 28, 2011 at 03:20:04PM +0800, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> > 
> > Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx>
> 
> A general question: we implement mmio read and write
> operation here, but seem to do nothing
> about ordering. In particular, pci mmio reads are normally
> assumed to flush all writes out to the device
> and all device writes in to the CPU.
> 
> How exactly does it work here?

I don't understand your problem... We emulate all operation here, where is 
"ordering" issue?

And Michael, thanks for you detail comments, but could you give comments all at 
once? Now I've tried my best to address comments, but still feeling endless 
comments are coming. And I would leave Intel this Friday, I want to get it done 
before I leave.
> 
> > ---
> > 
> >  arch/x86/include/asm/kvm_host.h |    1 +
> >  arch/x86/kvm/Makefile           |    2 +-
> >  arch/x86/kvm/mmu.c              |    2 +
> >  arch/x86/kvm/x86.c              |   40 ++++-
> >  include/linux/kvm.h             |   28 ++++
> >  include/linux/kvm_host.h        |   36 +++++
> >  virt/kvm/assigned-dev.c         |   44 ++++++
> >  virt/kvm/kvm_main.c             |   38 +++++-
> >  virt/kvm/msix_mmio.c            |  301
> >  +++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h           
> >  |   25 ++++
> >  10 files changed, 504 insertions(+), 13 deletions(-)
> >  create mode 100644 virt/kvm/msix_mmio.c
> >  create mode 100644 virt/kvm/msix_mmio.h
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h
> > b/arch/x86/include/asm/kvm_host.h index aa75f21..4a390a4 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -635,6 +635,7 @@ enum emulation_result {
> > 
> >  	EMULATE_DONE,       /* no further processing */
> >  	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
> >  	EMULATE_FAIL,         /* can't emulate this instruction */
> > 
> > +	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
> > 
> >  };
> >  
> >  #define EMULTYPE_NO_DECODE	    (1 << 0)
> > 
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index f15501f..3a0d851 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > 
> >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> >  
> >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > 
> > -				assigned-dev.o)
> > +				assigned-dev.o msix_mmio.o)
> > 
> >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> >  async_pf.o)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 9cafbb4..912dca4 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t
> > cr2, u32 error_code,
> > 
> >  	case EMULATE_DO_MMIO:
> >  		++vcpu->stat.mmio_exits;
> >  		/* fall through */
> > 
> > +	case EMULATE_USERSPACE_EXIT:
> > +		/* fall through */
> > 
> >  	case EMULATE_FAIL:
> >  		return 0;
> >  	
> >  	default:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 21b84e2..87308eb 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > 
> >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> >  	case KVM_CAP_XSAVE:
> > 
> >  	case KVM_CAP_ASYNC_PF:
> > +	case KVM_CAP_MSIX_MMIO:
> >  		r = 1;
> >  		break;
> >  	
> >  	case KVM_CAP_COALESCED_MMIO:
> > @@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned
> > long addr,
> > 
> >  {
> >  
> >  	gpa_t                 gpa;
> >  	struct kvm_io_ext_data ext_data;
> > 
> > +	int r;
> > 
> >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > 
> > @@ -3824,18 +3826,32 @@ static int
> > emulator_write_emulated_onepage(unsigned long addr,
> > 
> >  mmio:
> >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > 
> > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
> > 
> >  	/*
> >  	
> >  	 * Is this MMIO handled locally?
> >  	 */
> > 
> > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > +	if (!r)
> > 
> >  		return X86EMUL_CONTINUE;
> > 
> > -	vcpu->mmio_needed = 1;
> > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > -	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > -	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > -	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > -	memcpy(vcpu->run->mmio.data, val, bytes);
> > +	if (r == -ENOTSYNC) {
> 
> Can you replace -ENOTSYNC with KVM_EXIT_MSIX_ROUTING_UPDATE all over
> please?

How about let Avi/Marcelo decide?
> 
> > +		vcpu->userspace_exit_needed = 1;
> > +		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
> > +		vcpu->run->msix_routing.dev_id =
> > +			ext_data.msix_routing.dev_id;
> > +		vcpu->run->msix_routing.type =
> > +			ext_data.msix_routing.type;
> > +		vcpu->run->msix_routing.entry_idx =
> > +			ext_data.msix_routing.entry_idx;
> > +		vcpu->run->msix_routing.flags =
> > +			ext_data.msix_routing.flags;
> > +	} else  {
> > +		vcpu->mmio_needed = 1;
> > +		vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > +		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > +		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > +		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > +		memcpy(vcpu->run->mmio.data, val, bytes);
> > +	}
> > 
> >  	return X86EMUL_CONTINUE;
> >  
> >  }
> > 
> > @@ -4469,6 +4485,8 @@ done:
> >  		r = EMULATE_DO_MMIO;
> >  	
> >  	} else if (r == EMULATION_RESTART)
> >  	
> >  		goto restart;
> > 
> > +	else if (vcpu->userspace_exit_needed)
> > +		r = EMULATE_USERSPACE_EXIT;
> > 
> >  	else
> >  	
> >  		r = EMULATE_DONE;
> > 
> > @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> > *vcpu, struct kvm_run *kvm_run)
> > 
> >  		}
> >  	
> >  	}
> > 
> > -	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
> > +	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
> > +			vcpu->userspace_exit_needed) {
> > 
> >  		if (vcpu->mmio_needed) {
> >  		
> >  			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
> >  			vcpu->mmio_read_completed = 1;
> >  			vcpu->mmio_needed = 0;
> >  		
> >  		}
> > 
> > +		if (vcpu->userspace_exit_needed) {
> > +			vcpu->userspace_exit_needed = 0;
> > +			r = 0;
> > +			goto out;
> > +		}
> > 
> >  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> >  		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> >  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> > 
> > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > index ea2dc1a..4393e4e 100644
> > --- a/include/linux/kvm.h
> > +++ b/include/linux/kvm.h
> > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > 
> >  #define KVM_EXIT_NMI              16
> >  #define KVM_EXIT_INTERNAL_ERROR   17
> >  #define KVM_EXIT_OSI              18
> > 
> > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > 
> >  /* For KVM_EXIT_INTERNAL_ERROR */
> >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > 
> > @@ -264,6 +265,13 @@ struct kvm_run {
> > 
> >  		struct {
> >  		
> >  			__u64 gprs[32];
> >  		
> >  		} osi;
> > 
> > +		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
> > +		struct {
> > +			__u32 dev_id;
> > +			__u16 type;
> > +			__u16 entry_idx;
> > +			__u64 flags;
> > +		} msix_routing;
> > 
> >  		/* Fix the size of the union. */
> >  		char padding[256];
> >  	
> >  	};
> > 
> > @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
> > 
> >  #define KVM_CAP_PPC_GET_PVINFO 57
> >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> >  #define KVM_CAP_ASYNC_PF 59
> > 
> > +#define KVM_CAP_MSIX_MMIO 60
> > 
> >  #ifdef KVM_CAP_IRQ_ROUTING
> > 
> > @@ -672,6 +681,9 @@ struct kvm_clock_data {
> > 
> >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > 
> > +/* Available with KVM_CAP_MSIX_MMIO */
> > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > 0x7e, struct kvm_msix_mmio_user)
> > 
> >  /* Available with KVM_CAP_PIT_STATE2 */
> >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> >  struct kvm_pit_state2)
> > 
> > @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
> > 
> >  	__u16 padding[3];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > +struct kvm_msix_mmio_user {
> > +	__u32 dev_id;
> > +	__u16 type;
> > +	__u16 max_entries_nr;
> > +	__u64 base_addr;
> > +	__u64 base_va;
> > +	__u64 flags;
> > +	__u64 reserved[4];
> > +};
> > +
> > 
> >  #endif /* __LINUX_KVM_H */
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index a32c53e..d6c05e3 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -68,8 +68,17 @@ enum kvm_bus {
> > 
> >  	KVM_NR_BUSES
> >  
> >  };
> > 
> > +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
> > 
> >  struct kvm_io_ext_data {
> >  
> >  	int type;   /* Extended data type */
> 
> type seems unused for now. Do we need it?
> I'd guess exit type should usually be sufficient.

I want to get this structure self-explained. 
> 
> > +	union {
> > +		struct {
> > +			u32 dev_id;	/* Device ID */
> > +			u16 type;	/* Device type */
> > +			u16 entry_idx;  /* Accessed MSI-X entry index */
> > +			u64 flags;
> > +		} msix_routing;
> 
> So this is only for when we exit on msix routing?

Yes for now.
> 
> > +	};
> > 
> >  };
> >  
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > 
> > @@ -165,6 +174,8 @@ struct kvm_vcpu {
> > 
> >  	} async_pf;
> >  
> >  #endif
> > 
> > +	int userspace_exit_needed;
> > +
> > 
> >  	struct kvm_vcpu_arch arch;
> >  
> >  };
> > 
> > @@ -238,6 +249,27 @@ struct kvm_memslots {
> > 
> >  					KVM_PRIVATE_MEM_SLOTS];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_MAX    32
> > +
> > +struct kvm_msix_mmio {
> > +	u32 dev_id;
> > +	u16 type;
> > +	u16 max_entries_nr;
> > +	u64 flags;
> > +	gpa_t table_base_addr;
> > +	hva_t table_base_va;
> 
> void __user* would make for less casting in code.

OK.
> 
> Or even
> 
> 	struct kvm_msix_entry {
> 		__le32 addr_lo;
> 		__le32 addr_hi;
> 		__le32 data;
> 		__le32 ctrl;
> 	};
> 
> 	struct kvm_msix_entry __user *table;
> 
> and then we can do table[entry].ctrl
> and so on.

It's a userspace address, I think use it in this way maybe misleading. Personally 
I want to keep the current way.
> 
> > +	gpa_t pba_base_addr;
> > +	hva_t pba_base_va;
> > +};
> > +
> > +struct kvm_msix_mmio_dev {
> > +	struct kvm *kvm;
> > +	struct kvm_io_device table_dev;
> > +	int mmio_nr;
> > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > +	struct mutex lock;
> > +};
> > +
> > 
> >  struct kvm {
> >  
> >  	spinlock_t mmu_lock;
> >  	raw_spinlock_t requests_lock;
> > 
> > @@ -286,6 +318,7 @@ struct kvm {
> > 
> >  	long mmu_notifier_count;
> >  
> >  #endif
> >  
> >  	long tlbs_dirty;
> > 
> > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > 
> >  };
> >  
> >  /* The guest did something we don't support. */
> > 
> > @@ -558,6 +591,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > 
> >  int kvm_request_irq_source_id(struct kvm *kvm);
> >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > 
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +			int assigned_dev_id, int entry, bool mask);
> > +
> > 
> >  /* For vcpu->arch.iommu_flags */
> >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > 
> > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > index ae72ae6..d1598a6 100644
> > --- a/virt/kvm/assigned-dev.c
> > +++ b/virt/kvm/assigned-dev.c
> > @@ -18,6 +18,7 @@
> > 
> >  #include <linux/interrupt.h>
> >  #include <linux/slab.h>
> >  #include "irq.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> >  list_head *head,
> >  
> >  						      int assigned_dev_id)
> > 
> > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > 
> >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> >  
> >  }
> > 
> > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > +				struct kvm_assigned_dev_kernel *adev)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = adev->assigned_dev_id;
> > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +	kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > 
> >  static void kvm_free_assigned_device(struct kvm *kvm,
> >  
> >  				     struct kvm_assigned_dev_kernel
> >  				     *assigned_dev)
> >  
> >  {
> >  
> >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > 
> > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > +
> > 
> >  	__pci_reset_function(assigned_dev->dev);
> >  	pci_restore_state(assigned_dev->dev);
> > 
> > @@ -785,3 +799,33 @@ out:
> >  	return r;
> >  
> >  }
> > 
> > +/* The caller should hold kvm->lock */
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +				int assigned_dev_id, int entry, bool mask)
> > +{
> > +	int r = -EFAULT;
> > +	struct kvm_assigned_dev_kernel *adev;
> > +	int i;
> > +
> > +	if (!irqchip_in_kernel(kvm))
> > +		return r;
> 
> This is wrong, EFAULT should only be returned on
> bad/misaligned address.
> 
> Can we check this during setup instead?
> And then this check can go away or become
> BUG_ON, and the function be void.

I think BUG_ON is OK.
> 
> > +
> > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > +				      assigned_dev_id);
> > +	if (!adev)
> > +		goto out;
> > +
> > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > +	for (i = 0; i < adev->entries_nr; i++)
> > +		if (adev->host_msix_entries[i].entry == entry) {
> > +			if (mask)
> > +				disable_irq_nosync(
> > +					adev->host_msix_entries[i].vector);
> > +			else
> > +				enable_irq(adev->host_msix_entries[i].vector);
> > +			r = 0;
> > +			break;
> > +		}
> > +out:
> > +	return r;
> > +}
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index a61f90e..f211e49 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -56,6 +56,7 @@
> > 
> >  #include "coalesced_mmio.h"
> >  #include "async_pf.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/kvm.h>
> > 
> > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > 
> >  	struct mm_struct *mm = kvm->mm;
> >  	
> >  	kvm_arch_sync_events(kvm);
> > 
> > +	kvm_unregister_msix_mmio_dev(kvm);
> > 
> >  	spin_lock(&kvm_lock);
> >  	list_del(&kvm->vm_list);
> >  	spin_unlock(&kvm_lock);
> > 
> > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > 
> >  		mutex_unlock(&kvm->lock);
> >  		break;
> >  
> >  #endif
> > 
> > +	case KVM_REGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > 
> >  	default:
> >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> >  		if (r == -ENOTTY)
> > 
> > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > 
> >  		return r;
> >  	
> >  	}
> >  
> >  #endif
> > 
> > +	r = kvm_register_msix_mmio_dev(kvm);
> > +	if (r < 0) {
> > +		kvm_put_kvm(kvm);
> > +		return r;
> > +	}
> > +
> 
> Need to fix error handling below as well?
> Better do it with chained gotos if yes.

Let's make it another separate patch. 
> 
> >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> >  	if (r < 0)
> >  	
> >  		kvm_put_kvm(kvm);
> > 
> > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > *bus)
> > 
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >  
> >  		     int len, const void *val, struct kvm_io_ext_data *ext_data)
> >  
> >  {
> > 
> > -	int i;
> > +	int i, r = -EOPNOTSUPP;
> > 
> >  	struct kvm_io_bus *bus;
> >  	
> >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > 
> > -	for (i = 0; i < bus->dev_count; i++)
> > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> > +	for (i = 0; i < bus->dev_count; i++) {
> > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
> > +		if (r == -ENOTSYNC)
> > +			break;
> > +		else if (!r)
> > 
> >  			return 0;
> > 
> > -	return -EOPNOTSUPP;
> > +	}
> > +	return r;
> > 
> >  }
> >  
> >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > 
> > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > new file mode 100644
> > index 0000000..b2a5f86
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.c
> > @@ -0,0 +1,301 @@
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@xxxxxxxxx>
> > + */
> > +
> > +#include <linux/kvm_host.h>
> > +#include <linux/kvm.h>
> > +
> > +#include "msix_mmio.h"
> > +#include "iodev.h"
> > +
> > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio, +				int entry, u32 flag)
> > +{
> > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> 
> Does something else even happen?
> If not - BUG_ON.

Not for now, the next would be your vfio? Would use BUG_ON for now.
> 
> > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > +				mmio->dev_id, entry, flag);
> > 
> > +	return -EFAULT;
> 
> EINVAL or something

OK
> 
> > +}
> > +
> > +/* Caller must hold dev->lock */
> > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > +				gpa_t addr, int len)
> > +{
> > +	gpa_t start, end;
> > +	int i, r = -EINVAL;
> > +
> > +	for (i = 0; i < dev->mmio_nr; i++) {
> > +		start = dev->mmio[i].table_base_addr;
> > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > +			dev->mmio[i].max_entries_nr;
> > +		if (addr >= start && addr + len <= end) {
> > +			r = i;
> > +			break;
> > +		}
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > int len, +				void *val)
> > +{
> > +	/*TODO: Add big endian support */
> 
> likely can be removed.
> 
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, ret = 0, entry, offset, r;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((len != 4 && len != 8) || addr & (len - 1))
> > +		goto out;
> > +
> > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> 
> cast above is not needed.
> 
> > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > +	if (r)
> > +		goto out;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > +				int len, const void *val,
> > +				struct kvm_io_ext_data *ext_data)
> > +{
> > +	/*TODO: Add big endian support */
> 
> comment above can go I think.
> 
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, entry, offset, ret = 0, r = 0;
> > +	void __user *entry_base;
> > +	__le32 __user *ctrl_pos;
> > +	__le32 old_ctrl, new_ctrl;
> > +
> > +	mutex_lock(&mmio_dev->kvm->lock);
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((len != 4 && len != 8) || addr & (len - 1))
> > +		goto out;
> > +
> > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	entry_base = (void __user *)(mmio->table_base_va +
> > +			entry * PCI_MSIX_ENTRY_SIZE);
> > +	ctrl_pos = entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL;
> > +
> > +	if (get_user(old_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	/* Don't allow writing to other fields when entry is unmasked */
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> 
> This should be le32_to_cpu(old_ctrl)
> I think sparse shall warn about this.

Sadly it didn't. 
> 
> > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > +		goto out;
> > +
> > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> 
> A cast above is no longer needed
> 
> > +		goto out;
> > +
> > +	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
> > +	ext_data->msix_routing.dev_id = mmio->dev_id;
> > +	ext_data->msix_routing.type = mmio->type;
> > +	ext_data->msix_routing.entry_idx = entry;
> > +	ext_data->msix_routing.flags = 0;
> 
> Is this strictly necessary?
> Can we fill the data in vcpu->run->msix_routing
> directly instead?
> Also, on good path we don't need to fill this structure
> as there's no exit. Let's only do it then?

Where could you get the "entry_idx"?
> 
> > +
> > +	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
> > +		ret = -ENOTSYNC;
> > +		goto out;
> > +	}
> > +
> > +	if (get_user(new_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	if (old_ctrl == new_ctrl) {
> > +		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
> > +			ret = -ENOTSYNC;
> > +		goto out;
> > +	}
> > +	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
> > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
> > +				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> 
> So if update_msix_mask_bit returns EFAULT above this will
> trigger msix exit? Why?

This means kernel didn't handle the mask bit, so we need let userspace handle it - 
for enable the vectors, or others.

> 
> > +	if (r)
> > +		ret = -ENOTSYNC;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	mutex_unlock(&mmio_dev->kvm->lock);
> > +	return ret;
> > +}
> > +
> > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > +	.read     = msix_table_mmio_read,
> > +	.write    = msix_table_mmio_write,
> > +};
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > +	kvm->msix_mmio_dev.kvm = kvm;
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio = &mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> > +	}
> 
> what does the loop above do? Pls add a comment.

Find the existed MMIO, I think code is clear here.
> 
> > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > +		r = -EINVAL;
> 
> -ENOSPC here as well?

OK
> 
> > +		goto out;
> > +	}
> > +	/* All reserved currently */
> > +	if (mmio_user->flags) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +#ifndef CONFIG_64BIT
> > +	if (mmio_user->base_va >= 0xffffffff ||
> > +	    mmio_user->base_addr >= 0xffffffff) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +#endif
> 
> A way to check this without ifdef (long is as pointer in length):
> 	if ((unsigned long)mmio_user->base_va !=
> 		mmio_user->base_va)
> 
> and for base_va it should be EFAULT I think as with any illegal
> virtual address.

OK.
> 
> > +
> > +	/* Check alignment and accessibility */
> > +	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
> > +	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
> > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> 
> One thing to watch for here is wrap around. AFAIK
> access_ok does not check for it, you must do it yourself.

I am curious that if so, every access_ok should go with a wrap around checking?

> 
> > +		r = -EINVAL;
> 
> EFAULT is usually better for an access error.

OK.
> 
> > +		goto out;
> > +	}
> > +	if (!mmio) {
> > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > +			r = -ENOSPC;
> > +			goto out;
> > +		}
> > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > +		mmio_dev->mmio_nr++;
> > +	}
> > +
> > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > +	mmio->dev_id = mmio_user->dev_id;
> > +	mmio->flags = mmio_user->flags;
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +		mmio->table_base_addr = mmio_user->base_addr;
> > +		mmio->table_base_va = mmio_user->base_va;
> > +	}
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> 
> Pls add a comment documenting what this does.

You mean code is too confusing here? I don't think so.
> 
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	int r = -EINVAL, i, j;
> > +
> > +	if (!mmio)
> > +		return 0;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > +		    mmio_dev->mmio[i].type == mmio->type) {
> > +			r = 0;
> > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > +			mmio_dev->mmio_nr--;
> > +			break;
> > +		}
> > +	}
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = mmio_user->dev_id;
> > +	mmio.type = mmio_user->type;
> > +
> 
> So you pass in kvm_msix_mmio but only 2 fields are
> valid? This is very confusing. Please just pass the
> two values as function params to kvm_free_msix_mmio.

OK.
> 
> > +	return kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > new file mode 100644
> > index 0000000..01b6587
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.h
> > @@ -0,0 +1,25 @@
> > +#ifndef __KVM_MSIX_MMIO_H__
> > +#define __KVM_MSIX_MMIO_H__
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@xxxxxxxxx>
> > + */
> > +
> > +#include <linux/pci.h>
> 
> I don't see where you use this include.
> OTOH you do need to declare struct kvm_msix_mmio_user and
> struct kvm here.

Yes.

--
regards
Yang, Sheng

> 
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio_user); +
> > +#endif
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html