Re: [PATCH 6/7] KVM: assigned dev: MSI-X mask support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Nov 12, 2010 at 06:54:01PM +0800, Sheng Yang wrote:
> On Friday 12 November 2010 18:47:29 Michael S. Tsirkin wrote:
> > On Fri, Nov 12, 2010 at 06:13:48PM +0800, Sheng Yang wrote:
> > > On Friday 12 November 2010 17:53:13 Michael S. Tsirkin wrote:
> > > > On Thu, Nov 11, 2010 at 03:46:59PM +0800, Sheng Yang wrote:
> > > > > This patch enable per-vector mask for assigned devices using MSI-X.
> > > > > 
> > > > > This patch provided two new APIs: one is for guest to specific
> > > > > device's MSI-X table address in MMIO, the other is for userspace to
> > > > > get information about mask bit.
> > > > > 
> > > > > All the mask bit operation are kept in kernel, in order to
> > > > > accelerate. Userspace shouldn't access the device MMIO directly for
> > > > > the information, instead it should uses provided API to do so.
> > > > > 
> > > > > Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx>
> > > > > ---
> > > > > 
> > > > >  arch/x86/kvm/x86.c       |    1 +
> > > > >  include/linux/kvm.h      |   32 +++++
> > > > >  include/linux/kvm_host.h |    5 +
> > > > >  virt/kvm/assigned-dev.c  |  316
> > > > >  +++++++++++++++++++++++++++++++++++++++++++++- 4 files 
> changed,
> > > 
> > > 353
> > > 
> > > > >  insertions(+), 1 deletions(-)
> > > > > 
> > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > index f3f86b2..847f1e1 100644
> > > > > --- a/arch/x86/kvm/x86.c
> > > > > +++ b/arch/x86/kvm/x86.c
> > > > > @@ -1926,6 +1926,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > 
> > > > >  	case KVM_CAP_DEBUGREGS:
> > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > > 
> > > > >  	case KVM_CAP_XSAVE:
> > > > > +	case KVM_CAP_MSIX_MASK:
> > > > >  		r = 1;
> > > > >  		break;
> > > > >  	
> > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > > index 919ae53..32cd244 100644
> > > > > --- a/include/linux/kvm.h
> > > > > +++ b/include/linux/kvm.h
> > > > > @@ -540,6 +540,9 @@ struct kvm_ppc_pvinfo {
> > > > > 
> > > > >  #endif
> > > > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > > > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > > > > 
> > > > > +#ifdef __KVM_HAVE_MSIX
> > > > > +#define KVM_CAP_MSIX_MASK 59
> > > > > +#endif
> > > > > 
> > > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > > 
> > > > > @@ -671,6 +674,9 @@ struct kvm_clock_data {
> > > > > 
> > > > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > > > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO,
> > > > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
> > > > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > > > 
> > > > > +/* Available with KVM_CAP_MSIX_MASK */
> > > > > +#define KVM_GET_MSIX_ENTRY        _IOWR(KVMIO,  0x7d, struct
> > > > > kvm_msix_entry) +#define KVM_UPDATE_MSIX_MMIO      _IOW(KVMIO,  0x7e,
> > > > > struct kvm_msix_mmio)
> > > > > 
> > > > >  /* Available with KVM_CAP_PIT_STATE2 */
> > > > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > > > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > > > >  struct kvm_pit_state2)
> > > > > 
> > > > > @@ -794,4 +800,30 @@ struct kvm_assigned_msix_entry {
> > > > > 
> > > > >  	__u16 padding[3];
> > > > >  
> > > > >  };
> > > > > 
> > > > > +#define KVM_MSIX_TYPE_ASSIGNED_DEV	1
> > > > > +
> > > > > +#define KVM_MSIX_FLAG_MASKBIT		(1 << 0)
> > > > > +#define KVM_MSIX_FLAG_QUERY_MASKBIT	(1 << 0)
> > > > > +
> > > > > +struct kvm_msix_entry {
> > > > > +	__u32 id;
> > > > > +	__u32 type;
> > > > > +	__u32 entry; /* The index of entry in the MSI-X table */
> > > > > +	__u32 flags;
> > > > > +	__u32 query_flags;
> > > > > +	__u32 reserved[5];
> > > > > +};
> > > > > +
> > > > > +#define KVM_MSIX_MMIO_FLAG_REGISTER	(1 << 0)
> > > > > +#define KVM_MSIX_MMIO_FLAG_UNREGISTER	(1 << 1)
> > > > > +
> > > > > +struct kvm_msix_mmio {
> > > > > +	__u32 id;
> > > > > +	__u32 type;
> > > > > +	__u64 base_addr;
> > > > > +	__u32 max_entries_nr;
> > > > > +	__u32 flags;
> > > > > +	__u32 reserved[6];
> > > > > +};
> > > > > +
> > > > > 
> > > > >  #endif /* __LINUX_KVM_H */
> > > > > 
> > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > index 4b31539..9d49074 100644
> > > > > --- a/include/linux/kvm_host.h
> > > > > +++ b/include/linux/kvm_host.h
> > > > > @@ -445,6 +445,7 @@ struct kvm_guest_msix_entry {
> > > > > 
> > > > >  };
> > > > >  
> > > > >  #define KVM_ASSIGNED_ENABLED_IOMMU		(1 << 0)
> > > > > 
> > > > > +#define KVM_ASSIGNED_ENABLED_MSIX_MMIO		(1 << 1)
> > > > > 
> > > > >  struct kvm_assigned_dev_kernel {
> > > > >  
> > > > >  	struct kvm_irq_ack_notifier ack_notifier;
> > > > >  	struct work_struct interrupt_work;
> > > > > 
> > > > > @@ -465,6 +466,10 @@ struct kvm_assigned_dev_kernel {
> > > > > 
> > > > >  	struct pci_dev *dev;
> > > > >  	struct kvm *kvm;
> > > > >  	spinlock_t assigned_dev_lock;
> > > > > 
> > > > > +	DECLARE_BITMAP(msix_mask_bitmap, KVM_MAX_MSIX_PER_DEV);
> > > > > +	gpa_t msix_mmio_base;
> > > > > +	struct kvm_io_device msix_mmio_dev;
> > > > > +	int msix_max_entries_nr;
> > > > > 
> > > > >  };
> > > > >  
> > > > >  struct kvm_irq_mask_notifier {
> > > > > 
> > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > > > index 5c6b96d..3010d7d 100644
> > > > > --- a/virt/kvm/assigned-dev.c
> > > > > +++ b/virt/kvm/assigned-dev.c
> > > > > @@ -226,12 +226,27 @@ static void kvm_free_assigned_irq(struct kvm
> > > > > *kvm,
> > > > > 
> > > > >  	kvm_deassign_irq(kvm, assigned_dev,
> > > > >  	assigned_dev->irq_requested_type);
> > > > >  
> > > > >  }
> > > > > 
> > > > > +static void unregister_msix_mmio(struct kvm *kvm,
> > > > > +				 struct kvm_assigned_dev_kernel *adev)
> > > > > +{
> > > > > +	if (adev->flags & KVM_ASSIGNED_ENABLED_MSIX_MMIO) {
> > > > > +		mutex_lock(&kvm->slots_lock);
> > > > > +		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > > > +				&adev->msix_mmio_dev);
> > > > > +		mutex_unlock(&kvm->slots_lock);
> > > > > +		adev->flags &= ~KVM_ASSIGNED_ENABLED_MSIX_MMIO;
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > 
> > > > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > > > >  
> > > > >  				     struct kvm_assigned_dev_kernel
> > > > >  				     *assigned_dev)
> > > > >  
> > > > >  {
> > > > >  
> > > > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > > > 
> > > > > +#ifdef __KVM_HAVE_MSIX
> > > > > +	unregister_msix_mmio(kvm, assigned_dev);
> > > > > +#endif
> > > > > 
> > > > >  	pci_reset_function(assigned_dev->dev);
> > > > >  	
> > > > >  	pci_release_regions(assigned_dev->dev);
> > > > > 
> > > > > @@ -504,7 +519,7 @@ out:
> > > > >  static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
> > > > >  
> > > > >  				      struct kvm_assigned_pci_dev *assigned_dev)
> > > > >  
> > > > >  {
> > > > > 
> > > > > -	int r = 0, idx;
> > > > > +	int r = 0, idx, i;
> > > > > 
> > > > >  	struct kvm_assigned_dev_kernel *match;
> > > > >  	struct pci_dev *dev;
> > > > > 
> > > > > @@ -564,6 +579,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm
> > > > > *kvm,
> > > > > 
> > > > >  	list_add(&match->list, &kvm->arch.assigned_dev_head);
> > > > > 
> > > > > +	/* The state after reset of MSI-X table is all masked */
> > > > > +	for (i = 0; i < KVM_MAX_MSIX_PER_DEV; i++)
> > > > > +		set_bit(i, match->msix_mask_bitmap);
> > > > > +
> > > > > 
> > > > >  	if (assigned_dev->flags & KVM_ASSIGNED_ENABLED_IOMMU) {
> > > > >  	
> > > > >  		if (!kvm->arch.iommu_domain) {
> > > > >  		
> > > > >  			r = kvm_iommu_map_guest(kvm);
> > > > > 
> > > > > @@ -667,6 +686,43 @@ msix_nr_out:
> > > > >  	return r;
> > > > >  
> > > > >  }
> > > > > 
> > > > > +static void update_msix_mask(struct kvm_assigned_dev_kernel *adev,
> > > > > +			     int idx, bool new_mask_flag)
> > > > > +{
> > > > > +	int irq;
> > > > > +	bool old_mask_flag, need_flush = false;
> > > > > +
> > > > > +	spin_lock_irq(&adev->assigned_dev_lock);
> > > > > +
> > > > > +	if (!adev->dev->msix_enabled ||
> > > > > +	    !(adev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX))
> > > > > +		goto out;
> > > > > +
> > > > > +	old_mask_flag = test_bit(adev->guest_msix_entries[idx].entry,
> > > > > +			adev->msix_mask_bitmap);
> > > > > +	if (old_mask_flag == new_mask_flag)
> > > > > +		goto out;
> > > > > +
> > > > > +	irq = adev->host_msix_entries[idx].vector;
> > > > > +	BUG_ON(irq == 0);
> > > > > +
> > > > > +	if (new_mask_flag) {
> > > > > +		set_bit(adev->guest_msix_entries[idx].entry,
> > > > > +				adev->msix_mask_bitmap);
> > > > > +		disable_irq_nosync(irq);
> > > > > +		need_flush = true;
> > > > > +	} else {
> > > > > +		clear_bit(adev->guest_msix_entries[idx].entry,
> > > > > +				adev->msix_mask_bitmap);
> > > > > +		enable_irq(irq);
> > > > > +	}
> > > > > +out:
> > > > > +	spin_unlock_irq(&adev->assigned_dev_lock);
> > > > > +
> > > > > +	if (need_flush)
> > > > > +		flush_work(&adev->interrupt_work);
> > > > > +}
> > > > > +
> > > > > 
> > > > >  static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
> > > > >  
> > > > >  				       struct kvm_assigned_msix_entry *entry)
> > > > >  
> > > > >  {
> > > > > 
> > > > > @@ -701,6 +757,233 @@ msix_entry_out:
> > > > >  	return r;
> > > > >  
> > > > >  }
> > > > > 
> > > > > +
> > > > > +static int kvm_vm_ioctl_get_msix_entry(struct kvm *kvm,
> > > > > +				       struct kvm_msix_entry *entry)
> > > > > +{
> > > > > +	int r = 0;
> > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > +
> > > > > +	if (entry->type != KVM_MSIX_TYPE_ASSIGNED_DEV)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	if (!entry->query_flags)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	mutex_lock(&kvm->lock);
> > > > > +
> > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > +				      entry->id);
> > > > > +
> > > > > +	if (!adev) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	if (entry->entry >= adev->msix_max_entries_nr) {
> > > > > +		r = -ENOSPC;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	if (entry->query_flags & KVM_MSIX_FLAG_QUERY_MASKBIT) {
> > > > > +		if (test_bit(entry->entry, adev->msix_mask_bitmap))
> > > > > +			entry->flags |= KVM_MSIX_FLAG_MASKBIT;
> > > > > +		else
> > > > > +			entry->flags &= ~KVM_MSIX_FLAG_MASKBIT;
> > > > > +	}
> > > > > +
> > > > > +out:
> > > > > +	mutex_unlock(&kvm->lock);
> > > > > +
> > > > > +	return r;
> > > > > +}
> > > > > +
> > > > > +static bool msix_mmio_in_range(struct kvm_assigned_dev_kernel *adev,
> > > > > +			      gpa_t addr, int len)
> > > > > +{
> > > > > +	gpa_t start, end;
> > > > > +
> > > > > +	BUG_ON(adev->msix_mmio_base == 0);
> > > > 
> > > > I thought we wanted to use a separate flag for this?
> > > 
> > > O, flag added, but forgot to update this one.
> > > 
> > > > > +	start = adev->msix_mmio_base;
> > > > > +	end = adev->msix_mmio_base + PCI_MSIX_ENTRY_SIZE *
> > > > > +		adev->msix_max_entries_nr;
> > > > > +	if (addr >= start && addr + len <= end)
> > > > > +		return true;
> > > > > +
> > > > > +	return false;
> > > > > +}
> > > > > +
> > > > > +static int msix_get_enabled_idx(struct kvm_assigned_dev_kernel
> > > > > *adev, +			      gpa_t addr, int len)
> > > > > +{
> > > > > +	int i, index = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE;
> > > > > +
> > > > > +	for (i = 0; i < adev->entries_nr; i++)
> > > > > +		if (adev->guest_msix_entries[i].entry == index)
> > > > > +			return i;
> > > > > +
> > > > > +	return -EINVAL;
> > > > > +}
> > > > > +
> > > > 
> > > > Hmm, we still have a linear scan on each write.  Is the point to detect
> > > > which entries need to be handled in kernel? Maybe another bitmap for
> > > > this?  Or just handle whole entry write in kernel as well, then we
> > > > won't need this at all.
> > > 
> > > Still dont' like handling the whole entry writing. In fact here we have
> > > two questions:
> > > 1. If the entry already enabled
> > > 2. If it is, then what's it sequence number.
> > > 
> > > Bitmap can solve question 1, but we still need O(n) scan for the second.
> > > So I think leave it like this should be fine.
> > 
> > The spec quoted above implies we must handle all entry writes:
> > a single write can update mask and data.
> 
> Anyway it's the work to be done by QEmu, if we want to cover the situation that 
> signle write can update mask and data - that's surely QWORD, which hasn't been 
> supported yet. 

Must pass the transactions to qemu then :)

> We can back to them if there is someone really did it in that way. But for all 
> hypervisors using QEmu, I think we haven't seen such kind of behavior yet.

I would rather stick to the spec than go figure out what do BSD/Sun/Mac do,
or will do.

> So we 
> can leave it later.
> 
> --
> regards
> Yang, Sheng
> 
> > 
> > > > > +static int msix_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > > > int len, +			  void *val)
> > > > > +{
> > > > > +	struct kvm_assigned_dev_kernel *adev =
> > > > > +			container_of(this, struct kvm_assigned_dev_kernel,
> > > > > +				     msix_mmio_dev);
> > > > > +	int idx, r = 0;
> > > > > +	u32 entry[4];
> > > > > +	struct kvm_kernel_irq_routing_entry e;
> > > > > +
> > > > > +	mutex_lock(&adev->kvm->lock);
> > > > > +	if (!msix_mmio_in_range(adev, addr, len)) {
> > > > > +		r = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || len != 4)
> > > > > +		goto out;
> > > > > +
> > > > > +	idx = msix_get_enabled_idx(adev, addr, len);
> > > > > +	if (idx < 0) {
> > > > > +		idx = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE;
> > > > > +		if ((addr % PCI_MSIX_ENTRY_SIZE) ==
> > > > > +				PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > +			*(unsigned long *)val =
> > > > > +				test_bit(idx, adev->msix_mask_bitmap) ?
> > > > > +				PCI_MSIX_ENTRY_CTRL_MASKBIT : 0;
> > > > > +		else
> > > > > +			r = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	r = kvm_get_irq_routing_entry(adev->kvm,
> > > > > +			adev->guest_msix_entries[idx].vector, &e);
> > > > > +	if (r || e.type != KVM_IRQ_ROUTING_MSI) {
> > > > > +		r = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	entry[0] = e.msi.address_lo;
> > > > > +	entry[1] = e.msi.address_hi;
> > > > > +	entry[2] = e.msi.data;
> > > > > +	entry[3] = test_bit(adev->guest_msix_entries[idx].entry,
> > > > > +			adev->msix_mask_bitmap);
> > > > > +	memcpy(val, &entry[addr % PCI_MSIX_ENTRY_SIZE / sizeof *entry],
> > > > > len);
> > > > 
> > > > It seems weird to pass writes to userspace but do reads
> > > > in kernel.
> > > > Either we should support entry read and write or neither.
> > > 
> > > Entry read is for speed up, because kernel would write to the mask bit
> > > then try to read from it for flushing.

What I see linux doing is reading the 1st entry, always.
So this means we must handle all table reads in kernel,
we can't pass any to userspace. Anyway, only new kernels do this,
presumably they touch msix table rarely so it should not matter
for performance?

> Don't think reading matter much
> > > here, as long as userspace still own routing table.

Why insist on using the routing table then?  Since we handle all of the
entry for reads, let's do it for writes too?
The only justification for the split was that supposedly registers were
always programmed separately. But now we see that spoec does not mandate
this.

> > > > > +
> > > > > +out:
> > > > > +	mutex_unlock(&adev->kvm->lock);
> > > > > +	return r;
> > > > > +}
> > > > > +
> > > > > +static int msix_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > > > int len, +			   const void *val)
> > > > > +{
> > > > > 
> > > > > +	struct kvm_assigned_dev_kernel *adev =
> > > > > +			container_of(this, struct kvm_assigned_dev_kernel,
> > > > > +				     msix_mmio_dev);
> > > > > +	int idx, r = 0;
> > > > > +	unsigned long new_val = *(unsigned long *)val;
> > > > 
> > > > Move this to after you did length and alignment checks,
> > > > it might not be safe here.
> > > > 
> > > > > +
> > > > > +	mutex_lock(&adev->kvm->lock);
> > > > > +	if (!msix_mmio_in_range(adev, addr, len)) {
> > > > > +		r = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || len != 4)
> > > > 
> > > > The lenght could be 8:
> > > > 
> > > > For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> > > > aligned full DWORD or aligned full QWORD transactions; otherwise, the
> > > > result is undefined.
> > > > 
> > > > and
> > > > 
> > > > Software is permitted to fill in MSI-X Table entry DWORD fields
> > > > individually with DWORD writes, or software in certain cases is
> > > > permitted to fill in appropriate pairs of DWORDs with a single QWORD
> > > > write. Specifically, software is always permitted to fill in the
> > > > Message Address and Message Upper Address fields with a single QWORD
> > > > write. If a given entry is currently masked (via its Mask bit or the
> > > > Function Mask bit), software is permitted to fill in the Message Data
> > > > and Vector Control fields with a single QWORD write, taking advantage
> > > > of the fact the Message Data field is guaranteed to become visible to
> > > > hardware no later than the Vector Control field. However, if software
> > > > wishes to mask a currently unmasked entry (without setting the
> > > > Function Mask bit), software must set the entryâs Mask bit using a
> > > > DWORD write to the Vector Control field, since performing a QWORD
> > > > write to the Message Data and Vector Control fields might result in
> > > > the Message Data field being modified before the Mask bit in the
> > > > Vector Control field becomes set.
> > > 
> > > Haven't seen any device use QWORD accessing. Also QEmu can't handle QWORD
> > > MMIO as well.

That's a userspace bug.  Not sure whether this justifies copyng the bug
in kernel.

> > >  So I guess we can leave it later.

Maybe what we can do is handle the mask in kernel +
pass to userspace to handle the address/vector update?

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux