On Wednesday 17 November 2010 03:45:22 Marcelo Tosatti wrote: > On Mon, Nov 15, 2010 at 05:15:32PM +0800, Sheng Yang wrote: > > This patch enable per-vector mask for assigned devices using MSI-X. > > > > This patch provided two new APIs: one is for guest to specific device's > > MSI-X table address in MMIO, the other is for userspace to get > > information about mask bit. > > > > All the mask bit operation are kept in kernel, in order to accelerate. > > Userspace shouldn't access the device MMIO directly for the information, > > instead it should uses provided API to do so. > > > > Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx> > > --- > > > > arch/x86/kvm/x86.c | 1 + > > include/linux/kvm.h | 32 +++++ > > include/linux/kvm_host.h | 5 + > > virt/kvm/assigned-dev.c | 318 > > +++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 355 > > insertions(+), 1 deletions(-) > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index fc29223..37602e2 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > case KVM_CAP_XSAVE: > > > > case KVM_CAP_ASYNC_PF: > > + case KVM_CAP_MSIX_MASK: > > r = 1; > > break; > > > > case KVM_CAP_COALESCED_MMIO: > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > > index ea2dc1a..b3e5ffe 100644 > > --- a/include/linux/kvm.h > > +++ b/include/linux/kvm.h > > @@ -541,6 +541,9 @@ struct kvm_ppc_pvinfo { > > > > #define KVM_CAP_PPC_GET_PVINFO 57 > > #define KVM_CAP_PPC_IRQ_LEVEL 58 > > #define KVM_CAP_ASYNC_PF 59 > > > > +#ifdef __KVM_HAVE_MSIX > > +#define KVM_CAP_MSIX_MASK 60 > > +#endif > > > > #ifdef KVM_CAP_IRQ_ROUTING > > > > @@ -672,6 +675,9 @@ struct kvm_clock_data { > > > > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct > > kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, > > 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK > > _IOR(KVMIO, 0x7c, struct kvm_clock_data) > > > > +/* Available with KVM_CAP_MSIX_MASK */ > > +#define KVM_GET_MSIX_ENTRY _IOWR(KVMIO, 0x7d, struct > > kvm_msix_entry) +#define KVM_UPDATE_MSIX_MMIO _IOW(KVMIO, 0x7e, > > struct kvm_msix_mmio) > > > > /* Available with KVM_CAP_PIT_STATE2 */ > > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct > > kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, > > struct kvm_pit_state2) > > > > @@ -795,4 +801,30 @@ struct kvm_assigned_msix_entry { > > > > __u16 padding[3]; > > > > }; > > > > +#define KVM_MSIX_TYPE_ASSIGNED_DEV 1 > > + > > +#define KVM_MSIX_FLAG_MASKBIT (1 << 0) > > +#define KVM_MSIX_FLAG_QUERY_MASKBIT (1 << 0) > > + > > +struct kvm_msix_entry { > > + __u32 id; > > + __u32 type; > > Is type really necessary? Will it ever differ from > KVM_MSIX_TYPE_ASSIGNED_DEV? This is the suggestion from Michael. He want it to be reused by emulated/pv devices. So I add the type field here. > > > + __u32 entry; /* The index of entry in the MSI-X table */ > > + __u32 flags; > > + __u32 query_flags; > > + __u32 reserved[5]; > > +}; > > + > > +#define KVM_MSIX_MMIO_FLAG_REGISTER (1 << 0) > > +#define KVM_MSIX_MMIO_FLAG_UNREGISTER (1 << 1) > > + > > +struct kvm_msix_mmio { > > + __u32 id; > > + __u32 type; > > + __u64 base_addr; > > + __u32 max_entries_nr; > > + __u32 flags; > > + __u32 reserved[6]; > > +}; > > + > > > > #endif /* __LINUX_KVM_H */ > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index f09db87..57a437a 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -501,6 +501,7 @@ struct kvm_guest_msix_entry { > > > > }; > > > > #define KVM_ASSIGNED_ENABLED_IOMMU (1 << 0) > > > > +#define KVM_ASSIGNED_ENABLED_MSIX_MMIO (1 << 1) > > > > struct kvm_assigned_dev_kernel { > > > > struct kvm_irq_ack_notifier ack_notifier; > > struct work_struct interrupt_work; > > > > @@ -521,6 +522,10 @@ struct kvm_assigned_dev_kernel { > > > > struct pci_dev *dev; > > struct kvm *kvm; > > spinlock_t assigned_dev_lock; > > > > + DECLARE_BITMAP(msix_mask_bitmap, KVM_MAX_MSIX_PER_DEV); > > + gpa_t msix_mmio_base; > > + struct kvm_io_device msix_mmio_dev; > > + int msix_max_entries_nr; > > > > }; > > > > struct kvm_irq_mask_notifier { > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > > index 5c6b96d..76a1f12 100644 > > --- a/virt/kvm/assigned-dev.c > > +++ b/virt/kvm/assigned-dev.c > > @@ -226,12 +226,27 @@ static void kvm_free_assigned_irq(struct kvm *kvm, > > > > kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); > > > > } > > > > +static void unregister_msix_mmio(struct kvm *kvm, > > + struct kvm_assigned_dev_kernel *adev) > > +{ > > + if (adev->flags & KVM_ASSIGNED_ENABLED_MSIX_MMIO) { > > + mutex_lock(&kvm->slots_lock); > > + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > > + &adev->msix_mmio_dev); > > + mutex_unlock(&kvm->slots_lock); > > + adev->flags &= ~KVM_ASSIGNED_ENABLED_MSIX_MMIO; > > + } > > +} > > + > > > > static void kvm_free_assigned_device(struct kvm *kvm, > > > > struct kvm_assigned_dev_kernel > > *assigned_dev) > > > > { > > > > kvm_free_assigned_irq(kvm, assigned_dev); > > > > +#ifdef __KVM_HAVE_MSIX > > + unregister_msix_mmio(kvm, assigned_dev); > > +#endif > > > > pci_reset_function(assigned_dev->dev); > > > > pci_release_regions(assigned_dev->dev); > > > > @@ -504,7 +519,7 @@ out: > > static int kvm_vm_ioctl_assign_device(struct kvm *kvm, > > > > struct kvm_assigned_pci_dev *assigned_dev) > > > > { > > > > - int r = 0, idx; > > + int r = 0, idx, i; > > > > struct kvm_assigned_dev_kernel *match; > > struct pci_dev *dev; > > > > @@ -564,6 +579,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm > > *kvm, > > > > list_add(&match->list, &kvm->arch.assigned_dev_head); > > > > + /* The state after reset of MSI-X table is all masked */ > > + for (i = 0; i < KVM_MAX_MSIX_PER_DEV; i++) > > + set_bit(i, match->msix_mask_bitmap); > > + > > > > if (assigned_dev->flags & KVM_ASSIGNED_ENABLED_IOMMU) { > > > > if (!kvm->arch.iommu_domain) { > > > > r = kvm_iommu_map_guest(kvm); > > > > @@ -667,6 +686,43 @@ msix_nr_out: > > return r; > > > > } > > > > +static void update_msix_mask(struct kvm_assigned_dev_kernel *adev, > > + int idx, bool new_mask_flag) > > +{ > > + int irq; > > + bool old_mask_flag, need_flush = false; > > + > > + spin_lock_irq(&adev->assigned_dev_lock); > > + > > + if (!adev->dev->msix_enabled || > > + !(adev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX)) > > + goto out; > > + > > + old_mask_flag = test_bit(adev->guest_msix_entries[idx].entry, > > + adev->msix_mask_bitmap); > > + if (old_mask_flag == new_mask_flag) > > + goto out; > > + > > + irq = adev->host_msix_entries[idx].vector; > > + BUG_ON(irq == 0); > > + > > + if (new_mask_flag) { > > + set_bit(adev->guest_msix_entries[idx].entry, > > + adev->msix_mask_bitmap); > > + disable_irq_nosync(irq); > > + need_flush = true; > > + } else { > > + clear_bit(adev->guest_msix_entries[idx].entry, > > + adev->msix_mask_bitmap); > > + enable_irq(irq); > > + } > > +out: > > + spin_unlock_irq(&adev->assigned_dev_lock); > > + > > + if (need_flush) > > + flush_work(&adev->interrupt_work); > > +} > > + > > > > static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, > > > > struct kvm_assigned_msix_entry *entry) > > > > { > > > > @@ -701,6 +757,235 @@ msix_entry_out: > > return r; > > > > } > > > > + > > +static int kvm_vm_ioctl_get_msix_entry(struct kvm *kvm, > > + struct kvm_msix_entry *entry) > > +{ > > + int r = 0; > > + struct kvm_assigned_dev_kernel *adev; > > + > > + if (entry->type != KVM_MSIX_TYPE_ASSIGNED_DEV) > > + return -EINVAL; > > + > > + if (!entry->query_flags) > > + return -EINVAL; > > + > > + mutex_lock(&kvm->lock); > > + > > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > > + entry->id); > > + > > + if (!adev) { > > + r = -EINVAL; > > + goto out; > > + } > > + > > + if (entry->entry >= adev->msix_max_entries_nr) { > > + r = -ENOSPC; > > + goto out; > > + } > > + > > + if (entry->query_flags & KVM_MSIX_FLAG_QUERY_MASKBIT) { > > + if (test_bit(entry->entry, adev->msix_mask_bitmap)) > > + entry->flags |= KVM_MSIX_FLAG_MASKBIT; > > + else > > + entry->flags &= ~KVM_MSIX_FLAG_MASKBIT; > > + } > > + > > +out: > > + mutex_unlock(&kvm->lock); > > + > > + return r; > > +} > > + > > +static bool msix_mmio_in_range(struct kvm_assigned_dev_kernel *adev, > > + gpa_t addr, int len) > > +{ > > + gpa_t start, end; > > + > > + BUG_ON(!(adev->flags & KVM_ASSIGNED_ENABLED_MSIX_MMIO)); > > + start = adev->msix_mmio_base; > > + end = adev->msix_mmio_base + PCI_MSIX_ENTRY_SIZE * > > + adev->msix_max_entries_nr; > > + if (addr >= start && addr + len <= end) > > + return true; > > + > > + return false; > > +} > > + > > +static int msix_get_enabled_idx(struct kvm_assigned_dev_kernel *adev, > > + gpa_t addr, int len) > > +{ > > + int i, index = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE; > > + > > + for (i = 0; i < adev->entries_nr; i++) > > + if (adev->guest_msix_entries[i].entry == index) > > + return i; > > + > > + return -EINVAL; > > +} > > + > > +static int msix_mmio_read(struct kvm_io_device *this, gpa_t addr, int > > len, + void *val) > > +{ > > + struct kvm_assigned_dev_kernel *adev = > > + container_of(this, struct kvm_assigned_dev_kernel, > > + msix_mmio_dev); > > + int idx, r = 0; > > + u32 entry[4]; > > + struct kvm_kernel_irq_routing_entry e; > > + > > + /* TODO: Get big-endian machine work */ > > + mutex_lock(&adev->kvm->lock); > > + if (!msix_mmio_in_range(adev, addr, len)) { > > + r = -EOPNOTSUPP; > > + goto out; > > + } > > The unregister path does: > > mutex_lock(kvm->lock) > kvm_io_bus_unregister_dev() > synchronize_srcu() > > If an instance of msix_mmio_read/msix_mmio_write is waiting on > kvm->lock, synchronize_srcu will never complete. > > You should use a separate lock for the in range check (and have it mind > that reads/writes can trigger after kvm_io_bus_register_dev, so all > state accessible in the r/w handlers should be complete by that time). Good point! Would update it. > > > + if ((addr & 0x3) || len != 4) > > + goto out; > > + > > + idx = msix_get_enabled_idx(adev, addr, len); > > + if (idx < 0) { > > + idx = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE; > > + if ((addr % PCI_MSIX_ENTRY_SIZE) == > > + PCI_MSIX_ENTRY_VECTOR_CTRL) > > + *(unsigned long *)val = > > + test_bit(idx, adev->msix_mask_bitmap) ? > > + PCI_MSIX_ENTRY_CTRL_MASKBIT : 0; > > + else > > + r = -EOPNOTSUPP; > > + goto out; > > + } > > + > > + r = kvm_get_irq_routing_entry(adev->kvm, > > + adev->guest_msix_entries[idx].vector, &e); > > + if (r || e.type != KVM_IRQ_ROUTING_MSI) { > > + r = -EOPNOTSUPP; > > + goto out; > > + } > > + entry[0] = e.msi.address_lo; > > + entry[1] = e.msi.address_hi; > > + entry[2] = e.msi.data; > > + entry[3] = test_bit(adev->guest_msix_entries[idx].entry, > > + adev->msix_mask_bitmap); > > + memcpy(val, &entry[addr % PCI_MSIX_ENTRY_SIZE / sizeof *entry], len); > > Division by zero? Not quite understand. You mean sizeof *entry or PCI_MSIX_ENTRY_SIZE? Both of them should be positive integer I think... Maybe I should use sizeof u32 here? -- regards Yang, Sheng -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html