On Thu, Nov 04, 2010 at 02:15:21PM +0800, Sheng Yang wrote: > This patch enable per-vector mask for assigned devices using MSI-X. > > This patch provided two new APIs: one is for guest to specific device's MSI-X > table address in MMIO, the other is for userspace to get information about mask > bit. > > All the mask bit operation are kept in kernel, in order to accelerate. > Userspace shouldn't access the device MMIO directly for the information, > instead it should uses provided API to do so. > > Signed-off-by: Sheng Yang <sheng@xxxxxxxxxxxxxxx> > --- > Documentation/kvm/api.txt | 46 +++++++ > arch/x86/kvm/x86.c | 1 + > include/linux/kvm.h | 21 +++- > include/linux/kvm_host.h | 3 + > virt/kvm/assigned-dev.c | 285 ++++++++++++++++++++++++++++++++++++++++++++- > 5 files changed, 354 insertions(+), 2 deletions(-) > > diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt > index b336266..69b993c 100644 > --- a/Documentation/kvm/api.txt > +++ b/Documentation/kvm/api.txt > @@ -1085,6 +1085,52 @@ of 4 instructions that make up a hypercall. > If any additional field gets added to this structure later on, a bit for that > additional piece of information will be set in the flags bitmap. > > +4.47 KVM_ASSIGN_REG_MSIX_MMIO > + > +Capability: KVM_CAP_DEVICE_MSIX_MASK > +Architectures: x86 > +Type: vm ioctl > +Parameters: struct kvm_assigned_msix_mmio (in) > +Returns: 0 on success, !0 on error > + > +struct kvm_assigned_msix_mmio { > + /* Assigned device's ID */ > + __u32 assigned_dev_id; > + /* Must be 0 */ > + __u32 flags; > + /* MSI-X table MMIO address */ > + __u64 base_addr; > + /* Must be 0, reserved for future use */ > + __u64 reserved; We also want length I think. > +}; > + > +This ioctl would enable in-kernel MSI-X emulation, which would handle MSI-X > +mask bit in the kernel. What happens on repeated calls when it's already enabled? How does one disable after it's enabled? > + This is very specialized for assigned devices. I would like an interface not tied to assigned devices explicitly (even if the implementation at the moment only works for assigned devices, I can patch that in later). E.g. vhost-net would benefit from such an extension too. Maybe tie this to a special GSI and this GSI can be an assigned device or not? > +4.48 KVM_ASSIGN_GET_MSIX_ENTRY > + > +Capability: KVM_CAP_DEVICE_MSIX_MASK > +Architectures: x86 > +Type: vm ioctl > +Parameters: struct kvm_assigned_msix_entry (in and out) > +Returns: 0 on success, !0 on error > + > +struct kvm_assigned_msix_entry { > + /* Assigned device's ID */ > + __u32 assigned_dev_id; > + /* Ignored */ > + __u32 gsi; > + /* The index of entry in the MSI-X table */ > + __u16 entry; > + /* Querying flags and returning status */ > + __u16 flags; > + /* Must be 0 */ > + __u16 padding[2]; > +}; > + > +This ioctl would allow userspace to get the status of one specific MSI-X > +entry. Currently we support mask bit status querying. > + > 5. The kvm_run structure > > Application code obtains a pointer to the kvm_run structure by > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index f3f86b2..8fd5121 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -1926,6 +1926,7 @@ int kvm_dev_ioctl_check_extension(long ext) > case KVM_CAP_DEBUGREGS: > case KVM_CAP_X86_ROBUST_SINGLESTEP: > case KVM_CAP_XSAVE: > + case KVM_CAP_DEVICE_MSIX_MASK: > r = 1; > break; > case KVM_CAP_COALESCED_MMIO: > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > index 919ae53..eafafb1 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -540,6 +540,9 @@ struct kvm_ppc_pvinfo { > #endif > #define KVM_CAP_PPC_GET_PVINFO 57 > #define KVM_CAP_PPC_IRQ_LEVEL 58 > +#ifdef __KVM_HAVE_MSIX > +#define KVM_CAP_DEVICE_MSIX_MASK 59 > +#endif > We seem to have these HAVE macros all over. Avi, what's the idea? Let's drop them? > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -671,6 +674,10 @@ struct kvm_clock_data { > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) > #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) > #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) > +#define KVM_ASSIGN_GET_MSIX_ENTRY _IOWR(KVMIO, 0x7d, \ > + struct kvm_assigned_msix_entry) > +#define KVM_ASSIGN_REG_MSIX_MMIO _IOW(KVMIO, 0x7e, \ > + struct kvm_assigned_msix_mmio) > /* Available with KVM_CAP_PIT_STATE2 */ > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) > #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) > @@ -787,11 +794,23 @@ struct kvm_assigned_msix_nr { > }; > > #define KVM_MAX_MSIX_PER_DEV 256 > + > +#define KVM_MSIX_FLAG_MASK (1 << 0) > +#define KVM_MSIX_FLAG_QUERY_MASK (1 << 15) > + > struct kvm_assigned_msix_entry { > __u32 assigned_dev_id; > __u32 gsi; > __u16 entry; /* The index of entry in the MSI-X table */ > - __u16 padding[3]; > + __u16 flags; > + __u16 padding[2]; > +}; > + > +struct kvm_assigned_msix_mmio { > + __u32 assigned_dev_id; > + __u32 flags; > + __u64 base_addr; > + __u64 reserved; > }; > > #endif /* __LINUX_KVM_H */ > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index e2ecbac..5ac4db9 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -464,6 +464,9 @@ struct kvm_assigned_dev_kernel { > struct pci_dev *dev; > struct kvm *kvm; > spinlock_t assigned_dev_lock; > + DECLARE_BITMAP(msix_mask_bitmap, KVM_MAX_MSIX_PER_DEV); > + gpa_t msix_mmio_base; > + struct kvm_io_device msix_mmio_dev; > }; > > struct kvm_irq_mask_notifier { > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > index 7c98928..9398c27 100644 > --- a/virt/kvm/assigned-dev.c > +++ b/virt/kvm/assigned-dev.c > @@ -232,6 +232,14 @@ static void kvm_free_assigned_device(struct kvm *kvm, > { > kvm_free_assigned_irq(kvm, assigned_dev); > > +#ifdef __KVM_HAVE_MSIX > + if (assigned_dev->msix_mmio_base) { > + mutex_lock(&kvm->slots_lock); > + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > + &assigned_dev->msix_mmio_dev); > + mutex_unlock(&kvm->slots_lock); > + } > +#endif > pci_reset_function(assigned_dev->dev); > > pci_release_regions(assigned_dev->dev); > @@ -504,7 +512,7 @@ out: > static int kvm_vm_ioctl_assign_device(struct kvm *kvm, > struct kvm_assigned_pci_dev *assigned_dev) > { > - int r = 0, idx; > + int r = 0, idx, i; > struct kvm_assigned_dev_kernel *match; > struct pci_dev *dev; > > @@ -563,6 +571,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, > > list_add(&match->list, &kvm->arch.assigned_dev_head); > > + /* The state after reset of MSI-X table is all masked */ > + for (i = 0; i < KVM_MAX_MSIX_PER_DEV; i++) > + set_bit(i, match->msix_mask_bitmap); > + > if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { > if (!kvm->arch.iommu_domain) { > r = kvm_iommu_map_guest(kvm); > @@ -666,6 +678,43 @@ msix_nr_out: > return r; > } > > +static void update_msix_mask(struct kvm_assigned_dev_kernel *adev, > + int idx, bool new_mask_flag) > +{ > + int irq; > + bool old_mask_flag, need_flush = false; > + > + spin_lock_irq(&adev->assigned_dev_lock); > + > + if (!adev->dev->msix_enabled || > + !(adev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX)) > + goto out; > + > + old_mask_flag = test_bit(adev->guest_msix_entries[idx].entry, > + adev->msix_mask_bitmap); > + if (old_mask_flag == new_mask_flag) > + goto out; > + > + irq = adev->host_msix_entries[idx].vector; > + BUG_ON(irq == 0); > + > + if (new_mask_flag) { > + set_bit(adev->guest_msix_entries[idx].entry, > + adev->msix_mask_bitmap); > + disable_irq_nosync(irq); > + need_flush = true; > + } else { > + clear_bit(adev->guest_msix_entries[idx].entry, > + adev->msix_mask_bitmap); > + enable_irq(irq); > + } > +out: > + spin_unlock_irq(&adev->assigned_dev_lock); > + > + if (need_flush) > + flush_work(&adev->interrupt_work); > +} > + > static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, > struct kvm_assigned_msix_entry *entry) > { > @@ -700,6 +749,210 @@ msix_entry_out: > > return r; > } > + > +static int kvm_vm_ioctl_get_msix_entry(struct kvm *kvm, > + struct kvm_assigned_msix_entry *entry) > +{ > + int r = 0; > + struct kvm_assigned_dev_kernel *adev; > + > + mutex_lock(&kvm->lock); > + > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > + entry->assigned_dev_id); > + > + if (!adev) { > + r = -EINVAL; > + goto out; > + } > + > + if (entry->entry >= KVM_MAX_MSIX_PER_DEV) { > + r = -ENOSPC; > + goto out; > + } > + > + if (entry->flags & KVM_MSIX_FLAG_QUERY_MASK) { > + if (test_bit(entry->entry, adev->msix_mask_bitmap)) > + entry->flags |= KVM_MSIX_FLAG_MASK; > + else > + entry->flags &= ~KVM_MSIX_FLAG_MASK; > + } > + > +out: > + mutex_unlock(&kvm->lock); > + > + return r; > +} > + > +static bool msix_mmio_in_range(struct kvm_assigned_dev_kernel *adev, > + gpa_t addr, int len) > +{ > + gpa_t start, end; > + > + BUG_ON(adev->msix_mmio_base == 0); Below I see adev->msix_mmio_base = msix_mmio->base_addr; which comes from userspace. BUG_ON is an unfriendly way to tell user about a bug in qemu. Anyway, I don't think we should special-case 0 gpa. It's up to the user where to base the table. Use a separate flag to signal that table was initialized. > + start = adev->msix_mmio_base; > + end = adev->msix_mmio_base + PCI_MSIX_ENTRY_SIZE * KVM_MAX_MSIX_PER_DEV; This seems wrong. What if the device has a smaller msix table? Can't we simply pass in base and length? > + if (addr >= start && addr + len <= end) interesting wrap-around cases come to mind. Likely not a real problem but maybe better use addr + len - 1 all over to make it robust. > + return true; > + > + return false; > +} > + > +static int msix_get_enabled_idx(struct kvm_assigned_dev_kernel *adev, > + gpa_t addr, int len) > +{ > + int i; > + gpa_t start, end; > + > + for (i = 0; i < adev->entries_nr; i++) { > + start = adev->msix_mmio_base + > + adev->guest_msix_entries[i].entry * PCI_MSIX_ENTRY_SIZE; > + end = start + PCI_MSIX_ENTRY_SIZE; > + if (addr >= start && addr + len <= end) { > + return i; > + } > + } That's a scary thing to do on each access. Can we build an interface that does not force us to scan the whole list each time? > + > + return -EINVAL; > +} > + > +static int msix_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, > + void *val) > +{ > + struct kvm_assigned_dev_kernel *adev = > + container_of(this, struct kvm_assigned_dev_kernel, > + msix_mmio_dev); > + int idx, r = 0; > + u32 entry[4]; > + struct kvm_kernel_irq_routing_entry e; > + > + mutex_lock(&adev->kvm->lock); > + if (!msix_mmio_in_range(adev, addr, len)) { > + r = -EOPNOTSUPP; > + goto out; > + } > + if ((addr & 0x3) || len != 4) > + goto out; > + > + idx = msix_get_enabled_idx(adev, addr, len); > + if (idx < 0) { > + idx = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE; > + if ((addr % PCI_MSIX_ENTRY_SIZE) == > + PCI_MSIX_ENTRY_VECTOR_CTRL) > + *(unsigned long *)val = > + test_bit(idx, adev->msix_mask_bitmap) ? > + PCI_MSIX_ENTRY_CTRL_MASKBIT : 0; I suspect this is wrong for big endian platforms: PCI returns all values in little endian format. > + else > + r = -EOPNOTSUPP; > + goto out; > + } > + > + r = kvm_get_irq_routing_entry(adev->kvm, > + adev->guest_msix_entries[idx].vector, &e); > + if (r || e.type != KVM_IRQ_ROUTING_MSI) { > + printk(KERN_WARNING "KVM: Wrong MSI-X routing entry! " > + "idx %d, addr 0x%llx, len %d\n", idx, addr, len); Can a malicious userspace trigger this intentionally? > + r = -EOPNOTSUPP; > + goto out; > + } > + entry[0] = e.msi.address_lo; > + entry[1] = e.msi.address_hi; > + entry[2] = e.msi.data; > + entry[3] = test_bit(adev->guest_msix_entries[idx].entry, > + adev->msix_mask_bitmap); I suspect this is wrong for big endian platforms: PCI returns all values in little endian format. > + memcpy(val, &entry[addr % PCI_MSIX_ENTRY_SIZE / 4 4 is sizeof *entry? ], len); > + > +out: > + mutex_unlock(&adev->kvm->lock); > + return r; > +} > + > +static int msix_mmio_write(struct kvm_io_device *this, gpa_t addr, int len, > + const void *val) > +{ > + struct kvm_assigned_dev_kernel *adev = > + container_of(this, struct kvm_assigned_dev_kernel, > + msix_mmio_dev); > + int idx, r = 0; > + unsigned long new_val = *(unsigned long *)val; I suspect this is wrong for big endian platforms: PCI returns all values in little endian format. > + > + mutex_lock(&adev->kvm->lock); > + if (!msix_mmio_in_range(adev, addr, len)) { > + r = -EOPNOTSUPP; > + goto out; > + } > + if ((addr & 0x3) || len != 4) > + goto out; > + > + idx = msix_get_enabled_idx(adev, addr, len); > + if (idx < 0) { > + idx = (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE; > + if (((addr % PCI_MSIX_ENTRY_SIZE) == > + PCI_MSIX_ENTRY_VECTOR_CTRL)) { > + if (new_val & ~PCI_MSIX_ENTRY_CTRL_MASKBIT) > + goto out; > + if (new_val & PCI_MSIX_ENTRY_CTRL_MASKBIT) > + set_bit(idx, adev->msix_mask_bitmap); > + else > + clear_bit(idx, adev->msix_mask_bitmap); > + } else > + /* Userspace would handle other MMIO writing */ > + r = -EOPNOTSUPP; > + goto out; > + } > + if (addr % PCI_MSIX_ENTRY_SIZE != PCI_MSIX_ENTRY_VECTOR_CTRL) { > + r = -EOPNOTSUPP; > + goto out; > + } > + if (new_val & ~PCI_MSIX_ENTRY_CTRL_MASKBIT) > + goto out; > + update_msix_mask(adev, idx, !!(new_val & PCI_MSIX_ENTRY_CTRL_MASKBIT)); > +out: > + mutex_unlock(&adev->kvm->lock); > + > + return r; > +} > + > +static const struct kvm_io_device_ops msix_mmio_ops = { > + .read = msix_mmio_read, > + .write = msix_mmio_write, > +}; > + > +static int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > + struct kvm_assigned_msix_mmio *msix_mmio) > +{ > + int r = 0; > + struct kvm_assigned_dev_kernel *adev; > + > + mutex_lock(&kvm->lock); > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > + msix_mmio->assigned_dev_id); > + if (!adev) { > + r = -EINVAL; > + goto out; > + } > + if (msix_mmio->base_addr == 0) { > + r = -EINVAL; > + goto out; > + } > + > + mutex_lock(&kvm->slots_lock); > + if (adev->msix_mmio_base == 0) { What does this do? Handles case we are already registered? Also ! adev->msix_mmio_base > + kvm_iodevice_init(&adev->msix_mmio_dev, &msix_mmio_ops); > + r = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, > + &adev->msix_mmio_dev); > + if (r) > + goto out2; > + } > + > + adev->msix_mmio_base = msix_mmio->base_addr; > +out2: > + mutex_unlock(&kvm->slots_lock); > +out: > + mutex_unlock(&kvm->lock); > + > + return r; > +} > #endif > > long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, > @@ -812,6 +1065,36 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, > goto out; > break; > } > + case KVM_ASSIGN_GET_MSIX_ENTRY: { > + struct kvm_assigned_msix_entry entry; > + r = -EFAULT; > + if (copy_from_user(&entry, argp, sizeof entry)) > + goto out; > + r = kvm_vm_ioctl_get_msix_entry(kvm, &entry); > + if (r) > + goto out; > + r = -EFAULT; > + if (copy_to_user(argp, &entry, sizeof entry)) > + goto out; > + r = 0; > + break; > + } > + case KVM_ASSIGN_REG_MSIX_MMIO: { > + struct kvm_assigned_msix_mmio msix_mmio; > + > + r = -EFAULT; > + if (copy_from_user(&msix_mmio, argp, sizeof(msix_mmio))) > + goto out; > + > + r = -EINVAL; > + if (msix_mmio.flags != 0 || msix_mmio.reserved != 0) > + goto out; > + > + r = kvm_vm_ioctl_register_msix_mmio(kvm, &msix_mmio); > + if (r) > + goto out; > + break; > + } > #endif > } > out: > -- > 1.7.0.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html