On 2012-06-16 18:34, Alex Williamson wrote: > I'm looking for opinions on this approach. For vfio device assignment > we minimally need a way to get EOIs from the in-kernel irqchip out to > userspace. Getting that out via an eventfd would allow us to bounce > all level interrupts out to userspace, where we would de-assert the > device interrupt in qemu and unmask the physical device. Ideally we > could deassert the interrupt in KVM, which allows us to send the EOI > directly to vfio. To do that, we need to use a new IRQ source ID so > the guest sees the logical OR of qemu requested state and external > device state. This allows external devices to share interrupts with > emulated devices, just like KVM assignment. That means the means we > also need to use a new source ID when injecting the interrupt via > irqfd. > > Rather than creating a source ID allocation interface, extending irqfd > to support a user supplied source ID, and creating another new > interface to get the EOI out, I think it works out better to bundle > these all together as just a level irqfd interface. This way we don't > allow users to create unbalanced states where a level interrupt is > asserted, but has no way of being deasserted. I believe the below > does this, but needs testing and validation with an implementation in > qemu. > > Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx> > --- > > arch/x86/kvm/x86.c | 1 + > include/linux/kvm.h | 6 +++ > include/linux/kvm_host.h | 4 +- > virt/kvm/eventfd.c | 90 ++++++++++++++++++++++++++++++++++++++-------- > virt/kvm/kvm_main.c | 2 + > 5 files changed, 84 insertions(+), 19 deletions(-) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index a01a424..80bed07 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2148,6 +2148,7 @@ int kvm_dev_ioctl_check_extension(long ext) > case KVM_CAP_GET_TSC_KHZ: > case KVM_CAP_PCI_2_3: > case KVM_CAP_KVMCLOCK_CTRL: > + case KVM_CAP_IRQFD_LEVEL: > r = 1; > break; > case KVM_CAP_COALESCED_MMIO: > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > index 2ce09aa..cca49a1 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -618,6 +618,7 @@ struct kvm_ppc_smmu_info { > #define KVM_CAP_PPC_GET_SMMU_INFO 78 > #define KVM_CAP_S390_COW 79 > #define KVM_CAP_PPC_ALLOC_HTAB 80 > +#define KVM_CAP_IRQFD_LEVEL 81 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -683,12 +684,15 @@ struct kvm_xen_hvm_config { > #endif > > #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) > +/* Available with KVM_CAP_IRQFD_LEVEL */ > +#define KVM_IRQFD_FLAG_LEVEL (1 << 1) > > struct kvm_irqfd { > __u32 fd; > __u32 gsi; > __u32 flags; > - __u8 pad[20]; > + __u32 eoi_fd; > + __u8 pad[16]; > }; > > struct kvm_clock_data { > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 27ac8a4..ae3b426 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -824,7 +824,7 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} > #ifdef CONFIG_HAVE_KVM_EVENTFD > > void kvm_eventfd_init(struct kvm *kvm); > -int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); > +int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); > void kvm_irqfd_release(struct kvm *kvm); > void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); > int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); > @@ -833,7 +833,7 @@ int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); > > static inline void kvm_eventfd_init(struct kvm *kvm) {} > > -static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) > +static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) > { > return -EINVAL; > } > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c > index f59c1e8..89a6fa9 100644 > --- a/virt/kvm/eventfd.c > +++ b/virt/kvm/eventfd.c > @@ -49,9 +49,13 @@ struct _irqfd { > wait_queue_t wait; > /* Update side is protected by irqfds.lock */ > struct kvm_kernel_irq_routing_entry __rcu *irq_entry; > - /* Used for level IRQ fast-path */ > + /* Used for IRQ fast-path */ > int gsi; > struct work_struct inject; > + /* Used for level EOI path */ > + int irq_source_id; > + struct eventfd_ctx *eoi_eventfd; > + struct kvm_irq_ack_notifier notifier; > /* Used for setup/shutdown */ > struct eventfd_ctx *eventfd; > struct list_head list; > @@ -62,7 +66,7 @@ struct _irqfd { > static struct workqueue_struct *irqfd_cleanup_wq; > > static void > -irqfd_inject(struct work_struct *work) > +irqfd_inject_edge(struct work_struct *work) > { > struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); > struct kvm *kvm = irqfd->kvm; > @@ -71,6 +75,23 @@ irqfd_inject(struct work_struct *work) > kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); > } > > +static void > +irqfd_inject_level(struct work_struct *work) > +{ > + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); > + > + kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 1); > +} > + > +static void > +irqfd_ack_level(struct kvm_irq_ack_notifier *notifier) > +{ > + struct _irqfd *irqfd = container_of(notifier, struct _irqfd, notifier); > + > + kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0); > + eventfd_signal(irqfd->eoi_eventfd, 1); > +} > + > /* > * Race-free decouple logic (ordering is critical) > */ > @@ -96,6 +117,14 @@ irqfd_shutdown(struct work_struct *work) > * It is now safe to release the object's resources > */ > eventfd_ctx_put(irqfd->eventfd); > + > + if (irqfd->eoi_eventfd) { > + kvm_unregister_irq_ack_notifier(irqfd->kvm, &irqfd->notifier); > + eventfd_ctx_put(irqfd->eoi_eventfd); > + kvm_set_irq(irqfd->kvm, irqfd->irq_source_id, irqfd->gsi, 0); > + kvm_free_irq_source_id(irqfd->kvm, irqfd->irq_source_id); > + } > + > kfree(irqfd); > } > > @@ -198,13 +227,13 @@ static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, > } > > static int > -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) > +kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) > { > struct kvm_irq_routing_table *irq_rt; > struct _irqfd *irqfd, *tmp; > struct file *file = NULL; > - struct eventfd_ctx *eventfd = NULL; > - int ret; > + struct eventfd_ctx *eventfd = NULL, *eoi_eventfd = NULL; > + int ret, irq_source_id = -1; > unsigned int events; > > irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); > @@ -212,12 +241,35 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) > return -ENOMEM; > > irqfd->kvm = kvm; > - irqfd->gsi = gsi; > + irqfd->gsi = args->gsi; > INIT_LIST_HEAD(&irqfd->list); > - INIT_WORK(&irqfd->inject, irqfd_inject); > + > + if (args->flags & KVM_IRQFD_FLAG_LEVEL) { > + irq_source_id = kvm_request_irq_source_id(kvm); > + if (irq_source_id < 0) { > + ret = irq_source_id; > + goto fail; > + } > + > + eoi_eventfd = eventfd_ctx_fdget(args->eoi_fd); > + if (IS_ERR(eoi_eventfd)) { > + ret = PTR_ERR(eoi_eventfd); > + goto fail; > + } > + > + irqfd->irq_source_id = irq_source_id; > + irqfd->eoi_eventfd = eoi_eventfd; > + irqfd->notifier.gsi = args->gsi; > + irqfd->notifier.irq_acked = irqfd_ack_level; > + kvm_register_irq_ack_notifier(kvm, &irqfd->notifier); > + > + INIT_WORK(&irqfd->inject, irqfd_inject_level); > + } else > + INIT_WORK(&irqfd->inject, irqfd_inject_edge); > + > INIT_WORK(&irqfd->shutdown, irqfd_shutdown); > > - file = eventfd_fget(fd); > + file = eventfd_fget(args->fd); > if (IS_ERR(file)) { > ret = PTR_ERR(file); > goto fail; > @@ -282,6 +334,14 @@ fail: > if (!IS_ERR(file)) > fput(file); > > + if (eoi_eventfd && !IS_ERR(eoi_eventfd)) { > + kvm_unregister_irq_ack_notifier(kvm, &irqfd->notifier); > + eventfd_ctx_put(eoi_eventfd); > + } > + > + if (irq_source_id >= 0) > + kvm_free_irq_source_id(kvm, irq_source_id); > + > kfree(irqfd); > return ret; > } > @@ -298,19 +358,19 @@ kvm_eventfd_init(struct kvm *kvm) > * shutdown any irqfd's that match fd+gsi > */ > static int > -kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) > +kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) > { > struct _irqfd *irqfd, *tmp; > struct eventfd_ctx *eventfd; > > - eventfd = eventfd_ctx_fdget(fd); > + eventfd = eventfd_ctx_fdget(args->fd); > if (IS_ERR(eventfd)) > return PTR_ERR(eventfd); > > spin_lock_irq(&kvm->irqfds.lock); > > list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { > - if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { > + if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { > /* > * This rcu_assign_pointer is needed for when > * another thread calls kvm_irq_routing_update before > @@ -338,12 +398,12 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) > } > > int > -kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) > +kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) > { > - if (flags & KVM_IRQFD_FLAG_DEASSIGN) > - return kvm_irqfd_deassign(kvm, fd, gsi); > + if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) > + return kvm_irqfd_deassign(kvm, args); > > - return kvm_irqfd_assign(kvm, fd, gsi); > + return kvm_irqfd_assign(kvm, args); > } > > /* > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 02cb440..b4ad14cc 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2059,7 +2059,7 @@ static long kvm_vm_ioctl(struct file *filp, > r = -EFAULT; > if (copy_from_user(&data, argp, sizeof data)) > goto out; > - r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); > + r = kvm_irqfd(kvm, &data); > break; > } > case KVM_IOEVENTFD: { > Looks reasonable to me. Jan
Attachment:
signature.asc
Description: OpenPGP digital signature