Re: [PATCH v8 5/6] kvm: KVM_IRQ_ACKFD

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Aug 10, 2012 at 04:37:48PM -0600, Alex Williamson wrote:
> Enable a mechanism for IRQ ACKs to be exposed through an eventfd.  The
> user can specify the GSI and optionally an IRQ source ID and have the
> provided eventfd trigger whenever the irqchip resamples it's inputs,
> for instance on EOI.
> 
> Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
> ---
> 
>  Documentation/virtual/kvm/api.txt |   18 ++
>  arch/x86/kvm/x86.c                |    2 
>  include/linux/kvm.h               |   16 ++
>  include/linux/kvm_host.h          |   13 ++
>  virt/kvm/eventfd.c                |  285 +++++++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c               |   10 +
>  6 files changed, 344 insertions(+)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 17cd599..77b4837 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2011,6 +2011,24 @@ of IRQ source IDs.  These IDs are also shared with KVM internal users
>  (ex. KVM assigned devices, PIT, shared user ID), therefore not all IDs
>  may be allocated through this interface.
>  
> +4.78 KVM_IRQ_ACKFD
> +
> +Capability: KVM_CAP_IRQ_ACKFD
> +Architectures: x86
> +Type: vm ioctl
> +Parameters: struct kvm_irq_ackfd (in)
> +Returns: 0 on success, -errno on error
> +
> +Allows userspace notification of IRQ ACKs, or resampling of irqchip
> +inputs, often associated with an EOI.  User provided kvm_irq_ackfd.fd
> +and kvm_irq_ackfd.gsi are required and result in an eventfd trigger
> +whenever the GSI is acknowledged.  When KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_FD
> +is available, KVM_IRQ_ACKFD supports the KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID
> +flag which indicates kvm_irqfd.irq_source_id is provided.  With this,
> +the eventfd is only triggered when the specified IRQ source ID is
> +pending.  On deassign, fd, gsi, and irq_source_id (if provided on assign)
> +must be provided.
> +
>

This seems to imply that ACKFD can be used with edge
GSIs, but this does not actually work: with source ID
because of the filtering, and without because of PV EOI.
It seems that what we are really tracking is
level change 1->0. For edge no level -> no notification?

Or does 'resampling of irqchip inputs' imply level somehow?

>  5. The kvm_run structure
>  ------------------------
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 19680ed..3d98e59 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2176,6 +2176,8 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_KVMCLOCK_CTRL:
>  	case KVM_CAP_IRQFD_IRQ_SOURCE_ID:
>  	case KVM_CAP_IRQFD_ASSERT_ONLY:
> +	case KVM_CAP_IRQ_ACKFD:
> +	case KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_ID:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 19b1235..0f53bd5 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -621,6 +621,8 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_NR_IRQ_SOURCE_ID 81
>  #define KVM_CAP_IRQFD_IRQ_SOURCE_ID 82
>  #define KVM_CAP_IRQFD_ASSERT_ONLY 83
> +#define KVM_CAP_IRQ_ACKFD 84
> +#define KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_ID 85
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -707,6 +709,18 @@ struct kvm_irq_source_id {
>  	__u8 pad[24];
>  };
>  
> +#define KVM_IRQ_ACKFD_FLAG_DEASSIGN (1 << 0)
> +/* Available with KVM_CAP_IRQ_ACKFD_IRQ_SOURCE_ID */
> +#define KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID (1 << 1)
> +
> +struct kvm_irq_ackfd {
> +	__u32 flags;
> +	__u32 fd;
> +	__u32 gsi;
> +	__u32 irq_source_id;
> +	__u8 pad[16];
> +};
> +
>  struct kvm_clock_data {
>  	__u64 clock;
>  	__u32 flags;
> @@ -849,6 +863,8 @@ struct kvm_s390_ucas_mapping {
>  #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
>  /* Available with KVM_CAP_IRQ_SOURCE_ID */
>  #define KVM_IRQ_SOURCE_ID         _IOWR(KVMIO, 0xa8, struct kvm_irq_source_id)
> +/* Available with KVM_CAP_IRQ_ACKFD */
> +#define KVM_IRQ_ACKFD             _IOW(KVMIO,  0xa9, struct kvm_irq_ackfd)
>  
>  /*
>   * ioctls for vcpu fds
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ea6d7a1..cdc55c2 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -285,6 +285,10 @@ struct kvm {
>  		struct list_head  items;
>  	} irqfds;
>  	struct list_head ioeventfds;
> +	struct {
> +		spinlock_t        lock;
> +		struct list_head  items;
> +	} irq_ackfds;
>  #endif
>  	struct kvm_vm_stat stat;
>  	struct kvm_arch arch;
> @@ -831,6 +835,8 @@ int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
>  void kvm_irqfd_release(struct kvm *kvm);
>  void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *);
>  int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
> +int kvm_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args);
> +void kvm_irq_ackfd_release(struct kvm *kvm);
>  
>  #else
>  
> @@ -843,6 +849,13 @@ static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
>  
>  static inline void kvm_irqfd_release(struct kvm *kvm) {}
>  
> +static inline int kvm_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args)
> +{
> +	return -EINVAL;
> +}
> +
> +static inline void kvm_irq_ackfd_release(struct kvm *kvm) {}
> +
>  #ifdef CONFIG_HAVE_KVM_IRQCHIP
>  static inline void kvm_irq_routing_update(struct kvm *kvm,
>  					  struct kvm_irq_routing_table *irq_rt)
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index df41038..ff5c784 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -300,6 +300,8 @@ kvm_eventfd_init(struct kvm *kvm)
>  	spin_lock_init(&kvm->irqfds.lock);
>  	INIT_LIST_HEAD(&kvm->irqfds.items);
>  	INIT_LIST_HEAD(&kvm->ioeventfds);
> +	spin_lock_init(&kvm->irq_ackfds.lock);
> +	INIT_LIST_HEAD(&kvm->irq_ackfds.items);
>  }
>  
>  /*
> @@ -668,3 +670,286 @@ kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>  
>  	return kvm_assign_ioeventfd(kvm, args);
>  }
> +
> +/*
> + * --------------------------------------------------------------------
> + * irq_ackfd: Expose IRQ ACKs to userspace via eventfd
> + *
> + * userspace can register to recieve eventfd notification for ACKs of
> + * interrupt lines.
> + * --------------------------------------------------------------------
> + */
> +
> +struct _irq_ackfd {
> +	struct kvm *kvm;
> +	struct eventfd_ctx *eventfd; /* signaled on ack */
> +	struct kvm_irq_ack_notifier notifier;
> +	/* Setup/shutdown */
> +	wait_queue_t wait;
> +	poll_table pt;
> +	struct work_struct shutdown;
> +	struct list_head list;
> +};
> +
> +/* Called under irq_ackfds.lock */
> +static void irq_ackfd_shutdown(struct work_struct *work)
> +{
> +	struct _irq_ackfd *ackfd = container_of(work, struct _irq_ackfd,
> +						shutdown);
> +	struct kvm *kvm = ackfd->kvm;
> +	u64 cnt;
> +
> +	/*
> +	 * Stop ACK signaling
> +	 */
> +	kvm_unregister_irq_ack_notifier(kvm, &ackfd->notifier);
> +
> +	/*
> +	 * Synchronize with the wait-queue and unhook ourselves to prevent
> +	 * further events.
> +	 */
> +	eventfd_ctx_remove_wait_queue(ackfd->eventfd, &ackfd->wait, &cnt);
> +
> +	/*
> +	 * Release resources
> +	 */
> +	eventfd_ctx_put(ackfd->eventfd);
> +	kfree(ackfd);
> +}
> +
> +/* assumes kvm->irq_ackfds.lock is held */
> +static bool irq_ackfd_is_active(struct _irq_ackfd *ackfd)
> +{
> +	return list_empty(&ackfd->list) ? false : true;
> +}
> +
> +/*
> + * Mark the irq_ackfd as inactive and schedule it for removal
> + *
> + * assumes kvm->irq_ackfds.lock is held
> + */
> +static void irq_ackfd_deactivate(struct _irq_ackfd *ackfd)
> +{
> +	BUG_ON(!irq_ackfd_is_active(ackfd));
> +
> +	list_del_init(&ackfd->list);
> +
> +	/* Borrow irqfd's workqueue, we don't need our own. */
> +	queue_work(irqfd_cleanup_wq, &ackfd->shutdown);
> +}
> +
> +/*
> + * Called with wqh->lock held and interrupts disabled
> + */
> +static int irq_ackfd_wakeup(wait_queue_t *wait, unsigned mode,
> +			    int sync, void *key)
> +{
> +	unsigned long flags = (unsigned long)key;
> +
> +	if (unlikely(flags & POLLHUP)) {
> +		/* The eventfd is closing, detach from KVM */
> +		struct _irq_ackfd *ackfd;
> +		unsigned long flags;
> +
> +		ackfd = container_of(wait, struct _irq_ackfd, wait);
> +
> +		spin_lock_irqsave(&ackfd->kvm->irq_ackfds.lock, flags);
> +
> +		/*
> +		 * We must check if someone deactivated the irq_ackfd before
> +		 * we could acquire the irq_ackfds.lock since the item is
> +		 * deactivated from the KVM side before it is unhooked from
> +		 * the wait-queue.  If it is already deactivated, we can
> +		 * simply return knowing the other side will cleanup for us.
> +		 * We cannot race against the irq_ackfd going away since the
> +		 * other side is required to acquire wqh->lock, which we hold
> +		 */
> +		if (irq_ackfd_is_active(ackfd))
> +				irq_ackfd_deactivate(ackfd);
> +
> +		spin_unlock_irqrestore(&ackfd->kvm->irq_ackfds.lock, flags);
> +	}
> +
> +	return 0;
> +}
> +
> +static void irq_ackfd_ptable_queue_proc(struct file *file,
> +					wait_queue_head_t *wqh,
> +					poll_table *pt)
> +{
> +	struct _irq_ackfd *ackfd = container_of(pt, struct _irq_ackfd, pt);
> +	add_wait_queue(wqh, &ackfd->wait);
> +}
> +
> +/*
> + * This function is called as the kvm VM fd is being released. Shutdown all
> + * irq_ackfds that still remain open
> + */
> +void kvm_irq_ackfd_release(struct kvm *kvm)
> +{
> +	struct _irq_ackfd *tmp, *ackfd;
> +
> +	spin_lock_irq(&kvm->irq_ackfds.lock);
> +
> +	list_for_each_entry_safe(ackfd, tmp, &kvm->irq_ackfds.items, list)
> +		irq_ackfd_deactivate(ackfd);
> +
> +	spin_unlock_irq(&kvm->irq_ackfds.lock);
> +
> +	flush_workqueue(irqfd_cleanup_wq);
> +}
> +
> +static void irq_ackfd_acked(struct kvm_irq_ack_notifier *notifier)
> +{
> +	struct _irq_ackfd *ackfd;
> +
> +	ackfd = container_of(notifier, struct _irq_ackfd, notifier);
> +
> +	eventfd_signal(ackfd->eventfd, 1);
> +}
> +
> +static int kvm_assign_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args)
> +{
> +	struct file *file = NULL;
> +	struct eventfd_ctx *eventfd = NULL;
> +	struct _irq_ackfd *ackfd = NULL, *tmp;
> +	int ret;
> +	u64 cnt;
> +
> +	file = eventfd_fget(args->fd);
> +	if (IS_ERR(file)) {
> +		ret = PTR_ERR(file);
> +		goto fail;
> +	}
> +
> +	eventfd = eventfd_ctx_fileget(file);
> +	if (IS_ERR(eventfd)) {
> +		ret = PTR_ERR(eventfd);
> +		goto fail;
> +	}
> +
> +	ackfd = kzalloc(sizeof(*ackfd), GFP_KERNEL);
> +	if (!ackfd) {
> +		ret = -ENOMEM;
> +		goto fail;
> +	}
> +
> +	INIT_LIST_HEAD(&ackfd->list);
> +	INIT_WORK(&ackfd->shutdown, irq_ackfd_shutdown);
> +	ackfd->kvm = kvm;
> +	ackfd->eventfd = eventfd;
> +	ackfd->notifier.gsi = args->gsi;
> +	if (args->flags & KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID)
> +		ackfd->notifier.irq_source_id = args->irq_source_id;
> +	else
> +		ackfd->notifier.irq_source_id = -1;
> +	ackfd->notifier.irq_acked = irq_ackfd_acked;
> +
> +	/*
> +	 * Install our own custom wake-up handling so we are notified via
> +	 * a callback whenever someone releases the underlying eventfd
> +	 */
> +	init_waitqueue_func_entry(&ackfd->wait, irq_ackfd_wakeup);
> +	init_poll_funcptr(&ackfd->pt, irq_ackfd_ptable_queue_proc);
> +
> +	/*
> +	 * Install the wait queue function to allow cleanup when the
> +	 * eventfd is closed by the user.  This grabs the wqh lock, so
> +	 * we do it out of spinlock, holding the file reference ensures
> +	 * we won't see a POLLHUP until setup is complete.
> +	 */
> +	file->f_op->poll(file, &ackfd->pt);
> +
> +	spin_lock_irq(&kvm->irq_ackfds.lock);
> +
> +	/* Check for duplicates.  */
> +	list_for_each_entry(tmp, &kvm->irq_ackfds.items, list) {
> +		if (ackfd->eventfd == tmp->eventfd &&
> +		    ackfd->notifier.gsi == tmp->notifier.gsi &&
> +		    ackfd->notifier.irq_source_id ==
> +		    tmp->notifier.irq_source_id) {
> +			spin_unlock_irq(&kvm->irq_ackfds.lock);
> +			ret = -EBUSY;
> +			goto fail_unqueue;
> +		}
> +	}
> +

So source ID is not validated at all, this means there are
4G ways for the same eventfd to be registered,
drinking up unlimited kernel memory by means of kzalloc above.

> +	list_add_tail(&ackfd->list, &kvm->irq_ackfds.items);
> +
> +	spin_unlock_irq(&kvm->irq_ackfds.lock);
> +
> +	/* This can sleep, so register after spinlock.  */
> +	kvm_register_irq_ack_notifier(kvm, &ackfd->notifier);
> +
> +	fput(file); /* Enable POLLHUP */
> +
> +	return 0;
> +
> +fail_unqueue:
> +	eventfd_ctx_remove_wait_queue(eventfd, &ackfd->wait, &cnt);
> +fail:
> +	if (eventfd && !IS_ERR(eventfd))
> +		eventfd_ctx_put(eventfd);
> +
> +	if (file && !IS_ERR(file))
> +		fput(file);
> +
> +	kfree(ackfd);
> +
> +	return ret;
> +}
> +
> +static int kvm_deassign_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args)
> +{
> +	struct eventfd_ctx *eventfd = NULL;
> +	struct _irq_ackfd *ackfd;
> +	int irq_source_id = -1, ret = -ENOENT;
> +
> +	eventfd = eventfd_ctx_fdget(args->fd);
> +	if (IS_ERR(eventfd)) {
> +		ret = PTR_ERR(eventfd);
> +		goto fail;
> +	}
> +
> +	if (args->flags & KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID)
> +		irq_source_id = args->irq_source_id;
> +
> +	spin_lock_irq(&kvm->irq_ackfds.lock);
> +
> +	list_for_each_entry(ackfd, &kvm->irq_ackfds.items, list) {
> +		if (ackfd->eventfd == eventfd &&
> +		    ackfd->notifier.gsi == args->gsi &&
> +		    ackfd->notifier.irq_source_id == irq_source_id) {
> +			irq_ackfd_deactivate(ackfd);
> +			ret = 0;
> +			break;
> +		}
> +	}
> +
> +	spin_unlock_irq(&kvm->irq_ackfds.lock);
> +
> +fail:
> +	if (eventfd && !IS_ERR(eventfd))
> +		eventfd_ctx_put(eventfd);
> +
> +	/*
> +	 * Block until we know all outstanding shutdown jobs have completed
> +	 * so that we guarantee there will not be any more ACKs signaled on
> +	 * this eventfd once this deassign function returns.
> +	 */
> +	flush_workqueue(irqfd_cleanup_wq);
> +
> +	return ret;
> +}
> +
> +int kvm_irq_ackfd(struct kvm *kvm, struct kvm_irq_ackfd *args)
> +{
> +	if (args->flags & ~(KVM_IRQ_ACKFD_FLAG_DEASSIGN |
> +			    KVM_IRQ_ACKFD_FLAG_IRQ_SOURCE_ID))
> +		return -EINVAL;
> +
> +	if (args->flags & KVM_IRQ_ACKFD_FLAG_DEASSIGN)
> +		return kvm_deassign_irq_ackfd(kvm, args);
> +
> +	return kvm_assign_irq_ackfd(kvm, args);
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index e120cb3..bbe3a20 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -619,6 +619,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
>  	struct kvm *kvm = filp->private_data;
>  
>  	kvm_irqfd_release(kvm);
> +	kvm_irq_ackfd_release(kvm);
>  
>  	kvm_put_kvm(kvm);
>  	return 0;
> @@ -2109,6 +2110,15 @@ static long kvm_vm_ioctl(struct file *filp,
>  		break;
>  	}
>  #endif
> +	case KVM_IRQ_ACKFD: {
> +		struct kvm_irq_ackfd data;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&data, argp, sizeof(data)))
> +			goto out;
> +		r = kvm_irq_ackfd(kvm, &data);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux