Re: [KVM-RFC PATCH 2/2] kvm: use POLLHUP to close an irqfd instead of an explicit ioctl

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Jun 02, 2009 at 11:15:38AM -0400, Gregory Haskins wrote:
> Assigning an irqfd object to a kvm object creates a relationship that we
> currently manage by having the kvm oject acquire/hold a file* reference to
> the underlying eventfd.  The lifetime of these objects is properly maintained
> by decoupling the two objects whenever the irqfd is closed or kvm is closed,
> whichever comes first.
> 
> However, the irqfd "close" method is less than ideal since it requires two
> system calls to complete (one for ioctl(kvmfd, IRQFD_DEASSIGN), the other for
> close(eventfd)).  This dual-call approach was utilized because there was no
> notification mechanism on the eventfd side at the time irqfd was implemented.
> 
> Recently, Davide proposed a patch to send a POLLHUP wakeup whenever an
> eventfd is about to close.  So we eliminate the IRQFD_DEASSIGN ioctl (*)
> vector in favor of sensing the desassign automatically when the fd is closed.
> The resulting code is slightly more complex as a result since we need to
> allow either side to sever the relationship independently.  We utilize SRCU
> to guarantee stable concurrent access to the KVM pointer without adding
> additional atomic operations in the fast path.
> 
> At minimum, this design should be acked by both Davide and Paul (cc'd).
> 
> (*) The irqfd patch does not exist in any released tree, so the understanding
> is that we can alter the irqfd specific ABI without taking the normal
> precautions, such as CAP bits.

A few questions and suggestions interspersed below.

							Thanx, Paul

> Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx>
> CC: Davide Libenzi <davidel@xxxxxxxxxxxxxxx>
> CC: Michael S. Tsirkin <mst@xxxxxxxxxx>
> CC: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
> ---
> 
>  include/linux/kvm.h |    2 -
>  virt/kvm/eventfd.c  |  177 +++++++++++++++++++++++----------------------------
>  virt/kvm/kvm_main.c |    3 +
>  3 files changed, 81 insertions(+), 101 deletions(-)
> 
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 632a856..29b62cc 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -482,8 +482,6 @@ struct kvm_x86_mce {
>  };
>  #endif
> 
> -#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> -
>  struct kvm_irqfd {
>  	__u32 fd;
>  	__u32 gsi;
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index f3f2ea1..6ed62e2 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -37,26 +37,63 @@
>   * --------------------------------------------------------------------
>   */
>  struct _irqfd {
> +	struct mutex              lock;
> +	struct srcu_struct        srcu;
>  	struct kvm               *kvm;
>  	int                       gsi;
> -	struct file              *file;
>  	struct list_head          list;
>  	poll_table                pt;
>  	wait_queue_head_t        *wqh;
>  	wait_queue_t              wait;
> -	struct work_struct        work;
> +	struct work_struct        inject;
>  };
> 
>  static void
>  irqfd_inject(struct work_struct *work)
>  {
> -	struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
> -	struct kvm *kvm = irqfd->kvm;
> +	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
> +	struct kvm *kvm;
> +	int idx;
> +
> +	idx = srcu_read_lock(&irqfd->srcu);
> +
> +	kvm = rcu_dereference(irqfd->kvm);
> +	if (kvm) {
> +		mutex_lock(&kvm->lock);
> +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> +		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> +		mutex_unlock(&kvm->lock);
> +	}
> +
> +	srcu_read_unlock(&irqfd->srcu, idx);
> +}
> +
> +static void
> +irqfd_disconnect(struct _irqfd *irqfd)
> +{
> +	struct kvm *kvm;
> +
> +	mutex_lock(&irqfd->lock);
> +
> +	kvm = rcu_dereference(irqfd->kvm);
> +	rcu_assign_pointer(irqfd->kvm, NULL);
> +
> +	mutex_unlock(&irqfd->lock);
> +
> +	if (!kvm)
> +		return;
> 
>  	mutex_lock(&kvm->lock);
> -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> -	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> +	list_del(&irqfd->list);
>  	mutex_unlock(&kvm->lock);
> +
> +	/*
> +	 * It is important to not drop the kvm reference until the next grace
> +	 * period because there might be lockless references in flight up
> +	 * until then
> +	 */

The lockless references are all harmless even if carried out after the
kvm structure has been removed?  Or does there need to be a ->deleted
field that allows the lockless references to ignore a kvm structure that
has already been deleted?

On the other hand, if it really is somehow OK for kvm_set_irq() to be
called on an already-deleted kvm structure, then this code would be OK
as is.

> +	synchronize_srcu(&irqfd->srcu);
> +	kvm_put_kvm(kvm);
>  }
> 
>  static int
> @@ -64,12 +101,28 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
>  {
>  	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
> 
> -	/*
> -	 * The wake_up is called with interrupts disabled.  Therefore we need
> -	 * to defer the IRQ injection until later since we need to acquire the
> -	 * kvm->lock to do so.
> -	 */
> -	schedule_work(&irqfd->work);
> +	switch ((unsigned long)key) {
> +	case POLLIN:
> +		/*
> +		 * The POLLIN wake_up is called with interrupts disabled.
> +		 * Therefore we need to defer the IRQ injection until later
> +		 * since we need to acquire the kvm->lock to do so.
> +		 */
> +		schedule_work(&irqfd->inject);
> +		break;
> +	case POLLHUP:
> +		/*
> +		 * The POLLHUP is called unlocked, so it theoretically should
> +		 * be safe to remove ourselves from the wqh
> +		 */
> +		remove_wait_queue(irqfd->wqh, &irqfd->wait);
> +		flush_work(&irqfd->inject);
> +		irqfd_disconnect(irqfd);

Good.  The fact that irqfd_disconnect() does a synchronize_srcu()
prevents any readers from trying to do an SRCU operation on an already
cleaned-up srcu_struct, so this does look safe to me.

> +		cleanup_srcu_struct(&irqfd->srcu);
> +		kfree(irqfd);
> +		break;
> +	}
> 
>  	return 0;
>  }
> @@ -84,8 +137,8 @@ irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
>  	add_wait_queue(wqh, &irqfd->wait);
>  }
> 
> -static int
> -kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi)
> +int
> +kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>  {
>  	struct _irqfd *irqfd;
>  	struct file *file = NULL;
> @@ -95,10 +148,12 @@ kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi)
>  	if (!irqfd)
>  		return -ENOMEM;
> 
> +	mutex_init(&irqfd->lock);
> +	init_srcu_struct(&irqfd->srcu);
>  	irqfd->kvm = kvm;
>  	irqfd->gsi = gsi;
>  	INIT_LIST_HEAD(&irqfd->list);
> -	INIT_WORK(&irqfd->work, irqfd_inject);
> +	INIT_WORK(&irqfd->inject, irqfd_inject);
> 
>  	/*
>  	 * Embed the file* lifetime in the irqfd.
> @@ -120,12 +175,18 @@ kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi)
>  	if (ret < 0)
>  		goto fail;
> 
> -	irqfd->file = file;
> +	kvm_get_kvm(kvm);
> 
>  	mutex_lock(&kvm->lock);
>  	list_add_tail(&irqfd->list, &kvm->irqfds);

Doesn't the above need to be list_add_tail_rcu()?  Unless I am confused,
this is the point at which the new SRCU-protected structure is first
made public.  If so, you really do need list_add_tail_rcu() to make sure
that concurrent readers don't see pre-initialized values in the structure.

>  	mutex_unlock(&kvm->lock);
> 
> +	/*
> +	 * do not drop the file until the irqfd is fully initialized, otherwise
> +	 * we might race against the POLLHUP
> +	 */
> +	fput(file);
> +
>  	return 0;
> 
>  fail:
> @@ -139,97 +200,17 @@ fail:
>  	return ret;
>  }
> 
> -static void
> -irqfd_release(struct _irqfd *irqfd)
> -{
> -	/*
> -	 * The ordering is important.  We must remove ourselves from the wqh
> -	 * first to ensure no more event callbacks are issued, and then flush
> -	 * any previously scheduled work prior to freeing the memory
> -	 */
> -	remove_wait_queue(irqfd->wqh, &irqfd->wait);
> -
> -	flush_work(&irqfd->work);
> -
> -	fput(irqfd->file);
> -	kfree(irqfd);
> -}
> -
> -static struct _irqfd *
> -irqfd_remove(struct kvm *kvm, struct file *file, int gsi)
> -{
> -	struct _irqfd *irqfd;
> -
> -	mutex_lock(&kvm->lock);
> -
> -	/*
> -	 * linear search isn't brilliant, but this should be an infrequent
> -	 * slow-path operation, and the list should not grow very large
> -	 */
> -	list_for_each_entry(irqfd, &kvm->irqfds, list) {
> -		if (irqfd->file != file || irqfd->gsi != gsi)
> -			continue;
> -
> -		list_del(&irqfd->list);
> -		mutex_unlock(&kvm->lock);
> -
> -		return irqfd;
> -	}
> -
> -	mutex_unlock(&kvm->lock);
> -
> -	return NULL;
> -}
> -
> -static int
> -kvm_deassign_irqfd(struct kvm *kvm, int fd, int gsi)
> -{
> -	struct _irqfd *irqfd;
> -	struct file *file;
> -	int count = 0;
> -
> -	file = fget(fd);
> -	if (IS_ERR(file))
> -		return PTR_ERR(file);
> -
> -	while ((irqfd = irqfd_remove(kvm, file, gsi))) {
> -		/*
> -		 * We remove the item from the list under the lock, but we
> -		 * free it outside the lock to avoid deadlocking with the
> -		 * flush_work and the work_item taking the lock
> -		 */
> -		irqfd_release(irqfd);
> -		count++;
> -	}
> -
> -	fput(file);
> -
> -	return count ? count : -ENOENT;
> -}
> -
>  void
>  kvm_irqfd_init(struct kvm *kvm)
>  {
>  	INIT_LIST_HEAD(&kvm->irqfds);
>  }
> 
> -int
> -kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> -{
> -	if (flags & KVM_IRQFD_FLAG_DEASSIGN)
> -		return kvm_deassign_irqfd(kvm, fd, gsi);
> -
> -	return kvm_assign_irqfd(kvm, fd, gsi);
> -}
> -
>  void
>  kvm_irqfd_release(struct kvm *kvm)
>  {
>  	struct _irqfd *irqfd, *tmp;
> 
> -	/* don't bother with the lock..we are shutting down */
> -	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
> -		list_del(&irqfd->list);
> -		irqfd_release(irqfd);
> -	}
> +	list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list)
> +		irqfd_disconnect(irqfd);
>  }
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 902fed9..a9f62bb 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1029,7 +1029,6 @@ static void kvm_destroy_vm(struct kvm *kvm)
>  	spin_lock(&kvm_lock);
>  	list_del(&kvm->vm_list);
>  	spin_unlock(&kvm_lock);
> -	kvm_irqfd_release(kvm);
>  	kvm_free_irq_routing(kvm);
>  	kvm_io_bus_destroy(&kvm->pio_bus);
>  	kvm_io_bus_destroy(&kvm->mmio_bus);
> @@ -1064,6 +1063,8 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
>  {
>  	struct kvm *kvm = filp->private_data;
> 
> +	kvm_irqfd_release(kvm);
> +
>  	kvm_put_kvm(kvm);
>  	return 0;
>  }
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux