Re: [PATCH V3 2/3] kvm: don't register wildcard MMIO EVENTFD on two buses

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Aug 25, 2015 at 05:05:47PM +0800, Jason Wang wrote:
> We register wildcard mmio eventfd on two buses, one for KVM_MMIO_BUS
> and another is KVM_FAST_MMIO_BUS. This leads to issue:
> 
> - kvm_io_bus_destroy() knows nothing about the devices on two buses
>   points to a single dev. Which will lead double free [1] during exit.
> - wildcard eventfd ignores data len, so it was registered as a
>   kvm_io_range with zero length. This will fail the binary search in
>   kvm_io_bus_get_first_dev() when we try to emulate through
>   KVM_MMIO_BUS. This will cause userspace io emulation request instead
>   of a eventfd notification (virtqueue kick will be trapped by qemu
>   instead of vhost in this case).
> 
> Fixing this by don't register wildcard mmio eventfd on two
> buses. Instead, only register it in KVM_FAST_MMIO_BUS. This fixes the
> double free issue of kvm_io_bus_destroy(). For the arch/setups that
> does not utilize KVM_FAST_MMIO_BUS, before searching KVM_MMIO_BUS, try
> KVM_FAST_MMIO_BUS first to see it it has a match.
> 
> [1] Panic caused by double free:
> 
> CPU: 1 PID: 2894 Comm: qemu-system-x86 Not tainted 3.19.0-26-generic #28-Ubuntu
> Hardware name: LENOVO 2356BG6/2356BG6, BIOS G7ET96WW (2.56 ) 09/12/2013
> task: ffff88009ae0c4b0 ti: ffff88020e7f0000 task.ti: ffff88020e7f0000
> RIP: 0010:[<ffffffffc07e25d8>]  [<ffffffffc07e25d8>] ioeventfd_release+0x28/0x60 [kvm]
> RSP: 0018:ffff88020e7f3bc8  EFLAGS: 00010292
> RAX: dead000000200200 RBX: ffff8801ec19c900 RCX: 000000018200016d
> RDX: ffff8801ec19cf80 RSI: ffffea0008bf1d40 RDI: ffff8801ec19c900
> RBP: ffff88020e7f3bd8 R08: 000000002fc75a01 R09: 000000018200016d
> R10: ffffffffc07df6ae R11: ffff88022fc75a98 R12: ffff88021e7cc000
> R13: ffff88021e7cca48 R14: ffff88021e7cca50 R15: ffff8801ec19c880
> FS:  00007fc1ee3e6700(0000) GS:ffff88023e240000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 00007f8f389d8000 CR3: 000000023dc13000 CR4: 00000000001427e0
> Stack:
> ffff88021e7cc000 0000000000000000 ffff88020e7f3be8 ffffffffc07e2622
> ffff88020e7f3c38 ffffffffc07df69a ffff880232524160 ffff88020e792d80
>  0000000000000000 ffff880219b78c00 0000000000000008 ffff8802321686a8
> Call Trace:
> [<ffffffffc07e2622>] ioeventfd_destructor+0x12/0x20 [kvm]
> [<ffffffffc07df69a>] kvm_put_kvm+0xca/0x210 [kvm]
> [<ffffffffc07df818>] kvm_vcpu_release+0x18/0x20 [kvm]
> [<ffffffff811f69f7>] __fput+0xe7/0x250
> [<ffffffff811f6bae>] ____fput+0xe/0x10
> [<ffffffff81093f04>] task_work_run+0xd4/0xf0
> [<ffffffff81079358>] do_exit+0x368/0xa50
> [<ffffffff81082c8f>] ? recalc_sigpending+0x1f/0x60
> [<ffffffff81079ad5>] do_group_exit+0x45/0xb0
> [<ffffffff81085c71>] get_signal+0x291/0x750
> [<ffffffff810144d8>] do_signal+0x28/0xab0
> [<ffffffff810f3a3b>] ? do_futex+0xdb/0x5d0
> [<ffffffff810b7028>] ? __wake_up_locked_key+0x18/0x20
> [<ffffffff810f3fa6>] ? SyS_futex+0x76/0x170
> [<ffffffff81014fc9>] do_notify_resume+0x69/0xb0
> [<ffffffff817cb9af>] int_signal+0x12/0x17
> Code: 5d c3 90 0f 1f 44 00 00 55 48 89 e5 53 48 89 fb 48 83 ec 08 48 8b 7f 20 e8 06 d6 a5 c0 48 8b 43 08 48 8b 13 48 89 df 48 89 42 08 <48> 89 10 48 b8 00 01 10 00 00
> RIP  [<ffffffffc07e25d8>] ioeventfd_release+0x28/0x60 [kvm]
> RSP <ffff88020e7f3bc8>
> 
> Cc: Gleb Natapov <gleb@xxxxxxxxxx>
> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Cc: Michael S. Tsirkin <mst@xxxxxxxxxx>
> Signed-off-by: Jason Wang <jasowang@xxxxxxxxxx>
> ---
> Changes from V2:
> - Tweak styles and comment suggested by Cornelia.
> Changes from v1:
> - change ioeventfd_bus_from_flags() to return KVM_FAST_MMIO_BUS when
>   needed to save lots of unnecessary changes.
> ---
>  virt/kvm/eventfd.c  | 31 +++++++++----------------------
>  virt/kvm/kvm_main.c | 16 ++++++++++++++--
>  2 files changed, 23 insertions(+), 24 deletions(-)
> 
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 9ff4193..c3ffdc3 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -762,13 +762,16 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
>  	return false;
>  }
>  
> -static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
> +static enum kvm_bus ioeventfd_bus_from_args(struct kvm_ioeventfd *args)
>  {
> -	if (flags & KVM_IOEVENTFD_FLAG_PIO)
> +	if (args->flags & KVM_IOEVENTFD_FLAG_PIO)
>  		return KVM_PIO_BUS;
> -	if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
> +	if (args->flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
>  		return KVM_VIRTIO_CCW_NOTIFY_BUS;
> -	return KVM_MMIO_BUS;
> +	/* When length is ignored, MMIO is put on a separate bus, for
> +	 * faster lookups.
> +	 */
> +	return args->len ? KVM_MMIO_BUS : KVM_FAST_MMIO_BUS;
>  }
>  
>  static int
> @@ -779,7 +782,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>  	struct eventfd_ctx       *eventfd;
>  	int                       ret;
>  
> -	bus_idx = ioeventfd_bus_from_flags(args->flags);
> +	bus_idx = ioeventfd_bus_from_args(args);
>  	/* must be natural-word sized, or 0 to ignore length */
>  	switch (args->len) {
>  	case 0:
> @@ -843,16 +846,6 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>  	if (ret < 0)
>  		goto unlock_fail;
>  
> -	/* When length is ignored, MMIO is also put on a separate bus, for
> -	 * faster lookups.
> -	 */
> -	if (!args->len && !(args->flags & KVM_IOEVENTFD_FLAG_PIO)) {
> -		ret = kvm_io_bus_register_dev(kvm, KVM_FAST_MMIO_BUS,
> -					      p->addr, 0, &p->dev);
> -		if (ret < 0)
> -			goto register_fail;
> -	}
> -
>  	kvm->buses[bus_idx]->ioeventfd_count++;
>  	list_add_tail(&p->list, &kvm->ioeventfds);
>  
> @@ -860,8 +853,6 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>  
>  	return 0;
>  
> -register_fail:
> -	kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
>  unlock_fail:
>  	mutex_unlock(&kvm->slots_lock);
>  
> @@ -880,7 +871,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>  	struct eventfd_ctx       *eventfd;
>  	int                       ret = -ENOENT;
>  
> -	bus_idx = ioeventfd_bus_from_flags(args->flags);
> +	bus_idx = ioeventfd_bus_from_args(args);
>  	eventfd = eventfd_ctx_fdget(args->fd);
>  	if (IS_ERR(eventfd))
>  		return PTR_ERR(eventfd);
> @@ -901,10 +892,6 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>  			continue;
>  
>  		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
> -		if (!p->length) {
> -			kvm_io_bus_unregister_dev(kvm, KVM_FAST_MMIO_BUS,
> -						  &p->dev);
> -		}
>  		kvm->buses[bus_idx]->ioeventfd_count--;
>  		ioeventfd_release(p);
>  		ret = 0;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 0d79fe8..3e3b3bc 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3152,8 +3152,8 @@ static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
>  }
>  
>  /* kvm_io_bus_write - called under kvm->slots_lock */
> -int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
> -		     int len, const void *val)
> +static int kvm_io_bus_write_idx(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
> +				gpa_t addr, int len, const void *val)
>  {
>  	struct kvm_io_bus *bus;
>  	struct kvm_io_range range;
> @@ -3169,6 +3169,18 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
>  	return r < 0 ? r : 0;
>  }
>  
> +int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
> +		     int len, const void *val)
> +{
> +	int r = 0;
> +
> +	if (bus_idx != KVM_MMIO_BUS ||
> +	    kvm_io_bus_write_idx(vcpu, KVM_FAST_MMIO_BUS, addr, 0, NULL) < 0)
> +		r = kvm_io_bus_write_idx(vcpu, bus_idx, addr, len, val);
> +
> +	return r < 0 ? r : 0;
> +}
> +

Thinking more about this, invoking the 0-length write after
the != 0 length one would be better: it would mean we only
handle the userspace MMIO like this.

All this calls for performance tests using kvm unit tests.

>  /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
>  int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
>  			    gpa_t addr, int len, const void *val, long cookie)
> -- 
> 2.1.4
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux