On Thu, 2021-02-11 at 14:59 +0000, Stefan Hajnoczi wrote: > On Wed, Feb 10, 2021 at 11:31:30AM -0800, Elena Afanasova wrote: > > On Mon, 2021-02-08 at 14:21 +0800, Jason Wang wrote: > > > On 2021/1/30 上午2:48, Elena Afanasova wrote: > > > > This vm ioctl adds or removes an ioregionfd MMIO/PIO region. > > > > Guest > > > > read and write accesses are dispatched through the given > > > > ioregionfd > > > > instead of returning from ioctl(KVM_RUN). Regions can be > > > > deleted by > > > > setting fds to -1. > > > > > > > > Signed-off-by: Elena Afanasova <eafanasova@xxxxxxxxx> > > > > --- > > > > Changes in v2: > > > > - changes after code review > > > > > > > > arch/x86/kvm/Kconfig | 1 + > > > > arch/x86/kvm/Makefile | 1 + > > > > arch/x86/kvm/x86.c | 1 + > > > > include/linux/kvm_host.h | 17 +++ > > > > include/uapi/linux/kvm.h | 23 ++++ > > > > virt/kvm/Kconfig | 3 + > > > > virt/kvm/eventfd.c | 25 +++++ > > > > virt/kvm/eventfd.h | 14 +++ > > > > virt/kvm/ioregion.c | 232 > > > > +++++++++++++++++++++++++++++++++++++++ > > > > virt/kvm/ioregion.h | 15 +++ > > > > virt/kvm/kvm_main.c | 11 ++ > > > > 11 files changed, 343 insertions(+) > > > > create mode 100644 virt/kvm/eventfd.h > > > > create mode 100644 virt/kvm/ioregion.c > > > > create mode 100644 virt/kvm/ioregion.h > > > > > > > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > > > > index f92dfd8ef10d..b914ef375199 100644 > > > > --- a/arch/x86/kvm/Kconfig > > > > +++ b/arch/x86/kvm/Kconfig > > > > @@ -33,6 +33,7 @@ config KVM > > > > select HAVE_KVM_IRQ_BYPASS > > > > select HAVE_KVM_IRQ_ROUTING > > > > select HAVE_KVM_EVENTFD > > > > + select KVM_IOREGION > > > > select KVM_ASYNC_PF > > > > select USER_RETURN_NOTIFIER > > > > select KVM_MMIO > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > > index b804444e16d4..b3b17dc9f7d4 100644 > > > > --- a/arch/x86/kvm/Makefile > > > > +++ b/arch/x86/kvm/Makefile > > > > @@ -12,6 +12,7 @@ KVM := ../../../virt/kvm > > > > kvm-y += $(KVM)/kvm_main.o > > > > $(KVM)/coalesced_mmio.o \ > > > > $(KVM)/eventfd.o > > > > $(KVM)/irqchip.o > > > > $(KVM)/vfio.o > > > > kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o > > > > +kvm-$(CONFIG_KVM_IOREGION) += $(KVM)/ioregion.o > > > > > > > > kvm-y += x86.o emulate.o i8259.o irq.o > > > > lapic.o \ > > > > i8254.o ioapic.o irq_comm.o cpuid.o > > > > pmu.o > > > > mtrr.o \ > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > index e545a8a613b1..ddb28f5ca252 100644 > > > > --- a/arch/x86/kvm/x86.c > > > > +++ b/arch/x86/kvm/x86.c > > > > @@ -3739,6 +3739,7 @@ int kvm_vm_ioctl_check_extension(struct > > > > kvm > > > > *kvm, long ext) > > > > case KVM_CAP_X86_USER_SPACE_MSR: > > > > case KVM_CAP_X86_MSR_FILTER: > > > > case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: > > > > + case KVM_CAP_IOREGIONFD: > > > > r = 1; > > > > break; > > > > case KVM_CAP_SYNC_REGS: > > > > diff --git a/include/linux/kvm_host.h > > > > b/include/linux/kvm_host.h > > > > index 7f2e2a09ebbd..7cd667dddba9 100644 > > > > --- a/include/linux/kvm_host.h > > > > +++ b/include/linux/kvm_host.h > > > > @@ -470,6 +470,10 @@ struct kvm { > > > > struct mutex resampler_lock; > > > > } irqfds; > > > > struct list_head ioeventfds; > > > > +#endif > > > > +#ifdef CONFIG_KVM_IOREGION > > > > + struct list_head ioregions_mmio; > > > > + struct list_head ioregions_pio; > > > > #endif > > > > struct kvm_vm_stat stat; > > > > struct kvm_arch arch; > > > > @@ -1262,6 +1266,19 @@ static inline int kvm_ioeventfd(struct > > > > kvm > > > > *kvm, struct kvm_ioeventfd *args) > > > > > > > > #endif /* CONFIG_HAVE_KVM_EVENTFD */ > > > > > > > > +#ifdef CONFIG_KVM_IOREGION > > > > +void kvm_ioregionfd_init(struct kvm *kvm); > > > > +int kvm_ioregionfd(struct kvm *kvm, struct kvm_ioregion > > > > *args); > > > > + > > > > +#else > > > > + > > > > +static inline void kvm_ioregionfd_init(struct kvm *kvm) {} > > > > +static inline int kvm_ioregionfd(struct kvm *kvm, struct > > > > kvm_ioregion *args) > > > > +{ > > > > + return -ENOSYS; > > > > +} > > > > +#endif > > > > + > > > > void kvm_arch_irq_routing_update(struct kvm *kvm); > > > > > > > > static inline void kvm_make_request(int req, struct kvm_vcpu > > > > *vcpu) > > > > diff --git a/include/uapi/linux/kvm.h > > > > b/include/uapi/linux/kvm.h > > > > index ca41220b40b8..81e775778c66 100644 > > > > --- a/include/uapi/linux/kvm.h > > > > +++ b/include/uapi/linux/kvm.h > > > > @@ -732,6 +732,27 @@ struct kvm_ioeventfd { > > > > __u8 pad[36]; > > > > }; > > > > > > > > +enum { > > > > + kvm_ioregion_flag_nr_pio, > > > > + kvm_ioregion_flag_nr_posted_writes, > > > > + kvm_ioregion_flag_nr_max, > > > > +}; > > > > + > > > > +#define KVM_IOREGION_PIO (1 << kvm_ioregion_flag_nr_pio) > > > > +#define KVM_IOREGION_POSTED_WRITES (1 << > > > > kvm_ioregion_flag_nr_posted_writes) > > > > + > > > > +#define KVM_IOREGION_VALID_FLAG_MASK ((1 << > > > > kvm_ioregion_flag_nr_max) - 1) > > > > + > > > > +struct kvm_ioregion { > > > > + __u64 guest_paddr; /* guest physical address */ > > > > + __u64 memory_size; /* bytes */ > > > > > > Do we really need __u64 here? > > > > > > > > > > + __u64 user_data; > > > > + __s32 rfd; > > > > + __s32 wfd; > > > > + __u32 flags; > > > > + __u8 pad[28]; > > > > +}; > > > > + > > > > #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0) > > > > #define KVM_X86_DISABLE_EXITS_HLT (1 << 1) > > > > #define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2) > > > > @@ -1053,6 +1074,7 @@ struct kvm_ppc_resize_hpt { > > > > #define KVM_CAP_X86_USER_SPACE_MSR 188 > > > > #define KVM_CAP_X86_MSR_FILTER 189 > > > > #define KVM_CAP_ENFORCE_PV_FEATURE_CPUID 190 > > > > +#define KVM_CAP_IOREGIONFD 191 > > > > > > > > #ifdef KVM_CAP_IRQ_ROUTING > > > > > > > > @@ -1308,6 +1330,7 @@ struct kvm_vfio_spapr_tce { > > > > struct > > > > kvm_userspace_memory_region) > > > > #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) > > > > #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) > > > > +#define KVM_SET_IOREGION _IOW(KVMIO, 0x49, struct > > > > kvm_ioregion) > > > > > > > > /* enable ucontrol for s390 */ > > > > struct kvm_s390_ucas_mapping { > > > > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig > > > > index 1c37ccd5d402..5e6620bbf000 100644 > > > > --- a/virt/kvm/Kconfig > > > > +++ b/virt/kvm/Kconfig > > > > @@ -17,6 +17,9 @@ config HAVE_KVM_EVENTFD > > > > bool > > > > select EVENTFD > > > > > > > > +config KVM_IOREGION > > > > + bool > > > > + > > > > config KVM_MMIO > > > > bool > > > > > > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c > > > > index c2323c27a28b..aadb73903f8b 100644 > > > > --- a/virt/kvm/eventfd.c > > > > +++ b/virt/kvm/eventfd.c > > > > @@ -27,6 +27,7 @@ > > > > #include <trace/events/kvm.h> > > > > > > > > #include <kvm/iodev.h> > > > > +#include "ioregion.h" > > > > > > > > #ifdef CONFIG_HAVE_KVM_IRQFD > > > > > > > > @@ -755,6 +756,23 @@ static const struct kvm_io_device_ops > > > > ioeventfd_ops = { > > > > .destructor = ioeventfd_destructor, > > > > }; > > > > > > > > +#ifdef CONFIG_KVM_IOREGION > > > > +/* assumes kvm->slots_lock held */ > > > > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, > > > > + u64 start, u64 size) > > > > +{ > > > > + struct _ioeventfd *_p; > > > > + > > > > + list_for_each_entry(_p, &kvm->ioeventfds, list) > > > > + if (_p->bus_idx == bus_idx && > > > > + overlap(start, size, _p->addr, > > > > + !_p->length ? 8 : _p->length)) > > > > + return true; > > > > + > > > > + return false; > > > > +} > > > > +#endif > > > > + > > > > /* assumes kvm->slots_lock held */ > > > > static bool > > > > ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd > > > > *p) > > > > @@ -770,6 +788,13 @@ ioeventfd_check_collision(struct kvm *kvm, > > > > struct _ioeventfd *p) > > > > _p->datamatch == p->datamatch)))) > > > > return true; > > > > > > > > +#ifdef CONFIG_KVM_IOREGION > > > > + if (p->bus_idx == KVM_MMIO_BUS || p->bus_idx == > > > > KVM_PIO_BUS) > > > > + if (kvm_ioregion_collides(kvm, p->bus_idx, p- > > > > >addr, > > > > + !p->length ? 8 : p- > > > > >length)) > > > > + return true; > > > > +#endif > > > > + > > > > return false; > > > > } > > > > > > > > diff --git a/virt/kvm/eventfd.h b/virt/kvm/eventfd.h > > > > new file mode 100644 > > > > index 000000000000..73a621eebae3 > > > > --- /dev/null > > > > +++ b/virt/kvm/eventfd.h > > > > @@ -0,0 +1,14 @@ > > > > +/* SPDX-License-Identifier: GPL-2.0-only */ > > > > +#ifndef __KVM_EVENTFD_H__ > > > > +#define __KVM_EVENTFD_H__ > > > > + > > > > +#ifdef CONFIG_KVM_IOREGION > > > > +bool kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 > > > > start, > > > > u64 size); > > > > +#else > > > > +static inline bool > > > > +kvm_eventfd_collides(struct kvm *kvm, int bus_idx, u64 start, > > > > u64 > > > > size) > > > > +{ > > > > + return false; > > > > +} > > > > +#endif > > > > +#endif > > > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c > > > > new file mode 100644 > > > > index 000000000000..48ff92bca966 > > > > --- /dev/null > > > > +++ b/virt/kvm/ioregion.c > > > > @@ -0,0 +1,232 @@ > > > > +// SPDX-License-Identifier: GPL-2.0-only > > > > +#include <linux/kvm_host.h> > > > > +#include <linux/fs.h> > > > > +#include <kvm/iodev.h> > > > > +#include "eventfd.h" > > > > + > > > > +void > > > > +kvm_ioregionfd_init(struct kvm *kvm) > > > > +{ > > > > + INIT_LIST_HEAD(&kvm->ioregions_mmio); > > > > + INIT_LIST_HEAD(&kvm->ioregions_pio); > > > > +} > > > > + > > > > +struct ioregion { > > > > + struct list_head list; > > > > + u64 paddr; /* guest physical address > > > > */ > > > > + u64 size; /* size in bytes */ > > > > + struct file *rf; > > > > + struct file *wf; > > > > + u64 user_data; /* opaque token used by > > > > userspace */ > > > > + struct kvm_io_device dev; > > > > + bool posted_writes; > > > > +}; > > > > + > > > > +static inline struct ioregion * > > > > +to_ioregion(struct kvm_io_device *dev) > > > > +{ > > > > + return container_of(dev, struct ioregion, dev); > > > > +} > > > > + > > > > +/* assumes kvm->slots_lock held */ > > > > +static void > > > > +ioregion_release(struct ioregion *p) > > > > +{ > > > > + fput(p->rf); > > > > + fput(p->wf); > > > > + list_del(&p->list); > > > > + kfree(p); > > > > +} > > > > + > > > > +static int > > > > +ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device > > > > *this, > > > > gpa_t addr, > > > > + int len, void *val) > > > > +{ > > > > + return -EOPNOTSUPP; > > > > +} > > > > + > > > > +static int > > > > +ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device > > > > *this, > > > > gpa_t addr, > > > > + int len, const void *val) > > > > +{ > > > > + return -EOPNOTSUPP; > > > > +} > > > > + > > > > +/* > > > > + * This function is called as KVM is completely shutting > > > > down. We > > > > do not > > > > + * need to worry about locking just nuke anything we have as > > > > quickly as possible > > > > + */ > > > > +static void > > > > +ioregion_destructor(struct kvm_io_device *this) > > > > +{ > > > > + struct ioregion *p = to_ioregion(this); > > > > + > > > > + ioregion_release(p); > > > > +} > > > > + > > > > +static const struct kvm_io_device_ops ioregion_ops = { > > > > + .read = ioregion_read, > > > > + .write = ioregion_write, > > > > + .destructor = ioregion_destructor, > > > > +}; > > > > + > > > > +static inline struct list_head * > > > > +get_ioregion_list(struct kvm *kvm, enum kvm_bus bus_idx) > > > > +{ > > > > + return (bus_idx == KVM_MMIO_BUS) ? > > > > + &kvm->ioregions_mmio : &kvm->ioregions_pio; > > > > +} > > > > + > > > > +/* check for not overlapping case and reverse */ > > > > +inline bool > > > > +overlap(u64 start1, u64 size1, u64 start2, u64 size2) > > > > +{ > > > > + u64 end1 = start1 + size1 - 1; > > > > + u64 end2 = start2 + size2 - 1; > > > > + > > > > + return !(end1 < start2 || start1 >= end2); > > > > +} > > > > + > > > > +/* assumes kvm->slots_lock held */ > > > > +bool > > > > +kvm_ioregion_collides(struct kvm *kvm, int bus_idx, > > > > + u64 start, u64 size) > > > > +{ > > > > + struct ioregion *_p; > > > > + struct list_head *ioregions; > > > > + > > > > + ioregions = get_ioregion_list(kvm, bus_idx); > > > > + list_for_each_entry(_p, ioregions, list) > > > > + if (overlap(start, size, _p->paddr, _p->size)) > > > > + return true; > > > > + > > > > + return false; > > > > +} > > > > + > > > > +/* assumes kvm->slots_lock held */ > > > > +static bool > > > > +ioregion_collision(struct kvm *kvm, struct ioregion *p, enum > > > > kvm_bus bus_idx) > > > > +{ > > > > + if (kvm_ioregion_collides(kvm, bus_idx, p->paddr, p- > > > > >size) || > > > > + kvm_eventfd_collides(kvm, bus_idx, p->paddr, p- > > > > >size)) > > > > + return true; > > > > + > > > > + return false; > > > > +} > > > > + > > > > +static enum kvm_bus > > > > +get_bus_from_flags(__u32 flags) > > > > +{ > > > > + if (flags & KVM_IOREGION_PIO) > > > > + return KVM_PIO_BUS; > > > > + return KVM_MMIO_BUS; > > > > +} > > > > + > > > > +int > > > > +kvm_set_ioregion(struct kvm *kvm, struct kvm_ioregion *args) > > > > +{ > > > > + struct ioregion *p; > > > > + struct file *rfile, *wfile; > > > > + enum kvm_bus bus_idx; > > > > + int ret = 0; > > > > + > > > > + if (!args->memory_size) > > > > + return -EINVAL; > > > > + if ((args->guest_paddr + args->memory_size - 1) < args- > > > > > guest_paddr) > > > > + return -EINVAL; > > > > + > > > > + rfile = fget(args->rfd); > > > > + if (!rfile) > > > > + return -EBADF; > > > > > > So the question still, if we want to use ioregion fd for > > > doorbell, > > > we > > > don't need rfd in this case? > > > > > Using ioregionfd for doorbell seems to be an open question. > > Probably it > > could just focus on the non-doorbell cases. > > Below you replied FAST_MMIO will be in v3. That is the doorbell case, > so > maybe it is in scope for this patch series? > Ok, will fix > I think continuing to use ioeventfd for most doorbell registers makes > sense. > > However, there are two cases where ioregionfd doorbell support is > interesting: > > 1. The (non-FAST_MMIO) case where the application needs to know the > value written to the doorbell. ioeventfd cannot do this (datamatch > can handle a subset of cases but not all) so we need ioregionfd > for > this. > > 2. The FAST_MMIO case just for convenience if applications prefer to > use > a single API (ioregionfd) instead of implementing both ioregionfd > and > ioeventfd. > > ioeventfd will still have its benefits (and limitations) that make it > different from ioregionfd. In particular, ioregionfd will not merge > doorbell writes into a single message because doing so would > basically > involve reimplementing ioeventfd functionality as part of ioregionfd > and > isn't compatible with the current approach where userspace can > provide > any file descriptor for communication. > > Elena and Jason: do you agree with this API design? I’m still not sure about coalescing the writes support, but in general it looks ok to me.