The new flag allows passing a connected socket instead of an eventfd to be notified of writes or reads to the specified memory region. Instead of signaling an event, On write - the value written to the memory region is written to the pipe. On read - a notification of the read is sent to the host, and a response is expected with the value to be 'read'. Using a socket instead of an eventfd is usefull when any value can be written to the memory region but we're interested in recieving the actual value instead of just a notification. A simple example for practical use is the serial port. we are not interested in an exit every time a char is written to the port, but we do need to know what was written so we could handle it on the guest. Cc: Avi Kivity <avi@xxxxxxxxxx> Cc: Ingo Molnar <mingo@xxxxxxx> Cc: Marcelo Tosatti <mtosatti@xxxxxxxxxx> Cc: Michael S. Tsirkin <mst@xxxxxxxxxx> Cc: Pekka Enberg <penberg@xxxxxxxxxx> Signed-off-by: Sasha Levin <levinsasha928@xxxxxxxxx> --- Documentation/virtual/kvm/api.txt | 18 ++++- include/linux/kvm.h | 9 ++ virt/kvm/eventfd.c | 153 ++++++++++++++++++++++++++++++++----- 3 files changed, 161 insertions(+), 19 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 317d86a..74f0946 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address within the guest. A guest write in the registered address will signal the -provided event instead of triggering an exit. +provided event or write to the provided socket instead of triggering an exit. struct kvm_ioeventfd { __u64 datamatch; @@ -1341,6 +1341,13 @@ struct kvm_ioeventfd { __u8 pad[36]; }; +struct kvm_ioeventfd_data { + __u64 data; + __u64 addr; + __u32 len; + __u8 is_write; +}; + The following flags are defined: #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch) @@ -1348,6 +1355,7 @@ The following flags are defined: #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) #define KVM_IOEVENTFD_FLAG_READ (1 << kvm_ioeventfd_flag_nr_read) #define KVM_IOEVENTFD_FLAG_NOWRITE (1 << kvm_ioeventfd_flag_nr_nowrite) +#define KVM_IOEVENTFD_FLAG_SOCKET (1 << kvm_ioeventfd_flag_nr_socket) If datamatch flag is set, the event will be signaled only if the written value to the registered address is equal to datamatch in struct kvm_ioeventfd. @@ -1359,6 +1367,14 @@ passed in datamatch. If the nowrite flag is set, the event won't be signaled when the specified address is being written to. +If the socket flag is set, fd is expected to be a connected AF_UNIX +SOCK_SEQPACKET socket. Once a guest write in the registered address is +detected - a struct kvm_ioeventfd_data which describes the write will be +written to the socket. +On read, struct kvm_ioeventfd_data will be written with 'is_write = 0', and +would wait for a response with a struct kvm_ioeventfd_data containing the +value which should be 'read' by the guest. + 5. The kvm_run structure diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 8a12711..ff3d808 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -389,6 +389,7 @@ enum { kvm_ioeventfd_flag_nr_deassign, kvm_ioeventfd_flag_nr_read, kvm_ioeventfd_flag_nr_nowrite, + kvm_ioeventfd_flag_nr_socket, kvm_ioeventfd_flag_nr_max, }; @@ -397,6 +398,7 @@ enum { #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign) #define KVM_IOEVENTFD_FLAG_READ (1 << kvm_ioeventfd_flag_nr_read) #define KVM_IOEVENTFD_FLAG_NOWRITE (1 << kvm_ioeventfd_flag_nr_nowrite) +#define KVM_IOEVENTFD_FLAG_SOCKET (1 << kvm_ioeventfd_flag_nr_socket) #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1) @@ -409,6 +411,13 @@ struct kvm_ioeventfd { __u8 pad[36]; }; +struct kvm_ioeventfd_data { + __u64 data; + __u64 addr; + __u32 len; + __u8 is_write; +}; + /* for KVM_ENABLE_CAP */ struct kvm_enable_cap { /* in */ diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 5f2d203..d1d63b3 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -32,6 +32,7 @@ #include <linux/eventfd.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/net.h> #include "iodev.h" @@ -413,10 +414,11 @@ module_exit(irqfd_module_exit); /* * -------------------------------------------------------------------- - * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal or + * a socket write. * - * userspace can register a PIO/MMIO address with an eventfd for receiving - * notification when the memory has been touched. + * userspace can register a PIO/MMIO address with an eventfd or a + * socket for receiving notification when the memory has been touched. * -------------------------------------------------------------------- */ @@ -424,7 +426,10 @@ struct _ioeventfd { struct list_head list; u64 addr; int length; - struct eventfd_ctx *eventfd; + union { + struct socket *sock; + struct eventfd_ctx *eventfd; + }; u64 datamatch; struct kvm_io_device dev; bool wildcard; @@ -441,7 +446,11 @@ to_ioeventfd(struct kvm_io_device *dev) static void ioeventfd_release(struct _ioeventfd *p) { - eventfd_ctx_put(p->eventfd); + if (p->eventfd) + eventfd_ctx_put(p->eventfd); + else + sockfd_put(p->sock); + list_del(&p->list); kfree(p); } @@ -510,12 +519,65 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) return _val == p->datamatch ? true : false; } +static ssize_t socket_write(struct socket *sock, const void *buf, size_t count) +{ + mm_segment_t old_fs; + ssize_t res; + struct msghdr msg; + struct iovec iov; + + iov = (struct iovec) { + .iov_base = (void *)buf, + .iov_len = count, + }; + + msg = (struct msghdr) { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = sock_sendmsg(sock, &msg, count); + set_fs(old_fs); + + return res; +} + +static ssize_t socket_read(struct socket *sock, void *buf, size_t count) +{ + mm_segment_t old_fs; + ssize_t res; + struct msghdr msg; + struct iovec iov; + + iov = (struct iovec) { + .iov_base = (void *)buf, + .iov_len = count, + }; + + msg = (struct msghdr) { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = sock_recvmsg(sock, &msg, count, 0); + set_fs(old_fs); + + return res; +} + /* MMIO/PIO writes trigger an event if the addr/val match */ static int ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val) { struct _ioeventfd *p = to_ioeventfd(this); + struct kvm_ioeventfd_data data; /* Exit if signaling on writes isn't requested */ if (!p->track_writes) @@ -524,7 +586,18 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, if (!ioeventfd_in_range(p, addr, len, val)) return -EOPNOTSUPP; - eventfd_signal(p->eventfd, 1); + data = (struct kvm_ioeventfd_data) { + .data = get_val(val, len), + .addr = addr, + .len = len, + .is_write = 1, + }; + + if (p->sock) + socket_write(p->sock, &data, sizeof(data)); + else + eventfd_signal(p->eventfd, 1); + return 0; } @@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len, void *val) { struct _ioeventfd *p = to_ioeventfd(this); + struct kvm_ioeventfd_data data; /* Exit if signaling on reads isn't requested */ if (!p->track_reads) @@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len, if (!ioeventfd_in_range(p, addr, len, val)) return -EOPNOTSUPP; - eventfd_signal(p->eventfd, 1); + data = (struct kvm_ioeventfd_data) { + .addr = addr, + .len = len, + .is_write = 0, + }; + + if (p->sock) { + socket_write(p->sock, &data, sizeof(data)); + socket_read(p->sock, &data, sizeof(data)); + set_val(val, len, data.data); + } else { + set_val(val, len, p->datamatch); + eventfd_signal(p->eventfd, 1); + } + return 0; } @@ -585,7 +673,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; struct _ioeventfd *p; - struct eventfd_ctx *eventfd; + struct eventfd_ctx *eventfd = NULL; int ret; /* check for range overflow */ @@ -596,10 +684,6 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) return -EINVAL; - eventfd = eventfd_ctx_fdget(args->fd); - if (IS_ERR(eventfd)) - return PTR_ERR(eventfd); - p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { ret = -ENOMEM; @@ -611,6 +695,20 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) p->length = args->len; p->eventfd = eventfd; + if (args->flags & KVM_IOEVENTFD_FLAG_SOCKET) { + ret = 0; + p->sock = sockfd_lookup(args->fd, &ret); + if (ret) + goto fail; + } else { + ret = -EINVAL; + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + goto fail; + + p->eventfd = eventfd; + } + /* The datamatch feature is optional, otherwise this is a wildcard */ if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) p->datamatch = args->datamatch; @@ -649,8 +747,14 @@ unlock_fail: mutex_unlock(&kvm->slots_lock); fail: + if (eventfd) + eventfd_ctx_put(eventfd); + + if (p->sock) + sockfd_put(p->sock); + + kfree(p); - eventfd_ctx_put(eventfd); return ret; } @@ -661,12 +765,21 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; struct _ioeventfd *p, *tmp; - struct eventfd_ctx *eventfd; + struct eventfd_ctx *eventfd = NULL; + struct socket *sock = NULL; int ret = -ENOENT; - eventfd = eventfd_ctx_fdget(args->fd); - if (IS_ERR(eventfd)) - return PTR_ERR(eventfd); + if (args->flags & KVM_IOEVENTFD_FLAG_SOCKET) { + ret = 0; + sock = sockfd_lookup(args->fd, &ret); + if (ret) + return PTR_ERR(sock); + } else { + ret = -EINVAL; + eventfd = eventfd_ctx_fdget(args->fd); + if (IS_ERR(eventfd)) + return PTR_ERR(eventfd); + } mutex_lock(&kvm->slots_lock); @@ -674,6 +787,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); if (p->eventfd != eventfd || + p->sock != sock || p->addr != args->addr || p->length != args->len || p->wildcard != wildcard) @@ -690,7 +804,10 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) mutex_unlock(&kvm->slots_lock); - eventfd_ctx_put(eventfd); + if (eventfd) + eventfd_ctx_put(eventfd); + if (sock) + sockfd_put(sock); return ret; } -- 1.7.6 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html