Cc: Avi Kivity<avi@xxxxxxxxxx>
Cc: Ingo Molnar<mingo@xxxxxxx>
Cc: Marcelo Tosatti<mtosatti@xxxxxxxxxx>
Cc: Michael S. Tsirkin<mst@xxxxxxxxxx>
Cc: Pekka Enberg<penberg@xxxxxxxxxx>
Signed-off-by: Sasha Levin<levinsasha928@xxxxxxxxx>
---
Documentation/virtual/kvm/api.txt | 18 ++++-
include/linux/kvm.h | 9 ++
virt/kvm/eventfd.c | 153 ++++++++++++++++++++++++++++++++-----
3 files changed, 161 insertions(+), 19 deletions(-)
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 317d86a..74f0946 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1330,7 +1330,7 @@ Returns: 0 on success, !0 on error
This ioctl attaches or detaches an ioeventfd to a legal pio/mmio address
within the guest. A guest write in the registered address will signal the
-provided event instead of triggering an exit.
+provided event or write to the provided socket instead of triggering an exit.
struct kvm_ioeventfd {
__u64 datamatch;
@@ -1341,6 +1341,13 @@ struct kvm_ioeventfd {
__u8 pad[36];
};
+struct kvm_ioeventfd_data {
+ __u64 data;
+ __u64 addr;
+ __u32 len;
+ __u8 is_write;
+};
+
The following flags are defined:
#define KVM_IOEVENTFD_FLAG_DATAMATCH (1<< kvm_ioeventfd_flag_nr_datamatch)
@@ -1348,6 +1355,7 @@ The following flags are defined:
#define KVM_IOEVENTFD_FLAG_DEASSIGN (1<< kvm_ioeventfd_flag_nr_deassign)
#define KVM_IOEVENTFD_FLAG_READ (1<< kvm_ioeventfd_flag_nr_read)
#define KVM_IOEVENTFD_FLAG_NOWRITE (1<< kvm_ioeventfd_flag_nr_nowrite)
+#define KVM_IOEVENTFD_FLAG_SOCKET (1<< kvm_ioeventfd_flag_nr_socket)
If datamatch flag is set, the event will be signaled only if the written value
to the registered address is equal to datamatch in struct kvm_ioeventfd.
@@ -1359,6 +1367,14 @@ passed in datamatch.
If the nowrite flag is set, the event won't be signaled when the specified address
is being written to.
+If the socket flag is set, fd is expected to be a connected AF_UNIX
+SOCK_SEQPACKET socket. Once a guest write in the registered address is
+detected - a struct kvm_ioeventfd_data which describes the write will be
+written to the socket.
+On read, struct kvm_ioeventfd_data will be written with 'is_write = 0', and
+would wait for a response with a struct kvm_ioeventfd_data containing the
+value which should be 'read' by the guest.
+
5. The kvm_run structure
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8a12711..ff3d808 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -389,6 +389,7 @@ enum {
kvm_ioeventfd_flag_nr_deassign,
kvm_ioeventfd_flag_nr_read,
kvm_ioeventfd_flag_nr_nowrite,
+ kvm_ioeventfd_flag_nr_socket,
kvm_ioeventfd_flag_nr_max,
};
@@ -397,6 +398,7 @@ enum {
#define KVM_IOEVENTFD_FLAG_DEASSIGN (1<< kvm_ioeventfd_flag_nr_deassign)
#define KVM_IOEVENTFD_FLAG_READ (1<< kvm_ioeventfd_flag_nr_read)
#define KVM_IOEVENTFD_FLAG_NOWRITE (1<< kvm_ioeventfd_flag_nr_nowrite)
+#define KVM_IOEVENTFD_FLAG_SOCKET (1<< kvm_ioeventfd_flag_nr_socket)
#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1<< kvm_ioeventfd_flag_nr_max) - 1)
@@ -409,6 +411,13 @@ struct kvm_ioeventfd {
__u8 pad[36];
};
+struct kvm_ioeventfd_data {
+ __u64 data;
+ __u64 addr;
+ __u32 len;
+ __u8 is_write;
+};
+
/* for KVM_ENABLE_CAP */
struct kvm_enable_cap {
/* in */
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 5f2d203..d1d63b3 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -32,6 +32,7 @@
#include<linux/eventfd.h>
#include<linux/kernel.h>
#include<linux/slab.h>
+#include<linux/net.h>
#include "iodev.h"
@@ -413,10 +414,11 @@ module_exit(irqfd_module_exit);
/*
* --------------------------------------------------------------------
- * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
+ * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal or
+ * a socket write.
*
- * userspace can register a PIO/MMIO address with an eventfd for receiving
- * notification when the memory has been touched.
+ * userspace can register a PIO/MMIO address with an eventfd or a
+ * socket for receiving notification when the memory has been touched.
* --------------------------------------------------------------------
*/
@@ -424,7 +426,10 @@ struct _ioeventfd {
struct list_head list;
u64 addr;
int length;
- struct eventfd_ctx *eventfd;
+ union {
+ struct socket *sock;
+ struct eventfd_ctx *eventfd;
+ };
u64 datamatch;
struct kvm_io_device dev;
bool wildcard;
@@ -441,7 +446,11 @@ to_ioeventfd(struct kvm_io_device *dev)
static void
ioeventfd_release(struct _ioeventfd *p)
{
- eventfd_ctx_put(p->eventfd);
+ if (p->eventfd)
+ eventfd_ctx_put(p->eventfd);
+ else
+ sockfd_put(p->sock);
+
list_del(&p->list);
kfree(p);
}
@@ -510,12 +519,65 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
return _val == p->datamatch ? true : false;
}
+static ssize_t socket_write(struct socket *sock, const void *buf, size_t count)
+{
+ mm_segment_t old_fs;
+ ssize_t res;
+ struct msghdr msg;
+ struct iovec iov;
+
+ iov = (struct iovec) {
+ .iov_base = (void *)buf,
+ .iov_len = count,
+ };
+
+ msg = (struct msghdr) {
+ .msg_iov =&iov,
+ .msg_iovlen = 1,
+ };
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ res = sock_sendmsg(sock,&msg, count);
+ set_fs(old_fs);
+
+ return res;
+}
+
+static ssize_t socket_read(struct socket *sock, void *buf, size_t count)
+{
+ mm_segment_t old_fs;
+ ssize_t res;
+ struct msghdr msg;
+ struct iovec iov;
+
+ iov = (struct iovec) {
+ .iov_base = (void *)buf,
+ .iov_len = count,
+ };
+
+ msg = (struct msghdr) {
+ .msg_iov =&iov,
+ .msg_iovlen = 1,
+ };
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ res = sock_recvmsg(sock,&msg, count, 0);
+ set_fs(old_fs);
+
+ return res;
+}
+
/* MMIO/PIO writes trigger an event if the addr/val match */
static int
ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
const void *val)
{
struct _ioeventfd *p = to_ioeventfd(this);
+ struct kvm_ioeventfd_data data;
/* Exit if signaling on writes isn't requested */
if (!p->track_writes)
@@ -524,7 +586,18 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
if (!ioeventfd_in_range(p, addr, len, val))
return -EOPNOTSUPP;
- eventfd_signal(p->eventfd, 1);
+ data = (struct kvm_ioeventfd_data) {
+ .data = get_val(val, len),
+ .addr = addr,
+ .len = len,
+ .is_write = 1,
+ };
+
+ if (p->sock)
+ socket_write(p->sock,&data, sizeof(data));
+ else
+ eventfd_signal(p->eventfd, 1);
+
return 0;
}
@@ -534,6 +607,7 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
void *val)
{
struct _ioeventfd *p = to_ioeventfd(this);
+ struct kvm_ioeventfd_data data;
/* Exit if signaling on reads isn't requested */
if (!p->track_reads)
@@ -542,7 +616,21 @@ ioeventfd_read(struct kvm_io_device *this, gpa_t addr, int len,
if (!ioeventfd_in_range(p, addr, len, val))
return -EOPNOTSUPP;
- eventfd_signal(p->eventfd, 1);
+ data = (struct kvm_ioeventfd_data) {
+ .addr = addr,
+ .len = len,
+ .is_write = 0,
+ };
+
+ if (p->sock) {
+ socket_write(p->sock,&data, sizeof(data));
+ socket_read(p->sock,&data, sizeof(data));
+ set_val(val, len, data.data);
+ } else {
+ set_val(val, len, p->datamatch);
+ eventfd_signal(p->eventfd, 1);
+ }
+
return 0;
}
@@ -585,7 +673,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
int pio = args->flags& KVM_IOEVENTFD_FLAG_PIO;
enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
struct _ioeventfd *p;
- struct eventfd_ctx *eventfd;
+ struct eventfd_ctx *eventfd = NULL;
int ret;
/* check for range overflow */
@@ -596,10 +684,6 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
if (args->flags& ~KVM_IOEVENTFD_VALID_FLAG_MASK)
return -EINVAL;
- eventfd = eventfd_ctx_fdget(args->fd);
- if (IS_ERR(eventfd))
- return PTR_ERR(eventfd);
-
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
ret = -ENOMEM;
@@ -611,6 +695,20 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
p->length = args->len;
p->eventfd = eventfd;
+ if (args->flags& KVM_IOEVENTFD_FLAG_SOCKET) {
+ ret = 0;
+ p->sock = sockfd_lookup(args->fd,&ret);
+ if (ret)
+ goto fail;
+ } else {
+ ret = -EINVAL;
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ goto fail;
+
+ p->eventfd = eventfd;
+ }
+
/* The datamatch feature is optional, otherwise this is a wildcard */
if (args->flags& KVM_IOEVENTFD_FLAG_DATAMATCH)
p->datamatch = args->datamatch;
@@ -649,8 +747,14 @@ unlock_fail:
mutex_unlock(&kvm->slots_lock);
fail:
+ if (eventfd)
+ eventfd_ctx_put(eventfd);
+
+ if (p->sock)
+ sockfd_put(p->sock);
+
+
kfree(p);
- eventfd_ctx_put(eventfd);
return ret;
}
@@ -661,12 +765,21 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
int pio = args->flags& KVM_IOEVENTFD_FLAG_PIO;
enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
struct _ioeventfd *p, *tmp;
- struct eventfd_ctx *eventfd;
+ struct eventfd_ctx *eventfd = NULL;
+ struct socket *sock = NULL;
int ret = -ENOENT;
- eventfd = eventfd_ctx_fdget(args->fd);
- if (IS_ERR(eventfd))
- return PTR_ERR(eventfd);
+ if (args->flags& KVM_IOEVENTFD_FLAG_SOCKET) {
+ ret = 0;
+ sock = sockfd_lookup(args->fd,&ret);
+ if (ret)
+ return PTR_ERR(sock);
+ } else {
+ ret = -EINVAL;
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+ }
mutex_lock(&kvm->slots_lock);
@@ -674,6 +787,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
bool wildcard = !(args->flags& KVM_IOEVENTFD_FLAG_DATAMATCH);
if (p->eventfd != eventfd ||
+ p->sock != sock ||
p->addr != args->addr ||
p->length != args->len ||
p->wildcard != wildcard)
@@ -690,7 +804,10 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
mutex_unlock(&kvm->slots_lock);
- eventfd_ctx_put(eventfd);
+ if (eventfd)
+ eventfd_ctx_put(eventfd);
+ if (sock)
+ sockfd_put(sock);
return ret;
}