[RFC v3 4/5] KVM: add ioregionfd context

Elena Afanasova <eafanasova@xxxxxxxxx> · Sun, 21 Feb 2021 15:04:40 +0300

Add support for ioregionfd cmds/replies serialization.

Signed-off-by: Elena Afanasova <eafanasova@xxxxxxxxx>
---
v3:
 - add comment
 - drop kvm_io_bus_finish/prepare()

 virt/kvm/ioregion.c | 164 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 135 insertions(+), 29 deletions(-)

diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c
index 1e1c7772d274..d53e3d1cd2ff 100644
--- a/virt/kvm/ioregion.c
+++ b/virt/kvm/ioregion.c
@@ -1,10 +1,39 @@
 // SPDX-License-Identifier: GPL-2.0-only
 #include <linux/kvm_host.h>
-#include <linux/fs.h>
+#include <linux/wait.h>
 #include <kvm/iodev.h>
 #include "eventfd.h"
 #include <uapi/linux/ioregion.h>
 
+/* ioregions that share the same rfd are serialized so that only one vCPU
+ * thread sends a struct ioregionfd_cmd to userspace at a time. This
+ * ensures that the struct ioregionfd_resp received from userspace will
+ * be processed by the one and only vCPU thread that sent it.
+ *
+ * A waitqueue is used to wake up waiting vCPU threads in order. Most of
+ * the time the waitqueue is unused and the lock is not contended.
+ * For best performance userspace should set up ioregionfds so that there
+ * is no contention (e.g. dedicated ioregionfds for queue doorbell
+ * registers on multi-queue devices).
+ */
+struct ioregionfd {
+	wait_queue_head_t	  wq;
+	struct file		 *rf;
+	struct kref		  kref;
+	bool			  busy;
+};
+
+struct ioregion {
+	struct list_head	  list;
+	u64			  paddr;   /* guest physical address */
+	u64			  size;    /* size in bytes */
+	struct file		 *wf;
+	u64			  user_data; /* opaque token used by userspace */
+	struct kvm_io_device	  dev;
+	bool			  posted_writes;
+	struct ioregionfd	 *ctx;
+};
+
 void
 kvm_ioregionfd_init(struct kvm *kvm)
 {
@@ -13,29 +42,28 @@ kvm_ioregionfd_init(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->ioregions_pio);
 }
 
-struct ioregion {
-	struct list_head     list;
-	u64                  paddr;  /* guest physical address */
-	u64                  size;   /* size in bytes */
-	struct file         *rf;
-	struct file         *wf;
-	u64                  user_data; /* opaque token used by userspace */
-	struct kvm_io_device dev;
-	bool                 posted_writes;
-};
-
 static inline struct ioregion *
 to_ioregion(struct kvm_io_device *dev)
 {
 	return container_of(dev, struct ioregion, dev);
 }
 
+/* assumes kvm->slots_lock held */
+static void ctx_free(struct kref *kref)
+{
+	struct ioregionfd *ctx = container_of(kref, struct ioregionfd, kref);
+
+	kfree(ctx);
+}
+
 /* assumes kvm->slots_lock held */
 static void
 ioregion_release(struct ioregion *p)
 {
-	if (p->rf)
-		fput(p->rf);
+	if (p->ctx) {
+		fput(p->ctx->rf);
+		kref_put(&p->ctx->kref, ctx_free);
+	}
 	fput(p->wf);
 	list_del(&p->list);
 	kfree(p);
@@ -90,6 +118,30 @@ ioregion_save_ctx(struct kvm_vcpu *vcpu, bool in, gpa_t addr, u8 state, void *va
 	vcpu->ioregion_ctx.in = in;
 }
 
+static inline void
+ioregion_lock_ctx(struct ioregionfd *ctx)
+{
+	if (!ctx)
+		return;
+
+	spin_lock(&ctx->wq.lock);
+	wait_event_interruptible_exclusive_locked(ctx->wq, !ctx->busy);
+	ctx->busy = true;
+	spin_unlock(&ctx->wq.lock);
+}
+
+static inline void
+ioregion_unlock_ctx(struct ioregionfd *ctx)
+{
+	if (!ctx)
+		return;
+
+	spin_lock(&ctx->wq.lock);
+	ctx->busy = false;
+	wake_up_locked(&ctx->wq);
+	spin_unlock(&ctx->wq.lock);
+}
+
 static int
 ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	      int len, void *val)
@@ -115,11 +167,15 @@ ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 		}
 	}
 
+	ioregion_lock_ctx(p->ctx);
+
 send_cmd:
 	memset(&buf, 0, sizeof(buf));
 	if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ,
-		      1, p->user_data, NULL))
-		return -EOPNOTSUPP;
+		      1, p->user_data, NULL)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
 
 	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
 	state = (ret == sizeof(buf.cmd)) ? GET_REPLY : SEND_CMD;
@@ -129,14 +185,15 @@ ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	}
 	if (ret != sizeof(buf.cmd)) {
 		ret = (ret < 0) ? ret : -EIO;
-		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+		ret = (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+		goto out;
 	}
-	if (!p->rf)
+	if (!p->ctx)
 		return 0;
 
 get_repl:
 	memset(&buf, 0, sizeof(buf));
-	ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
+	ret = kernel_read(p->ctx->rf, &buf.resp, sizeof(buf.resp), 0);
 	state = (ret == sizeof(buf.resp)) ? COMPLETE : GET_REPLY;
 	if (signal_pending(current) && state == GET_REPLY) {
 		ioregion_save_ctx(vcpu, 1, addr, state, val);
@@ -144,12 +201,17 @@ ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	}
 	if (ret != sizeof(buf.resp)) {
 		ret = (ret < 0) ? ret : -EIO;
-		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+		ret = (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+		goto out;
 	}
 
 	memcpy(val, &buf.resp.data, len);
+	ret = 0;
 
-	return 0;
+out:
+	ioregion_unlock_ctx(p->ctx);
+
+	return ret;
 }
 
 static int
@@ -177,11 +239,15 @@ ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 		}
 	}
 
+	ioregion_lock_ctx(p->ctx);
+
 send_cmd:
 	memset(&buf, 0, sizeof(buf));
 	if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_WRITE,
-		      p->posted_writes ? 0 : 1, p->user_data, val))
-		return -EOPNOTSUPP;
+		      p->posted_writes ? 0 : 1, p->user_data, val)) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
 
 	ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0);
 	state = (ret == sizeof(buf.cmd)) ? GET_REPLY : SEND_CMD;
@@ -191,13 +257,14 @@ ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	}
 	if (ret != sizeof(buf.cmd)) {
 		ret = (ret < 0) ? ret : -EIO;
-		return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+		ret = (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+		goto out;
 	}
 
 get_repl:
 	if (!p->posted_writes) {
 		memset(&buf, 0, sizeof(buf));
-		ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0);
+		ret = kernel_read(p->ctx->rf, &buf.resp, sizeof(buf.resp), 0);
 		state = (ret == sizeof(buf.resp)) ? COMPLETE : GET_REPLY;
 		if (signal_pending(current) && state == GET_REPLY) {
 			ioregion_save_ctx(vcpu, 0, addr, state, (void *)val);
@@ -205,11 +272,16 @@ ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 		}
 		if (ret != sizeof(buf.resp)) {
 			ret = (ret < 0) ? ret : -EIO;
-			return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+			ret = (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret;
+			goto out;
 		}
 	}
+	ret = 0;
 
-	return 0;
+out:
+	ioregion_unlock_ctx(p->ctx);
+
+	return ret;
 }
 
 /*
@@ -285,6 +357,33 @@ get_bus_from_flags(__u32 flags)
 	return KVM_MMIO_BUS;
 }
 
+/* assumes kvm->slots_lock held */
+static bool
+ioregion_get_ctx(struct kvm *kvm, struct ioregion *p, struct file *rf, int bus_idx)
+{
+	struct ioregion *_p;
+	struct list_head *ioregions;
+
+	ioregions = get_ioregion_list(kvm, bus_idx);
+	list_for_each_entry(_p, ioregions, list)
+		if (file_inode(_p->ctx->rf)->i_ino == file_inode(rf)->i_ino) {
+			p->ctx = _p->ctx;
+			kref_get(&p->ctx->kref);
+			return true;
+		}
+
+	p->ctx = kzalloc(sizeof(*p->ctx), GFP_KERNEL_ACCOUNT);
+	if (!p->ctx)
+		return false;
+
+	p->ctx->rf = rf;
+	p->ctx->busy = false;
+	init_waitqueue_head(&p->ctx->wq);
+	kref_get(&p->ctx->kref);
+
+	return true;
+}
+
 int
 kvm_set_ioregion_idx(struct kvm *kvm, struct kvm_ioregion *args, enum kvm_bus bus_idx)
 {
@@ -309,11 +408,10 @@ kvm_set_ioregion_idx(struct kvm *kvm, struct kvm_ioregion *args, enum kvm_bus bu
 	}
 
 	INIT_LIST_HEAD(&p->list);
+	p->wf = wfile;
 	p->paddr = args->guest_paddr;
 	p->size = args->memory_size;
 	p->user_data = args->user_data;
-	p->rf = rfile;
-	p->wf = wfile;
 	p->posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES;
 
 	mutex_lock(&kvm->slots_lock);
@@ -322,6 +420,12 @@ kvm_set_ioregion_idx(struct kvm *kvm, struct kvm_ioregion *args, enum kvm_bus bu
 		ret = -EEXIST;
 		goto unlock_fail;
 	}
+
+	if (rfile && !ioregion_get_ctx(kvm, p, rfile, bus_idx)) {
+		ret = -ENOMEM;
+		goto unlock_fail;
+	}
+
 	kvm_iodevice_init(&p->dev, &ioregion_ops);
 	ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size,
 				      &p->dev);
@@ -335,6 +439,8 @@ kvm_set_ioregion_idx(struct kvm *kvm, struct kvm_ioregion *args, enum kvm_bus bu
 
 unlock_fail:
 	mutex_unlock(&kvm->slots_lock);
+	if (p->ctx)
+		kref_put(&p->ctx->kref, ctx_free);
 	kfree(p);
 fail:
 	if (rfile)
-- 
2.25.1