Add support for ioregionfd cmds/replies serialization. Signed-off-by: Elena Afanasova <eafanasova@xxxxxxxxx> --- v3: - add comment - drop kvm_io_bus_finish/prepare() virt/kvm/ioregion.c | 164 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 135 insertions(+), 29 deletions(-) diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c index 1e1c7772d274..d53e3d1cd2ff 100644 --- a/virt/kvm/ioregion.c +++ b/virt/kvm/ioregion.c @@ -1,10 +1,39 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/kvm_host.h> -#include <linux/fs.h> +#include <linux/wait.h> #include <kvm/iodev.h> #include "eventfd.h" #include <uapi/linux/ioregion.h> +/* ioregions that share the same rfd are serialized so that only one vCPU + * thread sends a struct ioregionfd_cmd to userspace at a time. This + * ensures that the struct ioregionfd_resp received from userspace will + * be processed by the one and only vCPU thread that sent it. + * + * A waitqueue is used to wake up waiting vCPU threads in order. Most of + * the time the waitqueue is unused and the lock is not contended. + * For best performance userspace should set up ioregionfds so that there + * is no contention (e.g. dedicated ioregionfds for queue doorbell + * registers on multi-queue devices). + */ +struct ioregionfd { + wait_queue_head_t wq; + struct file *rf; + struct kref kref; + bool busy; +}; + +struct ioregion { + struct list_head list; + u64 paddr; /* guest physical address */ + u64 size; /* size in bytes */ + struct file *wf; + u64 user_data; /* opaque token used by userspace */ + struct kvm_io_device dev; + bool posted_writes; + struct ioregionfd *ctx; +}; + void kvm_ioregionfd_init(struct kvm *kvm) { @@ -13,29 +42,28 @@ kvm_ioregionfd_init(struct kvm *kvm) INIT_LIST_HEAD(&kvm->ioregions_pio); } -struct ioregion { - struct list_head list; - u64 paddr; /* guest physical address */ - u64 size; /* size in bytes */ - struct file *rf; - struct file *wf; - u64 user_data; /* opaque token used by userspace */ - struct kvm_io_device dev; - bool posted_writes; -}; - static inline struct ioregion * to_ioregion(struct kvm_io_device *dev) { return container_of(dev, struct ioregion, dev); } +/* assumes kvm->slots_lock held */ +static void ctx_free(struct kref *kref) +{ + struct ioregionfd *ctx = container_of(kref, struct ioregionfd, kref); + + kfree(ctx); +} + /* assumes kvm->slots_lock held */ static void ioregion_release(struct ioregion *p) { - if (p->rf) - fput(p->rf); + if (p->ctx) { + fput(p->ctx->rf); + kref_put(&p->ctx->kref, ctx_free); + } fput(p->wf); list_del(&p->list); kfree(p); @@ -90,6 +118,30 @@ ioregion_save_ctx(struct kvm_vcpu *vcpu, bool in, gpa_t addr, u8 state, void *va vcpu->ioregion_ctx.in = in; } +static inline void +ioregion_lock_ctx(struct ioregionfd *ctx) +{ + if (!ctx) + return; + + spin_lock(&ctx->wq.lock); + wait_event_interruptible_exclusive_locked(ctx->wq, !ctx->busy); + ctx->busy = true; + spin_unlock(&ctx->wq.lock); +} + +static inline void +ioregion_unlock_ctx(struct ioregionfd *ctx) +{ + if (!ctx) + return; + + spin_lock(&ctx->wq.lock); + ctx->busy = false; + wake_up_locked(&ctx->wq); + spin_unlock(&ctx->wq.lock); +} + static int ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, void *val) @@ -115,11 +167,15 @@ ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, } } + ioregion_lock_ctx(p->ctx); + send_cmd: memset(&buf, 0, sizeof(buf)); if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ, - 1, p->user_data, NULL)) - return -EOPNOTSUPP; + 1, p->user_data, NULL)) { + ret = -EOPNOTSUPP; + goto out; + } ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0); state = (ret == sizeof(buf.cmd)) ? GET_REPLY : SEND_CMD; @@ -129,14 +185,15 @@ ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, } if (ret != sizeof(buf.cmd)) { ret = (ret < 0) ? ret : -EIO; - return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret; + ret = (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret; + goto out; } - if (!p->rf) + if (!p->ctx) return 0; get_repl: memset(&buf, 0, sizeof(buf)); - ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0); + ret = kernel_read(p->ctx->rf, &buf.resp, sizeof(buf.resp), 0); state = (ret == sizeof(buf.resp)) ? COMPLETE : GET_REPLY; if (signal_pending(current) && state == GET_REPLY) { ioregion_save_ctx(vcpu, 1, addr, state, val); @@ -144,12 +201,17 @@ ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, } if (ret != sizeof(buf.resp)) { ret = (ret < 0) ? ret : -EIO; - return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret; + ret = (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret; + goto out; } memcpy(val, &buf.resp.data, len); + ret = 0; - return 0; +out: + ioregion_unlock_ctx(p->ctx); + + return ret; } static int @@ -177,11 +239,15 @@ ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, } } + ioregion_lock_ctx(p->ctx); + send_cmd: memset(&buf, 0, sizeof(buf)); if (!pack_cmd(&buf.cmd, addr - p->paddr, len, IOREGIONFD_CMD_WRITE, - p->posted_writes ? 0 : 1, p->user_data, val)) - return -EOPNOTSUPP; + p->posted_writes ? 0 : 1, p->user_data, val)) { + ret = -EOPNOTSUPP; + goto out; + } ret = kernel_write(p->wf, &buf.cmd, sizeof(buf.cmd), 0); state = (ret == sizeof(buf.cmd)) ? GET_REPLY : SEND_CMD; @@ -191,13 +257,14 @@ ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, } if (ret != sizeof(buf.cmd)) { ret = (ret < 0) ? ret : -EIO; - return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret; + ret = (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret; + goto out; } get_repl: if (!p->posted_writes) { memset(&buf, 0, sizeof(buf)); - ret = kernel_read(p->rf, &buf.resp, sizeof(buf.resp), 0); + ret = kernel_read(p->ctx->rf, &buf.resp, sizeof(buf.resp), 0); state = (ret == sizeof(buf.resp)) ? COMPLETE : GET_REPLY; if (signal_pending(current) && state == GET_REPLY) { ioregion_save_ctx(vcpu, 0, addr, state, (void *)val); @@ -205,11 +272,16 @@ ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, } if (ret != sizeof(buf.resp)) { ret = (ret < 0) ? ret : -EIO; - return (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret; + ret = (ret == -EAGAIN || ret == -EWOULDBLOCK) ? -EINVAL : ret; + goto out; } } + ret = 0; - return 0; +out: + ioregion_unlock_ctx(p->ctx); + + return ret; } /* @@ -285,6 +357,33 @@ get_bus_from_flags(__u32 flags) return KVM_MMIO_BUS; } +/* assumes kvm->slots_lock held */ +static bool +ioregion_get_ctx(struct kvm *kvm, struct ioregion *p, struct file *rf, int bus_idx) +{ + struct ioregion *_p; + struct list_head *ioregions; + + ioregions = get_ioregion_list(kvm, bus_idx); + list_for_each_entry(_p, ioregions, list) + if (file_inode(_p->ctx->rf)->i_ino == file_inode(rf)->i_ino) { + p->ctx = _p->ctx; + kref_get(&p->ctx->kref); + return true; + } + + p->ctx = kzalloc(sizeof(*p->ctx), GFP_KERNEL_ACCOUNT); + if (!p->ctx) + return false; + + p->ctx->rf = rf; + p->ctx->busy = false; + init_waitqueue_head(&p->ctx->wq); + kref_get(&p->ctx->kref); + + return true; +} + int kvm_set_ioregion_idx(struct kvm *kvm, struct kvm_ioregion *args, enum kvm_bus bus_idx) { @@ -309,11 +408,10 @@ kvm_set_ioregion_idx(struct kvm *kvm, struct kvm_ioregion *args, enum kvm_bus bu } INIT_LIST_HEAD(&p->list); + p->wf = wfile; p->paddr = args->guest_paddr; p->size = args->memory_size; p->user_data = args->user_data; - p->rf = rfile; - p->wf = wfile; p->posted_writes = args->flags & KVM_IOREGION_POSTED_WRITES; mutex_lock(&kvm->slots_lock); @@ -322,6 +420,12 @@ kvm_set_ioregion_idx(struct kvm *kvm, struct kvm_ioregion *args, enum kvm_bus bu ret = -EEXIST; goto unlock_fail; } + + if (rfile && !ioregion_get_ctx(kvm, p, rfile, bus_idx)) { + ret = -ENOMEM; + goto unlock_fail; + } + kvm_iodevice_init(&p->dev, &ioregion_ops); ret = kvm_io_bus_register_dev(kvm, bus_idx, p->paddr, p->size, &p->dev); @@ -335,6 +439,8 @@ kvm_set_ioregion_idx(struct kvm *kvm, struct kvm_ioregion *args, enum kvm_bus bu unlock_fail: mutex_unlock(&kvm->slots_lock); + if (p->ctx) + kref_put(&p->ctx->kref, ctx_free); kfree(p); fail: if (rfile) -- 2.25.1