From: Ankur Arora <ankur.a.arora@xxxxxxxxxx> Introduce support for mapping grant references. The sequence of events to map a grant is: rframe = read_shared_entry(guest_grant_table, grant-ref); rpfn = get_user_pages_remote(remote_mm, rframe); mark_shared_entry(guest_grant_table, grant-ref, GTF_reading | GTF_writing); To correctly handle grant unmaps for mapped grants, we save the mapping parameters in maptrack. Also, grant map (and unmap) can be called from non-sleeping contexts, so we call get_user_pages_remote() in non-blocking mode and ask the user to retry. Also note that this code is not compliant with Xen's grant map/unmap ABI. In particular, we do not support multiple simultaneous mappings of a grant-reference. Later versions will support that. Co-developed-by: Joao Martins <joao.m.martins@xxxxxxxxxx> Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx> Signed-off-by: Joao Martins <joao.m.martins@xxxxxxxxxx> --- arch/x86/kvm/xen.c | 396 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 396 insertions(+) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 645cd22ab4e7..3603645086a7 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -9,6 +9,7 @@ #include "xen.h" #include "ioapic.h" +#include <linux/mman.h> #include <linux/kvm_host.h> #include <linux/eventfd.h> #include <linux/sched/stat.h> @@ -29,9 +30,11 @@ /* Grant v1 references per 4K page */ #define GPP_V1 (PAGE_SIZE / sizeof(struct grant_entry_v1)) +#define shared_entry(gt, ref) (&((gt)[(ref) / GPP_V1][(ref) % GPP_V1])) /* Grant mappings per 4K page */ #define MPP (PAGE_SIZE / sizeof(struct kvm_grant_map)) +#define maptrack_entry(mt, hdl) (&((mt)[(hdl) / MPP][(hdl) % MPP])) struct evtchnfd { struct eventfd_ctx *ctx; @@ -81,6 +84,18 @@ static int kvm_xen_domid_init(struct kvm *kvm, bool any, domid_t domid) return 0; } +static struct kvm *kvm_xen_find_vm(domid_t domid) +{ + unsigned long flags; + struct kvm *vm; + + read_lock_irqsave(&domid_lock, flags); + vm = idr_find(&domid_to_kvm, domid); + read_unlock_irqrestore(&domid_lock, flags); + + return vm; +} + int kvm_xen_free_domid(struct kvm *kvm) { struct kvm_xen *xen = &kvm->arch.xen; @@ -1153,7 +1168,20 @@ int kvm_xen_gnttab_init(struct kvm *kvm, struct kvm_xen *xen, gnttab->frames = addr; gnttab->frames[0] = xen->gnttab.initial; gnttab->max_nr_frames = max_frames; + + addr = kcalloc(max_mt_frames, sizeof(addr), GFP_KERNEL); + if (!addr) + goto out; + + /* Needs to be aligned at 16b boundary. */ + gnttab->handle = addr; gnttab->max_mt_frames = max_mt_frames; + + addr = (void *) get_zeroed_page(GFP_KERNEL); + if (!addr) + goto out; + gnttab->handle[0] = addr; + gnttab->nr_mt_frames = 1; gnttab->nr_frames = 0; @@ -1162,6 +1190,7 @@ int kvm_xen_gnttab_init(struct kvm *kvm, struct kvm_xen *xen, return 0; out: + kfree(xen->gnttab.handle); kfree(xen->gnttab.frames); kfree(xen->gnttab.frames_addr); if (page) @@ -1170,11 +1199,38 @@ int kvm_xen_gnttab_init(struct kvm *kvm, struct kvm_xen *xen, return -ENOMEM; } +static void kvm_xen_maptrack_free(struct kvm_xen *xen) +{ + u32 max_entries = xen->gnttab.nr_mt_frames * MPP; + struct kvm_grant_map *map; + int ref, inuse = 0; + + for (ref = 0; ref < max_entries; ref++) { + map = maptrack_entry(xen->gnttab.handle, ref); + + if (test_and_clear_bit(_KVM_GNTMAP_ACTIVE, + (unsigned long *)&map->flags)) { + put_page(virt_to_page(map->gpa)); + inuse++; + } + } + + if (inuse) + pr_debug("kvm: dom%u teardown %u mappings\n", + xen->domid, inuse); +} + void kvm_xen_gnttab_free(struct kvm_xen *xen) { struct kvm_grant_table *gnttab = &xen->gnttab; int i; + if (xen->domid) + kvm_xen_maptrack_free(xen); + + for (i = 0; i < gnttab->nr_mt_frames; i++) + free_page((unsigned long)gnttab->handle[i]); + for (i = 0; i < gnttab->nr_frames; i++) put_page(virt_to_page(gnttab->frames[i])); @@ -1313,6 +1369,343 @@ void kvm_xen_unregister_lcall(void) } EXPORT_SYMBOL_GPL(kvm_xen_unregister_lcall); +static inline int gnttab_entries(struct kvm *kvm) +{ + struct kvm_grant_table *gnttab = &kvm->arch.xen.gnttab; + int n = max_t(unsigned int, gnttab->nr_frames, 1); + + return n * ((n << PAGE_SHIFT) / sizeof(struct grant_entry_v1)); +} + +/* + * The first two members of a grant entry are updated as a combined pair. + * The following union allows that to happen in an endian-neutral fashion. + * Taken from Xen. + */ +union grant_combo { + uint32_t word; + struct { + uint16_t flags; + domid_t domid; + } shorts; +}; + +/* Marks a grant in use. Code largely borrowed from Xen. */ +static int set_grant_status(domid_t domid, bool readonly, + struct grant_entry_v1 *shah) +{ + int rc = GNTST_okay; + union grant_combo scombo, prev_scombo, new_scombo; + uint16_t mask = GTF_type_mask; + + /* + * We bound the number of times we retry CMPXCHG on memory locations + * that we share with a guest OS. The reason is that the guest can + * modify that location at a higher rate than we can + * read-modify-CMPXCHG, so the guest could cause us to livelock. There + * are a few cases where it is valid for the guest to race our updates + * (e.g., to change the GTF_readonly flag), so we allow a few retries + * before failing. + */ + int retries = 0; + + scombo.word = *(u32 *)shah; + + /* + * This loop attempts to set the access (reading/writing) flags + * in the grant table entry. It tries a cmpxchg on the field + * up to five times, and then fails under the assumption that + * the guest is misbehaving. + */ + for (;;) { + /* If not already pinned, check the grant domid and type. */ + if ((((scombo.shorts.flags & mask) != GTF_permit_access) || + (scombo.shorts.domid != domid))) { + rc = GNTST_general_error; + pr_err("Bad flags (%x) or dom (%d); expected d%d\n", + scombo.shorts.flags, scombo.shorts.domid, + domid); + return rc; + } + + new_scombo = scombo; + new_scombo.shorts.flags |= GTF_reading; + + if (!readonly) { + new_scombo.shorts.flags |= GTF_writing; + if (unlikely(scombo.shorts.flags & GTF_readonly)) { + rc = GNTST_general_error; + pr_err("Attempt to write-pin a r/o grant entry\n"); + return rc; + } + } + + prev_scombo.word = cmpxchg((u32 *)shah, + scombo.word, new_scombo.word); + if (likely(prev_scombo.word == scombo.word)) + break; + + if (retries++ == 4) { + rc = GNTST_general_error; + pr_err("Shared grant entry is unstable\n"); + return rc; + } + + scombo = prev_scombo; + } + + return rc; +} + +#define MT_HANDLE_DOMID_SHIFT 17 +#define MT_HANDLE_DOMID_MASK 0x7fff +#define MT_HANDLE_GREF_MASK 0x1ffff + +static u32 handle_get(domid_t domid, grant_ref_t ref) +{ + return (domid << MT_HANDLE_DOMID_SHIFT) | ref; +} + +static u16 handle_get_domid(grant_handle_t handle) +{ + return (handle >> MT_HANDLE_DOMID_SHIFT) & MT_HANDLE_DOMID_MASK; +} + +static grant_ref_t handle_get_grant(grant_handle_t handle) +{ + return handle & MT_HANDLE_GREF_MASK; +} + +static int map_grant_nosleep(struct kvm *rd, u64 frame, bool readonly, + struct page **page, u16 *err) +{ + unsigned long rhva; + int gup_flags, non_blocking; + int ret; + + *err = GNTST_general_error; + + if (!err || !page) + return -EINVAL; + + rhva = gfn_to_hva(rd, frame); + if (kvm_is_error_hva(rhva)) { + *err = GNTST_bad_page; + return -EFAULT; + } + + gup_flags = (readonly ? 0 : FOLL_WRITE) | FOLL_NOWAIT; + + /* get_user_pages will reset this were IO to be needed */ + non_blocking = 1; + + /* + * get_user_pages_*() family of functions can sleep if the page needs + * to be mapped in. However, our main consumer is the grant map + * hypercall and because we run in the same context as the caller + * (unlike a real hypercall) sleeping is not an option. + * + * This is how we avoid it: + * - sleeping on mmap_sem acquisition: we handle that by acquiring the + * read-lock before calling. + * If mmap_sem is contended, return with GNTST_eagain. + * - sync wait for pages to be swapped in: specify FOLL_NOWAIT. If IO + * was needed, would be returned via @non_blocking. Return + * GNTST_eagain if it is necessary and the user would retry. + * Also, in the blocking case, mmap_sem will be released + * asynchronously when the IO completes. + */ + ret = down_read_trylock(&rd->mm->mmap_sem); + if (ret == 0) { + *err = GNTST_eagain; + return -EBUSY; + } + + ret = get_user_pages_remote(rd->mm->owner, rd->mm, rhva, 1, gup_flags, + page, NULL, &non_blocking); + if (non_blocking) + up_read(&rd->mm->mmap_sem); + + if (ret == 1) { + *err = GNTST_okay; + } else if (ret == 0) { + *err = GNTST_eagain; + ret = -EBUSY; + } else if (ret < 0) { + pr_err("gnttab: failed to get pfn for hva %lx, err %d\n", + rhva, ret); + if (ret == -EFAULT) { + *err = GNTST_bad_page; + } else if (ret == -EBUSY) { + WARN_ON(non_blocking); + *err = GNTST_eagain; + } else { + *err = GNTST_general_error; + } + } + + return (ret >= 0) ? 0 : ret; +} + +static int shim_hcall_gntmap(struct kvm_xen *ld, + struct gnttab_map_grant_ref *op) +{ + struct kvm_grant_map map_old, map_new, *map = NULL; + bool readonly = op->flags & GNTMAP_readonly; + struct grant_entry_v1 *shah; + struct page *page = NULL; + unsigned long host_kaddr; + int err = -ENOSYS; + struct kvm *rd; + kvm_pfn_t rpfn; + u32 frame; + u32 idx; + + BUILD_BUG_ON(sizeof(*map) != 16); + + if (unlikely((op->host_addr))) { + pr_err("gnttab: bad host_addr %llx in map\n", op->host_addr); + op->status = GNTST_bad_virt_addr; + return 0; + } + + /* + * Make sure the guest does not try to smuggle any flags here + * (for instance _KVM_GNTMAP_ACTIVE.) + * The only allowable flag is GNTMAP_readonly. + */ + if (unlikely(op->flags & ~((u16) GNTMAP_readonly))) { + pr_err("gnttab: bad flags %x in map\n", op->flags); + op->status = GNTST_bad_gntref; + return 0; + } + + rd = kvm_xen_find_vm(op->dom); + if (unlikely(!rd)) { + pr_err("gnttab: could not find domain %u\n", op->dom); + op->status = GNTST_bad_domain; + return 0; + } + + if (unlikely(op->ref >= gnttab_entries(rd))) { + pr_err("gnttab: bad ref %u\n", op->ref); + op->status = GNTST_bad_gntref; + return 0; + } + + /* + * shah is potentially controlled by the user. We cache the frame but + * don't care about any changes to domid or flags since those get + * validated in set_grant_status() anyway. + * + * Note that if the guest changes the frame we will end up mapping the + * old frame. + */ + shah = shared_entry(rd->arch.xen.gnttab.frames_v1, op->ref); + frame = READ_ONCE(shah->frame); + + if (unlikely(shah->domid != ld->domid)) { + pr_err("gnttab: bad domain (%u != %u)\n", + shah->domid, ld->domid); + op->status = GNTST_bad_gntref; + goto out; + } + + idx = handle_get(op->dom, op->ref); + if (handle_get_grant(idx) < op->ref || + handle_get_domid(idx) < op->dom) { + pr_err("gnttab: out of maptrack entries (dom %u)\n", ld->domid); + op->status = GNTST_general_error; + goto out; + } + + map = maptrack_entry(rd->arch.xen.gnttab.handle, op->ref); + + /* + * Cache the old map value so we can do our checks on the stable + * version. Once the map is done, swap the mapping with the new map. + */ + map_old = *map; + if (map_old.flags & KVM_GNTMAP_ACTIVE) { + pr_err("gnttab: grant ref %u dom %u in use\n", + op->ref, ld->domid); + op->status = GNTST_bad_gntref; + goto out; + } + + err = map_grant_nosleep(rd, frame, readonly, &page, &op->status); + if (err) { + if (err != -EBUSY) + op->status = GNTST_bad_gntref; + goto out; + } + + err = set_grant_status(ld->domid, readonly, shah); + if (err != GNTST_okay) { + pr_err("gnttab: pin failed\n"); + put_page(page); + op->status = err; + goto out; + } + + rpfn = page_to_pfn(page); + host_kaddr = (unsigned long) pfn_to_kaddr(rpfn); + + map_new.domid = op->dom; + map_new.ref = op->ref; + map_new.flags = op->flags; + map_new.gpa = host_kaddr; + + map_new.flags |= KVM_GNTMAP_ACTIVE; + + /* + * Protect against a grant-map that could come in between our check for + * KVM_GNTMAP_ACTIVE above and assuming the ownership of the mapping. + * + * Use cmpxchg_double() so we can update mapping atomically (which + * luckily fits in 16b.) + */ + if (cmpxchg_double(&map->gpa, &map->fields, + map_old.gpa, map_old.fields, + map_new.gpa, map_new.fields) == false) { + put_page(page); + op->status = GNTST_bad_gntref; + goto out; + } + + op->dev_bus_addr = rpfn << PAGE_SHIFT; + op->handle = idx; + op->status = GNTST_okay; + op->host_addr = host_kaddr; + return 0; + +out: + /* The error code is stored in @status. */ + return 0; +} + +static int shim_hcall_gnttab(int op, void *p, int count) +{ + int ret = -ENOSYS; + int i; + + switch (op) { + case GNTTABOP_map_grant_ref: { + struct gnttab_map_grant_ref *ref = p; + + for (i = 0; i < count; i++) + shim_hcall_gntmap(xen_shim, ref + i); + ret = 0; + break; + } + default: + pr_info("lcall-gnttab:op default=%d\n", op); + break; + } + + return ret; +} + static int shim_hcall_version(int op, struct xen_feature_info *fi) { if (op != XENVER_get_features || !fi || fi->submap_idx != 0) @@ -1330,6 +1723,9 @@ static int shim_hypercall(u64 code, u64 a0, u64 a1, u64 a2, u64 a3, u64 a4) int ret = -ENOSYS; switch (code) { + case __HYPERVISOR_grant_table_op: + ret = shim_hcall_gnttab((int) a0, (void *) a1, (int) a2); + break; case __HYPERVISOR_xen_version: ret = shim_hcall_version((int)a0, (void *)a1); break; -- 2.11.0