On Fri, 2021-10-29 at 12:56 +0100, David Woodhouse wrote: > I think I've just conceded that I have to map/unmap the page at a > kernel virtual address from the MMU notifier when it becomes > present/absent, so I might get *some* of the benefits of pinning from > that (at least if I protect it with a spinlock it can't go away > *between* two consecutive accesses). Maybe... in fact from the workqueue I can borrow the mm I want with kthread_use_mm(kvm->mm) and thejn kvm_map_gfn() does work, so I've done that as a hack for now to get the rest of it working and tested. https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/xen-evtchn It's working now taking the workqueue path every time, so next I'll take a look at fixing the atomic path to do something other than just return -EWOULDBLOCK every time, by maintaining a kernel mapping of the shinfo page. From 82dcaa28d80c9ee4e2a09ac4f263be4ce59a3457 Mon Sep 17 00:00:00 2001 From: David Woodhouse <dwmw@xxxxxxxxxxxx> Date: Thu, 28 Oct 2021 23:10:31 +0100 Subject: [PATCH] KVM: x86/xen: First attempt at KVM_IRQ_ROUTING_XEN_EVTCHN Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/irq_comm.c | 12 + arch/x86/kvm/xen.c | 221 +++++++++++++++++- arch/x86/kvm/xen.h | 6 + include/linux/kvm_host.h | 7 + include/uapi/linux/kvm.h | 10 + .../selftests/kvm/x86_64/xen_shinfo_test.c | 99 +++++++- 7 files changed, 350 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 70771376e246..e1a4521ae838 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -606,6 +606,7 @@ struct kvm_vcpu_xen { u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; + unsigned long evtchn_pending_sel; }; struct kvm_vcpu_arch { diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index d5b72a08e566..6894f9a369f2 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -24,6 +24,7 @@ #include "hyperv.h" #include "x86.h" +#include "xen.h" static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, @@ -175,6 +176,13 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, return r; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + if (!level) + return -1; + + return kvm_xen_set_evtchn(e, kvm, true); +#endif default: break; } @@ -310,6 +318,10 @@ int kvm_set_routing_entry(struct kvm *kvm, e->hv_sint.vcpu = ue->u.hv_sint.vcpu; e->hv_sint.sint = ue->u.hv_sint.sint; break; +#ifdef CONFIG_KVM_XEN + case KVM_IRQ_ROUTING_XEN_EVTCHN: + return kvm_xen_setup_evtchn(kvm, e, ue); +#endif default: return -EINVAL; } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index c4bca001a7c9..3ec7c62f99ec 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -13,6 +13,8 @@ #include <linux/kvm_host.h> #include <linux/sched/stat.h> +#include <asm/mmu_context.h> + #include <trace/events/kvm.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> @@ -207,6 +209,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { + unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); + bool atomic = in_atomic() || !task_is_running(current); int err; u8 rc = 0; @@ -216,6 +220,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) */ struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache; struct kvm_memslots *slots = kvm_memslots(v->kvm); + bool ghc_valid = slots->generation == ghc->generation && + !kvm_is_error_hva(ghc->hva) && ghc->memslot; + unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending); /* No need for compat handling here */ @@ -231,8 +238,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) * cache in kvm_read_guest_offset_cached(), but just uses * __get_user() instead. And falls back to the slow path. */ - if (likely(slots->generation == ghc->generation && - !kvm_is_error_hva(ghc->hva) && ghc->memslot)) { + if (!evtchn_pending_sel && ghc_valid) { /* Fast path */ pagefault_disable(); err = __get_user(rc, (u8 __user *)ghc->hva + offset); @@ -251,11 +257,80 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) * and we'll end up getting called again from a context where we *can* * fault in the page and wait for it. */ - if (in_atomic() || !task_is_running(current)) + if (atomic) return 1; - kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset, - sizeof(rc)); + if (!ghc_valid) { + err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len); + if (err && !ghc->memslot) { + /* + * If this failed, userspace has screwed up the + * vcpu_info mapping. No interrupts for you. + */ + return 0; + } + } + + /* + * Now we have a valid (protected by srcu) userspace HVA in + * ghc->hva which points to the struct vcpu_info. If there + * are any bits in the in-kernel evtchn_pending_sel then + * we need to write those to the guest vcpu_info and set + * its evtchn_upcall_pending flag. If there aren't any bits + * to add, we only want to *check* evtchn_upcall_pending. + */ + if (evtchn_pending_sel) { + if (!user_access_begin((void *)ghc->hva, sizeof(struct vcpu_info))) + return 0; + + if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { + struct vcpu_info __user *vi = (void *)ghc->hva; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n" + "\tnotq %0\n" + "\t" LOCK_PREFIX "andq %0, %2\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel) + : "m" (vi->evtchn_pending_sel), + "m" (v->arch.xen.evtchn_pending_sel), + "0" (evtchn_pending_sel)); + } else { + struct compat_vcpu_info __user *vi = (void *)ghc->hva; + u32 evtchn_pending_sel32 = evtchn_pending_sel; + + /* Attempt to set the evtchn_pending_sel bits in the + * guest, and if that succeeds then clear the same + * bits in the in-kernel version. */ + asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n" + "\tnotl %0\n" + "\t" LOCK_PREFIX "andl %0, %2\n" + "2:\n" + "\t.section .fixup,\"ax\"\n" + "3:\tjmp\t2b\n" + "\t.previous\n" + _ASM_EXTABLE_UA(1b, 3b) + : "=r" (evtchn_pending_sel32) + : "m" (vi->evtchn_pending_sel), + "m" (v->arch.xen.evtchn_pending_sel), + "0" (evtchn_pending_sel32)); + } + rc = 1; + unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err); + + err: + user_access_end(); + + mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + } else { + __get_user(rc, (u8 __user *)ghc->hva + offset); + } return rc; } @@ -772,3 +847,139 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu) return 0; } + +static inline int max_evtchn_port(struct kvm *kvm) +{ + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) + return 4096; + else + return 1024; +} + +int kvm_xen_set_evtchn(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, bool in_atomic) +{ + bool mm_borrowed = false; + struct kvm_vcpu *vcpu; + struct kvm_host_map map; + unsigned long *pending_bits, *mask_bits; + int port_word_bit; + int idx; + int rc; + + vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu); + if (!vcpu) + return -EINVAL; + + if (!vcpu->arch.xen.vcpu_info_set) + return -EINVAL; + + if (e->xen_evtchn.port >= max_evtchn_port(kvm)) + return -EINVAL; + + if (current->mm != kvm->mm) { + if (in_atomic) + return -EWOULDBLOCK; + + /* Else we'd better be in the workqueue */ + BUG_ON(current->mm); + kthread_use_mm(kvm->mm); + mm_borrowed = true; + } + + idx = srcu_read_lock(&kvm->srcu); + + /* With no cache this is *always* going to fail in the atomic case for now */ + rc = kvm_map_gfn(vcpu, kvm->arch.xen.shinfo_gfn, &map, NULL, in_atomic); + if (rc < 0) { + if (in_atomic) + rc = -EWOULDBLOCK; + goto out_rcu; + } + + if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { + struct shared_info *shinfo = map.hva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 64; + } else { + struct compat_shared_info *shinfo = map.hva; + pending_bits = (unsigned long *)&shinfo->evtchn_pending; + mask_bits = (unsigned long *)&shinfo->evtchn_mask; + port_word_bit = e->xen_evtchn.port / 32; + } + + /* + * If this port wasn't already set, and if it isn't masked, then + * we try to set the corresponding bit in the in-kernel shadow of + * evtchn_pending_sel for the target vCPU. And if *that* wasn't + * already set, then we kick the vCPU in question to write to the + * *real* evtchn_pending_sel in its own guest vcpu_info struct. + */ + if (!test_and_set_bit(e->xen_evtchn.port, pending_bits) && + !test_bit(e->xen_evtchn.port, mask_bits) && + !test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) { + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } + + kvm_unmap_gfn(vcpu, &map, NULL, true, in_atomic); + out_rcu: + srcu_read_unlock(&kvm->srcu, idx); + + if (mm_borrowed) + kthread_unuse_mm(kvm->mm); + + return rc; +} + +/* This is the version called from kvm_set_irq() as the .set function */ +static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + if (!level) + return -1; + + return kvm_xen_set_evtchn(e, kvm, false); +} + +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) + +{ + struct kvm_vcpu *vcpu; + + if (kvm->arch.xen.shinfo_gfn == GPA_INVALID) + return -EINVAL; + + if (ue->u.xen_evtchn.vcpu >= KVM_MAX_VCPUS) + return -EINVAL; + + vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu); + if (!vcpu) + return -EINVAL; + + if (!vcpu->arch.xen.vcpu_info_set) + return -EINVAL; + + if (!kvm->arch.xen.upcall_vector) + return -EINVAL; + + /* Once we support the per-vCPU LAPIC based vector we will permit + * that here instead of the per-KVM upcall vector */ + + if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) + return -EINVAL; + + /* We only support 2 level event channels for now */ + if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) + return -EINVAL; + + e->xen_evtchn.port = ue->u.xen_evtchn.port; + e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu; + e->xen_evtchn.priority = ue->u.xen_evtchn.priority; + e->set = evtchn_set_fn; + + return 0; +} diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index cc0cf5f37450..3e717947b928 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -24,6 +24,12 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); +int kvm_xen_set_evtchn(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, bool in_atomic); +int kvm_xen_setup_evtchn(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue); + static inline bool kvm_xen_msr_enabled(struct kvm *kvm) { return static_branch_unlikely(&kvm_xen_enabled.key) && diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0f18df7fe874..9003fae1af9d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -470,6 +470,12 @@ struct kvm_hv_sint { u32 sint; }; +struct kvm_xen_evtchn { + u32 port; + u32 vcpu; + u32 priority; +}; + struct kvm_kernel_irq_routing_entry { u32 gsi; u32 type; @@ -490,6 +496,7 @@ struct kvm_kernel_irq_routing_entry { } msi; struct kvm_s390_adapter_int adapter; struct kvm_hv_sint hv_sint; + struct kvm_xen_evtchn xen_evtchn; }; struct hlist_node link; }; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a067410ebea5..05391c80bb6a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1143,11 +1143,20 @@ struct kvm_irq_routing_hv_sint { __u32 sint; }; +struct kvm_irq_routing_xen_evtchn { + __u32 port; + __u32 vcpu; + __u32 priority; +}; + +#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1)) + /* gsi routing entry types */ #define KVM_IRQ_ROUTING_IRQCHIP 1 #define KVM_IRQ_ROUTING_MSI 2 #define KVM_IRQ_ROUTING_S390_ADAPTER 3 #define KVM_IRQ_ROUTING_HV_SINT 4 +#define KVM_IRQ_ROUTING_XEN_EVTCHN 5 struct kvm_irq_routing_entry { __u32 gsi; @@ -1159,6 +1168,7 @@ struct kvm_irq_routing_entry { struct kvm_irq_routing_msi msi; struct kvm_irq_routing_s390_adapter adapter; struct kvm_irq_routing_hv_sint hv_sint; + struct kvm_irq_routing_xen_evtchn xen_evtchn; __u32 pad[8]; } u; }; diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index a0699f00b3d6..514acf3ab73f 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -14,6 +14,9 @@ #include <stdint.h> #include <time.h> #include <sched.h> +#include <signal.h> + +#include <sys/eventfd.h> #define VCPU_ID 5 @@ -22,10 +25,12 @@ #define SHINFO_REGION_SLOT 10 #define PAGE_SIZE 4096 +#define SHINFO_ADDR (SHINFO_REGION_GPA) #define PVTIME_ADDR (SHINFO_REGION_GPA + PAGE_SIZE) #define RUNSTATE_ADDR (SHINFO_REGION_GPA + PAGE_SIZE + 0x20) #define VCPU_INFO_ADDR (SHINFO_REGION_GPA + 0x40) +#define SHINFO_VADDR (SHINFO_REGION_GVA) #define RUNSTATE_VADDR (SHINFO_REGION_GVA + PAGE_SIZE + 0x20) #define VCPU_INFO_VADDR (SHINFO_REGION_GVA + 0x40) @@ -73,15 +78,30 @@ struct vcpu_info { struct pvclock_vcpu_time_info time; }; /* 64 bytes (x86) */ +struct shared_info { + struct vcpu_info vcpu_info[32]; + unsigned long evtchn_pending[64]; + unsigned long evtchn_mask[64]; + struct pvclock_wall_clock wc; + uint32_t wc_sec_hi; + /* arch_shared_info here */ +}; + #define RUNSTATE_running 0 #define RUNSTATE_runnable 1 #define RUNSTATE_blocked 2 #define RUNSTATE_offline 3 +struct { + struct kvm_irq_routing info; + struct kvm_irq_routing_entry entries[2]; +} irq_routes; + static void evtchn_handler(struct ex_regs *regs) { struct vcpu_info *vi = (void *)VCPU_INFO_VADDR; vi->evtchn_upcall_pending = 0; + vi->evtchn_pending_sel = 0; GUEST_SYNC(0x20); } @@ -127,7 +147,19 @@ static void guest_code(void) GUEST_SYNC(6); GUEST_ASSERT(rs->time[RUNSTATE_runnable] >= MIN_STEAL_TIME); - GUEST_DONE(); + /* Attempt to deliver a *masked* interrupt */ + GUEST_SYNC(7); + + /* Wait until we see the bit set */ + struct shared_info *si = (void *)SHINFO_VADDR; + while (!si->evtchn_pending[0]) + __asm__ __volatile__ ("rep nop" : : : "memory"); + + /* Now deliver an *unmasked* interrupt */ + GUEST_SYNC(8); + + for (;;) + __asm__ __volatile__ ("rep nop" : : : "memory"); } static int cmp_timespec(struct timespec *a, struct timespec *b) @@ -144,6 +176,11 @@ static int cmp_timespec(struct timespec *a, struct timespec *b) return 0; } +static void handle_alrm(int sig) +{ + TEST_FAIL("IRQ delivery timed out"); +} + int main(int argc, char *argv[]) { struct timespec min_ts, max_ts, vm_ts; @@ -155,6 +192,7 @@ int main(int argc, char *argv[]) } bool do_runstate_tests = !!(xen_caps & KVM_XEN_HVM_CONFIG_RUNSTATE); + bool do_eventfd_tests = true; clock_gettime(CLOCK_REALTIME, &min_ts); @@ -214,6 +252,51 @@ int main(int argc, char *argv[]) vcpu_ioctl(vm, VCPU_ID, KVM_XEN_VCPU_SET_ATTR, &st); } + int irq_fd[2] = { -1, -1 }; + + if (do_eventfd_tests) { + irq_fd[0] = eventfd(0, 0); + irq_fd[1] = eventfd(0, 0); + + /* Unexpected, but not a KVM failure */ + if (irq_fd[0] == -1 || irq_fd[1] == -1) + do_eventfd_tests = false; + } + + if (do_eventfd_tests) { + irq_routes.info.nr = 2; + + irq_routes.entries[0].gsi = 32; + irq_routes.entries[0].type = KVM_IRQ_ROUTING_XEN_EVTCHN; + irq_routes.entries[0].u.xen_evtchn.port = 15; + irq_routes.entries[0].u.xen_evtchn.vcpu = VCPU_ID; + irq_routes.entries[0].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + irq_routes.entries[1].gsi = 33; + irq_routes.entries[1].type = KVM_IRQ_ROUTING_XEN_EVTCHN; + irq_routes.entries[1].u.xen_evtchn.port = 66; + irq_routes.entries[1].u.xen_evtchn.vcpu = VCPU_ID; + irq_routes.entries[1].u.xen_evtchn.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; + + vm_ioctl(vm, KVM_SET_GSI_ROUTING, &irq_routes); + + struct kvm_irqfd ifd = { }; + + ifd.fd = irq_fd[0]; + ifd.gsi = 32; + vm_ioctl(vm, KVM_IRQFD, &ifd); + + ifd.fd = irq_fd[1]; + ifd.gsi = 33; + vm_ioctl(vm, KVM_IRQFD, &ifd); + + struct sigaction sa = { }; + sa.sa_handler = handle_alrm; + sigaction(SIGALRM, &sa, NULL); + } + + struct shared_info *shinfo = addr_gpa2hva(vm, SHINFO_VADDR); + struct vcpu_info *vinfo = addr_gpa2hva(vm, VCPU_INFO_VADDR); vinfo->evtchn_upcall_pending = 0; @@ -289,9 +372,23 @@ int main(int argc, char *argv[]) sched_yield(); } while (get_run_delay() < rundelay); break; + case 7: + if (!do_eventfd_tests) + goto done; + shinfo->evtchn_mask[0] = 0x8000; + eventfd_write(irq_fd[0], 1UL); + alarm(1); + break; + case 8: + eventfd_write(irq_fd[1], 1UL); + evtchn_irq_expected = true; + break; + case 0x20: TEST_ASSERT(evtchn_irq_expected, "Unexpected event channel IRQ"); evtchn_irq_expected = false; + if (shinfo->evtchn_pending[1]) + goto done; break; } break; -- 2.31.1
Attachment:
smime.p7s
Description: S/MIME cryptographic signature