From: Mihai Donțu <mdontu@xxxxxxxxxxxxxxx> >From preread, prewrite and preexec callbacks we will send the KVMI_EVENT_PF events caused by access rights enforced by the introspection tool. Signed-off-by: Mihai Donțu <mdontu@xxxxxxxxxxxxxxx> Co-developed-by: Nicușor Cîțu <ncitu@xxxxxxxxxxxxxxx> Signed-off-by: Nicușor Cîțu <ncitu@xxxxxxxxxxxxxxx> Co-developed-by: Marian Rotariu <marian.c.rotariu@xxxxxxxxx> Signed-off-by: Marian Rotariu <marian.c.rotariu@xxxxxxxxx> Co-developed-by: Adalbert Lazăr <alazar@xxxxxxxxxxxxxxx> Signed-off-by: Adalbert Lazăr <alazar@xxxxxxxxxxxxxxx> --- arch/x86/include/asm/kvmi_host.h | 12 ++ arch/x86/kvm/kvmi.c | 45 +++++ include/uapi/linux/kvmi.h | 4 + virt/kvm/kvmi.c | 293 ++++++++++++++++++++++++++++++- virt/kvm/kvmi_int.h | 21 +++ 5 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/kvmi_host.h diff --git a/arch/x86/include/asm/kvmi_host.h b/arch/x86/include/asm/kvmi_host.h new file mode 100644 index 000000000000..7ab6dd71a0c2 --- /dev/null +++ b/arch/x86/include/asm/kvmi_host.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_KVMI_HOST_H +#define _ASM_X86_KVMI_HOST_H + +#include <asm/kvm_host.h> +#include <asm/kvm_page_track.h> + +struct kvmi_arch_mem_access { + unsigned long active[KVM_PAGE_TRACK_MAX][BITS_TO_LONGS(KVM_MEM_SLOTS_NUM)]; +}; + +#endif /* _ASM_X86_KVMI_HOST_H */ diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c index 97c72cdc6fb0..d7b9201582b4 100644 --- a/arch/x86/kvm/kvmi.c +++ b/arch/x86/kvm/kvmi.c @@ -91,6 +91,12 @@ void kvmi_arch_setup_event(struct kvm_vcpu *vcpu, struct kvmi_event *ev) kvmi_get_msrs(vcpu, event); } +bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + u8 access) +{ + return KVMI_EVENT_ACTION_CONTINUE; /* TODO */ +} + int kvmi_arch_cmd_get_vcpu_info(struct kvm_vcpu *vcpu, struct kvmi_get_vcpu_info_reply *rpl) { @@ -102,3 +108,42 @@ int kvmi_arch_cmd_get_vcpu_info(struct kvm_vcpu *vcpu, return 0; } +static const struct { + unsigned int allow_bit; + enum kvm_page_track_mode track_mode; +} track_modes[] = { + { KVMI_PAGE_ACCESS_R, KVM_PAGE_TRACK_PREREAD }, + { KVMI_PAGE_ACCESS_W, KVM_PAGE_TRACK_PREWRITE }, + { KVMI_PAGE_ACCESS_X, KVM_PAGE_TRACK_PREEXEC }, +}; + +void kvmi_arch_update_page_tracking(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvmi_mem_access *m) +{ + struct kvmi_arch_mem_access *arch = &m->arch; + int i; + + if (!slot) { + slot = gfn_to_memslot(kvm, m->gfn); + if (!slot) + return; + } + + for (i = 0; i < ARRAY_SIZE(track_modes); i++) { + unsigned int allow_bit = track_modes[i].allow_bit; + enum kvm_page_track_mode mode = track_modes[i].track_mode; + bool slot_tracked = test_bit(slot->id, arch->active[mode]); + + if (m->access & allow_bit) { + if (slot_tracked) { + kvm_slot_page_track_remove_page(kvm, slot, + m->gfn, mode); + clear_bit(slot->id, arch->active[mode]); + } + } else if (!slot_tracked) { + kvm_slot_page_track_add_page(kvm, slot, m->gfn, mode); + set_bit(slot->id, arch->active[mode]); + } + } +} diff --git a/include/uapi/linux/kvmi.h b/include/uapi/linux/kvmi.h index aa5bc909e278..c56e676ddb2b 100644 --- a/include/uapi/linux/kvmi.h +++ b/include/uapi/linux/kvmi.h @@ -70,6 +70,10 @@ enum { #define KVMI_EVENT_ACTION_RETRY 1 #define KVMI_EVENT_ACTION_CRASH 2 +#define KVMI_PAGE_ACCESS_R (1 << 0) +#define KVMI_PAGE_ACCESS_W (1 << 1) +#define KVMI_PAGE_ACCESS_X (1 << 2) + #define KVMI_MSG_SIZE (4096 - sizeof(struct kvmi_msg_hdr)) struct kvmi_msg_hdr { diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c index d0d9adf5b6ed..5cbc82b284f4 100644 --- a/virt/kvm/kvmi.c +++ b/virt/kvm/kvmi.c @@ -11,10 +11,27 @@ #include <linux/bitmap.h> static struct kmem_cache *msg_cache; +static struct kmem_cache *radix_cache; static struct kmem_cache *job_cache; static bool kvmi_create_vcpu_event(struct kvm_vcpu *vcpu); static void kvmi_abort_events(struct kvm *kvm); +static bool kvmi_track_preread(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + u8 *new, int bytes, struct kvm_page_track_notifier_node *node, + bool *data_ready); +static bool kvmi_track_prewrite(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + const u8 *new, int bytes, struct kvm_page_track_notifier_node *node); +static bool kvmi_track_preexec(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + struct kvm_page_track_notifier_node *node); +static void kvmi_track_create_slot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages, + struct kvm_page_track_notifier_node *node); +static void kvmi_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node); + +static const u8 full_access = KVMI_PAGE_ACCESS_R | + KVMI_PAGE_ACCESS_W | + KVMI_PAGE_ACCESS_X; void *kvmi_msg_alloc(void) { @@ -34,23 +51,96 @@ void kvmi_msg_free(void *addr) kmem_cache_free(msg_cache, addr); } +static struct kvmi_mem_access *__kvmi_get_gfn_access(struct kvmi *ikvm, + const gfn_t gfn) +{ + return radix_tree_lookup(&ikvm->access_tree, gfn); +} + +static int kvmi_get_gfn_access(struct kvmi *ikvm, const gfn_t gfn, + u8 *access) +{ + struct kvmi_mem_access *m; + + *access = full_access; + + read_lock(&ikvm->access_tree_lock); + m = __kvmi_get_gfn_access(ikvm, gfn); + if (m) + *access = m->access; + read_unlock(&ikvm->access_tree_lock); + + return m ? 0 : -1; +} + +static bool kvmi_restricted_access(struct kvmi *ikvm, gpa_t gpa, u8 access) +{ + u8 allowed_access; + int err; + + err = kvmi_get_gfn_access(ikvm, gpa_to_gfn(gpa), &allowed_access); + + if (err) + return false; + + /* + * We want to be notified only for violations involving access + * bits that we've specifically cleared + */ + if ((~allowed_access) & access) + return true; + + return false; +} + +static void kvmi_clear_mem_access(struct kvm *kvm) +{ + void **slot; + struct radix_tree_iter iter; + struct kvmi *ikvm = IKVM(kvm); + int idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + write_lock(&ikvm->access_tree_lock); + + radix_tree_for_each_slot(slot, &ikvm->access_tree, &iter, 0) { + struct kvmi_mem_access *m = *slot; + + m->access = full_access; + kvmi_arch_update_page_tracking(kvm, NULL, m); + + radix_tree_iter_delete(&ikvm->access_tree, &iter, slot); + kmem_cache_free(radix_cache, m); + } + + write_unlock(&ikvm->access_tree_lock); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); +} + static void kvmi_cache_destroy(void) { kmem_cache_destroy(msg_cache); msg_cache = NULL; + kmem_cache_destroy(radix_cache); + radix_cache = NULL; kmem_cache_destroy(job_cache); job_cache = NULL; } static int kvmi_cache_create(void) { + radix_cache = kmem_cache_create("kvmi_radix_tree", + sizeof(struct kvmi_mem_access), + 0, SLAB_ACCOUNT, NULL); job_cache = kmem_cache_create("kvmi_job", sizeof(struct kvmi_job), 0, SLAB_ACCOUNT, NULL); msg_cache = kmem_cache_create("kvmi_msg", KVMI_MSG_SIZE_ALLOC, 4096, SLAB_ACCOUNT, NULL); - if (!msg_cache || !job_cache) { + if (!msg_cache || !radix_cache || !job_cache) { kvmi_cache_destroy(); return -1; @@ -77,6 +167,10 @@ static bool alloc_kvmi(struct kvm *kvm, const struct kvm_introspection *qemu) if (!ikvm) return false; + /* see comments of radix_tree_preload() - no direct reclaim */ + INIT_RADIX_TREE(&ikvm->access_tree, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM); + rwlock_init(&ikvm->access_tree_lock); + atomic_set(&ikvm->ev_seq, 0); set_bit(KVMI_GET_VERSION, ikvm->cmd_allow_mask); @@ -85,6 +179,12 @@ static bool alloc_kvmi(struct kvm *kvm, const struct kvm_introspection *qemu) memcpy(&ikvm->uuid, &qemu->uuid, sizeof(ikvm->uuid)); + ikvm->kptn_node.track_preread = kvmi_track_preread; + ikvm->kptn_node.track_prewrite = kvmi_track_prewrite; + ikvm->kptn_node.track_preexec = kvmi_track_preexec; + ikvm->kptn_node.track_create_slot = kvmi_track_create_slot; + ikvm->kptn_node.track_flush_slot = kvmi_track_flush_slot; + ikvm->kvm = kvm; kvm->kvmi = ikvm; @@ -276,6 +376,179 @@ void kvmi_vcpu_uninit(struct kvm_vcpu *vcpu) vcpu->kvmi = NULL; } +static bool is_pf_of_interest(struct kvm_vcpu *vcpu, gpa_t gpa, u8 access) +{ + struct kvm *kvm = vcpu->kvm; + + if (kvm_mmu_nested_pagefault(vcpu)) + return false; + + /* Have we shown interest in this page? */ + return kvmi_restricted_access(IKVM(kvm), gpa, access); +} + +static bool __kvmi_track_preread(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + u8 *new, int bytes, struct kvm_page_track_notifier_node *node, + bool *data_ready) +{ + bool ret; + + if (!is_pf_of_interest(vcpu, gpa, KVMI_PAGE_ACCESS_R)) + return true; + + ret = kvmi_arch_pf_event(vcpu, gpa, gva, KVMI_PAGE_ACCESS_R); + + return ret; +} + +static bool kvmi_track_preread(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + u8 *new, int bytes, struct kvm_page_track_notifier_node *node, + bool *data_ready) +{ + struct kvmi *ikvm; + bool ret = true; + + ikvm = kvmi_get(vcpu->kvm); + if (!ikvm) + return true; + + if (is_event_enabled(vcpu, KVMI_EVENT_PF)) + ret = __kvmi_track_preread(vcpu, gpa, gva, new, bytes, node, + data_ready); + + kvmi_put(vcpu->kvm); + + return ret; +} + +static bool __kvmi_track_prewrite(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node) +{ + if (!is_pf_of_interest(vcpu, gpa, KVMI_PAGE_ACCESS_W)) + return true; + + return kvmi_arch_pf_event(vcpu, gpa, gva, KVMI_PAGE_ACCESS_W); +} + +static bool kvmi_track_prewrite(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node) +{ + struct kvmi *ikvm; + bool ret = true; + + ikvm = kvmi_get(vcpu->kvm); + if (!ikvm) + return true; + + if (is_event_enabled(vcpu, KVMI_EVENT_PF)) + ret = __kvmi_track_prewrite(vcpu, gpa, gva, new, bytes, node); + + kvmi_put(vcpu->kvm); + + return ret; +} + +static bool __kvmi_track_preexec(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + struct kvm_page_track_notifier_node *node) +{ + if (!is_pf_of_interest(vcpu, gpa, KVMI_PAGE_ACCESS_X)) + return true; + + return kvmi_arch_pf_event(vcpu, gpa, gva, KVMI_PAGE_ACCESS_X); +} + +static bool kvmi_track_preexec(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + struct kvm_page_track_notifier_node *node) +{ + struct kvmi *ikvm; + bool ret = true; + + ikvm = kvmi_get(vcpu->kvm); + if (!ikvm) + return true; + + if (is_event_enabled(vcpu, KVMI_EVENT_PF)) + ret = __kvmi_track_preexec(vcpu, gpa, gva, node); + + kvmi_put(vcpu->kvm); + + return ret; +} + +static void kvmi_track_create_slot(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned long npages, + struct kvm_page_track_notifier_node *node) +{ + struct kvmi *ikvm; + gfn_t start = slot->base_gfn; + const gfn_t end = start + npages; + int idx; + + ikvm = kvmi_get(kvm); + if (!ikvm) + return; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + read_lock(&ikvm->access_tree_lock); + + while (start < end) { + struct kvmi_mem_access *m; + + m = __kvmi_get_gfn_access(ikvm, start); + if (m) + kvmi_arch_update_page_tracking(kvm, slot, m); + start++; + } + + read_unlock(&ikvm->access_tree_lock); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + kvmi_put(kvm); +} + +static void kvmi_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node) +{ + struct kvmi *ikvm; + gfn_t start = slot->base_gfn; + const gfn_t end = start + slot->npages; + int idx; + + ikvm = kvmi_get(kvm); + if (!ikvm) + return; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + write_lock(&ikvm->access_tree_lock); + + while (start < end) { + struct kvmi_mem_access *m; + + m = __kvmi_get_gfn_access(ikvm, start); + if (m) { + u8 prev_access = m->access; + + m->access = full_access; + kvmi_arch_update_page_tracking(kvm, slot, m); + m->access = prev_access; + } + + start++; + } + + write_unlock(&ikvm->access_tree_lock); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + kvmi_put(kvm); +} + static void kvmi_end_introspection(struct kvmi *ikvm) { struct kvm *kvm = ikvm->kvm; @@ -290,6 +563,22 @@ static void kvmi_end_introspection(struct kvmi *ikvm) */ kvmi_abort_events(kvm); + /* + * This may sleep on synchronize_srcu() so it's not allowed to be + * called under kvmi_put(). + * Also synchronize_srcu() may deadlock on (page tracking) read-side + * regions that are waiting for reply to events, so must be called + * after kvmi_abort_events(). + */ + kvm_page_track_unregister_notifier(kvm, &ikvm->kptn_node); + + /* + * This function uses kvm->mmu_lock so it's not allowed to be + * called under kvmi_put(). It can reach a deadlock if called + * from kvm_mmu_load -> kvmi_tracked_gfn -> kvmi_put. + */ + kvmi_clear_mem_access(kvm); + /* * At this moment the socket is shut down, no more commands will come * from the introspector, and the only way into the introspection is @@ -351,6 +640,8 @@ int kvmi_hook(struct kvm *kvm, const struct kvm_introspection *qemu) goto err_alloc; } + kvm_page_track_register_notifier(kvm, &ikvm->kptn_node); + /* * Make sure all the KVM/KVMI structures are linked and no pointer * is read as NULL after the reference count has been set. diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h index 7cff91bc1acc..d798908d0f70 100644 --- a/virt/kvm/kvmi_int.h +++ b/virt/kvm/kvmi_int.h @@ -6,6 +6,7 @@ #include <linux/kvm_host.h> #include <uapi/linux/kvmi.h> +#include <asm/kvmi_host.h> #define kvmi_debug(ikvm, fmt, ...) \ kvm_debug("%pU " fmt, &ikvm->uuid, ## __VA_ARGS__) @@ -104,6 +105,10 @@ struct kvmi_vcpu { struct kvmi { struct kvm *kvm; + struct kvm_page_track_notifier_node kptn_node; + + struct radix_tree_root access_tree; + rwlock_t access_tree_lock; struct socket *sock; struct task_struct *recv; @@ -118,6 +123,17 @@ struct kvmi { bool cmd_reply_disabled; }; +struct kvmi_mem_access { + gfn_t gfn; + u8 access; + struct kvmi_arch_mem_access arch; +}; + +static inline bool is_event_enabled(struct kvm_vcpu *vcpu, int event) +{ + return false; /* TODO */ +} + /* kvmi_msg.c */ bool kvmi_sock_get(struct kvmi *ikvm, int fd); void kvmi_sock_shutdown(struct kvmi *ikvm); @@ -138,7 +154,12 @@ int kvmi_add_job(struct kvm_vcpu *vcpu, void *ctx, void (*free_fct)(void *ctx)); /* arch */ +void kvmi_arch_update_page_tracking(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvmi_mem_access *m); void kvmi_arch_setup_event(struct kvm_vcpu *vcpu, struct kvmi_event *ev); +bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, + u8 access); int kvmi_arch_cmd_get_vcpu_info(struct kvm_vcpu *vcpu, struct kvmi_get_vcpu_info_reply *rpl);