From: Nicușor Cîțu <ncitu@xxxxxxxxxxxxxxx> This would be used either if the introspection tool request it as a reply to a KVMI_EVENT_PF event or to cope with instructions that cannot be handled by the x86 emulator during the handling of a VMEXIT. In these situations, all other vCPU-s are kicked and held, the EPT-based protection is removed and the guest is single stepped by the vCPU that triggered the initial VMEXIT. Upon completion the EPT-base protection is reinstalled and all vCPU-s all allowed to return to the guest. This is a rather slow workaround that kicks in occasionally. In the future, the most frequently single-stepped instructions should be added to the emulator (usually, stores to and from memory - SSE/AVX). For the moment it works only on Intel. CC: Jim Mattson <jmattson@xxxxxxxxxx> CC: Sean Christopherson <sean.j.christopherson@xxxxxxxxx> CC: Joerg Roedel <joro@xxxxxxxxxx> Signed-off-by: Nicușor Cîțu <ncitu@xxxxxxxxxxxxxxx> Co-developed-by: Mihai Donțu <mdontu@xxxxxxxxxxxxxxx> Signed-off-by: Mihai Donțu <mdontu@xxxxxxxxxxxxxxx> Co-developed-by: Adalbert Lazăr <alazar@xxxxxxxxxxxxxxx> Signed-off-by: Adalbert Lazăr <alazar@xxxxxxxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/kvmi.c | 47 ++++++++++- arch/x86/kvm/svm.c | 5 ++ arch/x86/kvm/vmx/vmx.c | 17 ++++ arch/x86/kvm/x86.c | 19 +++++ include/linux/kvmi.h | 4 + virt/kvm/kvmi.c | 145 +++++++++++++++++++++++++++++++- virt/kvm/kvmi_int.h | 16 ++++ 8 files changed, 253 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ad36a5fc2048..60e2c298d469 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1016,6 +1016,7 @@ struct kvm_x86_ops { void (*msr_intercept)(struct kvm_vcpu *vcpu, unsigned int msr, bool enable); bool (*desc_intercept)(struct kvm_vcpu *vcpu, bool enable); + void (*set_mtf)(struct kvm_vcpu *vcpu, bool enable); void (*cr3_write_exiting)(struct kvm_vcpu *vcpu, bool enable); bool (*nested_pagefault)(struct kvm_vcpu *vcpu); bool (*spt_fault)(struct kvm_vcpu *vcpu); @@ -1628,6 +1629,8 @@ void kvm_arch_msr_intercept(struct kvm_vcpu *vcpu, unsigned int msr, bool enable); bool kvm_mmu_nested_pagefault(struct kvm_vcpu *vcpu); bool kvm_spt_fault(struct kvm_vcpu *vcpu); +void kvm_set_mtf(struct kvm_vcpu *vcpu, bool enable); +void kvm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); void kvm_control_cr3_write_exiting(struct kvm_vcpu *vcpu, bool enable); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/kvmi.c b/arch/x86/kvm/kvmi.c index 04cac5b8a4d0..f0ab4bd9eb37 100644 --- a/arch/x86/kvm/kvmi.c +++ b/arch/x86/kvm/kvmi.c @@ -520,7 +520,6 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, u32 ctx_size; u64 ctx_addr; u32 action; - bool singlestep_ignored; bool ret = false; if (!kvm_spt_fault(vcpu)) @@ -533,7 +532,7 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, if (ivcpu->effective_rep_complete) return true; - action = kvmi_msg_send_pf(vcpu, gpa, gva, access, &singlestep_ignored, + action = kvmi_msg_send_pf(vcpu, gpa, gva, access, &ivcpu->ss_requested, &ivcpu->rep_complete, &ctx_addr, ivcpu->ctx_data, &ctx_size); @@ -547,6 +546,8 @@ bool kvmi_arch_pf_event(struct kvm_vcpu *vcpu, gpa_t gpa, gva_t gva, ret = true; break; case KVMI_EVENT_ACTION_RETRY: + if (ivcpu->ss_requested && !kvmi_start_ss(vcpu, gpa, access)) + ret = true; break; default: kvmi_handle_common_event_actions(vcpu, action, "PF"); @@ -758,6 +759,48 @@ int kvmi_arch_cmd_control_cr(struct kvm_vcpu *vcpu, return 0; } +void kvmi_arch_start_single_step(struct kvm_vcpu *vcpu) +{ + kvm_set_mtf(vcpu, true); + + /* + * Set block by STI only if the RFLAGS.IF = 1. + * Blocking by both STI and MOV/POP SS is not possible. + */ + if (kvm_arch_interrupt_allowed(vcpu)) + kvm_set_interrupt_shadow(vcpu, KVM_X86_SHADOW_INT_STI); + +} + +void kvmi_arch_stop_single_step(struct kvm_vcpu *vcpu) +{ + kvm_set_mtf(vcpu, false); + /* + * The blocking by STI is cleared after the guest + * executes one instruction or incurs an exception. + * However we migh stop the SS before entering to guest, + * so be sure we are clearing the STI blocking. + */ + kvm_set_interrupt_shadow(vcpu, 0); +} + +u8 kvmi_arch_relax_page_access(u8 old, u8 new) +{ + u8 ret = old | new; + + /* + * An SPTE entry with just the -wx bits set can trigger a + * misconfiguration error from the hardware, as it's the case + * for x86 where this access mode is used to mark I/O memory. + * Thus, we make sure that -wx accesses are translated to rwx. + */ + if ((ret & (KVMI_PAGE_ACCESS_W | KVMI_PAGE_ACCESS_X)) == + (KVMI_PAGE_ACCESS_W | KVMI_PAGE_ACCESS_X)) + ret |= KVMI_PAGE_ACCESS_R; + + return ret; +} + static const struct { unsigned int allow_bit; enum kvm_page_track_mode track_mode; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b178b8900660..3481c0247680 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7183,6 +7183,10 @@ static bool svm_spt_fault(struct kvm_vcpu *vcpu) return (svm->vmcb->control.exit_code == SVM_EXIT_NPF); } +static void svm_set_mtf(struct kvm_vcpu *vcpu, bool enable) +{ +} + static void svm_cr3_write_exiting(struct kvm_vcpu *vcpu, bool enable) { } @@ -7225,6 +7229,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, .has_emulated_msr = svm_has_emulated_msr, + .set_mtf = svm_set_mtf, .cr3_write_exiting = svm_cr3_write_exiting, .msr_intercept = svm_msr_intercept, .desc_intercept = svm_desc_intercept, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7d1e341b51ad..f0369d0574dc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5384,6 +5384,7 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu) static int handle_monitor_trap(struct kvm_vcpu *vcpu) { + kvmi_stop_ss(vcpu); return 1; } @@ -5992,6 +5993,11 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } } + if (kvmi_vcpu_enabled_ss(vcpu) + && exit_reason != EXIT_REASON_EPT_VIOLATION + && exit_reason != EXIT_REASON_MONITOR_TRAP_FLAG) + kvmi_stop_ss(vcpu); + if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); @@ -7842,6 +7848,16 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } +static void vmx_set_mtf(struct kvm_vcpu *vcpu, bool enable) +{ + if (enable) + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_MONITOR_TRAP_FLAG); + else + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_MONITOR_TRAP_FLAG); +} + static void vmx_msr_intercept(struct kvm_vcpu *vcpu, unsigned int msr, bool enable) { @@ -7927,6 +7943,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_accelerated_tpr = report_flexpriority, .has_emulated_msr = vmx_has_emulated_msr, + .set_mtf = vmx_set_mtf, .msr_intercept = vmx_msr_intercept, .cr3_write_exiting = vmx_cr3_write_exiting, .desc_intercept = vmx_desc_intercept, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 38aaddadb93a..65855340249a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7358,6 +7358,13 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) { int r; + if (kvmi_vcpu_enabled_ss(vcpu)) + /* + * We cannot inject events during single-stepping. + * Try again later. + */ + return -1; + /* try to reinject previous events if any */ if (vcpu->arch.exception.injected) @@ -10134,6 +10141,18 @@ void kvm_control_cr3_write_exiting(struct kvm_vcpu *vcpu, bool enable) } EXPORT_SYMBOL(kvm_control_cr3_write_exiting); +void kvm_set_mtf(struct kvm_vcpu *vcpu, bool enable) +{ + kvm_x86_ops->set_mtf(vcpu, enable); +} +EXPORT_SYMBOL(kvm_set_mtf); + +void kvm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +{ + kvm_x86_ops->set_interrupt_shadow(vcpu, mask); +} +EXPORT_SYMBOL(kvm_set_interrupt_shadow); + bool kvm_spt_fault(struct kvm_vcpu *vcpu) { return kvm_x86_ops->spt_fault(vcpu); diff --git a/include/linux/kvmi.h b/include/linux/kvmi.h index 5d162b9e67f2..1dc90284dc3a 100644 --- a/include/linux/kvmi.h +++ b/include/linux/kvmi.h @@ -22,6 +22,8 @@ bool kvmi_queue_exception(struct kvm_vcpu *vcpu); void kvmi_trap_event(struct kvm_vcpu *vcpu); bool kvmi_descriptor_event(struct kvm_vcpu *vcpu, u8 descriptor, u8 write); void kvmi_handle_requests(struct kvm_vcpu *vcpu); +void kvmi_stop_ss(struct kvm_vcpu *vcpu); +bool kvmi_vcpu_enabled_ss(struct kvm_vcpu *vcpu); void kvmi_init_emulate(struct kvm_vcpu *vcpu); void kvmi_activate_rep_complete(struct kvm_vcpu *vcpu); bool kvmi_bp_intercepted(struct kvm_vcpu *vcpu, u32 dbg); @@ -44,6 +46,8 @@ static inline void kvmi_handle_requests(struct kvm_vcpu *vcpu) { } static inline bool kvmi_hypercall_event(struct kvm_vcpu *vcpu) { return false; } static inline bool kvmi_queue_exception(struct kvm_vcpu *vcpu) { return true; } static inline void kvmi_trap_event(struct kvm_vcpu *vcpu) { } +static inline void kvmi_stop_ss(struct kvm_vcpu *vcpu) { } +static inline bool kvmi_vcpu_enabled_ss(struct kvm_vcpu *vcpu) { return false; } static inline void kvmi_init_emulate(struct kvm_vcpu *vcpu) { } static inline void kvmi_activate_rep_complete(struct kvm_vcpu *vcpu) { } static inline bool kvmi_bp_intercepted(struct kvm_vcpu *vcpu, u32 dbg) diff --git a/virt/kvm/kvmi.c b/virt/kvm/kvmi.c index d47a725a4045..a3a5af9080a9 100644 --- a/virt/kvm/kvmi.c +++ b/virt/kvm/kvmi.c @@ -1260,11 +1260,19 @@ void kvmi_run_jobs(struct kvm_vcpu *vcpu) } } +static bool need_to_wait_for_ss(struct kvm_vcpu *vcpu) +{ + struct kvmi_vcpu *ivcpu = IVCPU(vcpu); + struct kvmi *ikvm = IKVM(vcpu->kvm); + + return atomic_read(&ikvm->ss_active) && !ivcpu->ss_owner; +} + static bool need_to_wait(struct kvm_vcpu *vcpu) { struct kvmi_vcpu *ivcpu = IVCPU(vcpu); - return ivcpu->reply_waiting; + return ivcpu->reply_waiting || need_to_wait_for_ss(vcpu); } static bool done_waiting(struct kvm_vcpu *vcpu) @@ -1572,6 +1580,141 @@ int kvmi_cmd_pause_vcpu(struct kvm_vcpu *vcpu, bool wait) return 0; } +void kvmi_stop_ss(struct kvm_vcpu *vcpu) +{ + struct kvmi_vcpu *ivcpu = IVCPU(vcpu); + struct kvm *kvm = vcpu->kvm; + struct kvmi *ikvm; + int i; + + ikvm = kvmi_get(kvm); + if (!ikvm) + return; + + if (unlikely(!ivcpu->ss_owner)) { + kvmi_warn(ikvm, "%s\n", __func__); + goto out; + } + + for (i = ikvm->ss_level; i--;) + kvmi_set_gfn_access(kvm, + ikvm->ss_context[i].gfn, + ikvm->ss_context[i].old_access, + ikvm->ss_context[i].old_write_bitmap); + + ikvm->ss_level = 0; + + kvmi_arch_stop_single_step(vcpu); + + atomic_set(&ikvm->ss_active, false); + /* + * Make ss_active update visible + * before resuming all the other vCPUs. + */ + smp_mb__after_atomic(); + kvm_make_all_cpus_request(kvm, 0); + + ivcpu->ss_owner = false; + +out: + kvmi_put(kvm); +} +EXPORT_SYMBOL(kvmi_stop_ss); + +static bool kvmi_acquire_ss(struct kvm_vcpu *vcpu) +{ + struct kvmi_vcpu *ivcpu = IVCPU(vcpu); + struct kvmi *ikvm = IKVM(vcpu->kvm); + + if (ivcpu->ss_owner) + return true; + + if (atomic_cmpxchg(&ikvm->ss_active, false, true) != false) + return false; + + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_INTROSPECTION | + KVM_REQUEST_WAIT); + + ivcpu->ss_owner = true; + + return true; +} + +static bool kvmi_run_ss(struct kvm_vcpu *vcpu, gpa_t gpa, u8 access) +{ + struct kvmi *ikvm = IKVM(vcpu->kvm); + u8 old_access, new_access; + u32 old_write_bitmap; + gfn_t gfn = gpa_to_gfn(gpa); + int err; + + kvmi_arch_start_single_step(vcpu); + + err = kvmi_get_gfn_access(ikvm, gfn, &old_access, &old_write_bitmap); + /* likely was removed from radix tree due to rwx */ + if (err) { + kvmi_warn(ikvm, "%s: gfn 0x%llx not found in the radix tree\n", + __func__, gfn); + return true; + } + + if (ikvm->ss_level == SINGLE_STEP_MAX_DEPTH - 1) { + kvmi_err(ikvm, "single step limit reached\n"); + return false; + } + + ikvm->ss_context[ikvm->ss_level].gfn = gfn; + ikvm->ss_context[ikvm->ss_level].old_access = old_access; + ikvm->ss_context[ikvm->ss_level].old_write_bitmap = old_write_bitmap; + ikvm->ss_level++; + + new_access = kvmi_arch_relax_page_access(old_access, access); + + kvmi_set_gfn_access(vcpu->kvm, gfn, new_access, old_write_bitmap); + + return true; +} + +bool kvmi_start_ss(struct kvm_vcpu *vcpu, gpa_t gpa, u8 access) +{ + bool ret = false; + + while (!kvmi_acquire_ss(vcpu)) { + int err = kvmi_run_jobs_and_wait(vcpu); + + if (err) { + kvmi_err(IKVM(vcpu->kvm), "kvmi_acquire_ss() has failed\n"); + goto out; + } + } + + if (kvmi_run_ss(vcpu, gpa, access)) + ret = true; + else + kvmi_stop_ss(vcpu); + +out: + return ret; +} + +bool kvmi_vcpu_enabled_ss(struct kvm_vcpu *vcpu) +{ + struct kvmi_vcpu *ivcpu = IVCPU(vcpu); + struct kvmi *ikvm; + bool ret; + + ikvm = kvmi_get(vcpu->kvm); + if (!ikvm) + return false; + + ret = ivcpu->ss_owner; + + kvmi_put(vcpu->kvm); + + return ret; +} +EXPORT_SYMBOL(kvmi_vcpu_enabled_ss); + static void kvmi_job_abort(struct kvm_vcpu *vcpu, void *ctx) { struct kvmi_vcpu *ivcpu = IVCPU(vcpu); diff --git a/virt/kvm/kvmi_int.h b/virt/kvm/kvmi_int.h index d7f9858d3e97..1550fe33ed48 100644 --- a/virt/kvm/kvmi_int.h +++ b/virt/kvm/kvmi_int.h @@ -126,6 +126,9 @@ struct kvmi_vcpu { DECLARE_BITMAP(high, KVMI_NUM_MSR); } msr_mask; + bool ss_owner; + bool ss_requested; + struct list_head job_list; spinlock_t job_lock; @@ -151,6 +154,15 @@ struct kvmi { DECLARE_BITMAP(event_allow_mask, KVMI_NUM_EVENTS); DECLARE_BITMAP(vm_ev_mask, KVMI_NUM_EVENTS); +#define SINGLE_STEP_MAX_DEPTH 8 + struct { + gfn_t gfn; + u8 old_access; + u32 old_write_bitmap; + } ss_context[SINGLE_STEP_MAX_DEPTH]; + u8 ss_level; + atomic_t ss_active; + struct { bool initialized; atomic_t enabled; @@ -224,6 +236,7 @@ int kvmi_add_job(struct kvm_vcpu *vcpu, void *ctx, void (*free_fct)(void *ctx)); void kvmi_handle_common_event_actions(struct kvm_vcpu *vcpu, u32 action, const char *str); +bool kvmi_start_ss(struct kvm_vcpu *vcpu, gpa_t gpa, u8 access); /* arch */ void kvmi_arch_update_page_tracking(struct kvm *kvm, @@ -274,6 +287,9 @@ int kvmi_arch_cmd_inject_exception(struct kvm_vcpu *vcpu, u8 vector, u64 address); int kvmi_arch_cmd_control_cr(struct kvm_vcpu *vcpu, const struct kvmi_control_cr *req); +void kvmi_arch_start_single_step(struct kvm_vcpu *vcpu); +void kvmi_arch_stop_single_step(struct kvm_vcpu *vcpu); +u8 kvmi_arch_relax_page_access(u8 old, u8 new); int kvmi_arch_cmd_control_msr(struct kvm_vcpu *vcpu, const struct kvmi_control_msr *req); int kvmi_arch_cmd_get_mtrr_type(struct kvm_vcpu *vcpu, u64 gpa, u8 *type);