On 04.08.20 06:20, Aaron Lewis wrote:
Add support for exiting to userspace on a rdmsr or wrmsr instruction if the MSR being read from or written to is in the user_exit_msrs list. Signed-off-by: Aaron Lewis <aaronlewis@xxxxxxxxxx> --- Documentation/virt/kvm/api.rst | 29 +++++++++++- arch/x86/kvm/trace.h | 24 ++++++++++ arch/x86/kvm/x86.c | 83 ++++++++++++++++++++++++++++++++++ include/trace/events/kvm.h | 2 +- include/uapi/linux/kvm.h | 10 ++++ 5 files changed, 145 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 7d8167c165aa..8b7078707e0a 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4885,8 +4885,9 @@ to the byte array. .. note:: - For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR and - KVM_EXIT_EPR the corresponding + For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, + KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR, and KVM_EXIT_X86_WRMSR the + corresponding operations are complete (and guest state is consistent) only after userspace has re-entered the kernel with KVM_RUN. The kernel side will first finish @@ -5179,6 +5180,30 @@ Note that KVM does not skip the faulting instruction as it does for KVM_EXIT_MMIO, but userspace has to emulate any change to the processing state if it decides to decode and emulate the instruction. +:: + + /* KVM_EXIT_X86_RDMSR */ + /* KVM_EXIT_X86_WRMSR */ + struct { + __u8 inject_gp; /* out */ + __u8 pad[3]; + __u32 index; /* i.e. ecx; out */ + __u64 data; /* in (wrmsr) / out (rdmsr) */ + } msr;
I like that struct layout! :)
+ +If the exit_reason is KVM_EXIT_X86_RDMSR then a rdmsr instruction in the guest +needs to be processed by userspace. If the exit_reason is KVM_EXIT_X86_WRMSR +then a wrmsr instruction in the guest needs to be processed by userspace. + +Userspace can tell KVM to inject a #GP into the guest by setting the +'inject_gp' flag. Setting the flag to 1 tells KVM to inject a GP into the +guest. Setting the flag to 0 tells KVM to not inject a GP into the guest. + +The MSR being processed is indicated by 'index'. If a read is being processed +the 'data' field is expected to be filled out by userspace (as an out +parameter). If a write is being processed the 'data' field will contain the +updated value of the MSR (as an in parameter). + :: /* Fix the size of the union. */ diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index b66432b015d2..d03143ebd6f0 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -367,6 +367,30 @@ TRACE_EVENT(kvm_msr, #define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true) #define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true) +TRACE_EVENT(kvm_userspace_msr, + TP_PROTO(bool is_write, u8 inject_gp, u32 index, u64 data), + TP_ARGS(is_write, inject_gp, index, data), + + TP_STRUCT__entry( + __field(bool, is_write) + __field(u8, inject_gp) + __field(u32, index) + __field(u64, data) + ), + + TP_fast_assign( + __entry->is_write = is_write; + __entry->inject_gp = inject_gp; + __entry->index = index; + __entry->data = data; + ), + + TP_printk("userspace %s %x = 0x%llx, %s", + __entry->is_write ? "wrmsr" : "rdmsr", + __entry->index, __entry->data, + __entry->inject_gp ? "inject_gp" : "no_gp") +); + /* * Tracepoint for guest CR access. */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 46a0fb9e0869..47619b49818a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -108,6 +108,8 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); +bool kvm_msr_user_exit(struct kvm *kvm, u32 index); + struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -1549,11 +1551,61 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) } EXPORT_SYMBOL_GPL(kvm_set_msr); +/* + * On success, returns 1 so that __vcpu_run() will happen next. On + * error, returns 0. + */ +static int complete_userspace_msr(struct kvm_vcpu *vcpu, bool is_write) +{ + u32 ecx = vcpu->run->msr.index; + u64 data = vcpu->run->msr.data; + + trace_kvm_userspace_msr(is_write, + vcpu->run->msr.inject_gp, + vcpu->run->msr.index, + vcpu->run->msr.data); + + if (vcpu->run->msr.inject_gp) { + trace_kvm_msr(is_write, ecx, data, true);
If you put the trace point one line up and make the last argument !!vcpu->run->msr.inject_gp, you can omit the second invocation below :). Bonus readability points for making inject_gp a local bool variable.
+ kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr(is_write, ecx, data, false); + if (!is_write) { + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + } + + return kvm_skip_emulated_instruction(vcpu); +} + +static int complete_userspace_rdmsr(struct kvm_vcpu *vcpu) +{ + return complete_userspace_msr(vcpu, false); +} + +static int complete_userspace_wrmsr(struct kvm_vcpu *vcpu) +{ + return complete_userspace_msr(vcpu, true); +} + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; + if (kvm_msr_user_exit(vcpu->kvm, ecx)) { + vcpu->run->exit_reason = KVM_EXIT_X86_RDMSR; + vcpu->run->msr.index = ecx; + vcpu->run->msr.data = 0; + vcpu->run->msr.inject_gp = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->arch.complete_userspace_io = + complete_userspace_rdmsr; + return 0; + } + if (kvm_get_msr(vcpu, ecx, &data)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); @@ -1573,6 +1625,17 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); + if (kvm_msr_user_exit(vcpu->kvm, ecx)) { + vcpu->run->exit_reason = KVM_EXIT_X86_WRMSR; + vcpu->run->msr.index = ecx; + vcpu->run->msr.data = data; + vcpu->run->msr.inject_gp = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->arch.complete_userspace_io = + complete_userspace_wrmsr; + return 0; + } + if (kvm_set_msr(vcpu, ecx, data)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); @@ -3455,6 +3518,25 @@ static int kvm_vm_ioctl_set_exit_msrs(struct kvm *kvm, return 0; } +bool kvm_msr_user_exit(struct kvm *kvm, u32 index) +{ + struct kvm_msr_list *exit_msrs; + int i; + + exit_msrs = kvm->arch.user_exit_msrs; + + if (!exit_msrs) + return false; + + for (i = 0; i < exit_msrs->nmsrs; ++i) { + if (exit_msrs->indices[i] == index) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(kvm_msr_user_exit); + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -10762,3 +10844,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_userspace_msr); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 2c735a3e6613..19f33a704174 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -17,7 +17,7 @@ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ - ERSN(HYPERV) + ERSN(HYPERV), ERSN(X86_RDMSR), ERSN(X86_WRMSR) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index de4638c1bd15..2b7d21e6338c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -248,6 +248,8 @@ struct kvm_hyperv_exit { #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 #define KVM_EXIT_ARM_NISV 28 +#define KVM_EXIT_X86_RDMSR 29 +#define KVM_EXIT_X86_WRMSR 30 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -412,6 +414,14 @@ struct kvm_run { __u64 esr_iss; __u64 fault_ipa; } arm_nisv; + /* KVM_EXIT_X86_RDMSR */ + /* KVM_EXIT_X86_RDMSR */
WRMSR? Alex
+ struct { + __u8 inject_gp; + __u8 pad[3]; + __u32 index; + __u64 data; + } msr; /* Fix the size of the union. */ char padding[256]; }; -- 2.28.0.163.g6104cc2f0b6-goog
Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Jonathan Weiss Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879