On Mon, Oct 14, 2019 at 5:40 PM Krish Sadhukhan <krish.sadhukhan@xxxxxxxxxx> wrote: > > If VM-entry fails due to VM-entry MSR-loading, the MSRs of the nested guests > need to be rolled back to their previous state in order for the guest to not > be in an inconsistent state. This change seems overly simplistic, and it also breaks the existing ABI. > Suggested-by: Jim Mattson <jmattson@xxxxxxxxxx> > Signed-off-by: Krish Sadhukhan <krish.sadhukhan@xxxxxxxxxx> > Reviewed-by: Karl Heubaum <karl.heubaum@xxxxxxxxxx> > --- > arch/x86/kvm/vmx/nested.c | 50 +++++++++++++++++++++++++++++++++++---- > arch/x86/kvm/vmx/nested.h | 26 +++++++++++++++----- > arch/x86/kvm/vmx/vmx.h | 8 +++++++ > 3 files changed, 74 insertions(+), 10 deletions(-) > > diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c > index cebdcb105ea8..bd8e7af5c1e5 100644 > --- a/arch/x86/kvm/vmx/nested.c > +++ b/arch/x86/kvm/vmx/nested.c > @@ -191,7 +191,7 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu, > return kvm_skip_emulated_instruction(vcpu); > } > > -static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) > +void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) > { > /* TODO: not to reset guest simply here. */ > kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); > @@ -894,11 +894,13 @@ static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) > * as possible, process all valid entries before failing rather than precheck > * for a capacity violation. > */ > -static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) > +static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count, > + bool save) > { > u32 i; > struct vmx_msr_entry e; > u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); > + struct vcpu_vmx *vmx = to_vmx(vcpu); > > for (i = 0; i < count; i++) { > if (unlikely(i >= max_msr_list_size)) > @@ -917,6 +919,16 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) > __func__, i, e.index, e.reserved); > goto fail; > } > + if (save) { > + vmx->nested.vm_entry_msr_load_backup[i].index = e.index; > + if (kvm_get_msr(vcpu, e.index, > + &(vmx->nested.vm_entry_msr_load_backup[i].data))) { > + pr_debug_ratelimited( > + "%s cannot read MSR (%u, 0x%x)\n", > + __func__, i, e.index); > + goto fail; This breaks the ABI, by requiring that all MSRs in the MSR-load list have to be readable. Some, like IA32_PRED_CMD, are not. > + } > + } > if (kvm_set_msr(vcpu, e.index, e.value)) { > pr_debug_ratelimited( > "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", > @@ -926,6 +938,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) > } > return 0; > fail: > + kfree(vmx->nested.vm_entry_msr_load_backup); > return i + 1; > } > > @@ -973,6 +986,26 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) > return 0; > } > > +int nested_vmx_rollback_msr(struct kvm_vcpu *vcpu, u32 count) > +{ > + u32 i; > + struct msr_data msr; > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + > + for (i = 0; i < count; i++) { I wonder if this loop should go in the other direction, in case there are dependencies among the MSR settings. > + msr.host_initiated = false; > + msr.index = vmx->nested.vm_entry_msr_load_backup[i].index; > + msr.data = vmx->nested.vm_entry_msr_load_backup[i].data; > + if (kvm_set_msr(vcpu, msr.index, msr.data)) { > + pr_debug_ratelimited( > + "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", > + __func__, i, msr.index, msr.data); > + return -EINVAL; This doesn't work with time-related MSRs, like IA32_TIME_STAMP_COUNTER. Rather than "rolling back" to an earlier value, you need to be able to negate the effect of the load that should never have happened. Similarly, I don't think this works with IA32_TSC_DEADLINE, if the original deadline has passed before you saved it. I believe that writing a deadline in the past will result in a spurious interrupt. > + } > + } > + return 0; > +} > + > static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) > { > unsigned long invalid_mask; > @@ -3102,9 +3135,18 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) > goto vmentry_fail_vmexit_guest_mode; > > if (from_vmentry) { > + u32 count = vmcs12->vm_entry_msr_load_count; > + > + /* Save guest MSRs before we load them */ > + vmx->nested.vm_entry_msr_load_backup = > + kcalloc(count, sizeof(struct msr_data), GFP_KERNEL_ACCOUNT); > + if (!vmx->nested.vm_entry_msr_load_backup) > + goto vmentry_fail_vmexit_guest_mode; > + Should the backup memory be allocated in advance, so that we don't have this unarchitected VM-entry failure? If not, should this be deferred until after the attempted VM-entry to vmcs02, to avoid introducing yet another priority inversion? > exit_qual = nested_vmx_load_msr(vcpu, > vmcs12->vm_entry_msr_load_addr, > - vmcs12->vm_entry_msr_load_count); > + vmcs12->vm_entry_msr_load_count, > + true); > if (exit_qual) { > /* > * According to section “VM Entries” in Intel SDM > @@ -3940,7 +3982,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, > vmx_update_msr_bitmap(vcpu); > > if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, > - vmcs12->vm_exit_msr_load_count)) > + vmcs12->vm_exit_msr_load_count, false)) > nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); > } > > diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h > index bb51ec8cf7da..f951b2b338d2 100644 > --- a/arch/x86/kvm/vmx/nested.h > +++ b/arch/x86/kvm/vmx/nested.h > @@ -17,6 +17,8 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); > bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); > void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, > u32 exit_intr_info, unsigned long exit_qualification); > +void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator); > +int nested_vmx_rollback_msr(struct kvm_vcpu *vcpu, u32 count); > void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu); > int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); > int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); > @@ -66,7 +68,6 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, > { > u32 exit_qual; > u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); > - struct vmx_msr_entry *addr; > > /* > * At this point, the exit interruption info in exit_intr_info > @@ -85,11 +86,24 @@ static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, > > exit_qual = vmcs_readl(EXIT_QUALIFICATION); > > - addr = __va(vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR)); > - if (addr && addr->index == MSR_FS_BASE && > - (exit_reason == (VMX_EXIT_REASONS_FAILED_VMENTRY | > - EXIT_REASON_MSR_LOAD_FAIL))) { > - exit_qual = (to_vmx(vcpu))->nested.invalid_msr_load_exit_qual; > + if (exit_reason == (VMX_EXIT_REASONS_FAILED_VMENTRY | > + EXIT_REASON_MSR_LOAD_FAIL)) { > + > + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > + struct vmx_msr_entry *addr; > + > + if (nested_vmx_rollback_msr(vcpu, > + vmcs12->vm_entry_msr_load_count)) { > + nested_vmx_abort(vcpu, > + VMX_ABORT_SAVE_GUEST_MSR_FAIL); > + > + kfree(to_vmx(vcpu)->nested.vm_entry_msr_load_backup); > + } Are we leaking the backup memory when the rollback succeeds? > + > + addr = __va(vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR)); > + if (addr && addr->index == MSR_FS_BASE) > + exit_qual = > + (to_vmx(vcpu))->nested.invalid_msr_load_exit_qual; > } > > nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); > diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h > index ee7f40abd199..9a7c118036be 100644 > --- a/arch/x86/kvm/vmx/vmx.h > +++ b/arch/x86/kvm/vmx/vmx.h > @@ -189,6 +189,14 @@ struct nested_vmx { > * due to invalid VM-entry MSR-load area in vmcs12. > */ > u32 invalid_msr_load_exit_qual; > + > + /* > + * This is used for backing up the MSRs of nested guests when > + * those MSRs are loaded from VM-entry MSR-load area on VM-entry. > + * If VM-entry fails due to VM-entry MSR-loading, we roll back > + * the MSRs to the values saved here. > + */ > + struct msr_data *vm_entry_msr_load_backup; > }; > > struct vcpu_vmx { > -- > 2.20.1 >