Re: [PATCH v2] kvm: nVMX: Introduce KVM_CAP_STATE

Jim Mattson <jmattson@xxxxxxxxxx> · Mon, 9 Apr 2018 11:30:17 -0700

On Mon, Apr 9, 2018 at 1:37 AM, KarimAllah Ahmed <karahmed@xxxxxxxxx> wrote:
> From: Jim Mattson <jmattson@xxxxxxxxxx>
>
> For nested virtualization L0 KVM is managing a bit of state for L2 guests,
> this state can not be captured through the currently available IOCTLs. In
> fact the state captured through all of these IOCTLs is usually a mix of L1
> and L2 state. It is also dependent on whether the L2 guest was running at
> the moment when the process was interrupted to save its state.
>
> With this capability, there are two new vcpu ioctls: KVM_GET_VMX_STATE and
> KVM_SET_VMX_STATE. These can be used for saving and restoring a VM that is
> in VMX operation.
>
> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Cc: Radim Krčmář <rkrcmar@xxxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: H. Peter Anvin <hpa@xxxxxxxxx>
> Cc: x86@xxxxxxxxxx
> Cc: kvm@xxxxxxxxxxxxxxx
> Cc: linux-kernel@xxxxxxxxxxxxxxx
> Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx>
> [karahmed@ - rename structs and functions and make them ready for AMD and
>              address previous comments.
>            - rebase & a bit of refactoring.
>            - Merge 7/8 and 8/8 into one patch.
>            - Force a VMExit from L2 after reading the kvm_state to avoid
>              mixed state between L1 and L2 on resurrecting the instance. ]
> Signed-off-by: KarimAllah Ahmed <karahmed@xxxxxxxxx>

First, let me say "thank you" for picking this up!

> ---
> v1 -> v2:
> - rename structs and functions and make them ready for AMD and address
>   previous comments.
> - rebase & a bit of refactoring.
> - Merge 7/8 and 8/8 into one patch.
> - Force a VMExit from L2 after reading the kvm_state to avoid mixed state
>   between L1 and L2 on resurrecting the instance.
> ---
>  Documentation/virtual/kvm/api.txt |  46 ++++++++++
>  arch/x86/include/asm/kvm_host.h   |   7 ++
>  arch/x86/include/uapi/asm/kvm.h   |  38 ++++++++
>  arch/x86/kvm/vmx.c                | 189 +++++++++++++++++++++++++++++++++++++-
>  arch/x86/kvm/x86.c                |  21 +++++
>  include/uapi/linux/kvm.h          |   5 +
>  6 files changed, 302 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index d6b3ff5..3ed56df 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -3516,6 +3516,52 @@ Returns: 0 on success; -1 on error
>  This ioctl can be used to unregister the guest memory region registered
>  with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above.
>
> +4.112 KVM_GET_STATE

More specifically, KVM_GET_NESTED_STATE?

> +
> +Capability: KVM_CAP_STATE

KVM_CAP_NESTED_STATE?

> +Architectures: x86
> +Type: vcpu ioctl
> +Parameters: struct kvm_state (in/out)
> +Returns: 0 on success, -1 on error
> +Errors:
> +  E2BIG:     the data size exceeds the value of 'size' specified by
> +             the user (the size required will be written into size).
> +
> +struct kvm_state {
> +       __u16 flags;
> +       __u16 format;
> +       __u32 size;
> +       union {
> +               struct kvm_vmx_state vmx;
> +               struct kvm_svm_state svm;
> +               __u8 pad[120];
> +       };
> +       __u8 data[0];
> +};
> +
> +This ioctl copies the vcpu's kvm_state struct from the kernel to userspace.
> +
> +4.113 KVM_SET_STATE

KVM_SET_NESTED_STATE?

> +
> +Capability: KVM_CAP_STATE

KVM_CAP_NESTED_STATE?

> +Architectures: x86
> +Type: vcpu ioctl
> +Parameters: struct kvm_state (in)
> +Returns: 0 on success, -1 on error
> +
> +struct kvm_state {
> +       __u16 flags;
> +       __u16 format;
> +       __u32 size;
> +       union {
> +               struct kvm_vmx_state vmx;
> +               struct kvm_svm_state svm;
> +               __u8 pad[120];
> +       };
> +       __u8 data[0];
> +};
> +
> +This copies the vcpu's kvm_state struct from userspace to the kernel.
>
>  5. The kvm_run structure
>  ------------------------
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index fad4d46..902db9e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -73,6 +73,7 @@
>  #define KVM_REQ_HV_RESET               KVM_ARCH_REQ(20)
>  #define KVM_REQ_HV_EXIT                        KVM_ARCH_REQ(21)
>  #define KVM_REQ_HV_STIMER              KVM_ARCH_REQ(22)
> +#define KVM_REQ_GET_VMCS12_PAGES       KVM_ARCH_REQ(23)
>
>  #define CR0_RESERVED_BITS                                               \
>         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> @@ -1090,6 +1091,12 @@ struct kvm_x86_ops {
>
>         void (*setup_mce)(struct kvm_vcpu *vcpu);
>
> +       int (*get_state)(struct kvm_vcpu *vcpu,

get_nested_state

> +                        struct kvm_state __user *user_kvm_state);
> +       int (*set_state)(struct kvm_vcpu *vcpu,

set_nested_state

> +                        struct kvm_state __user *user_kvm_state);
> +       void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
> +
>         int (*smi_allowed)(struct kvm_vcpu *vcpu);
>         int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
>         int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index f3a9604..1d1cd26 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -361,4 +361,42 @@ struct kvm_sync_regs {
>  #define KVM_X86_QUIRK_LINT0_REENABLED  (1 << 0)
>  #define KVM_X86_QUIRK_CD_NW_CLEARED    (1 << 1)
>
> +#define KVM_STATE_GUEST_MODE   0x00000001
> +#define KVM_STATE_RUN_PENDING  0x00000002
> +#define KVM_STATE_GIF          0x00000004
> +
> +struct kvm_vmx_state {
> +       __u64 vmxon_pa;
> +       __u64 vmcs_pa;
> +};
> +
> +struct kvm_svm_state {
> +       __u64 hsave_pa;
> +       __u64 vmcb_pa;
> +};
> +
> +/* for KVM_CAP_STATE */
> +struct kvm_state {
> +       /* KVM_STATE_* flags */
> +       __u16 flags;
> +
> +       /* 0 for VMX, 1 for SVM.  */
> +       __u16 format;
> +
> +       /* 128 for SVM, 128 + VMCS size for VMX.  */
> +       __u32 size;
> +
> +       union {
> +               /* VMXON, VMCS */
> +               struct kvm_vmx_state vmx;
> +               /* HSAVE_PA, VMCB */
> +               struct kvm_svm_state svm;
> +
> +               /* Pad the union to 120 bytes.  */
> +               __u8 pad[120];
> +       };
> +
> +       __u8 data[0];
> +};
> +
>  #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 14655df..4d830f7 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -10056,10 +10056,10 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
>  static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
>                                                  struct vmcs12 *vmcs12);
>
> -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
> -                                       struct vmcs12 *vmcs12)
> +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
>  {
>         struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>
>         if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
>                 if (vmcs12->apic_access_addr != vmx->nested.apic_access_mapping.gfn << PAGE_SHIFT) {
> @@ -11101,8 +11101,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
>                 return 1;
>         }
>
> -       nested_get_vmcs12_pages(vcpu, vmcs12);
> -
>         msr_entry_idx = nested_vmx_load_msr(vcpu,
>                                             vmcs12->vm_entry_msr_load_addr,
>                                             vmcs12->vm_entry_msr_load_count);
> @@ -11200,6 +11198,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
>         if (ret)
>                 return ret;
>
> +       nested_get_vmcs12_pages(vcpu);
> +
>         /*
>          * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
>          * by event injection, halt vcpu.
> @@ -12259,6 +12259,183 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
>         return 0;
>  }
>
> +static int get_vmcs_cache(struct kvm_vcpu *vcpu,
> +                         struct kvm_state __user *user_kvm_state)
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +
> +       /*
> +        * When running L2, the authoritative vmcs12 state is in the
> +        * vmcs02. When running L1, the authoritative vmcs12 state is
> +        * in the shadow vmcs linked to vmcs01, unless
> +        * sync_shadow_vmcs is set, in which case, the authoritative
> +        * vmcs12 state is in the vmcs12 already.
> +        */
> +       if (is_guest_mode(vcpu))
> +               sync_vmcs12(vcpu, vmcs12);
> +       else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
> +               copy_shadow_to_vmcs12(vmx);
> +
> +       if (copy_to_user(user_kvm_state->data, vmcs12, sizeof(*vmcs12)))
> +               return -EFAULT;
> +
> +       /*
> +        * Force a nested exit that guarantees that any state capture
> +        * afterwards by any IOCTLs (MSRs, etc) will not capture a mix of L1
> +        * and L2 state.
> +        *
> +        * One example where that would lead to an issue is the TSC DEADLINE
> +        * MSR vs the guest TSC. If the L2 guest is running, the guest TSC will
> +        * be the L2 TSC while the TSC deadline MSR will contain the L1 TSC
> +        * deadline MSR. That would lead to a very large (and wrong) "expire"
> +        * diff when LAPIC is initialized during instance restore (i.e. the
> +        * instance will appear to have hanged!).
> +        */
> +       if (is_guest_mode(vcpu))
> +               nested_vmx_vmexit(vcpu, -1, 0, 0);

Injecting a fake VM-exit on restore is as bad as injecting a fake
VM-exit on save, and I don't think this is a good approach.

> +
> +       return 0;
> +}
> +
> +static int get_vmx_state(struct kvm_vcpu *vcpu,
> +                        struct kvm_state __user *user_kvm_state)
> +{
> +       u32 user_data_size;
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct kvm_state kvm_state = {
> +               .flags = 0,
> +               .format = 0,
> +               .size = sizeof(kvm_state),
> +               .vmx.vmxon_pa = -1ull,
> +               .vmx.vmcs_pa = -1ull,
> +       };
> +
> +       if (copy_from_user(&user_data_size, &user_kvm_state->size,
> +                          sizeof(user_data_size)))
> +               return -EFAULT;
> +
> +       if (nested_vmx_allowed(vcpu) && vmx->nested.vmxon) {
> +               kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
> +               kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
> +
> +               if (vmx->nested.current_vmptr != -1ull)
> +                       kvm_state.size += VMCS12_SIZE;
> +
> +               if (is_guest_mode(vcpu)) {
> +                       kvm_state.flags |= KVM_STATE_GUEST_MODE;
> +
> +                       if (vmx->nested.nested_run_pending)
> +                               kvm_state.flags |= KVM_STATE_RUN_PENDING;

IIRC, when I initially posted this set of changes, I neglected to
include the one that set nested_run_pending before prepare_vmcs02(),
and so this bit isn't actually tracked correctly for save/restore at
the moment.

> +               }
> +       }
> +
> +       if (user_data_size < kvm_state.size) {
> +               if (copy_to_user(&user_kvm_state->size, &kvm_state.size,
> +                                sizeof(kvm_state.size)))
> +                       return -EFAULT;
> +               return -E2BIG;
> +       }
> +
> +       if (copy_to_user(user_kvm_state, &kvm_state, sizeof(kvm_state)))
> +               return -EFAULT;
> +
> +       if (vmx->nested.current_vmptr == -1ull)
> +               return 0;
> +
> +       return get_vmcs_cache(vcpu, user_kvm_state);
> +}
> +
> +static int set_vmcs_cache(struct kvm_vcpu *vcpu,
> +                         struct kvm_state __user *user_kvm_state,
> +                         struct kvm_state *kvm_state)
> +
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> +       u32 exit_qual;
> +       int ret;
> +
> +       if ((kvm_state->size < (sizeof(*vmcs12) + sizeof(*kvm_state))) ||
> +           kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
> +           !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
> +               return -EINVAL;
> +
> +       if (copy_from_user(vmcs12, user_kvm_state->data, sizeof(*vmcs12)))
> +               return -EFAULT;
> +
> +       if (vmcs12->revision_id != VMCS12_REVISION)
> +               return -EINVAL;
> +
> +       set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
> +
> +       if (!(kvm_state->flags & KVM_STATE_GUEST_MODE))
> +               return 0;
> +
> +       if (check_vmentry_prereqs(vcpu, vmcs12) ||
> +           check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
> +               return -EINVAL;
> +
> +       ret = enter_vmx_non_root_mode(vcpu, true);
> +       if (ret)
> +               return ret;
> +
> +       /*
> +        * This request will result in a call to
> +        * nested_get_vmcs12_pages before the next VM-entry.
> +        */
> +       kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
> +
> +       vmx->nested.nested_run_pending = 1;
> +
> +       return 0;
> +}
> +
> +static int set_vmx_state(struct kvm_vcpu *vcpu,
> +                        struct kvm_state __user *user_kvm_state)
> +{
> +       struct vcpu_vmx *vmx = to_vmx(vcpu);
> +       struct kvm_state kvm_state;
> +       int ret;
> +
> +       if (copy_from_user(&kvm_state, user_kvm_state, sizeof(kvm_state)))
> +               return -EFAULT;
> +
> +       if (kvm_state.size < sizeof(kvm_state))
> +               return -EINVAL;
> +
> +       if (kvm_state.format != 0)
> +               return -EINVAL;
> +
> +       if (kvm_state.flags &
> +           ~(KVM_STATE_RUN_PENDING | KVM_STATE_GUEST_MODE))
> +               return -EINVAL;
> +
> +       if (!nested_vmx_allowed(vcpu))
> +               return kvm_state.vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
> +
> +       vmx_leave_nested(vcpu);
> +
> +       vmx->nested.nested_run_pending =
> +               !!(kvm_state.flags & KVM_STATE_RUN_PENDING);
> +
> +       if (kvm_state.vmx.vmxon_pa == -1ull)
> +               return 0;
> +
> +       if (!page_address_valid(vcpu, kvm_state.vmx.vmxon_pa))
> +               return -EINVAL;
> +
> +       vmx->nested.vmxon_ptr = kvm_state.vmx.vmxon_pa;
> +       ret = enter_vmx_operation(vcpu);
> +       if (ret)
> +               return ret;
> +
> +       if (kvm_state.vmx.vmcs_pa == -1ull)
> +               return 0;
> +
> +       return set_vmcs_cache(vcpu, user_kvm_state, &kvm_state);
> +}
> +
>  static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>         .cpu_has_kvm_support = cpu_has_kvm_support,
>         .disabled_by_bios = vmx_disabled_by_bios,
> @@ -12387,6 +12564,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
>
>         .setup_mce = vmx_setup_mce,
>
> +       .get_state = get_vmx_state,
> +       .set_state = set_vmx_state,
> +       .get_vmcs12_pages = nested_get_vmcs12_pages,
> +
>         .smi_allowed = vmx_smi_allowed,
>         .pre_enter_smm = vmx_pre_enter_smm,
>         .pre_leave_smm = vmx_pre_leave_smm,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 963cdb9..1ab7cc5 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2873,6 +2873,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>         case KVM_CAP_X2APIC_API:
>                 r = KVM_X2APIC_API_VALID_FLAGS;
>                 break;
> +       case KVM_CAP_STATE:
> +               r = !!kvm_x86_ops->get_state;
> +               break;
>         default:
>                 r = 0;
>                 break;
> @@ -3892,6 +3895,22 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>                 r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
>                 break;
>         }
> +       case KVM_GET_STATE: {
> +               struct kvm_state __user *user_kvm_state = argp;
> +
> +               r = -EINVAL;
> +               if (kvm_x86_ops->get_state)
> +                       r = kvm_x86_ops->get_state(vcpu, user_kvm_state);
> +               break;
> +       }
> +       case KVM_SET_STATE: {
> +               struct kvm_state __user *user_kvm_state = argp;
> +
> +               r = -EINVAL;
> +               if (kvm_x86_ops->set_state)
> +                       r = kvm_x86_ops->set_state(vcpu, user_kvm_state);
> +               break;
> +       }
>         default:
>                 r = -EINVAL;
>         }
> @@ -7051,6 +7070,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         bool req_immediate_exit = false;
>
>         if (kvm_request_pending(vcpu)) {
> +               if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
> +                       kvm_x86_ops->get_vmcs12_pages(vcpu);
>                 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
>                         kvm_mmu_unload(vcpu);
>                 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 4e1d7f5..4c170ff 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -949,6 +949,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_PPC_GET_CPU_CHAR 151
>  #define KVM_CAP_S390_BPB 152
>  #define KVM_CAP_GET_MSR_FEATURES 153
> +#define KVM_CAP_STATE 154
>
>  #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -1380,6 +1381,10 @@ struct kvm_s390_ucas_mapping {
>  /* Memory Encryption Commands */
>  #define KVM_MEMORY_ENCRYPT_OP      _IOWR(KVMIO, 0xba, unsigned long)
>
> +/* Available with KVM_CAP_STATE */
> +#define KVM_GET_STATE         _IOWR(KVMIO, 0xbb, struct kvm_vmx_state)
> +#define KVM_SET_STATE         _IOW(KVMIO,  0xbc, struct kvm_vmx_state)
> +
>  struct kvm_enc_region {
>         __u64 addr;
>         __u64 size;
> --
> 2.7.4
>