Re: [PATCH v11 26/35] KVM: SEV: Support SEV-SNP AP Creation NAE event

Jacob Xu <jacobhxu@xxxxxxxxxx> · Fri, 5 Jan 2024 14:08:24 -0800

On Sat, Dec 30, 2023 at 9:32 AM Michael Roth <michael.roth@xxxxxxx> wrote:
>
> From: Tom Lendacky <thomas.lendacky@xxxxxxx>
>
> Add support for the SEV-SNP AP Creation NAE event. This allows SEV-SNP
> guests to alter the register state of the APs on their own. This allows
> the guest a way of simulating INIT-SIPI.
>
> A new event, KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, is created and used
> so as to avoid updating the VMSA pointer while the vCPU is running.
>
> For CREATE
>   The guest supplies the GPA of the VMSA to be used for the vCPU with
>   the specified APIC ID. The GPA is saved in the svm struct of the
>   target vCPU, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is added
>   to the vCPU and then the vCPU is kicked.
>
> For CREATE_ON_INIT:
>   The guest supplies the GPA of the VMSA to be used for the vCPU with
>   the specified APIC ID the next time an INIT is performed. The GPA is
>   saved in the svm struct of the target vCPU.
>
> For DESTROY:
>   The guest indicates it wishes to stop the vCPU. The GPA is cleared
>   from the svm struct, the KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event is
>   added to vCPU and then the vCPU is kicked.
>
> The KVM_REQ_UPDATE_PROTECTED_GUEST_STATE event handler will be invoked
> as a result of the event or as a result of an INIT. The handler sets the
> vCPU to the KVM_MP_STATE_UNINITIALIZED state, so that any errors will
> leave the vCPU as not runnable. Any previous VMSA pages that were
> installed as part of an SEV-SNP AP Creation NAE event are un-pinned. If
> a new VMSA is to be installed, the VMSA guest page is pinned and set as
> the VMSA in the vCPU VMCB and the vCPU state is set to
> KVM_MP_STATE_RUNNABLE. If a new VMSA is not to be installed, the VMSA is
> cleared in the vCPU VMCB and the vCPU state is left as
> KVM_MP_STATE_UNINITIALIZED to prevent it from being run.
>
> Signed-off-by: Tom Lendacky <thomas.lendacky@xxxxxxx>
> Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx>
> Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx>
> [mdr: add handling for gmem]
> Signed-off-by: Michael Roth <michael.roth@xxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |   1 +
>  arch/x86/include/asm/svm.h      |   5 +
>  arch/x86/kvm/svm/sev.c          | 219 ++++++++++++++++++++++++++++++++
>  arch/x86/kvm/svm/svm.c          |   3 +
>  arch/x86/kvm/svm/svm.h          |   8 +-
>  arch/x86/kvm/x86.c              |  11 ++
>  6 files changed, 246 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3fdcbb1da856..9e45402e51bc 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -121,6 +121,7 @@
>         KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_HV_TLB_FLUSH \
>         KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
> +#define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE   KVM_ARCH_REQ(34)
>
>  #define CR0_RESERVED_BITS                                               \
>         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
> diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
> index ba8ce15b27d7..4b73cf5e9de0 100644
> --- a/arch/x86/include/asm/svm.h
> +++ b/arch/x86/include/asm/svm.h
> @@ -287,6 +287,11 @@ static_assert((X2AVIC_MAX_PHYSICAL_ID & AVIC_PHYSICAL_MAX_INDEX_MASK) == X2AVIC_
>
>  #define SVM_SEV_FEAT_DEBUG_SWAP                        BIT(5)
>  #define SVM_SEV_FEAT_SNP_ACTIVE                        BIT(0)
> +#define SVM_SEV_FEAT_RESTRICTED_INJECTION              BIT(3)
> +#define SVM_SEV_FEAT_ALTERNATE_INJECTION              BIT(4)
> +#define SVM_SEV_FEAT_INT_INJ_MODES             \
> +       (SVM_SEV_FEAT_RESTRICTED_INJECTION |    \
> +        SVM_SEV_FEAT_ALTERNATE_INJECTION)
>
>  struct vmcb_seg {
>         u16 selector;
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index 996b5a668938..3bb89c4df5d6 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -652,6 +652,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
>
>  static int sev_es_sync_vmsa(struct vcpu_svm *svm)
>  {
> +       struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
>         struct sev_es_save_area *save = svm->sev_es.vmsa;
>
>         /* Check some debug related fields before encrypting the VMSA */
> @@ -700,6 +701,12 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
>         if (sev_snp_guest(svm->vcpu.kvm))
>                 save->sev_features |= SVM_SEV_FEAT_SNP_ACTIVE;
>
> +       /*
> +        * Save the VMSA synced SEV features. For now, they are the same for
> +        * all vCPUs, so just save each time.
> +        */
> +       sev->sev_features = save->sev_features;
> +
>         pr_debug("Virtual Machine Save Area (VMSA):\n");
>         print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false);
>
> @@ -3082,6 +3089,11 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm)
>                 if (!kvm_ghcb_sw_scratch_is_valid(svm))
>                         goto vmgexit_err;
>                 break;
> +       case SVM_VMGEXIT_AP_CREATION:
> +               if (lower_32_bits(control->exit_info_1) != SVM_VMGEXIT_AP_DESTROY)
> +                       if (!kvm_ghcb_rax_is_valid(svm))
> +                               goto vmgexit_err;
> +               break;
>         case SVM_VMGEXIT_NMI_COMPLETE:
>         case SVM_VMGEXIT_AP_HLT_LOOP:
>         case SVM_VMGEXIT_AP_JUMP_TABLE:
> @@ -3322,6 +3334,202 @@ static int snp_complete_psc(struct kvm_vcpu *vcpu)
>         return 1; /* resume guest */
>  }
>
> +static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
> +{
> +       struct vcpu_svm *svm = to_svm(vcpu);
> +       hpa_t cur_pa;
> +
> +       WARN_ON(!mutex_is_locked(&svm->sev_es.snp_vmsa_mutex));
> +
> +       /* Save off the current VMSA PA for later checks */
> +       cur_pa = svm->sev_es.vmsa_pa;
> +
> +       /* Mark the vCPU as offline and not runnable */
> +       vcpu->arch.pv.pv_unhalted = false;
> +       vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
> +
> +       /* Clear use of the VMSA */
> +       svm->sev_es.vmsa_pa = INVALID_PAGE;
> +       svm->vmcb->control.vmsa_pa = INVALID_PAGE;
> +
> +       /*
> +        * sev->sev_es.vmsa holds the virtual address of the VMSA initially
> +        * allocated by the host. If the guest specified a new a VMSA via
> +        * AP_CREATION, it will have been pinned to avoid future issues
> +        * with things like page migration support. Make sure to un-pin it
> +        * before switching to a newer guest-specified VMSA.
> +        */
> +       if (cur_pa != __pa(svm->sev_es.vmsa) && VALID_PAGE(cur_pa))
> +               kvm_release_pfn_dirty(__phys_to_pfn(cur_pa));
> +
> +       if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) {
> +               gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa);
> +               struct kvm_memory_slot *slot;
> +               kvm_pfn_t pfn;
> +
> +               slot = gfn_to_memslot(vcpu->kvm, gfn);
> +               if (!slot)
> +                       return -EINVAL;
> +
> +               /*
> +                * The new VMSA will be private memory guest memory, so
> +                * retrieve the PFN from the gmem backend, and leave the ref
> +                * count of the associated folio elevated to ensure it won't
> +                * ever be migrated.
> +                */
> +               if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL))
> +                       return -EINVAL;
> +
> +               /* Use the new VMSA */
> +               svm->sev_es.vmsa_pa = pfn_to_hpa(pfn);
> +               svm->vmcb->control.vmsa_pa = svm->sev_es.vmsa_pa;
> +
> +               /* Mark the vCPU as runnable */
> +               vcpu->arch.pv.pv_unhalted = false;
> +               vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +
> +               svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
> +       }
> +
> +       /*
> +        * When replacing the VMSA during SEV-SNP AP creation,
> +        * mark the VMCB dirty so that full state is always reloaded.
> +        */
> +       vmcb_mark_all_dirty(svm->vmcb);
> +
> +       return 0;
> +}
> +
> +/*
> + * Invoked as part of svm_vcpu_reset() processing of an init event.
> + */
> +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu)
> +{
> +       struct vcpu_svm *svm = to_svm(vcpu);
> +       int ret;
> +
> +       if (!sev_snp_guest(vcpu->kvm))
> +               return;
> +
> +       mutex_lock(&svm->sev_es.snp_vmsa_mutex);
> +
> +       if (!svm->sev_es.snp_ap_create)
> +               goto unlock;
> +
> +       svm->sev_es.snp_ap_create = false;
> +
> +       ret = __sev_snp_update_protected_guest_state(vcpu);
> +       if (ret)
> +               vcpu_unimpl(vcpu, "snp: AP state update on init failed\n");
> +
> +unlock:
> +       mutex_unlock(&svm->sev_es.snp_vmsa_mutex);
> +}
> +
> +static int sev_snp_ap_creation(struct vcpu_svm *svm)
> +{
> +       struct kvm_sev_info *sev = &to_kvm_svm(svm->vcpu.kvm)->sev_info;
> +       struct kvm_vcpu *vcpu = &svm->vcpu;
> +       struct kvm_vcpu *target_vcpu;
> +       struct vcpu_svm *target_svm;
> +       unsigned int request;
> +       unsigned int apic_id;
> +       bool kick;
> +       int ret;
> +
> +       request = lower_32_bits(svm->vmcb->control.exit_info_1);
> +       apic_id = upper_32_bits(svm->vmcb->control.exit_info_1);
> +
> +       /* Validate the APIC ID */
> +       target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, apic_id);
> +       if (!target_vcpu) {
> +               vcpu_unimpl(vcpu, "vmgexit: invalid AP APIC ID [%#x] from guest\n",
> +                           apic_id);
> +               return -EINVAL;
> +       }
> +
> +       ret = 0;
> +
> +       target_svm = to_svm(target_vcpu);
> +
> +       /*
> +        * The target vCPU is valid, so the vCPU will be kicked unless the
> +        * request is for CREATE_ON_INIT. For any errors at this stage, the
> +        * kick will place the vCPU in an non-runnable state.
> +        */
> +       kick = true;
> +
> +       mutex_lock(&target_svm->sev_es.snp_vmsa_mutex);
> +
> +       target_svm->sev_es.snp_vmsa_gpa = INVALID_PAGE;
> +       target_svm->sev_es.snp_ap_create = true;
> +
> +       /* Interrupt injection mode shouldn't change for AP creation */
> +       if (request < SVM_VMGEXIT_AP_DESTROY) {
> +               u64 sev_features;
> +
> +               sev_features = vcpu->arch.regs[VCPU_REGS_RAX];
> +               sev_features ^= sev->sev_features;
> +               if (sev_features & SVM_SEV_FEAT_INT_INJ_MODES) {
> +                       vcpu_unimpl(vcpu, "vmgexit: invalid AP injection mode [%#lx] from guest\n",
> +                                   vcpu->arch.regs[VCPU_REGS_RAX]);
> +                       ret = -EINVAL;
> +                       goto out;
> +               }
> +       }
> +
> +       switch (request) {
> +       case SVM_VMGEXIT_AP_CREATE_ON_INIT:
> +               kick = false;
> +               fallthrough;
> +       case SVM_VMGEXIT_AP_CREATE:
> +               if (!page_address_valid(vcpu, svm->vmcb->control.exit_info_2)) {
> +                       vcpu_unimpl(vcpu, "vmgexit: invalid AP VMSA address [%#llx] from guest\n",
> +                                   svm->vmcb->control.exit_info_2);
> +                       ret = -EINVAL;
> +                       goto out;
> +               }
> +
> +               /*
> +                * Malicious guest can RMPADJUST a large page into VMSA which
> +                * will hit the SNP erratum where the CPU will incorrectly signal
> +                * an RMP violation #PF if a hugepage collides with the RMP entry
> +                * of VMSA page, reject the AP CREATE request if VMSA address from
> +                * guest is 2M aligned.
> +                */
> +               if (IS_ALIGNED(svm->vmcb->control.exit_info_2, PMD_SIZE)) {
> +                       vcpu_unimpl(vcpu,
> +                                   "vmgexit: AP VMSA address [%llx] from guest is unsafe as it is 2M aligned\n",
> +                                   svm->vmcb->control.exit_info_2);
> +                       ret = -EINVAL;
> +                       goto out;
> +               }
> +
> +               target_svm->sev_es.snp_vmsa_gpa = svm->vmcb->control.exit_info_2;
> +               break;
> +       case SVM_VMGEXIT_AP_DESTROY:
> +               break;
> +       default:
> +               vcpu_unimpl(vcpu, "vmgexit: invalid AP creation request [%#x] from guest\n",
> +                           request);
> +               ret = -EINVAL;
> +               break;
> +       }
> +
> +out:
> +       if (kick) {
> +               if (target_vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
> +                       target_vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +
> +               kvm_make_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, target_vcpu);

I think we should  switch the order of these two statements for
setting mp_state and for making the request for
KVM_REQ_UPDATE_PROTECTED_GUEST_STATE.
There is a race condition I observed when booting with SVSM where:
1. BSP sets target vcpu to KVM_MP_STATE_RUNNABLE
2. AP thread within the loop of arch/x86/kvm.c:vcpu_run() checks
vm_vcpu_running()
3. AP enters the guest without having updated the VMSA state from
KVM_REQ_UPDATE_PROTECTED_GUEST_STATE

This results in the AP executing on a bad RIP and then crashing.
If we set the request first, then we avoid the race condition.

> +               kvm_vcpu_kick(target_vcpu);
> +       }
> +
> +       mutex_unlock(&target_svm->sev_es.snp_vmsa_mutex);
> +
> +       return ret;
> +}
> +
>  static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
>  {
>         struct vmcb_control_area *control = &svm->vmcb->control;
> @@ -3565,6 +3773,15 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu)
>                 vcpu->run->vmgexit.psc.shared_gpa = svm->sev_es.sw_scratch;
>                 vcpu->arch.complete_userspace_io = snp_complete_psc;
>                 break;
> +       case SVM_VMGEXIT_AP_CREATION:
> +               ret = sev_snp_ap_creation(svm);
> +               if (ret) {
> +                       ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 2);
> +                       ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, GHCB_ERR_INVALID_INPUT);
> +               }
> +
> +               ret = 1;
> +               break;
>         case SVM_VMGEXIT_UNSUPPORTED_EVENT:
>                 vcpu_unimpl(vcpu,
>                             "vmgexit: unsupported event - exit_info_1=%#llx, exit_info_2=%#llx\n",
> @@ -3731,6 +3948,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
>         set_ghcb_msr(svm, GHCB_MSR_SEV_INFO(GHCB_VERSION_MAX,
>                                             GHCB_VERSION_MIN,
>                                             sev_enc_bit));
> +
> +       mutex_init(&svm->sev_es.snp_vmsa_mutex);
>  }
>
>  void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa)
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index da49e4981d75..240518f8d6c7 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1398,6 +1398,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
>         svm->spec_ctrl = 0;
>         svm->virt_spec_ctrl = 0;
>
> +       if (init_event)
> +               sev_snp_init_protected_guest_state(vcpu);
> +
>         init_vmcb(vcpu);
>
>         if (!init_event)
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 4ef41f4d4ee6..d953ae41c619 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -96,6 +96,7 @@ struct kvm_sev_info {
>         atomic_t migration_in_progress;
>         u64 snp_init_flags;
>         void *snp_context;      /* SNP guest context page */
> +       u64 sev_features;       /* Features set at VMSA creation */
>  };
>
>  struct kvm_svm {
> @@ -214,6 +215,10 @@ struct vcpu_sev_es_state {
>         bool ghcb_sa_free;
>
>         u64 ghcb_registered_gpa;
> +
> +       struct mutex snp_vmsa_mutex; /* Used to handle concurrent updates of VMSA. */
> +       gpa_t snp_vmsa_gpa;
> +       bool snp_ap_create;
>  };
>
>  struct vcpu_svm {
> @@ -689,7 +694,7 @@ void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
>  #define GHCB_VERSION_MAX       2ULL
>  #define GHCB_VERSION_MIN       1ULL
>
> -#define GHCB_HV_FT_SUPPORTED   GHCB_HV_FT_SNP
> +#define GHCB_HV_FT_SUPPORTED   (GHCB_HV_FT_SNP | GHCB_HV_FT_SNP_AP_CREATION)
>
>  extern unsigned int max_sev_asid;
>
> @@ -719,6 +724,7 @@ void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
>  void sev_es_unmap_ghcb(struct vcpu_svm *svm);
>  struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
>  void handle_rmp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code);
> +void sev_snp_init_protected_guest_state(struct kvm_vcpu *vcpu);
>
>  /* vmenter.S */
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 87b78d63e81d..df9ec357d538 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10858,6 +10858,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>
>                 if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
>                         static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
> +
> +               if (kvm_check_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu)) {
> +                       kvm_vcpu_reset(vcpu, true);
> +                       if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) {
> +                               r = 1;
> +                               goto out;
> +                       }
> +               }
>         }
>
>         if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
> @@ -13072,6 +13080,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
>         if (kvm_test_request(KVM_REQ_PMI, vcpu))
>                 return true;
>
> +       if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
> +               return true;
> +
>         if (kvm_arch_interrupt_allowed(vcpu) &&
>             (kvm_cpu_has_interrupt(vcpu) ||
>             kvm_guest_apic_has_interrupt(vcpu)))
> --
> 2.25.1
>
>