Re: [PATCH v10 23/50] KVM: SEV: Make AVIC backing, VMSA and VMCB memory allocation SNP safe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 10/16/23 15:27, Michael Roth wrote:
From: Brijesh Singh <brijesh.singh@xxxxxxx>

Implement a workaround for an SNP erratum where the CPU will incorrectly
signal an RMP violation #PF if a hugepage (2mb or 1gb) collides with the
RMP entry of a VMCB, VMSA or AVIC backing page.

When SEV-SNP is globally enabled, the CPU marks the VMCB, VMSA, and AVIC
backing pages as "in-use" via a reserved bit in the corresponding RMP
entry after a successful VMRUN. This is done for _all_ VMs, not just
SNP-Active VMs.

If the hypervisor accesses an in-use page through a writable
translation, the CPU will throw an RMP violation #PF. On early SNP
hardware, if an in-use page is 2mb aligned and software accesses any
part of the associated 2mb region with a hupage, the CPU will
incorrectly treat the entire 2mb region as in-use and signal a spurious
RMP violation #PF.

The recommended is to not use the hugepage for the VMCB, VMSA or
AVIC backing page for similar reasons. Add a generic allocator that will
ensure that the page returns is not hugepage (2mb or 1gb) and is safe to
be used when SEV-SNP is enabled. Also implement similar handling for the
VMCB/VMSA pages of nested guests.

Co-developed-by: Marc Orr <marcorr@xxxxxxxxxx>
Signed-off-by: Marc Orr <marcorr@xxxxxxxxxx>
Reported-by: Alper Gun <alpergun@xxxxxxxxxx> # for nested VMSA case
Co-developed-by: Ashish Kalra <ashish.kalra@xxxxxxx>
Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx>
Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx>
[mdr: squash in nested guest handling from Ashish]
Signed-off-by: Michael Roth <michael.roth@xxxxxxx>

Based on the discussion with Borislav, please move this earlier in the series, before patch 6.

Paolo

---
  arch/x86/include/asm/kvm-x86-ops.h |  1 +
  arch/x86/include/asm/kvm_host.h    |  1 +
  arch/x86/kvm/lapic.c               |  5 ++++-
  arch/x86/kvm/svm/nested.c          |  2 +-
  arch/x86/kvm/svm/sev.c             | 33 ++++++++++++++++++++++++++++++
  arch/x86/kvm/svm/svm.c             | 17 ++++++++++++---
  arch/x86/kvm/svm/svm.h             |  1 +
  7 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index f1505a5fa781..4ef2eca14287 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -136,6 +136,7 @@ KVM_X86_OP(vcpu_deliver_sipi_vector)
  KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
  KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
  KVM_X86_OP_OPTIONAL(gmem_invalidate)
+KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
#undef KVM_X86_OP
  #undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fa401cb1a552..a3983271ea28 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1763,6 +1763,7 @@ struct kvm_x86_ops {
int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
  	void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
+	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
  };
struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index dcd60b39e794..631a554c0f48 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2810,7 +2810,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
vcpu->arch.apic = apic; - apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+	if (kvm_x86_ops.alloc_apic_backing_page)
+		apic->regs = static_call(kvm_x86_alloc_apic_backing_page)(vcpu);
+	else
+		apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
  	if (!apic->regs) {
  		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
  		       vcpu->vcpu_id);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index dd496c9e5f91..1f9a3f9eb985 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1194,7 +1194,7 @@ int svm_allocate_nested(struct vcpu_svm *svm)
  	if (svm->nested.initialized)
  		return 0;
- vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	vmcb02_page = snp_safe_alloc_page(&svm->vcpu);
  	if (!vmcb02_page)
  		return -ENOMEM;
  	svm->nested.vmcb02.ptr = page_address(vmcb02_page);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 088b32657f46..1cfb9232fc74 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3211,3 +3211,36 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
  		break;
  	}
  }
+
+struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu)
+{
+	unsigned long pfn;
+	struct page *p;
+
+	if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
+		return alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+
+	/*
+	 * Allocate an SNP safe page to workaround the SNP erratum where
+	 * the CPU will incorrectly signal an RMP violation  #PF if a
+	 * hugepage (2mb or 1gb) collides with the RMP entry of VMCB, VMSA
+	 * or AVIC backing page. The recommeded workaround is to not use the
+	 * hugepage.
+	 *
+	 * Allocate one extra page, use a page which is not 2mb aligned
+	 * and free the other.
+	 */
+	p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1);
+	if (!p)
+		return NULL;
+
+	split_page(p, 1);
+
+	pfn = page_to_pfn(p);
+	if (IS_ALIGNED(pfn, PTRS_PER_PMD))
+		__free_page(p++);
+	else
+		__free_page(p + 1);
+
+	return p;
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 1e7fb1ea45f7..8e4ef0cd968a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -706,7 +706,7 @@ static int svm_cpu_init(int cpu)
  	int ret = -ENOMEM;
memset(sd, 0, sizeof(struct svm_cpu_data));
-	sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	sd->save_area = snp_safe_alloc_page(NULL);
  	if (!sd->save_area)
  		return ret;
@@ -1425,7 +1425,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
  	svm = to_svm(vcpu);
err = -ENOMEM;
-	vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	vmcb01_page = snp_safe_alloc_page(vcpu);
  	if (!vmcb01_page)
  		goto out;
@@ -1434,7 +1434,7 @@ static int svm_vcpu_create(struct kvm_vcpu *vcpu)
  		 * SEV-ES guests require a separate VMSA page used to contain
  		 * the encrypted register state of the guest.
  		 */
-		vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+		vmsa_page = snp_safe_alloc_page(vcpu);
  		if (!vmsa_page)
  			goto error_free_vmcb_page;
@@ -4876,6 +4876,16 @@ static int svm_vm_init(struct kvm *kvm)
  	return 0;
  }
+static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu)
+{
+	struct page *page = snp_safe_alloc_page(vcpu);
+
+	if (!page)
+		return NULL;
+
+	return page_address(page);
+}
+
  static struct kvm_x86_ops svm_x86_ops __initdata = {
  	.name = KBUILD_MODNAME,
@@ -5007,6 +5017,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
  	.vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
+	.alloc_apic_backing_page = svm_alloc_apic_backing_page,
  };
/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index c13070d00910..b7b8bf73cbb9 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -694,6 +694,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm);
  void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
  void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
  void sev_es_unmap_ghcb(struct vcpu_svm *svm);
+struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
/* vmenter.S */





[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux