7. Add back the PRESENT bit, making the changed attribute visible
With this approach, load_unaligned_zeropad() just takes its normal
page-fault-based fixup path if it touches a page that is transitioning.
As a result, load_unaligned_zeropad() and CoCo VM page transitioning
are completely decoupled. CoCo VM page transitions can proceed
without needing to handle architecture-specific exceptions and fix
things up. This decoupling reduces the complexity due to separate
TDX and SEV-SNP fixup paths, and gives more freedom to revise and
introduce new capabilities in future versions of the TDX and SEV-SNP
architectures. Paravisor scenarios work properly without needing
to forward exceptions.
With this approach, the order of updating the guest PTEs and
notifying the hypervisor doesn't matter. As such, only a single
hypervisor callback is needed, rather one before and one after
the PTE update. Simplify the code by eliminating the extra
hypervisor callback along with the TDX and SEV-SNP code that
handles the before and after cases. The TLB flush callback is
also no longer required and is removed.
Signed-off-by: Michael Kelley <mikelley@xxxxxxxxxxxxx>
---
arch/x86/coco/tdx/tdx.c | 66
+------------------------------------------
arch/x86/hyperv/ivm.c | 6 ----
arch/x86/kernel/x86_init.c | 4 ---
arch/x86/mm/mem_encrypt_amd.c | 27 ++++--------------
arch/x86/mm/pat/set_memory.c | 55 +++++++++++++++++++++++-------------
5 files changed, 43 insertions(+), 115 deletions(-)
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 3e6dbd2..1bb2fff 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -676,24 +676,6 @@ bool tdx_handle_virt_exception(struct pt_regs
*regs, struct ve_info *ve)
return true;
}
-static bool tdx_tlb_flush_required(bool private)
-{
- /*
- * TDX guest is responsible for flushing TLB on private->shared
- * transition. VMM is responsible for flushing on shared->private.
- *
- * The VMM _can't_ flush private addresses as it can't generate PAs
- * with the guest's HKID. Shared memory isn't subject to integrity
- * checking, i.e. the VMM doesn't need to flush for its own
protection.
- *
- * There's no need to flush when converting from shared to private,
- * as flushing is the VMM's responsibility in this case, e.g. it must
- * flush to avoid integrity failures in the face of a buggy or
- * malicious guest.
- */
- return !private;
-}
-
static bool tdx_cache_flush_required(void)
{
/*
@@ -776,30 +758,6 @@ static bool tdx_enc_status_changed(unsigned long
vaddr, int numpages, bool enc)
return true;
}
-static bool tdx_enc_status_change_prepare(unsigned long vaddr, int
numpages,
- bool enc)
-{
- /*
- * Only handle shared->private conversion here.
- * See the comment in tdx_early_init().
- */
- if (enc)
- return tdx_enc_status_changed(vaddr, numpages, enc);
- return true;
-}
-
-static bool tdx_enc_status_change_finish(unsigned long vaddr, int
numpages,
- bool enc)
-{
- /*
- * Only handle private->shared conversion here.
- * See the comment in tdx_early_init().
- */
- if (!enc)
- return tdx_enc_status_changed(vaddr, numpages, enc);
- return true;
-}
-
void __init tdx_early_init(void)
{
struct tdx_module_args args = {
@@ -831,30 +789,8 @@ void __init tdx_early_init(void)
*/
physical_mask &= cc_mask - 1;
- /*
- * The kernel mapping should match the TDX metadata for the page.
- * load_unaligned_zeropad() can touch memory *adjacent* to that
which is
- * owned by the caller and can catch even _momentary_ mismatches. Bad
- * things happen on mismatch:
- *
- * - Private mapping => Shared Page == Guest shutdown
- * - Shared mapping => Private Page == Recoverable #VE
- *
- * guest.enc_status_change_prepare() converts the page from
- * shared=>private before the mapping becomes private.
- *
- * guest.enc_status_change_finish() converts the page from
- * private=>shared after the mapping becomes private.
- *
- * In both cases there is a temporary shared mapping to a private
page,
- * which can result in a #VE. But, there is never a private
mapping to
- * a shared page.
- */
- x86_platform.guest.enc_status_change_prepare =
tdx_enc_status_change_prepare;
- x86_platform.guest.enc_status_change_finish =
tdx_enc_status_change_finish;
-
+ x86_platform.guest.enc_status_change_finish = tdx_enc_status_changed;
x86_platform.guest.enc_cache_flush_required =
tdx_cache_flush_required;
- x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
/*
* TDX intercepts the RDMSR to read the X2APIC ID in the parallel
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index 084fab6..fbe2585 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -550,11 +550,6 @@ static bool hv_vtom_set_host_visibility(unsigned
long kbuffer, int pagecount, bo
return result;
}
-static bool hv_vtom_tlb_flush_required(bool private)
-{
- return true;
-}
-
static bool hv_vtom_cache_flush_required(void)
{
return false;
@@ -614,7 +609,6 @@ void __init hv_vtom_init(void)
x86_platform.hyper.is_private_mmio = hv_is_private_mmio;
x86_platform.guest.enc_cache_flush_required =
hv_vtom_cache_flush_required;
- x86_platform.guest.enc_tlb_flush_required =
hv_vtom_tlb_flush_required;
x86_platform.guest.enc_status_change_finish =
hv_vtom_set_host_visibility;
/* Set WB as the default cache mode. */
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a37ebd3..cf5179b 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -131,9 +131,7 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { };
-static bool enc_status_change_prepare_noop(unsigned long vaddr, int
npages, bool enc) { return true; }
static bool enc_status_change_finish_noop(unsigned long vaddr, int
npages, bool enc) { return true; }
-static bool enc_tlb_flush_required_noop(bool enc) { return false; }
static bool enc_cache_flush_required_noop(void) { return false; }
static bool is_private_mmio_noop(u64 addr) {return false; }
@@ -154,9 +152,7 @@ struct x86_platform_ops x86_platform __ro_after_init
= {
.hyper.is_private_mmio = is_private_mmio_noop,
.guest = {
- .enc_status_change_prepare = enc_status_change_prepare_noop,
.enc_status_change_finish = enc_status_change_finish_noop,
- .enc_tlb_flush_required = enc_tlb_flush_required_noop,
.enc_cache_flush_required = enc_cache_flush_required_noop,
},
};
diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
index 6faea41..06960ba 100644
--- a/arch/x86/mm/mem_encrypt_amd.c
+++ b/arch/x86/mm/mem_encrypt_amd.c
@@ -278,11 +278,6 @@ static unsigned long pg_level_to_pfn(int level,
pte_t *kpte, pgprot_t *ret_prot)
return pfn;
}
-static bool amd_enc_tlb_flush_required(bool enc)
-{
- return true;
-}
-
static bool amd_enc_cache_flush_required(void)
{
return !cpu_feature_enabled(X86_FEATURE_SME_COHERENT);
@@ -318,18 +313,6 @@ static void enc_dec_hypercall(unsigned long vaddr,
unsigned long size, bool enc)
#endif
}
-static bool amd_enc_status_change_prepare(unsigned long vaddr, int
npages, bool enc)
-{
- /*
- * To maintain the security guarantees of SEV-SNP guests, make sure
- * to invalidate the memory before encryption attribute is cleared.
- */
- if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc)
- snp_set_memory_shared(vaddr, npages);
-
- return true;
-}
-
/* Return true unconditionally: return value doesn't matter for the
SEV side */
static bool amd_enc_status_change_finish(unsigned long vaddr, int
npages, bool enc)
{
@@ -337,8 +320,12 @@ static bool amd_enc_status_change_finish(unsigned
long vaddr, int npages, bool e
* After memory is mapped encrypted in the page table, validate it
* so that it is consistent with the page table updates.
*/
- if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && enc)
- snp_set_memory_private(vaddr, npages);
+ if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) {
+ if (enc)
+ snp_set_memory_private(vaddr, npages);
+ else
+ snp_set_memory_shared(vaddr, npages);
+ }
if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc);
@@ -498,9 +485,7 @@ void __init sme_early_init(void)
/* Update the protection map with memory encryption mask */
add_encrypt_protection_map();
- x86_platform.guest.enc_status_change_prepare =
amd_enc_status_change_prepare;
x86_platform.guest.enc_status_change_finish =
amd_enc_status_change_finish;
- x86_platform.guest.enc_tlb_flush_required =
amd_enc_tlb_flush_required;
x86_platform.guest.enc_cache_flush_required =
amd_enc_cache_flush_required;
/*
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index d7ef8d3..d062e01 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -2147,40 +2147,57 @@ static int __set_memory_enc_pgtable(unsigned
long addr, int numpages, bool enc)
memset(&cpa, 0, sizeof(cpa));
cpa.vaddr = &addr;
cpa.numpages = numpages;
+
+ /*
+ * The caller must ensure that the memory being transitioned between
+ * encrypted and decrypted is not being accessed. But if
+ * load_unaligned_zeropad() touches the "next" page, it may generate a
+ * read access the caller has no control over. To ensure such accesses
+ * cause a normal page fault for the load_unaligned_zeropad() handler,
+ * mark the pages not present until the transition is complete. We
+ * don't want a #VE or #VC fault due to a mismatch in the memory
+ * encryption status, since paravisor configurations can't cleanly do
+ * the load_unaligned_zeropad() handling in the paravisor.
+ *
+ * There's no requirement to do so, but for efficiency we can clear
+ * _PAGE_PRESENT and set/clr encryption attr as a single operation.
+ */
cpa.mask_set = enc ? pgprot_encrypted(empty) :
pgprot_decrypted(empty);
- cpa.mask_clr = enc ? pgprot_decrypted(empty) :
pgprot_encrypted(empty);
+ cpa.mask_clr = enc ? pgprot_decrypted(__pgprot(_PAGE_PRESENT)) :
+ pgprot_encrypted(__pgprot(_PAGE_PRESENT));
cpa.pgd = init_mm.pgd;
/* Must avoid aliasing mappings in the highmem code */
kmap_flush_unused();
vm_unmap_aliases();
- /* Flush the caches as needed before changing the encryption
attribute. */
- if (x86_platform.guest.enc_tlb_flush_required(enc))
- cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required());
-
- /* Notify hypervisor that we are about to set/clr encryption
attribute. */
- if (!x86_platform.guest.enc_status_change_prepare(addr, numpages,
enc))
- return -EIO;
+ /* Flush the caches as needed before changing the encryption attr. */
+ if (x86_platform.guest.enc_cache_flush_required())
+ cpa_flush(&cpa, 1);
ret = __change_page_attr_set_clr(&cpa, 1);
+ if (ret)
+ return ret;
/*
- * After changing the encryption attribute, we need to flush TLBs
again
- * in case any speculative TLB caching occurred (but no need to flush
- * caches again). We could just use cpa_flush_all(), but in case TLB
- * flushing gets optimized in the cpa_flush() path use the same logic
- * as above.
+ * After clearing _PAGE_PRESENT and changing the encryption attribute,
+ * we need to flush TLBs to ensure no further accesses to the
memory can
+ * be made with the old encryption attribute (but no need to flush
caches
+ * again). We could just use cpa_flush_all(), but in case TLB
flushing
+ * gets optimized in the cpa_flush() path use the same logic as above.
*/
cpa_flush(&cpa, 0);
- /* Notify hypervisor that we have successfully set/clr encryption
attribute. */
- if (!ret) {
- if (!x86_platform.guest.enc_status_change_finish(addr,
numpages, enc))
- ret = -EIO;
- }
+ /* Notify hypervisor that we have successfully set/clr encryption
attr. */
+ if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc))
+ return -EIO;