[PATCH v1 8/8] sbm: x86: lazy TLB flushing

Petr Tesarik <petrtesarik@xxxxxxxxxxxxxxx> · Wed, 14 Feb 2024 12:35:16 +0100

From: Petr Tesarik <petr.tesarik1@xxxxxxxxxxxxxxxxxxx>

Implement lazy TLB flushing in sandbox mode and keep CR4.PGE enabled.

For the transition from sandbox mode to kernel mode:

1. All user page translations (sandbox code and data) are flushed from the
   TLB, because their page protection bits do not include _PAGE_GLOBAL.

2. Any kernel page translations remain valid after the transition. The SBM
   state page is an exception; map it without _PAGE_GLOBAL.

For the transition from kernel mode to sandbox mode:

1. Kernel page translations become stale. However, any access by code
   running in sandbox mode (with CPL 3) causes a protection violation.
   Handle the spurious page faults from such accesses, lazily replacing
   entries in the TLB.

2. If the TLB contains any user page translations before the switch to
   sandbox mode, they are flushed, because their page protection bits do
   not include _PAGE_GLOBAL. This ensures that sandbox mode cannot access
   user mode pages.

Note that the TLB may keep kernel page translations for addresses which are
never accessed by sandbox mode. They remain valid after returning to kernel
mode.

Signed-off-by: Petr Tesarik <petr.tesarik1@xxxxxxxxxxxxxxxxxxx>
---
 arch/x86/entry/entry_64.S     |  17 +-----
 arch/x86/kernel/sbm/call_64.S |   5 +-
 arch/x86/kernel/sbm/core.c    | 100 +++++++++++++++++++++++++++++++++-
 3 files changed, 102 insertions(+), 20 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e1364115408a..4ba3eea38102 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -632,10 +632,8 @@ SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
 	movq	PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx
 	movq	TASK_sbm_state(%rcx), %rcx
 	movq	SBM_sbm_cr3(%rcx), %rcx
-	movq	%cr4, %rax
-	andb	$~X86_CR4_PGE, %al
-	movq	%rax, %cr4
 	movq	%rcx, %cr3
+	invlpg	x86_sbm_state
 	orb	$3, CS(%rsp)
 #endif
 
@@ -897,9 +895,6 @@ SYM_CODE_START(paranoid_entry)
 
 	movq	%cr3, %r14
 	andb	$~3, CS+8(%rsp)
-	movq	%cr4, %rax
-	orb	$X86_CR4_PGE, %al
-	movq	%rax, %cr4
 	movq	%rcx, %cr3
 	jmp	.Lparanoid_gsbase
 #endif
@@ -1073,9 +1068,6 @@ SYM_CODE_START(error_entry)
 	jrcxz	.Lerror_swapgs
 
 	andb	$~3, CS+8(%rsp)
-	movq	%cr4, %rax
-	orb	$X86_CR4_PGE, %al
-	movq	%rax, %cr4
 	movq	%rcx, %cr3
 	jmp	.Lerror_entry_done_lfence
 #endif
@@ -1281,9 +1273,6 @@ SYM_CODE_START(asm_exc_nmi)
 	 * stack. The code is similar to NMI from user mode.
 	 */
 	andb	$~3, CS-RIP+8(%rsp)
-	movq	%cr4, %rdx
-	orb	$X86_CR4_PGE, %dl
-	movq	%rdx, %cr4
 	movq	x86_sbm_state + SBM_kernel_cr3, %rdx
 	movq	%rdx, %cr3
 
@@ -1533,10 +1522,8 @@ end_repeat_nmi:
 	movq	TASK_sbm_state(%rcx), %rcx
 	jrcxz	nmi_no_sbm
 
-	movq	%cr4, %rax
-	andb	$~X86_CR4_PGE, %al
-	movq	%rax, %cr4
 	movq	%r14, %cr3
+	invlpg	x86_sbm_state
 #endif
 
 nmi_no_sbm:
diff --git a/arch/x86/kernel/sbm/call_64.S b/arch/x86/kernel/sbm/call_64.S
index 8b2b524c5b46..21edce5666bc 100644
--- a/arch/x86/kernel/sbm/call_64.S
+++ b/arch/x86/kernel/sbm/call_64.S
@@ -10,7 +10,6 @@
 #include <linux/linkage.h>
 #include <asm/nospec-branch.h>
 #include <asm/percpu.h>
-#include <asm/processor-flags.h>
 #include <asm/segment.h>
 
 .code64
@@ -75,12 +74,10 @@ SYM_FUNC_START(x86_sbm_exec)
 	 * The NMI handler takes extra care to restore CR3 and CR4.
 	 */
 	mov	SBM_sbm_cr3(%rdi), %r11
-	mov	%cr4, %rax
-	and	$~X86_CR4_PGE, %al
 	mov	%rdx, %rdi	/* args */
 	cli
-	mov	%rax, %cr4
 	mov	%r11, %cr3
+	invlpg	x86_sbm_state
 	iretq
 
 SYM_INNER_LABEL(x86_sbm_return, SYM_L_GLOBAL)
diff --git a/arch/x86/kernel/sbm/core.c b/arch/x86/kernel/sbm/core.c
index 0ea193550a83..296f1fde3c22 100644
--- a/arch/x86/kernel/sbm/core.c
+++ b/arch/x86/kernel/sbm/core.c
@@ -33,6 +33,11 @@ union {
 	char page[PAGE_SIZE];
 } x86_sbm_state __page_aligned_bss;
 
+static inline pgprot_t pgprot_nonglobal(pgprot_t prot)
+{
+	return __pgprot(pgprot_val(prot) & ~_PAGE_GLOBAL);
+}
+
 static inline phys_addr_t page_to_ptval(struct page *page)
 {
 	return PFN_PHYS(page_to_pfn(page)) | _PAGE_TABLE;
@@ -287,7 +292,7 @@ int arch_sbm_init(struct sbm *sbm)
 
 	BUILD_BUG_ON(sizeof(x86_sbm_state) != PAGE_SIZE);
 	err = map_page(state, (unsigned long)&x86_sbm_state,
-		       PHYS_PFN(__pa(state)), PAGE_KERNEL);
+		       PHYS_PFN(__pa(state)), pgprot_nonglobal(PAGE_KERNEL));
 	if (err < 0)
 		return err;
 
@@ -379,11 +384,104 @@ int arch_sbm_exec(struct sbm *sbm, sbm_func func, void *args)
 	return err;
 }
 
+static bool spurious_sbm_fault_check(unsigned long error_code, pte_t *pte)
+{
+	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
+		return false;
+
+	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
+		return false;
+
+	return true;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry.  Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ *
+ * Returns true if a spurious fault was handled, false otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
+ */
+static bool
+spurious_sbm_fault(struct x86_sbm_state *state, unsigned long error_code,
+		   unsigned long address)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	bool ret;
+
+	if ((error_code & ~(X86_PF_WRITE | X86_PF_INSTR)) !=
+	    (X86_PF_USER | X86_PF_PROT))
+		return false;
+
+	pgd = __va(state->sbm_cr3 & CR3_ADDR_MASK) + pgd_index(address);
+	if (!pgd_present(*pgd))
+		return false;
+
+	p4d = p4d_offset(pgd, address);
+	if (!p4d_present(*p4d))
+		return false;
+
+	if (p4d_large(*p4d))
+		return spurious_sbm_fault_check(error_code, (pte_t *)p4d);
+
+	pud = pud_offset(p4d, address);
+	if (!pud_present(*pud))
+		return false;
+
+	if (pud_large(*pud))
+		return spurious_sbm_fault_check(error_code, (pte_t *)pud);
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return false;
+
+	if (pmd_large(*pmd))
+		return spurious_sbm_fault_check(error_code, (pte_t *)pmd);
+
+	pte = pte_offset_kernel(pmd, address);
+	if (!pte_present(*pte))
+		return false;
+
+	ret = spurious_sbm_fault_check(error_code, pte);
+	if (!ret)
+		return false;
+
+	/*
+	 * Make sure we have permissions in PMD.
+	 * If not, then there's a bug in the page tables:
+	 */
+	ret = spurious_sbm_fault_check(error_code, (pte_t *)pmd);
+	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
+
+	return ret;
+}
+
 void handle_sbm_fault(struct pt_regs *regs, unsigned long error_code,
 		      unsigned long address)
 {
 	struct x86_sbm_state *state = current_thread_info()->sbm_state;
 
+	if (spurious_sbm_fault(state, error_code, address))
+		return;
+
 	/*
 	 * Force -EFAULT unless the fault was due to a user-mode instruction
 	 * fetch from the designated return address.
-- 
2.34.1