From: Petr Tesarik <petr.tesarik1@xxxxxxxxxxxxxxxxxxx> Implement lazy TLB flushing in sandbox mode and keep CR4.PGE enabled. For the transition from sandbox mode to kernel mode: 1. All user page translations (sandbox code and data) are flushed from the TLB, because their page protection bits do not include _PAGE_GLOBAL. 2. Any kernel page translations remain valid after the transition. The SBM state page is an exception; map it without _PAGE_GLOBAL. For the transition from kernel mode to sandbox mode: 1. Kernel page translations become stale. However, any access by code running in sandbox mode (with CPL 3) causes a protection violation. Handle the spurious page faults from such accesses, lazily replacing entries in the TLB. 2. If the TLB contains any user page translations before the switch to sandbox mode, they are flushed, because their page protection bits do not include _PAGE_GLOBAL. This ensures that sandbox mode cannot access user mode pages. Note that the TLB may keep kernel page translations for addresses which are never accessed by sandbox mode. They remain valid after returning to kernel mode. Signed-off-by: Petr Tesarik <petr.tesarik1@xxxxxxxxxxxxxxxxxxx> --- arch/x86/entry/entry_64.S | 17 +----- arch/x86/kernel/sbm/call_64.S | 5 +- arch/x86/kernel/sbm/core.c | 100 +++++++++++++++++++++++++++++++++- 3 files changed, 102 insertions(+), 20 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e1364115408a..4ba3eea38102 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -632,10 +632,8 @@ SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL) movq PER_CPU_VAR(pcpu_hot + X86_current_task), %rcx movq TASK_sbm_state(%rcx), %rcx movq SBM_sbm_cr3(%rcx), %rcx - movq %cr4, %rax - andb $~X86_CR4_PGE, %al - movq %rax, %cr4 movq %rcx, %cr3 + invlpg x86_sbm_state orb $3, CS(%rsp) #endif @@ -897,9 +895,6 @@ SYM_CODE_START(paranoid_entry) movq %cr3, %r14 andb $~3, CS+8(%rsp) - movq %cr4, %rax - orb $X86_CR4_PGE, %al - movq %rax, %cr4 movq %rcx, %cr3 jmp .Lparanoid_gsbase #endif @@ -1073,9 +1068,6 @@ SYM_CODE_START(error_entry) jrcxz .Lerror_swapgs andb $~3, CS+8(%rsp) - movq %cr4, %rax - orb $X86_CR4_PGE, %al - movq %rax, %cr4 movq %rcx, %cr3 jmp .Lerror_entry_done_lfence #endif @@ -1281,9 +1273,6 @@ SYM_CODE_START(asm_exc_nmi) * stack. The code is similar to NMI from user mode. */ andb $~3, CS-RIP+8(%rsp) - movq %cr4, %rdx - orb $X86_CR4_PGE, %dl - movq %rdx, %cr4 movq x86_sbm_state + SBM_kernel_cr3, %rdx movq %rdx, %cr3 @@ -1533,10 +1522,8 @@ end_repeat_nmi: movq TASK_sbm_state(%rcx), %rcx jrcxz nmi_no_sbm - movq %cr4, %rax - andb $~X86_CR4_PGE, %al - movq %rax, %cr4 movq %r14, %cr3 + invlpg x86_sbm_state #endif nmi_no_sbm: diff --git a/arch/x86/kernel/sbm/call_64.S b/arch/x86/kernel/sbm/call_64.S index 8b2b524c5b46..21edce5666bc 100644 --- a/arch/x86/kernel/sbm/call_64.S +++ b/arch/x86/kernel/sbm/call_64.S @@ -10,7 +10,6 @@ #include <linux/linkage.h> #include <asm/nospec-branch.h> #include <asm/percpu.h> -#include <asm/processor-flags.h> #include <asm/segment.h> .code64 @@ -75,12 +74,10 @@ SYM_FUNC_START(x86_sbm_exec) * The NMI handler takes extra care to restore CR3 and CR4. */ mov SBM_sbm_cr3(%rdi), %r11 - mov %cr4, %rax - and $~X86_CR4_PGE, %al mov %rdx, %rdi /* args */ cli - mov %rax, %cr4 mov %r11, %cr3 + invlpg x86_sbm_state iretq SYM_INNER_LABEL(x86_sbm_return, SYM_L_GLOBAL) diff --git a/arch/x86/kernel/sbm/core.c b/arch/x86/kernel/sbm/core.c index 0ea193550a83..296f1fde3c22 100644 --- a/arch/x86/kernel/sbm/core.c +++ b/arch/x86/kernel/sbm/core.c @@ -33,6 +33,11 @@ union { char page[PAGE_SIZE]; } x86_sbm_state __page_aligned_bss; +static inline pgprot_t pgprot_nonglobal(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) & ~_PAGE_GLOBAL); +} + static inline phys_addr_t page_to_ptval(struct page *page) { return PFN_PHYS(page_to_pfn(page)) | _PAGE_TABLE; @@ -287,7 +292,7 @@ int arch_sbm_init(struct sbm *sbm) BUILD_BUG_ON(sizeof(x86_sbm_state) != PAGE_SIZE); err = map_page(state, (unsigned long)&x86_sbm_state, - PHYS_PFN(__pa(state)), PAGE_KERNEL); + PHYS_PFN(__pa(state)), pgprot_nonglobal(PAGE_KERNEL)); if (err < 0) return err; @@ -379,11 +384,104 @@ int arch_sbm_exec(struct sbm *sbm, sbm_func func, void *args) return err; } +static bool spurious_sbm_fault_check(unsigned long error_code, pte_t *pte) +{ + if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) + return false; + + if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) + return false; + + return true; +} + +/* + * Handle a spurious fault caused by a stale TLB entry. + * + * This allows us to lazily refresh the TLB when increasing the + * permissions of a kernel page (RO -> RW or NX -> X). Doing it + * eagerly is very expensive since that implies doing a full + * cross-processor TLB flush, even if no stale TLB entries exist + * on other processors. + * + * Spurious faults may only occur if the TLB contains an entry with + * fewer permission than the page table entry. Non-present (P = 0) + * and reserved bit (R = 1) faults are never spurious. + * + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + * + * Returns true if a spurious fault was handled, false otherwise. + * + * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3 + * (Optional Invalidation). + */ +static bool +spurious_sbm_fault(struct x86_sbm_state *state, unsigned long error_code, + unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + bool ret; + + if ((error_code & ~(X86_PF_WRITE | X86_PF_INSTR)) != + (X86_PF_USER | X86_PF_PROT)) + return false; + + pgd = __va(state->sbm_cr3 & CR3_ADDR_MASK) + pgd_index(address); + if (!pgd_present(*pgd)) + return false; + + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return false; + + if (p4d_large(*p4d)) + return spurious_sbm_fault_check(error_code, (pte_t *)p4d); + + pud = pud_offset(p4d, address); + if (!pud_present(*pud)) + return false; + + if (pud_large(*pud)) + return spurious_sbm_fault_check(error_code, (pte_t *)pud); + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return false; + + if (pmd_large(*pmd)) + return spurious_sbm_fault_check(error_code, (pte_t *)pmd); + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return false; + + ret = spurious_sbm_fault_check(error_code, pte); + if (!ret) + return false; + + /* + * Make sure we have permissions in PMD. + * If not, then there's a bug in the page tables: + */ + ret = spurious_sbm_fault_check(error_code, (pte_t *)pmd); + WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); + + return ret; +} + void handle_sbm_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { struct x86_sbm_state *state = current_thread_info()->sbm_state; + if (spurious_sbm_fault(state, error_code, address)) + return; + /* * Force -EFAULT unless the fault was due to a user-mode instruction * fetch from the designated return address. -- 2.34.1