On Tue, 6 Sept 2022 at 12:42, Evgeniy Baskov <baskov@xxxxxxxxx> wrote: > > Use previously added code to use 4KB pages for mapping. Map compressed > and uncompressed kernel with appropriate memory protection attributes. > For compressed kernel set them up manually. For uncompressed kernel > used flags specified in ELF header. > > Signed-off-by: Evgeniy Baskov <baskov@xxxxxxxxx> > > delete mode 100644 arch/x86/boot/compressed/pgtable.h > create mode 100644 arch/x86/include/asm/shared/pgtable.h > --- > arch/x86/boot/compressed/head_64.S | 25 ++++++- > arch/x86/boot/compressed/ident_map_64.c | 96 ++++++++++++++++--------- > arch/x86/boot/compressed/misc.c | 63 ++++++++++++++-- > arch/x86/boot/compressed/misc.h | 16 ++++- > arch/x86/boot/compressed/pgtable.h | 20 ------ > arch/x86/boot/compressed/pgtable_64.c | 2 +- > arch/x86/boot/compressed/sev.c | 6 +- > arch/x86/include/asm/shared/pgtable.h | 29 ++++++++ > 8 files changed, 193 insertions(+), 64 deletions(-) > delete mode 100644 arch/x86/boot/compressed/pgtable.h > create mode 100644 arch/x86/include/asm/shared/pgtable.h > > diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S > index 5273367283b7..889ca7176aa7 100644 > --- a/arch/x86/boot/compressed/head_64.S > +++ b/arch/x86/boot/compressed/head_64.S > @@ -35,7 +35,7 @@ > #include <asm/bootparam.h> > #include <asm/desc_defs.h> > #include <asm/trapnr.h> > -#include "pgtable.h" > +#include <asm/shared/pgtable.h> > > /* > * Locally defined symbols should be marked hidden: > @@ -578,6 +578,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) > pushq %rsi > call load_stage2_idt > > + call startup32_enable_nx_if_supported > /* Pass boot_params to initialize_identity_maps() */ > movq (%rsp), %rdi > call initialize_identity_maps > @@ -602,6 +603,28 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) > jmp *%rax > SYM_FUNC_END(.Lrelocated) > > +SYM_FUNC_START_LOCAL_NOALIGN(startup32_enable_nx_if_supported) Why the startup32_ prefix for this function name? > + pushq %rbx > + > + leaq has_nx(%rip), %rcx > + > + mov $0x80000001, %eax > + cpuid > + btl $20, %edx > + jnc .Lnonx > + > + movl $1, (%rcx) > + > + movl $MSR_EFER, %ecx > + rdmsr > + btsl $_EFER_NX, %eax > + wrmsr > + > +.Lnonx: > + popq %rbx > + RET > +SYM_FUNC_END(startup32_enable_nx_if_supported) > + > .code32 > /* > * This is the 32-bit trampoline that will be copied over to low memory. > diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c > index d4a314cc50d6..880e08293023 100644 > --- a/arch/x86/boot/compressed/ident_map_64.c > +++ b/arch/x86/boot/compressed/ident_map_64.c > @@ -28,6 +28,7 @@ > #include <asm/trap_pf.h> > #include <asm/trapnr.h> > #include <asm/init.h> > +#include <asm/shared/pgtable.h> > /* Use the static base for this part of the boot process */ > #undef __PAGE_OFFSET > #define __PAGE_OFFSET __PAGE_OFFSET_BASE > @@ -86,24 +87,46 @@ phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; > * Due to relocation, pointers must be assigned at run time not build time. > */ > static struct x86_mapping_info mapping_info; > +int has_nx; > > /* > * Adds the specified range to the identity mappings. > */ > -void kernel_add_identity_map(unsigned long start, unsigned long end) > +unsigned long kernel_add_identity_map(unsigned long start, > + unsigned long end, > + unsigned int flags) > { > int ret; > > /* Align boundary to 2M. */ > - start = round_down(start, PMD_SIZE); > - end = round_up(end, PMD_SIZE); > + start = round_down(start, PAGE_SIZE); > + end = round_up(end, PAGE_SIZE); > if (start >= end) > - return; > + return start; > + > + /* Enforce W^X -- just stop booting with error on violation. */ > + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && > + (flags & (MAP_EXEC | MAP_WRITE)) == (MAP_EXEC | MAP_WRITE)) > + error("Error: W^X violation\n"); > + Do we need to add a new failure mode here? > + bool nx = !(flags & MAP_EXEC) && has_nx; > + bool ro = !(flags & MAP_WRITE); > + > + mapping_info.page_flag = sme_me_mask | (nx ? > + (ro ? __PAGE_KERNEL_RO : __PAGE_KERNEL) : > + (ro ? __PAGE_KERNEL_ROX : __PAGE_KERNEL_EXEC)); > > /* Build the mapping. */ > - ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end); > + ret = kernel_ident_mapping_init(&mapping_info, > + (pgd_t *)top_level_pgt, > + start, end); > if (ret) > error("Error: kernel_ident_mapping_init() failed\n"); > + > + if (!(flags & MAP_NOFLUSH)) > + write_cr3(top_level_pgt); > + > + return start; > } > > /* Locates and clears a region for a new top level page table. */ > @@ -112,14 +135,17 @@ void initialize_identity_maps(void *rmode) > unsigned long cmdline; > struct setup_data *sd; > > + boot_params = rmode; > + > /* Exclude the encryption mask from __PHYSICAL_MASK */ > physical_mask &= ~sme_me_mask; > > /* Init mapping_info with run-time function/buffer pointers. */ > mapping_info.alloc_pgt_page = alloc_pgt_page; > mapping_info.context = &pgt_data; > - mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask; > + mapping_info.page_flag = __PAGE_KERNEL_EXEC | sme_me_mask; > mapping_info.kernpg_flag = _KERNPG_TABLE; > + mapping_info.allow_4kpages = 1; > > /* > * It should be impossible for this not to already be true, > @@ -154,15 +180,34 @@ void initialize_identity_maps(void *rmode) > /* > * New page-table is set up - map the kernel image, boot_params and the > * command line. The uncompressed kernel requires boot_params and the > - * command line to be mapped in the identity mapping. Map them > - * explicitly here in case the compressed kernel does not touch them, > - * or does not touch all the pages covering them. > + * command line to be mapped in the identity mapping. > + * Every other accessed memory region is mapped later, if required. > */ > - kernel_add_identity_map((unsigned long)_head, (unsigned long)_end); > - boot_params = rmode; > - kernel_add_identity_map((unsigned long)boot_params, (unsigned long)(boot_params + 1)); > + extern char _head[], _ehead[]; Please move these extern declarations out of the function scope (at the very least) > + kernel_add_identity_map((unsigned long)_head, > + (unsigned long)_ehead, MAP_EXEC | MAP_NOFLUSH); > + > + extern char _compressed[], _ecompressed[]; > + kernel_add_identity_map((unsigned long)_compressed, > + (unsigned long)_ecompressed, MAP_WRITE | MAP_NOFLUSH); > + > + extern char _text[], _etext[]; > + kernel_add_identity_map((unsigned long)_text, > + (unsigned long)_etext, MAP_EXEC | MAP_NOFLUSH); > + > + extern char _rodata[], _erodata[]; > + kernel_add_identity_map((unsigned long)_rodata, > + (unsigned long)_erodata, MAP_NOFLUSH); > + Same question as before: do we really need three different regions for rodata+text here? > + extern char _data[], _end[]; > + kernel_add_identity_map((unsigned long)_data, > + (unsigned long)_end, MAP_WRITE | MAP_NOFLUSH); > + > + kernel_add_identity_map((unsigned long)boot_params, > + (unsigned long)(boot_params + 1), MAP_WRITE | MAP_NOFLUSH); > + > cmdline = get_cmd_line_ptr(); > - kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE); > + kernel_add_identity_map(cmdline, cmdline + COMMAND_LINE_SIZE, MAP_NOFLUSH); > > /* > * Also map the setup_data entries passed via boot_params in case they > @@ -172,7 +217,7 @@ void initialize_identity_maps(void *rmode) > while (sd) { > unsigned long sd_addr = (unsigned long)sd; > > - kernel_add_identity_map(sd_addr, sd_addr + sizeof(*sd) + sd->len); > + kernel_add_identity_map(sd_addr, sd_addr + sizeof(*sd) + sd->len, MAP_NOFLUSH); > sd = (struct setup_data *)sd->next; > } > > @@ -185,26 +230,11 @@ void initialize_identity_maps(void *rmode) > static pte_t *split_large_pmd(struct x86_mapping_info *info, > pmd_t *pmdp, unsigned long __address) > { > - unsigned long page_flags; > - unsigned long address; > - pte_t *pte; > - pmd_t pmd; > - int i; > - > - pte = (pte_t *)info->alloc_pgt_page(info->context); > + unsigned long address = __address & PMD_MASK; > + pte_t *pte = ident_split_large_pmd(info, pmdp, address); > if (!pte) > return NULL; > > - address = __address & PMD_MASK; > - /* No large page - clear PSE flag */ > - page_flags = info->page_flag & ~_PAGE_PSE; > - > - /* Populate the PTEs */ > - for (i = 0; i < PTRS_PER_PMD; i++) { > - set_pte(&pte[i], __pte(address | page_flags)); > - address += PAGE_SIZE; > - } > - > /* > * Ideally we need to clear the large PMD first and do a TLB > * flush before we write the new PMD. But the 2M range of the > @@ -214,7 +244,7 @@ static pte_t *split_large_pmd(struct x86_mapping_info *info, > * also the only user of the page-table, so there is no chance > * of a TLB multihit. > */ > - pmd = __pmd((unsigned long)pte | info->kernpg_flag); > + pmd_t pmd = __pmd((unsigned long)pte | info->kernpg_flag); > set_pmd(pmdp, pmd); > /* Flush TLB to establish the new PMD */ > write_cr3(top_level_pgt); > @@ -377,5 +407,5 @@ void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code) > * Error code is sane - now identity map the 2M region around > * the faulting address. > */ > - kernel_add_identity_map(address, end); > + kernel_add_identity_map(address, end, MAP_WRITE); > } > diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c > index cf690d8712f4..d377e434c4e3 100644 > --- a/arch/x86/boot/compressed/misc.c > +++ b/arch/x86/boot/compressed/misc.c > @@ -14,10 +14,10 @@ > > #include "misc.h" > #include "error.h" > -#include "pgtable.h" > #include "../string.h" > #include "../voffset.h" > #include <asm/bootparam_utils.h> > +#include <asm/shared/pgtable.h> > > /* > * WARNING!! > @@ -277,7 +277,8 @@ static inline void handle_relocations(void *output, unsigned long output_len, > { } > #endif > > -static void parse_elf(void *output) > +static void parse_elf(void *output, unsigned long output_len, > + unsigned long virt_addr) > { > #ifdef CONFIG_X86_64 > Elf64_Ehdr ehdr; > @@ -287,6 +288,7 @@ static void parse_elf(void *output) > Elf32_Phdr *phdrs, *phdr; > #endif > void *dest; > + unsigned long addr; > int i; > > memcpy(&ehdr, output, sizeof(ehdr)); > @@ -323,10 +325,50 @@ static void parse_elf(void *output) > #endif > memmove(dest, output + phdr->p_offset, phdr->p_filesz); > break; > - default: /* Ignore other PT_* */ break; > + default: > + /* Ignore other PT_* */ > + break; > + } > + } > + > + handle_relocations(output, output_len, virt_addr); > + > + if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) > + goto skip_protect; > + > + for (i = 0; i < ehdr.e_phnum; i++) { > + phdr = &phdrs[i]; > + > + switch (phdr->p_type) { > + case PT_LOAD: > +#ifdef CONFIG_RELOCATABLE > + addr = (unsigned long)output; > + addr += (phdr->p_paddr - LOAD_PHYSICAL_ADDR); > +#else > + addr = phdr->p_paddr; > +#endif > + /* > + * Simultaneously readable and writable segments are > + * violating W^X, and should not be present in vmlinux image. > + */ > + if ((phdr->p_flags & (PF_X | PF_W)) == (PF_X | PF_W)) > + error("W^X violation for ELF segment"); > + Can we catch this at build time instead? > + unsigned int flags = MAP_PROTECT; > + if (phdr->p_flags & PF_X) > + flags |= MAP_EXEC; > + if (phdr->p_flags & PF_W) > + flags |= MAP_WRITE; > + > + kernel_add_identity_map(addr, addr + phdr->p_memsz, flags); > + break; > + default: > + /* Ignore other PT_* */ > + break; > } > } > > +skip_protect: > free(phdrs); > } > > @@ -434,6 +476,18 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, > needed_size, > &virt_addr); > > + unsigned long phys_addr = (unsigned long)output; > + > + /* > + * If KASLR is disabled input and output regions may overlap. > + * In this case we need to map region excutable as well. > + */ > + unsigned long map_flags = MAP_ALLOC | MAP_WRITE | > + (IS_ENABLED(CONFIG_RANDOMIZE_BASE) ? 0 : MAP_EXEC); > + output = (unsigned char *)kernel_add_identity_map(phys_addr, > + phys_addr + needed_size, > + map_flags); > + > /* Validate memory location choices. */ > if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) > error("Destination physical address inappropriately aligned"); > @@ -456,8 +510,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, > debug_putstr("\nDecompressing Linux... "); > __decompress(input_data, input_len, NULL, NULL, output, output_len, > NULL, error); > - parse_elf(output); > - handle_relocations(output, output_len, virt_addr); > + parse_elf(output, output_len, virt_addr); > debug_putstr("done.\nBooting the kernel.\n"); > > /* Disable exception handling before booting the kernel */ > diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h > index 62208ec04ca4..a4f99516f310 100644 > --- a/arch/x86/boot/compressed/misc.h > +++ b/arch/x86/boot/compressed/misc.h > @@ -171,8 +171,20 @@ static inline int count_immovable_mem_regions(void) { return 0; } > #ifdef CONFIG_X86_5LEVEL > extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; > #endif > -extern void kernel_add_identity_map(unsigned long start, unsigned long end); > - > +#ifdef CONFIG_X86_64 > +extern unsigned long kernel_add_identity_map(unsigned long start, > + unsigned long end, > + unsigned int flags); > +#else > +static inline unsigned long kernel_add_identity_map(unsigned long start, > + unsigned long end, > + unsigned int flags) > +{ > + (void)flags; > + (void)end; Why these (void) casts? Can we just drop them? > + return start; > +} > +#endif > /* Used by PAGE_KERN* macros: */ > extern pteval_t __default_kernel_pte_mask; > > diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h > deleted file mode 100644 > index cc9b2529a086..000000000000 > --- a/arch/x86/boot/compressed/pgtable.h > +++ /dev/null > @@ -1,20 +0,0 @@ > -#ifndef BOOT_COMPRESSED_PAGETABLE_H > -#define BOOT_COMPRESSED_PAGETABLE_H > - > -#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) > - > -#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 > - > -#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE > -#define TRAMPOLINE_32BIT_CODE_SIZE 0x80 > - > -#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE > - > -#ifndef __ASSEMBLER__ > - > -extern unsigned long *trampoline_32bit; > - > -extern void trampoline_32bit_src(void *return_ptr); > - > -#endif /* __ASSEMBLER__ */ > -#endif /* BOOT_COMPRESSED_PAGETABLE_H */ > diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c > index 2ac12ff4111b..c7cf5a1059a8 100644 > --- a/arch/x86/boot/compressed/pgtable_64.c > +++ b/arch/x86/boot/compressed/pgtable_64.c > @@ -2,7 +2,7 @@ > #include "misc.h" > #include <asm/e820/types.h> > #include <asm/processor.h> > -#include "pgtable.h" > +#include <asm/shared/pgtable.h> > #include "../string.h" > #include "efi.h" > > diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c > index c93930d5ccbd..99f3ad0b30f3 100644 > --- a/arch/x86/boot/compressed/sev.c > +++ b/arch/x86/boot/compressed/sev.c > @@ -13,6 +13,7 @@ > #include "misc.h" > > #include <asm/pgtable_types.h> > +#include <asm/shared/pgtable.h> > #include <asm/sev.h> > #include <asm/trapnr.h> > #include <asm/trap_pf.h> > @@ -435,10 +436,11 @@ void sev_prep_identity_maps(unsigned long top_level_pgt) > unsigned long cc_info_pa = boot_params->cc_blob_address; > struct cc_blob_sev_info *cc_info; > > - kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info)); > + kernel_add_identity_map(cc_info_pa, cc_info_pa + sizeof(*cc_info), MAP_NOFLUSH); > > cc_info = (struct cc_blob_sev_info *)cc_info_pa; > - kernel_add_identity_map(cc_info->cpuid_phys, cc_info->cpuid_phys + cc_info->cpuid_len); > + kernel_add_identity_map(cc_info->cpuid_phys, > + cc_info->cpuid_phys + cc_info->cpuid_len, MAP_NOFLUSH); > } > > sev_verify_cbit(top_level_pgt); > diff --git a/arch/x86/include/asm/shared/pgtable.h b/arch/x86/include/asm/shared/pgtable.h > new file mode 100644 > index 000000000000..6527dadf39d6 > --- /dev/null > +++ b/arch/x86/include/asm/shared/pgtable.h > @@ -0,0 +1,29 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef ASM_SHARED_PAGETABLE_H > +#define ASM_SHARED_PAGETABLE_H > + > +#define MAP_WRITE 0x02 /* Writable memory */ > +#define MAP_EXEC 0x04 /* Executable memory */ > +#define MAP_ALLOC 0x10 /* Range needs to be allocated */ > +#define MAP_PROTECT 0x20 /* Set exact memory attributes for memory range */ > +#define MAP_NOFLUSH 0x40 /* Avoid flushing TLB */ > + > +#define TRAMPOLINE_32BIT_SIZE (3 * PAGE_SIZE) > + > +#define TRAMPOLINE_32BIT_PLACEMENT_MAX (0xA0000) > + > +#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0 > + > +#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE > +#define TRAMPOLINE_32BIT_CODE_SIZE 0x80 > + > +#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE > + > +#ifndef __ASSEMBLER__ > + > +extern unsigned long *trampoline_32bit; > + > +extern void trampoline_32bit_src(void *return_ptr); > + > +#endif /* __ASSEMBLER__ */ > +#endif /* ASM_SHARED_PAGETABLE_H */ > -- > 2.35.1 >