On Sun, Feb 10, 2008 at 08:43:32PM -0500, Kyle McMartin wrote: > x86-merge has kind of thrown a spanner in the execshield patchset. I've > merged it up so it looks like it works, but I'd like to get some input > from others to make sure I didn't brown-paper-bag it. > > The randomization bits seem to have been merged upstream, but I deferred > to the execshield implementation since they had some differences. > > cheers, > Kyle > [linux-2.6-execshield.patch is attached.] i appear to have noviced it and truncated the patch. i remerged it against rc1... seems to work. i hope... cheers, kyle
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f86a3c4..4c5f70d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -478,6 +478,13 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) * we do "generic changes." */ + if (exec_shield != 0) { +#ifdef CONFIG_X86_PAE + if (!test_bit(X86_FEATURE_NX, c->x86_capability)) +#endif + clear_bit(X86_FEATURE_SEP, c->x86_capability); + } + /* If the model name is still unset, do table lookup. */ if ( !c->x86_model_id[0] ) { char *p; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a7d50a5..83f7b4e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -677,7 +677,8 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ __unlazy_fpu(prev_p); - + if (next_p->mm) + load_user_cs_desc(cpu, next_p->mm); /* we're going to use this soon, after a few expensive things */ if (next_p->fpu_counter > 5) @@ -842,8 +843,58 @@ unsigned long arch_align_stack(unsigned long sp) return sp & ~0xf; } -unsigned long arch_randomize_brk(struct mm_struct *mm) +void arch_add_exec_range(struct mm_struct *mm, unsigned long limit) +{ + if (limit > mm->context.exec_limit) { + mm->context.exec_limit = limit; + set_user_cs(&mm->context.user_cs, limit); + if (mm == current->mm) { + preempt_disable(); + load_user_cs_desc(smp_processor_id(), mm); + preempt_enable(); + } + } +} + +void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end) { - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; + struct vm_area_struct *vma; + unsigned long limit = PAGE_SIZE; + + if (old_end == mm->context.exec_limit) { + for (vma = mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + + mm->context.exec_limit = limit; + set_user_cs(&mm->context.user_cs, limit); + if (mm == current->mm) { + preempt_disable(); + load_user_cs_desc(smp_processor_id(), mm); + preempt_enable(); + } + } +} + +void arch_flush_exec_range(struct mm_struct *mm) +{ + mm->context.exec_limit = 0; + set_user_cs(&mm->context.user_cs, 0); +} + +/* + * Generate random brk address between 128MB and 196MB. (if the layout + * allows it.) + */ +void randomize_brk(unsigned long old_brk) +{ + unsigned long new_brk, range_start, range_end; + + range_start = 0x08000000; + if (current->mm->brk >= range_start) + range_start = current->mm->brk; + range_end = range_start + 0x02000000; + new_brk = randomize_range(range_start, range_end, 0); + if (new_brk) + current->mm->brk = new_brk; } diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index 309366f..8a940dc 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -45,46 +45,6 @@ EXPORT_SYMBOL_GPL(__supported_pte_mask); static int do_not_nx __cpuinitdata = 0; -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -int force_personality32 = 0; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes -off PROT_READ implies PROT_EXEC (default) -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 1; -} -__setup("noexec32=", nonx32_setup); - /* * Copy data used in early init routines from the initial arrays to the * per cpu data areas. These arrays then become expendable and the diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c index dc0cde9..cca75b4 100644 --- a/arch/x86/kernel/smp_32.c +++ b/arch/x86/kernel/smp_32.c @@ -22,6 +22,7 @@ #include <asm/mtrr.h> #include <asm/tlbflush.h> +#include <asm/desc.h> #include <asm/mmu_context.h> #include <mach_apic.h> @@ -316,6 +317,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) unsigned long cpu; cpu = get_cpu(); + if (current->active_mm) + load_user_cs_desc(cpu, current->active_mm); if (!cpu_isset(cpu, flush_cpumask)) goto out; diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index b22c01e..7e196e4 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -592,7 +592,91 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) + +/* + * lazy-check for CS validity on exec-shield binaries: + * + * the original non-exec stack patch was written by + * Solar Designer <solar at openwall.com>. Thanks! + */ +static int +check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code) +{ + struct desc_struct *desc1, *desc2; + struct vm_area_struct *vma; + unsigned long limit; + + if (current->mm == NULL) + return 0; + + limit = -1UL; + if (current->mm->context.exec_limit != -1UL) { + limit = PAGE_SIZE; + spin_lock(¤t->mm->page_table_lock); + for (vma = current->mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + vma = get_gate_vma(current); + if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + spin_unlock(¤t->mm->page_table_lock); + if (limit >= TASK_SIZE) + limit = -1UL; + current->mm->context.exec_limit = limit; + } + set_user_cs(¤t->mm->context.user_cs, limit); + + desc1 = ¤t->mm->context.user_cs; + desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS; + + if (desc1->a != desc2->a || desc1->b != desc2->b) { + /* + * The CS was not in sync - reload it and retry the + * instruction. If the instruction still faults then + * we won't hit this branch next time around. + */ + if (print_fatal_signals >= 2) { + printk(KERN_ERR "#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", + error_code, error_code/8, regs->ip, smp_processor_id()); + printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x, CPU_cs: %08x/%08x.\n", + current->mm->context.exec_limit, desc1->a, desc1->b, desc2->a, desc2->b); + } + load_user_cs_desc(cpu, current->mm); + return 1; + } + + return 0; +} + +/* + * The fixup code for errors in iret jumps to here (iret_exc). It loses + * the original trap number and error code. The bogus trap 32 and error + * code 0 are what the vanilla kernel delivers via: + * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) + * + * NOTE: Because of the final "1" in the macro we need to enable interrupts. + * + * In case of a general protection fault in the iret instruction, we + * need to check for a lazy CS update for exec-shield. + */ +fastcall void do_iret_error(struct pt_regs *regs, long error_code) +{ + int ok; + local_irq_enable(); + ok = check_lazy_exec_limit(get_cpu(), regs, error_code); + put_cpu(); + if (!ok && notify_die(DIE_TRAP, "iret exception", regs, + error_code, 32, SIGSEGV) != NOTIFY_STOP) { + siginfo_t info; + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + do_trap(32, SIGSEGV, "iret exception", 0, regs, error_code, + &info); + } +} + void __kprobes do_general_protection(struct pt_regs * regs, long error_code) @@ -600,6 +684,7 @@ void __kprobes do_general_protection(struct pt_regs * regs, int cpu = get_cpu(); struct tss_struct *tss = &per_cpu(init_tss, cpu); struct thread_struct *thread = ¤t->thread; + int ok; /* * Perform the lazy TSS's I/O bitmap copy. If the TSS has an @@ -626,7 +711,6 @@ void __kprobes do_general_protection(struct pt_regs * regs, put_cpu(); return; } - put_cpu(); if (regs->flags & VM_MASK) goto gp_in_vm86; @@ -634,6 +718,22 @@ void __kprobes do_general_protection(struct pt_regs * regs, if (!user_mode(regs)) goto gp_in_kernel; + ok = check_lazy_exec_limit(cpu, regs, error_code); + + put_cpu(); + + if (ok) + return; + + if (print_fatal_signals) { + printk(KERN_ERR "#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", error_code, + error_code/8, regs->ip, smp_processor_id()); + printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x.\n", + current->mm->context.exec_limit, + current->mm->context.user_cs.a, + current->mm->context.user_cs.b); + } + current->thread.error_code = error_code; current->thread.trap_no = 13; if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && @@ -650,11 +750,13 @@ void __kprobes do_general_protection(struct pt_regs * regs, return; gp_in_vm86: + put_cpu(); local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); return; gp_in_kernel: + put_cpu(); if (!fixup_exception(regs)) { current->thread.error_code = error_code; current->thread.trap_no = 13; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8106bba..23932be 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -473,7 +473,7 @@ static int disable_nx __initdata; * Control non executable mappings. * * on Enable - * off Disable + * off Disable (disables exec-shield too) */ static int __init noexec_setup(char *str) { @@ -482,14 +482,12 @@ static int __init noexec_setup(char *str) __supported_pte_mask |= _PAGE_NX; disable_nx = 0; } - } else { - if (!strcmp(str, "off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else { - return -EINVAL; - } - } + } else if (!strcmp(str,"off")) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + exec_shield = 0; + } else + return -EINVAL; return 0; } @@ -527,6 +525,10 @@ void __init paging_init(void) if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); #endif + if (exec_shield) + printk(KERN_INFO "Using x86 segment limits to approximate " + "NX protection\n"); + pagetable_init(); load_cr3(swapper_pg_dir); diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 56fe712..ec932ae 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -111,13 +111,15 @@ static unsigned long mmap_legacy_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { - if (mmap_is_legacy()) { + if (!(2 & exec_shield) && mmap_is_legacy()) { mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; + if (!(current->personality & READ_IMPLIES_EXEC)) + mm->get_unmapped_exec_area = arch_get_unmapped_exec_area; mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 348f134..0bd5f44 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -336,7 +336,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) if (compat) addr = VDSO_HIGH_BASE; else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; diff --git a/drivers/char/random.c b/drivers/char/random.c index f43c89f..4ee8491 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1638,13 +1638,19 @@ EXPORT_SYMBOL(secure_dccp_sequence_number); */ unsigned int get_random_int(void) { + unsigned int val = 0; + +#ifdef CONFIG_X86_HAS_TSC + rdtscl(val); +#endif + /* * Use IP's RNG. It suits our purpose perfectly: it re-keys itself * every second, from the entropy pool (and thus creates a limited * drain on it), and uses halfMD4Transform within the second. We * also mix it with jiffies and the PID: */ - return secure_ip_id((__force __be32)(current->pid + jiffies)); + return secure_ip_id((__force __be32)(current->pid + jiffies + (int)val)); } /* diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 41a958a..22cbd06 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -81,7 +81,7 @@ static struct linux_binfmt elf_format = { .hasvdso = 1 }; -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) +#define BAD_ADDR(x) IS_ERR_VALUE(x) static int set_brk(unsigned long start, unsigned long end) { @@ -544,7 +544,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc = 0; struct files_struct *files; - int executable_stack = EXSTACK_DEFAULT; + int executable_stack; unsigned long def_flags = 0; struct { struct elfhdr elf_ex; @@ -611,6 +611,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) fd_install(elf_exec_fileno = retval, bprm->file); elf_ppnt = elf_phdata; + executable_stack = EXSTACK_DEFAULT; + elf_bss = 0; elf_brk = 0; @@ -709,6 +711,11 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) break; } + if (current->personality == PER_LINUX && (exec_shield & 2)) { + executable_stack = EXSTACK_DISABLE_X; + current->flags |= PF_RANDOMIZE; + } + /* Some simple consistency checks for the interpreter */ if (elf_interpreter) { retval = -ELIBBAD; @@ -728,6 +735,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval) goto out_free_dentry; +#ifdef CONFIG_X86_32 + /* + * Turn off the CS limit completely if exec-shield disabled or + * NX active: + */ + if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled) + arch_add_exec_range(current->mm, -1); +#endif + /* Discard our unneeded old files struct */ if (files) { put_files_struct(files); @@ -741,7 +757,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY(loc->elf_ex, 0); - if (elf_read_implies_exec(loc->elf_ex, executable_stack)) + if (!(exec_shield & 2) && + elf_read_implies_exec(loc->elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) @@ -906,7 +923,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) interpreter, &interp_map_addr, load_bias); - if (!IS_ERR((void *)elf_entry)) { + if (!BAD_ADDR(elf_entry)) { /* * load_elf_interp() returns relocation * adjustment @@ -914,12 +931,14 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) interp_load_addr = elf_entry; elf_entry += loc->interp_elf_ex.e_entry; } + if (BAD_ADDR(elf_entry)) { force_sig(SIGSEGV, current); retval = IS_ERR((void *)elf_entry) ? - (int)elf_entry : -EINVAL; + (int)elf_entry : -EINVAL; goto out_free_dentry; } + reloc_func_desc = interp_load_addr; allow_write_access(interpreter); @@ -934,20 +953,21 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) } } - kfree(elf_phdata); - sys_close(elf_exec_fileno); set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES retval = arch_setup_additional_pages(bprm, executable_stack); + if (retval < 0) { send_sig(SIGKILL, current, 0); - goto out; + goto out_free_fh; } #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ + kfree(elf_phdata); + compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; retval = create_elf_tables(bprm, &loc->elf_ex, @@ -963,10 +983,9 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) current->mm->end_data = end_data; current->mm->start_stack = bprm->p; -#ifdef arch_randomize_brk - if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) - current->mm->brk = current->mm->start_brk = - arch_randomize_brk(current->mm); +#ifdef __HAVE_ARCH_RANDOMIZE_BRK + if (current->flags & PF_RANDOMIZE) + randomize_brk(elf_brk); #endif if (current->personality & MMAP_PAGE_ZERO) { diff --git a/fs/proc/array.c b/fs/proc/array.c index 07d6c48..46adc3b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -471,8 +471,12 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, } rcu_read_unlock(); - if (!whole || num_threads < 2) - wchan = get_wchan(task); + if (!whole || num_threads < 2) { + wchan = 0; + if (current->uid == task->uid || current->euid == task->uid || + capable(CAP_SYS_NICE)) + wchan = get_wchan(task); + } if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; diff --git a/fs/proc/base.c b/fs/proc/base.c index 7c6b4ec..877d0a2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2290,7 +2290,7 @@ static const struct pid_entry tgid_base_stuff[] = { INF("cmdline", S_IRUGO, pid_cmdline), ONE("stat", S_IRUGO, tgid_stat), ONE("statm", S_IRUGO, pid_statm), - REG("maps", S_IRUGO, maps), + REG("maps", S_IRUSR, maps), #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, numa_maps), #endif @@ -2302,7 +2302,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("mountstats", S_IRUSR, mountstats), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), - REG("smaps", S_IRUGO, smaps), + REG("smaps", S_IRUSR, smaps), REG("pagemap", S_IRUSR, pagemap), #endif #ifdef CONFIG_SECURITY @@ -2621,7 +2621,7 @@ static const struct pid_entry tid_base_stuff[] = { INF("cmdline", S_IRUGO, pid_cmdline), ONE("stat", S_IRUGO, tid_stat), ONE("statm", S_IRUGO, pid_statm), - REG("maps", S_IRUGO, maps), + REG("maps", S_IRUSR, maps), #ifdef CONFIG_NUMA REG("numa_maps", S_IRUGO, numa_maps), #endif @@ -2632,7 +2632,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("mounts", S_IRUGO, mounts), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), - REG("smaps", S_IRUGO, smaps), + REG("smaps", S_IRUSR, smaps), REG("pagemap", S_IRUSR, pagemap), #endif #ifdef CONFIG_SECURITY diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ae4d3f2..9a838b8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -48,7 +48,11 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n", + "VmPTE:\t%8lu kB\n" + "StaBrk:\t%08lx kB\n" + "Brk:\t%08lx kB\n" + "StaStk:\t%08lx kB\n" + , hiwater_vm << (PAGE_SHIFT-10), (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), @@ -56,7 +60,12 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); + (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + mm->start_brk, mm->brk, mm->start_stack); +#ifdef CONFIG_X86_32 + if (!nx_enabled) + seq_printf(m, "ExecLim:\t%08lx\n", mm->context.exec_limit); +#endif } unsigned long task_vsize(struct mm_struct *mm) @@ -238,6 +247,9 @@ static int show_map(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; +#ifdef CONFIG_X86_32 + struct mm_struct *tmm = get_task_mm(task); +#endif struct vm_area_struct *vma = v; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; @@ -260,10 +272,20 @@ static int show_map(struct seq_file *m, void *v) vma->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', + (flags & VM_EXEC +#ifdef CONFIG_X86_32 + || (!nx_enabled && tmm && + (vma->vm_start < tmm->context.exec_limit)) +#endif + ) + ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', vma->vm_pgoff << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); +#ifdef CONFIG_X86_32 + if (tmm) + mmput(tmm); +#endif /* * Print the dentry name for named mappings, and a diff --git a/include/asm-ia64/pgalloc.h b/include/asm-ia64/pgalloc.h index b9ac1a6..24f183a 100644 --- a/include/asm-ia64/pgalloc.h +++ b/include/asm-ia64/pgalloc.h @@ -1,6 +1,10 @@ #ifndef _ASM_IA64_PGALLOC_H #define _ASM_IA64_PGALLOC_H +#define arch_add_exec_range(mm, limit) do { ; } while (0) +#define arch_flush_exec_range(mm) do { ; } while (0) +#define arch_remove_exec_range(mm, limit) do { ; } while (0) + /* * This file contains the functions and defines necessary to allocate * page tables. diff --git a/include/asm-powerpc/pgalloc.h b/include/asm-powerpc/pgalloc.h index b4505ed..25068ae 100644 --- a/include/asm-powerpc/pgalloc.h +++ b/include/asm-powerpc/pgalloc.h @@ -2,6 +2,11 @@ #define _ASM_POWERPC_PGALLOC_H #ifdef __KERNEL__ +/* Dummy functions since we don't support execshield on ppc */ +#define arch_add_exec_range(mm, limit) do { ; } while (0) +#define arch_flush_exec_range(mm) do { ; } while (0) +#define arch_remove_exec_range(mm, limit) do { ; } while (0) + #ifdef CONFIG_PPC64 #include <asm/pgalloc-64.h> #else diff --git a/include/asm-ppc/pgalloc.h b/include/asm-ppc/pgalloc.h index fd4d1d7..de55ad5 100644 --- a/include/asm-ppc/pgalloc.h +++ b/include/asm-ppc/pgalloc.h @@ -41,5 +41,10 @@ extern void pte_free(struct mm_struct *mm, pgtable_t pte); #define check_pgt_cache() do { } while (0) +#define arch_add_exec_range(mm, limit) do { ; } while (0) +#define arch_flush_exec_range(mm) do { ; } while (0) +#define arch_remove_exec_range(mm, limit) do { ; } while (0) + + #endif /* _PPC_PGALLOC_H */ #endif /* __KERNEL__ */ diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h index f5b2bf3..a43f0ec 100644 --- a/include/asm-s390/pgalloc.h +++ b/include/asm-s390/pgalloc.h @@ -17,6 +17,10 @@ #include <linux/gfp.h> #include <linux/mm.h> +#define arch_add_exec_range(mm, limit) do { ; } while (0) +#define arch_flush_exec_range(mm) do { ; } while (0) +#define arch_remove_exec_range(mm, limit) do { ; } while (0) + #define check_pgt_cache() do {} while (0) unsigned long *crst_table_alloc(struct mm_struct *, int); diff --git a/include/asm-sparc/pgalloc.h b/include/asm-sparc/pgalloc.h index 6292cd0..2632807 100644 --- a/include/asm-sparc/pgalloc.h +++ b/include/asm-sparc/pgalloc.h @@ -66,4 +66,8 @@ BTFIXUPDEF_CALL(void, pte_free, pgtable_t ) #define pte_free(mm, pte) BTFIXUP_CALL(pte_free)(pte) #define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define arch_add_exec_range(mm, limit) do { ; } while (0) +#define arch_flush_exec_range(mm) do { ; } while (0) +#define arch_remove_exec_range(mm, limit) do { ; } while (0) + #endif /* _SPARC_PGALLOC_H */ diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h index 3ee2d40..c998da2 100644 --- a/include/asm-sparc64/pgalloc.h +++ b/include/asm-sparc64/pgalloc.h @@ -79,4 +79,8 @@ static inline void check_pgt_cache(void) quicklist_trim(0, NULL, 25, 16); } +#define arch_add_exec_range(mm, limit) do { ; } while (0) +#define arch_flush_exec_range(mm) do { ; } while (0) +#define arch_remove_exec_range(mm, limit) do { ; } while (0) + #endif /* _SPARC64_PGALLOC_H */ diff --git a/include/asm-x86/desc.h b/include/asm-x86/desc.h index 5b6a05d..7ad80b9 100644 --- a/include/asm-x86/desc.h +++ b/include/asm-x86/desc.h @@ -353,6 +353,22 @@ static inline void set_system_gate_ist(int n, void *addr, unsigned ist) _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); } +static inline void set_user_cs(struct desc_struct *desc, unsigned long limit) +{ + limit = (limit - 1) / PAGE_SIZE; + desc->a = limit & 0xffff; + desc->b = (limit & 0xf0000) | 0x00c0fb00; +} + +#define load_user_cs_desc(cpu, mm) \ + get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs + +#ifdef CONFIG_X86_32 +extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_flush_exec_range(struct mm_struct *mm); +#endif /* CONFIG_X86_32 */ + #else /* * GET_DESC_BASE reads the descriptor base of the specified segment. diff --git a/include/asm-x86/elf.h b/include/asm-x86/elf.h index fb62f99..516a9da 100644 --- a/include/asm-x86/elf.h +++ b/include/asm-x86/elf.h @@ -185,7 +185,6 @@ static inline void elf_common_init(struct thread_struct *t, clear_thread_flag(TIF_ABI_PENDING); \ else \ set_thread_flag(TIF_ABI_PENDING); \ - current->personality |= force_personality32; \ } while (0) #define COMPAT_ELF_PLATFORM ("i686") @@ -230,7 +229,6 @@ static inline void elf_common_init(struct thread_struct *t, #define ELF_PLATFORM ("x86_64") extern void set_personality_64bit(void); extern unsigned int sysctl_vsyscall32; -extern int force_personality32; #endif /* !CONFIG_X86_32 */ @@ -317,7 +315,9 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, extern int syscall32_setup_pages(struct linux_binprm *, int exstack); #define compat_arch_setup_additional_pages syscall32_setup_pages -extern unsigned long arch_randomize_brk(struct mm_struct *mm); -#define arch_randomize_brk arch_randomize_brk +#ifdef CONFIG_X86_32 +#define __HAVE_ARCH_RANDOMIZE_BRK +extern void randomize_brk(unsigned long old_brk); +#endif #endif diff --git a/include/asm-x86/mmu.h b/include/asm-x86/mmu.h index efa962c..db9b109 100644 --- a/include/asm-x86/mmu.h +++ b/include/asm-x86/mmu.h @@ -9,6 +9,8 @@ * we put the segment information here. * * cpu_vm_mask is used to optimize ldt flushing. + * exec_limit is used to track the range PROT_EXEC + * mappings span. */ typedef struct { void *ldt; @@ -18,6 +20,10 @@ typedef struct { int size; struct mutex lock; void *vdso; +#ifdef CONFIG_X86_32 + struct desc_struct user_cs; + unsigned long exec_limit; +#endif } mm_context_t; #ifdef CONFIG_SMP diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h index 8d67223..982e1cb 100644 --- a/include/asm-x86/pgalloc_64.h +++ b/include/asm-x86/pgalloc_64.h @@ -5,6 +5,13 @@ #include <linux/threads.h> #include <linux/mm.h> +#define arch_add_exec_range(mm, limit) \ + do { (void)(mm), (void)(limit); } while (0) +#define arch_flush_exec_range(mm) \ + do { (void)(mm); } while (0) +#define arch_remove_exec_range(mm, limit) \ + do { (void)(mm), (void)(limit); } while (0) + #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))) #define pud_populate(mm, pud, pmd) \ diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h index 149920d..79c8ad8 100644 --- a/include/asm-x86/processor.h +++ b/include/asm-x86/processor.h @@ -679,6 +679,9 @@ extern int bootloader_type; extern char ignore_fpu_irq; #define cache_line_size() (boot_cpu_data.x86_cache_alignment) +#define __HAVE_ARCH_ALIGN_STACK +extern unsigned long arch_align_stack(unsigned long sp); + #define HAVE_ARCH_PICK_MMAP_LAYOUT 1 #define ARCH_HAS_PREFETCHW #define ARCH_HAS_SPINLOCK_PREFETCH @@ -756,6 +759,9 @@ static inline void prefetchw(const void *x) regs->cs = __USER_CS; \ regs->ip = new_eip; \ regs->sp = new_esp; \ + preempt_disable(); \ + load_user_cs_desc(smp_processor_id(), current->mm); \ + preempt_enable(); \ } while (0) diff --git a/include/linux/mm.h b/include/linux/mm.h index e8abb38..1483fc7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1041,7 +1041,13 @@ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int); + +static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0); +} extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bfee0bd..30ec32f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -166,6 +166,9 @@ struct mm_struct { unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); + unsigned long (*get_unmapped_exec_area) (struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); void (*unmap_area) (struct mm_struct *mm, unsigned long addr); unsigned long mmap_base; /* base of mmap area */ unsigned long task_size; /* size of task vm space */ diff --git a/include/linux/resource.h b/include/linux/resource.h index ae13db7..14757af 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -54,8 +54,11 @@ struct rlimit { /* * Limit the stack by to some sane default: root can always * increase this limit if needed.. 8MB seems reasonable. + * + * (2MB more to cover randomization effects.) */ -#define _STK_LIM (8*1024*1024) +#define _STK_LIM (10*1024*1024) +#define EXEC_STACK_BIAS (2*1024*1024) /* * GPG wants 32kB of mlocked memory, to make sure pass phrases diff --git a/include/linux/sched.h b/include/linux/sched.h index 00e1441..c1c8a7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -98,6 +98,9 @@ struct futex_pi_state; struct robust_list_head; struct bio; +extern int exec_shield; +extern int print_fatal_signals; + /* * List of flags we want to share for kernel threads, * if only because they are not used by them anyway. @@ -342,6 +345,10 @@ extern int sysctl_max_map_count; extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +extern unsigned long +arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d41ef6b..5304704 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -81,6 +81,26 @@ extern int maps_protect; extern int sysctl_stat_interval; extern int latencytop_enabled; +int exec_shield = (1<<0); +/* exec_shield is a bitmask: + * 0: off; vdso at STACK_TOP, 1 page below TASK_SIZE + * (1<<0) 1: on [also on if !=0] + * (1<<1) 2: force noexecstack regardless of PT_GNU_STACK + * The old settings + * (1<<2) 4: vdso just below .text of main (unless too low) + * (1<<3) 8: vdso just below .text of PT_INTERP (unless too low) + * are ignored because the vdso is placed completely randomly + */ + +static int __init setup_exec_shield(char *str) +{ + get_option(&str, &exec_shield); + + return 1; +} + +__setup("exec-shield=", setup_exec_shield); + /* Constants used for minimum and maximum */ #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) static int one = 1; @@ -383,6 +403,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "exec-shield", + .data = &exec_shield, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", .data = &core_uses_pid, diff --git a/mm/mmap.c b/mm/mmap.c index a32d28c..7634038 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -26,6 +26,7 @@ #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -377,6 +378,8 @@ static inline void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent) { + if (vma->vm_flags & VM_EXEC) + arch_add_exec_range(mm, vma->vm_end); if (prev) { vma->vm_next = prev->vm_next; prev->vm_next = vma; @@ -480,6 +483,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, rb_erase(&vma->vm_rb, &mm->mm_rb); if (mm->mmap_cache == vma) mm->mmap_cache = prev; + if (vma->vm_flags & VM_EXEC) + arch_remove_exec_range(mm, vma->vm_end); } /* @@ -785,6 +790,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } else /* cases 2, 5, 7 */ vma_adjust(prev, prev->vm_start, end, prev->vm_pgoff, NULL); + if (prev->vm_flags & VM_EXEC) + arch_add_exec_range(mm, prev->vm_end); return prev; } @@ -940,7 +947,8 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); + addr = get_unmapped_area_prot(file, addr, len, pgoff, flags, + prot & PROT_EXEC); if (addr & ~PAGE_MASK) return addr; @@ -1410,13 +1418,17 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) } unsigned long -get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) +get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, int exec) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - get_area = current->mm->get_unmapped_area; + if (exec && current->mm->get_unmapped_exec_area) + get_area = current->mm->get_unmapped_exec_area; + else + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; addr = get_area(file, addr, len, pgoff, flags); @@ -1430,8 +1442,75 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return arch_rebalance_pgtables(addr, len); } +EXPORT_SYMBOL(get_unmapped_area_prot); + +#define SHLIB_BASE 0x00110000 + +unsigned long arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0, + unsigned long len0, unsigned long pgoff, unsigned long flags) +{ + unsigned long addr = addr0, len = len0; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long tmp; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + if (!addr) { + addr = randomize_range(SHLIB_BASE, 0x01000000, len); + } else { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) { + return addr; + } + } + + addr = SHLIB_BASE; + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) + return -ENOMEM; + + if (!vma || addr + len <= vma->vm_start) { + /* + * Must not let a PROT_EXEC mapping get into the + * brk area: + */ + if (addr + len > mm->brk) + goto failed; + + /* + * Up until the brk area we randomize addresses + * as much as possible: + */ + if (addr >= 0x01000000) { + tmp = randomize_range(0x01000000, + PAGE_ALIGN(max(mm->start_brk, + (unsigned long)0x08000000)), len); + vma = find_vma(mm, tmp); + if (TASK_SIZE - len >= tmp && + (!vma || tmp + len <= vma->vm_start)) + return tmp; + } + /* + * Ok, randomization didnt work out - return + * the result of the linear search: + */ + return addr; + } + addr = vma->vm_end; + } + +failed: + return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags); +} -EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) @@ -1506,6 +1585,14 @@ out: return prev ? prev->vm_next : vma; } +static int over_stack_limit(unsigned long sz) +{ + if (sz < EXEC_STACK_BIAS) + return 0; + return (sz - EXEC_STACK_BIAS) > + current->signal->rlim[RLIMIT_STACK].rlim_cur; +} + /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the @@ -1522,7 +1609,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un return -ENOMEM; /* Stack limit test */ - if (size > rlim[RLIMIT_STACK].rlim_cur) + if (over_stack_limit(size)) return -ENOMEM; /* mlock limit tests */ @@ -1826,10 +1913,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - if (new_below) + if (new_below) { + unsigned long old_end = vma->vm_end; + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), new); - else + if (vma->vm_flags & VM_EXEC) + arch_remove_exec_range(mm, old_end); + } else vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); return 0; @@ -2048,6 +2139,7 @@ void exit_mmap(struct mm_struct *mm) vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); + arch_flush_exec_range(mm); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/mprotect.c b/mm/mprotect.c index 4de5468..6d822ad 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,8 +23,12 @@ #include <linux/swapops.h> #include <asm/uaccess.h> #include <asm/pgtable.h> +#include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#ifdef CONFIG_X86 +#include <asm/desc.h> +#endif static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, @@ -134,7 +138,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; - unsigned long charged = 0; + unsigned long charged = 0, old_end = vma->vm_end; pgoff_t pgoff; int error; int dirty_accountable = 0; @@ -198,6 +202,9 @@ success: dirty_accountable = 1; } + if (oldflags & VM_EXEC) + arch_remove_exec_range(current->mm, old_end); + if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else diff --git a/mm/mremap.c b/mm/mremap.c index 08e3c7f..101f885 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -392,8 +392,8 @@ unsigned long do_mremap(unsigned long addr, if (vma->vm_flags & VM_MAYSHARE) map_flags |= MAP_SHARED; - new_addr = get_unmapped_area(vma->vm_file, 0, new_len, - vma->vm_pgoff, map_flags); + new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len, + vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC); if (new_addr & ~PAGE_MASK) { ret = new_addr; goto out;
_______________________________________________ Fedora-kernel-list mailing list Fedora-kernel-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/fedora-kernel-list