This patch ports the x86-specific atomic overflow handling from PaX's PAX_REFCOUNT to the upstream refcount_t API. This is an updated version from PaX that eliminates the saturation race condition by resetting the atomic counter back to the INT_MAX saturation value on both overflow and underflow. To win a race, a system would have to have INT_MAX threads simultaneously overflow before the saturation handler runs. With this, the commonly used inc/dec_and_test usage patterns present in performance-sensitive areas of the kernel (mm, net, block) will use the regular inline atomic operations with only a single overflow test instruction added to the fast path. Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx> --- arch/Kconfig | 19 ++++++ arch/x86/Kconfig | 1 + arch/x86/entry/entry_32.S | 9 +++ arch/x86/entry/entry_64.S | 3 + arch/x86/include/asm/irq_vectors.h | 3 + arch/x86/include/asm/refcount.h | 123 +++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/traps.h | 5 ++ arch/x86/kernel/traps.c | 38 ++++++++++++ drivers/misc/lkdtm_bugs.c | 19 ++++-- include/asm-generic/sections.h | 4 ++ include/asm-generic/vmlinux.lds.h | 9 +++ include/linux/kernel.h | 2 + include/linux/refcount.h | 4 ++ kernel/panic.c | 23 +++++++ lib/refcount.c | 6 +- 15 files changed, 263 insertions(+), 5 deletions(-) create mode 100644 arch/x86/include/asm/refcount.h diff --git a/arch/Kconfig b/arch/Kconfig index cd211a14a88f..2cd150f03175 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -847,4 +847,23 @@ config STRICT_MODULE_RWX config ARCH_WANT_RELAX_ORDER bool +config ARCH_HAS_FAST_REFCOUNT + bool + help + An architecture selects this when it has implemented refcount_t + using primitizes that provide a faster runtime at the expense + of some refcount state checks. The refcount overflow condition, + however, must be retained. Catching overflows is the primary + security concern for protecting against bugs in reference counts. + +config FAST_REFCOUNT + bool "Speed up reference counting at the expense of full validation" + depends on ARCH_HAS_FAST_REFCOUNT + help + The regular reference counting infrastructure in the kernel checks + many error conditions. If this option is selected, refcounting + is made faster using architecture-specific implementions that may + only check for reference count overflows (which is the primary + way reference counting bugs are turned into security exploits). + source "kernel/gcov/Kconfig" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc98d5a294ee..a13db97e0d71 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -50,6 +50,7 @@ config X86 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER + select ARCH_HAS_FAST_REFCOUNT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MMIO_FLUSH diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 57f7ec35216e..9e8d9e2d70bf 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -792,6 +792,15 @@ ENTRY(spurious_interrupt_bug) jmp common_exception END(spurious_interrupt_bug) +#ifdef CONFIG_FAST_REFCOUNT +ENTRY(refcount_error) + ASM_CLAC + pushl $0 + pushl $do_refcount_error + jmp error_code +ENDPROC(refcount_error) +#endif + #ifdef CONFIG_XEN ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 044d18ebc43c..a736b882ec76 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -858,6 +858,9 @@ idtentry coprocessor_error do_coprocessor_error has_error_code=0 idtentry alignment_check do_alignment_check has_error_code=1 idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 +#ifdef CONFIG_FAST_REFCOUNT +idtentry refcount_error do_refcount_error has_error_code=0 +#endif /* * Reload gs selector with exception handling diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6ca9fd6234e1..64ca4dcc29ec 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -48,6 +48,9 @@ #define IA32_SYSCALL_VECTOR 0x80 +/* Refcount Overflow or Underflow Exception. */ +#define X86_REFCOUNT_VECTOR 0x81 + /* * Vectors 0x30-0x3f are used for ISA interrupts. * round up to the next 16-vector boundary diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h new file mode 100644 index 000000000000..79e35981e42f --- /dev/null +++ b/arch/x86/include/asm/refcount.h @@ -0,0 +1,123 @@ +#ifndef __ASM_X86_REFCOUNT_H +#define __ASM_X86_REFCOUNT_H +/* + * x86-specific implementation of refcount_t. Ported from PAX_REFCOUNT in + * PaX/grsecurity. + */ +#include <linux/refcount.h> +#include <asm/irq_vectors.h> + +#define __REFCOUNT_CHECK(size) \ + "jo 111f\n" \ + ".if "__stringify(size)" == 4\n\t" \ + ".pushsection .text.refcount_overflow\n" \ + ".elseif "__stringify(size)" == -4\n\t" \ + ".pushsection .text.refcount_underflow\n" \ + ".else\n" \ + ".error \"invalid size\"\n" \ + ".endif\n" \ + "111:\tlea %[counter],%%"_ASM_CX"\n\t" \ + "int $"__stringify(X86_REFCOUNT_VECTOR)"\n" \ + "222:\n\t" \ + ".popsection\n" \ + "333:\n" \ + _ASM_EXTABLE(222b, 333b) + +#define REFCOUNT_CHECK_OVERFLOW(size) __REFCOUNT_CHECK(size) +#define REFCOUNT_CHECK_UNDERFLOW(size) __REFCOUNT_CHECK(-(size)) + +#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) +/* Use asm goto */ +#define __GEN_CHECKED_RMWcc(fullop, var, size, cc, ...) \ +do { \ + asm_volatile_goto(fullop \ + "\n\t"__REFCOUNT_CHECK(size) \ + ";j" #cc " %l[cc_label]" \ + : : [counter] "m" (var), ## __VA_ARGS__ \ + : "memory", "cc", "cx" : cc_label); \ + return 0; \ +cc_label: \ + return 1; \ +} while (0) + +#define GEN_BINARY_CHECKED_RMWcc(op, var, size, vcon, val, arg0, cc) \ + __GEN_CHECKED_RMWcc(op " %1, " arg0, var, size, cc, vcon (val)) + +#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + +#define __GEN_CHECKED_RMWcc(fullop, var, size, cc, ...) \ +do { \ + bool c; \ + asm volatile (fullop \ + "\n\t"__REFCOUNT_CHECK(size) \ + ";" CC_SET(cc) \ + : [counter] "+m" (var), CC_OUT(cc) (c) \ + : __VA_ARGS__ : "memory", "cc", "cx"); \ + return c != 0; \ +} while (0) + +#define GEN_BINARY_CHECKED_RMWcc(op, var, size, vcon, val, arg0, cc) \ + __GEN_CHECKED_RMWcc(op " %2, " arg0, var, size, cc, vcon (val)) + +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + +#define GEN_UNARY_CHECKED_RMWcc(op, var, size, arg0, cc) \ + __GEN_CHECKED_RMWcc(op " " arg0, var, size, cc) + +static __always_inline void refcount_add(unsigned int i, refcount_t *r) +{ + asm volatile(LOCK_PREFIX "addl %1,%0\n\t" + REFCOUNT_CHECK_OVERFLOW(4) + : [counter] "+m" (r->refs.counter) + : "ir" (i) + : "cc", "cx"); +} + +static __always_inline void refcount_inc(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "incl %0\n\t" + REFCOUNT_CHECK_OVERFLOW(4) + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline void refcount_dec(refcount_t *r) +{ + asm volatile(LOCK_PREFIX "decl %0\n\t" + REFCOUNT_CHECK_UNDERFLOW(4) + : [counter] "+m" (r->refs.counter) + : : "cc", "cx"); +} + +static __always_inline __must_check +bool refcount_sub_and_test(unsigned int i, refcount_t *r) +{ + GEN_BINARY_CHECKED_RMWcc(LOCK_PREFIX "subl", r->refs.counter, + -4, "er", i, "%0", e); +} + +static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r) +{ + GEN_UNARY_CHECKED_RMWcc(LOCK_PREFIX "decl", r->refs.counter, + -4, "%0", e); +} + +static __always_inline __must_check bool refcount_inc_not_zero(refcount_t *r) +{ + const int a = 1; + const int u = 0; + int c, old; + + c = atomic_read(&(r->refs)); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic_cmpxchg(&(r->refs), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != u; +} + +#endif diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 01fd0a7f48cd..e4d8db75d85e 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -38,6 +38,10 @@ asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); +#ifdef CONFIG_FAST_REFCOUNT +asmlinkage void refcount_error(void); +#endif + #ifdef CONFIG_TRACING asmlinkage void trace_page_fault(void); #define trace_stack_segment stack_segment @@ -54,6 +58,7 @@ asmlinkage void trace_page_fault(void); #define trace_alignment_check alignment_check #define trace_simd_coprocessor_error simd_coprocessor_error #define trace_async_page_fault async_page_fault +#define trace_refcount_error refcount_error #endif dotraplinkage void do_divide_error(struct pt_regs *, long); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4e496379a871..999d324119c0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -192,6 +192,13 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, tsk->thread.trap_nr = trapnr; die(str, regs, error_code); } + +#ifdef CONFIG_FAST_REFCOUNT + if (trapnr == X86_REFCOUNT_VECTOR) { + regs->ip -= 2; /* sizeof(int $xx) */ + refcount_error_report(regs, str); + } +#endif return 0; } @@ -308,6 +315,32 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif +#ifdef CONFIG_FAST_REFCOUNT + +dotraplinkage void do_refcount_error(struct pt_regs *regs, long error_code) +{ + const char *str = NULL; + + BUG_ON(!(regs->flags & X86_EFLAGS_OF)); + +#define range_check(size, direction, type, value) \ + if ((unsigned long)__##size##_##direction##_start <= regs->ip && \ + regs->ip < (unsigned long)__##size##_##direction##_end) { \ + *(type *)regs->cx = value; \ + str = #size " " #direction; \ + } + + range_check(refcount, overflow, int, INT_MAX) + range_check(refcount, underflow, int, INT_MIN) + +#undef range_check + + BUG_ON(!str); + do_error_trap(regs, error_code, (char *)str, X86_REFCOUNT_VECTOR, + SIGILL); +} +#endif + #ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) @@ -983,6 +1016,11 @@ void __init trap_init(void) set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif +#ifdef CONFIG_FAST_REFCOUNT + set_intr_gate(X86_REFCOUNT_VECTOR, refcount_error); + set_bit(X86_REFCOUNT_VECTOR, used_vectors); +#endif + /* * Set the IDT descriptor to a fixed read-only location, so that the * "sidt" instruction will not leak the location of the kernel, and diff --git a/drivers/misc/lkdtm_bugs.c b/drivers/misc/lkdtm_bugs.c index e3f4cd8876b5..1bdafb29b802 100644 --- a/drivers/misc/lkdtm_bugs.c +++ b/drivers/misc/lkdtm_bugs.c @@ -135,9 +135,15 @@ void lkdtm_HUNG_TASK(void) schedule(); } +#ifdef CONFIG_FAST_REFCOUNT +#define REFCOUNT_MAX INT_MAX +#else +#define REFCOUNT_MAX UINT_MAX +#endif + void lkdtm_REFCOUNT_SATURATE_INC(void) { - refcount_t over = REFCOUNT_INIT(UINT_MAX - 1); + refcount_t over = REFCOUNT_INIT(REFCOUNT_MAX - 1); pr_info("attempting good refcount decrement\n"); refcount_dec(&over); @@ -146,7 +152,7 @@ void lkdtm_REFCOUNT_SATURATE_INC(void) pr_info("attempting bad refcount inc overflow\n"); refcount_inc(&over); refcount_inc(&over); - if (refcount_read(&over) == UINT_MAX) + if (refcount_read(&over) == REFCOUNT_MAX) pr_err("Correctly stayed saturated, but no BUG?!\n"); else pr_err("Fail: refcount wrapped\n"); @@ -154,7 +160,7 @@ void lkdtm_REFCOUNT_SATURATE_INC(void) void lkdtm_REFCOUNT_SATURATE_ADD(void) { - refcount_t over = REFCOUNT_INIT(UINT_MAX - 1); + refcount_t over = REFCOUNT_INIT(REFCOUNT_MAX - 1); pr_info("attempting good refcount decrement\n"); refcount_dec(&over); @@ -162,7 +168,7 @@ void lkdtm_REFCOUNT_SATURATE_ADD(void) pr_info("attempting bad refcount add overflow\n"); refcount_add(2, &over); - if (refcount_read(&over) == UINT_MAX) + if (refcount_read(&over) == REFCOUNT_MAX) pr_err("Correctly stayed saturated, but no BUG?!\n"); else pr_err("Fail: refcount wrapped\n"); @@ -178,6 +184,11 @@ void lkdtm_REFCOUNT_ZERO_DEC(void) pr_err("Stayed at zero, but no BUG?!\n"); else pr_err("Fail: refcount went crazy\n"); + + pr_info("attempting bad refcount decrement past INT_MIN\n"); + atomic_set(&zero.refs, INT_MIN); + refcount_dec(&zero); + pr_err("Fail: wrap not detected\n"); } void lkdtm_REFCOUNT_ZERO_SUB(void) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 532372c6cf15..0590f384f234 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -20,6 +20,8 @@ * may be out of this range on some architectures. * [_sinittext, _einittext]: contains .init.text.* sections * [__bss_start, __bss_stop]: contains BSS sections + * [__refcount_overflow/underflow_start, ..._end]: contains .text sections + * for refcount error handling. * * Following global variables are optional and may be unavailable on some * architectures and/or kernel configurations. @@ -39,6 +41,8 @@ extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __entry_text_start[], __entry_text_end[]; extern char __start_rodata[], __end_rodata[]; +extern char __refcount_overflow_start[], __refcount_overflow_end[]; +extern char __refcount_underflow_start[], __refcount_underflow_end[]; /* Start and end of .ctors section - used for constructor calls. */ extern char __ctors_start[], __ctors_end[]; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 143db9c523e2..a04aae39e820 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -448,9 +448,18 @@ ALIGN_FUNCTION(); \ *(.text.hot .text .text.fixup .text.unlikely) \ *(.ref.text) \ + REFCOUNT_TEXT \ MEM_KEEP(init.text) \ MEM_KEEP(exit.text) \ +#define __REFCOUNT_TEXT(section) \ + VMLINUX_SYMBOL(__##section##_start) = .; \ + *(.text.##section) \ + VMLINUX_SYMBOL(__##section##_end) = .; + +#define REFCOUNT_TEXT \ + __REFCOUNT_TEXT(refcount_overflow) \ + __REFCOUNT_TEXT(refcount_underflow) /* sched.text is aling to function alignment to secure we have same * address even at second ld pass when generating System.map */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 4c26dc3a8295..bc15822b24eb 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -275,6 +275,8 @@ extern int oops_may_print(void); void do_exit(long error_code) __noreturn; void complete_and_exit(struct completion *, long) __noreturn; +void refcount_error_report(struct pt_regs *regs, const char *kind); + /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); int __must_check _kstrtol(const char *s, unsigned int base, long *res); diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 0023fee4bbbc..fdb82bcaf975 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -22,6 +22,9 @@ static inline unsigned int refcount_read(const refcount_t *r) return atomic_read(&r->refs); } +#ifdef CONFIG_FAST_REFCOUNT +#include <asm/refcount.h> +#else extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r); extern void refcount_add(unsigned int i, refcount_t *r); @@ -33,6 +36,7 @@ extern void refcount_sub(unsigned int i, refcount_t *r); extern __must_check bool refcount_dec_and_test(refcount_t *r); extern void refcount_dec(refcount_t *r); +#endif extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); diff --git a/kernel/panic.c b/kernel/panic.c index a58932b41700..a1745b60cc36 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -26,6 +26,7 @@ #include <linux/nmi.h> #include <linux/console.h> #include <linux/bug.h> +#include <linux/ratelimit.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -601,6 +602,28 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif +#ifdef CONFIG_FAST_REFCOUNT +static DEFINE_RATELIMIT_STATE(refcount_ratelimit, 15 * HZ, 3); + +void refcount_error_report(struct pt_regs *regs, const char *kind) +{ + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, current, true); + + if (!__ratelimit(&refcount_ratelimit)) + return; + + pr_emerg("%s detected in: %s:%d, uid/euid: %u/%u\n", + kind ? kind : "refcount error", + current->comm, task_pid_nr(current), + from_kuid_munged(&init_user_ns, current_uid()), + from_kuid_munged(&init_user_ns, current_euid())); + print_symbol(KERN_EMERG "refcount error occurred at: %s\n", + instruction_pointer(regs)); + BUG(); +} +EXPORT_SYMBOL(refcount_error_report); +#endif + core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); diff --git a/lib/refcount.c b/lib/refcount.c index aa09ad3c30b0..903a59557893 100644 --- a/lib/refcount.c +++ b/lib/refcount.c @@ -37,6 +37,9 @@ #include <linux/refcount.h> #include <linux/bug.h> +/* Leave out architecture-specific implementations. */ +#ifndef CONFIG_FAST_REFCOUNT + bool refcount_add_not_zero(unsigned int i, refcount_t *r) { unsigned int old, new, val = atomic_read(&r->refs); @@ -168,6 +171,8 @@ void refcount_dec(refcount_t *r) } EXPORT_SYMBOL_GPL(refcount_dec); +#endif /* CONFIG_FAST_REFCOUNT */ + /* * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. @@ -264,4 +269,3 @@ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) return true; } EXPORT_SYMBOL_GPL(refcount_dec_and_lock); - -- 2.7.4 -- Kees Cook Pixel Security