In container environment, ebpf helpers could be used maliciously to leak information, DOS, even escape from containers. CONFIG_BPF_HELPER_STRICT is as a mitigation of it. Related Link: https://rolandorange.zone/report.html Signed-off-by: Roland <kernel.pwn@xxxxxxxxxxx> --- include/linux/kernel.h | 7 ++++++ include/linux/sched.h | 4 +++- include/uapi/linux/bpf.h | 4 ++++ include/uapi/linux/sched.h | 4 ++++ kernel/bpf/Kconfig | 6 +++++ kernel/bpf/syscall.c | 48 +++++++++++++++++++++++++++++++++++--- kernel/fork.c | 13 +++++++++++ kernel/trace/bpf_trace.c | 29 +++++++++++++++++++++-- 8 files changed, 109 insertions(+), 6 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fe6efb24d..61f7fcbc9 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -509,3 +509,10 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } BUILD_BUG_ON_ZERO((perms) & 2) + \ (perms)) #endif + +#ifdef CONFIG_BPF_HELPER_STRICT +# define BPF_PROBE_WRITE_BIT 1 +# define BPF_PROBE_READ_BIT 2 +# define BPF_SEND_SIGNAL_BIT 3 +# define BPF_OVERRIDE_RETURN_BIT 4 +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index ffb6eb55c..a3ec52056 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -760,7 +760,9 @@ struct task_struct { /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; - +#ifdef CONFIG_BPF_HELPER_STRICT + unsigned int bpf_helper_bitfield; +#endif #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 51b9aa640..99a90d0f5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -900,6 +900,7 @@ enum bpf_cmd { BPF_ITER_CREATE, BPF_LINK_DETACH, BPF_PROG_BIND_MAP, + BPF_HELPER_BITS_SET, }; enum bpf_map_type { @@ -1326,6 +1327,9 @@ union bpf_attr { * to using 5 hash functions). */ __u64 map_extra; +#ifdef CONFIG_BPF_HELPER_STRICT + __u32 security_helper_bits; +#endif }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ce..c2fd463be 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -43,6 +43,10 @@ */ #define CLONE_NEWTIME 0x00000080 /* New time namespace */ +#ifdef CONFIG_BPF_HELPER_STRICT +#define CLONE_BITFIELD 0x00000040 /* set if bpf_helper_bitfield shared between processes */ +#endif + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 2dfe1079f..c2c2fcf76 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -99,4 +99,10 @@ config BPF_LSM If you are unsure how to answer this question, answer N. +config BPF_HELPER_STRICT + bool "Enable BPF HELPER Check bits" + depends on BPF_SYSCALL + help + Enable several check bits for bpf helpers' security improvements. + + BPF_HELPER_STRICT restricts the function of helpers. + Currently it can be used for restrict override return, + read, write, and send signal. endmenu # "BPF subsystem" diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7b373a5e8..2f05ea50f 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -68,6 +68,45 @@ static const struct bpf_map_ops * const bpf_map_types[] = { #undef BPF_LINK_TYPE }; +#ifdef CONFIG_BPF_HELPER_STRICT +static __always_inline int HelperWrite(unsigned int bits) +{ + return ((unsigned int)bits & BPF_PROBE_WRITE_BIT) != 0; +} +static __always_inline int HelperRead(unsigned int bits) +{ + return ((unsigned int)bits & BPF_PROBE_READ_BIT) != 0; +} +static __always_inline int HelperSendSignal(unsigned int bits) +{ + return ((unsigned int)bits & BPF_SEND_SIGNAL_BIT) != 0; +} +static __always_inline int HelperOverrideReturn(unsigned int bits) +{ + return ((unsigned int)bits & BPF_OVERRIDE_RETURN_BIT) != 0; +} +int bpf_set_security_helper_bits(union bpf_attr *attr) +{ + int res; + unsigned int bits_to_set; + unsigned int expected_bpf_helper_bitfield = 0; + + bits_to_set = attr->security_helper_bits; + + if (HelperWrite(bits_to_set)) + expected_bpf_helper_bitfield |= 1 << BPF_PROBE_WRITE_BIT; + if (HelperRead(bits_to_set)) + expected_bpf_helper_bitfield |= 1 << BPF_PROBE_READ_BIT; + if (HelperSendSignal(bits_to_set)) + expected_bpf_helper_bitfield |= 1 << BPF_SEND_SIGNAL_BIT; + if (HelperOverrideReturn(bits_to_set)) + expected_bpf_helper_bitfield |= 1 << BPF_OVERRIDE_RETURN_BIT; + + current->bpf_helper_bitfield = expected_bpf_helper_bitfield; + res = 0; + return res; +} +#endif /* * If we're handed a bigger struct than we know of, ensure all the unknown bits * are 0 - i.e. new user-space does not rely on any kernel feature extensions @@ -4913,7 +4952,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) union bpf_attr attr; bool capable; int err; - + capable = bpf_capable() || !sysctl_unprivileged_bpf_disabled; /* Intent here is for unprivileged_bpf_disabled to block key object @@ -4925,7 +4964,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) * and other operations. */ if (!capable && - (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD)) + (cmd == BPF_MAP_CREATE || cmd == BPF_PROG_LOAD || cmd == BPF_HELPER_BITS_SET)) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); @@ -4938,7 +4977,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; - err = security_bpf(cmd, &attr, size); + err = security_bpf(cmd, &attr, size); if (err < 0) return err; @@ -5056,6 +5095,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) case BPF_PROG_BIND_MAP: err = bpf_prog_bind_map(&attr); break; + case BPF_HELPER_BITS_SET: + err = bpf_set_security_helper_bits(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/fork.c b/kernel/fork.c index 08969f5aa..6168da0b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1998,6 +1998,10 @@ static __latent_entropy struct task_struct *copy_process( const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; +#ifdef CONFIG_BPF_HELPER_STRICT + unsigned int bitfield = current->bpf_helper_bitfield; +#endif + /* * Don't allow sharing the root directory with processes in a different * namespace @@ -2102,6 +2106,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; + ftrace_graph_init_task(p); rt_mutex_init_task(p); @@ -2490,6 +2495,14 @@ static __latent_entropy struct task_struct *copy_process( copy_oom_score_adj(clone_flags, p); +#ifdef CONFIG_BPF_HELPER_STRICT + /* Only if explicit set CLONE_BITFIELD or + * the forked process has both CAP_BPF and CAP_SYS_ADMIN, + * we will set the bitfield + */ + p->bpf_helper_bitfield = (clone_flags & CLONE_BITFIELD) ? bitfield : 0; + if (capable(CAP_BPF) && capable(CAP_SYS_ADMIN)) + p->bpf_helper_bitfield = bitfield; +#endif return p; bad_fork_cancel_cgroup: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1ed08967f..5c4232d45 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -39,6 +39,16 @@ #define bpf_event_rcu_dereference(p) \ rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) +#ifdef CONFIG_BPF_HELPER_STRICT +static bool check_bpf_bitfield(unsigned int flags) +{ + unsigned int bits = current->bpf_helper_bitfield; + + if (!(bits & (1 << flags))) + return false; + + return true; +} +#endif + #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; @@ -145,6 +155,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { +#ifdef CONFIG_BPF_HELPER_STRICT + if (unlikely(!check_bpf_bitfield(BPF_OVERRIDE_RETURN_BIT))) + return -EPERM; +#endif regs_set_return_value(regs, rc); override_function_with_return(regs); return 0; @@ -162,8 +176,8 @@ static const struct bpf_func_proto bpf_override_return_proto = { static __always_inline int bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr) { - int ret; + int ret; ret = copy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); @@ -287,6 +301,10 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto = { BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size, const void *, unsafe_ptr) { +#ifdef CONFIG_BPF_HELPER_STRICT + if (unlikely(!check_bpf_bitfield(BPF_PROBE_READ_BIT))) + return -EPERM; +#endif if ((unsigned long)unsafe_ptr < TASK_SIZE) { return bpf_probe_read_user_common(dst, size, (__force void __user *)unsafe_ptr); @@ -338,7 +356,10 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src, * state, when the task or mm are switched. This is specifically * required to prevent the use of temporary mm. */ - +#ifdef CONFIG_BPF_HELPER_STRICT + if (unlikely(!check_bpf_bitfield(BPF_PROBE_WRITE_BIT))) + return -EPERM; +#endif if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; @@ -843,6 +864,10 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) * permitted in order to send signal to the current * task. */ +#ifdef CONFIG_BPF_HELPER_STRICT + if (unlikely(!check_bpf_bitfield(BPF_SEND_SIGNAL_BIT))) + return -EPERM; +#endif if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; if (unlikely(!nmi_uaccess_okay())) -- 2.25.1