From: Kui-Feng Lee <kuifeng@xxxxxxxx> Enable sleepable cgroup/{get,set}sockopt hooks. The sleepable BPF programs attached to cgroup/{get,set}sockopt hooks may received a pointer to the optval in user space instead of a kernel copy. ctx->user_optval and ctx->user_optval_end are the pointers to the begin and end of the user space buffer if receiving a user space buffer. ctx->optval and ctx->optval_end will be a kernel copy if receiving a kernel space buffer. A program receives a user space buffer if ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER is true, otherwise it receives a kernel space buffer. The BPF programs should not read/write from/to a user space buffer dirrectly. It should access the buffer through bpf_copy_from_user() and bpf_copy_to_user() provided in the following patches. Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxxxx> --- include/linux/filter.h | 3 + include/uapi/linux/bpf.h | 9 ++ kernel/bpf/cgroup.c | 189 ++++++++++++++++++++++++++------- kernel/bpf/verifier.c | 7 +- tools/include/uapi/linux/bpf.h | 9 ++ tools/lib/bpf/libbpf.c | 2 + 6 files changed, 176 insertions(+), 43 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index f69114083ec7..301dd1ba0de1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1345,6 +1345,9 @@ struct bpf_sockopt_kern { s32 level; s32 optname; s32 optlen; + u32 flags; + u8 *user_optval; + u8 *user_optval_end; /* for retval in struct bpf_cg_run_ctx */ struct task_struct *current_task; /* Temporary "register" for indirect stores to ppos. */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 739c15906a65..b2f81193f97b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7135,6 +7135,15 @@ struct bpf_sockopt { __s32 optname; __s32 optlen; __s32 retval; + + __bpf_md_ptr(void *, user_optval); + __bpf_md_ptr(void *, user_optval_end); + __u32 flags; +}; + +enum bpf_sockopt_flags { + /* optval is a pointer to user space memory */ + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0), }; struct bpf_pidns_info { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 5b2741aa0d9b..b268bbfa6c53 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -38,15 +38,26 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; + bool do_sleepable; u32 func_ret; + do_sleepable = + atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT; + run_ctx.retval = retval; migrate_disable(); - rcu_read_lock(); + if (do_sleepable) { + might_fault(); + rcu_read_lock_trace(); + } else + rcu_read_lock(); array = rcu_dereference(cgrp->effective[atype]); item = &array->items[0]; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); while ((prog = READ_ONCE(item->prog))) { + if (do_sleepable && !prog->aux->sleepable) + rcu_read_lock(); + run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); if (ret_flags) { @@ -56,13 +67,43 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; + + if (do_sleepable && !prog->aux->sleepable) + rcu_read_unlock(); } bpf_reset_run_ctx(old_run_ctx); - rcu_read_unlock(); + if (do_sleepable) + rcu_read_unlock_trace(); + else + rcu_read_unlock(); migrate_enable(); return run_ctx.retval; } +static __always_inline bool +has_only_sleepable_prog_cg(const struct cgroup_bpf *cgrp, + enum cgroup_bpf_attach_type atype) +{ + const struct bpf_prog_array_item *item; + const struct bpf_prog *prog; + int cnt = 0; + bool ret = true; + + rcu_read_lock(); + item = &rcu_dereference(cgrp->effective[atype])->items[0]; + while (ret && (prog = READ_ONCE(item->prog))) { + if (!prog->aux->sleepable) + ret = false; + item++; + cnt++; + } + rcu_read_unlock(); + if (cnt == 0) + ret = false; + + return ret; +} + unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, const struct bpf_insn *insn) { @@ -1773,7 +1814,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, struct bpf_sockopt_buf *buf) { - if (ctx->optval == buf->data) + if (ctx->optval == buf->data || + ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) return; kfree(ctx->optval); } @@ -1781,7 +1823,8 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, struct bpf_sockopt_buf *buf) { - return ctx->optval != buf->data; + return ctx->optval != buf->data && + !(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER); } int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, @@ -1796,21 +1839,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, .optname = *optname, }; int ret, max_optlen; + bool alloc_mem; + + alloc_mem = !has_only_sleepable_prog_cg(&cgrp->bpf, CGROUP_SETSOCKOPT); + if (!alloc_mem) { + max_optlen = *optlen; + ctx.optlen = *optlen; + ctx.user_optval = optval; + ctx.user_optval_end = optval + *optlen; + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER; + } else { + /* Allocate a bit more than the initial user buffer for + * BPF program. The canonical use case is overriding + * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). + */ + max_optlen = max_t(int, 16, *optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); + if (max_optlen < 0) + return max_optlen; - /* Allocate a bit more than the initial user buffer for - * BPF program. The canonical use case is overriding - * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). - */ - max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); - if (max_optlen < 0) - return max_optlen; - - ctx.optlen = *optlen; + ctx.optlen = *optlen; - if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { - ret = -EFAULT; - goto out; + if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { + ret = -EFAULT; + goto out; + } } lock_sock(sk); @@ -1824,7 +1877,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ ret = 1; - } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { + } else if (alloc_mem && + (ctx.optlen > max_optlen || ctx.optlen < -1)) { /* optlen is out of bounds */ if (*optlen > PAGE_SIZE && ctx.optlen >= 0) { pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", @@ -1846,6 +1900,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ if (ctx.optlen != 0) { *optlen = ctx.optlen; + if (ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) + return 0; /* We've used bpf_sockopt_kern->buf as an intermediary * storage, but the BPF program indicates that we need * to pass this data to the kernel setsockopt handler. @@ -1892,33 +1948,59 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, orig_optlen = max_optlen; ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); - if (max_optlen < 0) - return max_optlen; + if (has_only_sleepable_prog_cg(&cgrp->bpf, CGROUP_GETSOCKOPT)) { + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ - if (!retval) { - /* If kernel getsockopt finished successfully, - * copy whatever was returned to the user back - * into our temporary buffer. Set optlen to the - * one that kernel returned as well to let - * BPF programs inspect the value. - */ + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } - if (get_user(ctx.optlen, optlen)) { - ret = -EFAULT; - goto out; + if (ctx.optlen < 0) { + ret = -EFAULT; + goto out; + } + orig_optlen = ctx.optlen; } - if (ctx.optlen < 0) { - ret = -EFAULT; - goto out; - } - orig_optlen = ctx.optlen; + ctx.user_optval = optval; + ctx.user_optval_end = optval + *optlen; + ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER; + } else { + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); + if (max_optlen < 0) + return max_optlen; + + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ - if (copy_from_user(ctx.optval, optval, - min(ctx.optlen, max_optlen)) != 0) { - ret = -EFAULT; - goto out; + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + if (ctx.optlen < 0) { + ret = -EFAULT; + goto out; + } + orig_optlen = ctx.optlen; + + if (copy_from_user(ctx.optval, optval, + min(ctx.optlen, max_optlen)) != 0) { + ret = -EFAULT; + goto out; + } } } @@ -1942,7 +2024,9 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } if (ctx.optlen != 0) { - if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) { + if (optval && + !(ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) && + copy_to_user(optval, ctx.optval, ctx.optlen)) { ret = -EFAULT; goto out; } @@ -2388,6 +2472,20 @@ static bool cg_sockopt_is_valid_access(int off, int size, if (size != size_default) return false; return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; + case offsetof(struct bpf_sockopt, user_optval): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sockopt, user_optval_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sockopt, flags): + if (size != sizeof(__u32)) + return false; + break; default: if (size != size_default) return false; @@ -2481,6 +2579,15 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, case offsetof(struct bpf_sockopt, optval_end): *insn++ = CG_SOCKOPT_READ_FIELD(optval_end); break; + case offsetof(struct bpf_sockopt, user_optval): + *insn++ = CG_SOCKOPT_READ_FIELD(user_optval); + break; + case offsetof(struct bpf_sockopt, user_optval_end): + *insn++ = CG_SOCKOPT_READ_FIELD(user_optval_end); + break; + case offsetof(struct bpf_sockopt, flags): + *insn++ = CG_SOCKOPT_READ_FIELD(flags); + break; } return insn - insn_buf; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 803b91135ca0..8cb25a775119 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19299,9 +19299,11 @@ static bool can_be_sleepable(struct bpf_prog *prog) return false; } } + return prog->type == BPF_PROG_TYPE_LSM || prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ || - prog->type == BPF_PROG_TYPE_STRUCT_OPS; + prog->type == BPF_PROG_TYPE_STRUCT_OPS || + prog->type == BPF_PROG_TYPE_CGROUP_SOCKOPT; } static int check_attach_btf_id(struct bpf_verifier_env *env) @@ -19323,7 +19325,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } if (prog->aux->sleepable && !can_be_sleepable(prog)) { - verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n"); + verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe," + "cgroup, and struct_ops programs can be sleepable\n"); return -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 739c15906a65..b2f81193f97b 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7135,6 +7135,15 @@ struct bpf_sockopt { __s32 optname; __s32 optlen; __s32 retval; + + __bpf_md_ptr(void *, user_optval); + __bpf_md_ptr(void *, user_optval_end); + __u32 flags; +}; + +enum bpf_sockopt_flags { + /* optval is a pointer to user space memory */ + BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0), }; struct bpf_pidns_info { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 17883f5a44b9..3be9270bbc33 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8766,7 +8766,9 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE), SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE), SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockopt.s", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE | SEC_SLEEPABLE), SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/setsockopt.s", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE | SEC_SLEEPABLE), SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT), SEC_DEF("struct_ops+", STRUCT_OPS, 0, SEC_NONE), SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), -- 2.34.1