From: Martin KaFai Lau <martin.lau@xxxxxxxxxx> When a bad bpf prog '.init' calls bpf_setsockopt(TCP_CONGESTION, "itself"), it will trigger this loop: .init => bpf_setsockopt(tcp_cc) => .init => bpf_setsockopt(tcp_cc) ... ... => .init => bpf_setsockopt(tcp_cc). It was prevented by the prog->active counter before but the prog->active detection cannot be used in struct_ops as explained in the earlier patch of the set. In this patch, the second bpf_setsockopt(tcp_cc) is not allowed in order to break the loop. This is done by checking the previous bpf_run_ctx has saved the same sk pointer in the bpf_cookie. Note that this essentially limits only the first '.init' can call bpf_setsockopt(TCP_CONGESTION) to pick a fallback cc (eg. peer does not support ECN) and the second '.init' cannot fallback to another cc. This applies even the second bpf_setsockopt(TCP_CONGESTION) will not cause a loop. Signed-off-by: Martin KaFai Lau <martin.lau@xxxxxxxxxx> --- include/linux/filter.h | 3 +++ net/core/filter.c | 4 ++-- net/ipv4/bpf_tcp_ca.c | 54 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 98e28126c24b..9942ecc68a45 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -911,6 +911,9 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); +int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen); + u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ diff --git a/net/core/filter.c b/net/core/filter.c index f4cea3ff994a..e56a1ebcf1bc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5244,8 +5244,8 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; } -static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { if (sk_fullsock(sk)) sock_owned_by_me(sk); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 6da16ae6a962..a9f2cab5ffbc 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -144,6 +144,57 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_init_ops_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + struct bpf_tramp_run_ctx *run_ctx, *saved_run_ctx; + int ret; + + if (optname != TCP_CONGESTION) + return _bpf_setsockopt(sk, level, optname, optval, optlen); + + run_ctx = (struct bpf_tramp_run_ctx *)current->bpf_ctx; + if (unlikely(run_ctx->saved_run_ctx && + run_ctx->saved_run_ctx->type == BPF_RUN_CTX_TYPE_STRUCT_OPS)) { + saved_run_ctx = (struct bpf_tramp_run_ctx *)run_ctx->saved_run_ctx; + /* It stops this looping + * + * .init => bpf_setsockopt(tcp_cc) => .init => + * bpf_setsockopt(tcp_cc)" => .init => .... + * + * The second bpf_setsockopt(tcp_cc) is not allowed + * in order to break the loop when both .init + * are the same bpf prog. + * + * This applies even the second bpf_setsockopt(tcp_cc) + * does not cause a loop. This limits only the first + * '.init' can call bpf_setsockopt(TCP_CONGESTION) to + * pick a fallback cc (eg. peer does not support ECN) + * and the second '.init' cannot fallback to + * another cc. + */ + if (saved_run_ctx->bpf_cookie == (uintptr_t)sk) + return -EBUSY; + } + + run_ctx->bpf_cookie = (uintptr_t)sk; + ret = _bpf_setsockopt(sk, level, optname, optval, optlen); + run_ctx->bpf_cookie = 0; + + return ret; +} + +static const struct bpf_func_proto bpf_init_ops_setsockopt_proto = { + .func = bpf_init_ops_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + static u32 prog_ops_moff(const struct bpf_prog *prog) { const struct btf_member *m; @@ -169,6 +220,9 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_setsockopt: + if (prog_ops_moff(prog) == + offsetof(struct tcp_congestion_ops, init)) + return &bpf_init_ops_setsockopt_proto; /* Does not allow release() to call setsockopt. * release() is called when the current bpf-tcp-cc * is retiring. It is not allowed to call -- 2.30.2