[PATCH bpf-next 4/5] bpf: Stop bpf_setsockopt(TCP_CONGESTION) in init ops to recur itself

Martin KaFai Lau <kafai@xxxxxx> · Thu, 22 Sep 2022 15:56:42 -0700

From: Martin KaFai Lau <martin.lau@xxxxxxxxxx>

When a bad bpf prog '.init' calls
bpf_setsockopt(TCP_CONGESTION, "itself"), it will trigger this loop:

.init => bpf_setsockopt(tcp_cc) => .init => bpf_setsockopt(tcp_cc) ...
... => .init => bpf_setsockopt(tcp_cc).

It was prevented by the prog->active counter before but the prog->active
detection cannot be used in struct_ops as explained in the earlier
patch of the set.

In this patch, the second bpf_setsockopt(tcp_cc) is not allowed
in order to break the loop.  This is done by checking the
previous bpf_run_ctx has saved the same sk pointer in the
bpf_cookie.

Note that this essentially limits only the first '.init' can
call bpf_setsockopt(TCP_CONGESTION) to pick a fallback cc (eg. peer
does not support ECN) and the second '.init' cannot fallback to
another cc.  This applies even the second
bpf_setsockopt(TCP_CONGESTION) will not cause a loop.

Signed-off-by: Martin KaFai Lau <martin.lau@xxxxxxxxxx>
---
 include/linux/filter.h |  3 +++
 net/core/filter.c      |  4 ++--
 net/ipv4/bpf_tcp_ca.c  | 54 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 98e28126c24b..9942ecc68a45 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -911,6 +911,9 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
+int _bpf_setsockopt(struct sock *sk, int level, int optname,
+		    char *optval, int optlen);
+
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 #define __bpf_call_base_args \
 	((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
diff --git a/net/core/filter.c b/net/core/filter.c
index f4cea3ff994a..e56a1ebcf1bc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5244,8 +5244,8 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname,
 	return -EINVAL;
 }
 
-static int _bpf_setsockopt(struct sock *sk, int level, int optname,
-			   char *optval, int optlen)
+int _bpf_setsockopt(struct sock *sk, int level, int optname,
+		    char *optval, int optlen)
 {
 	if (sk_fullsock(sk))
 		sock_owned_by_me(sk);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 6da16ae6a962..a9f2cab5ffbc 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -144,6 +144,57 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_5(bpf_init_ops_setsockopt, struct sock *, sk, int, level,
+	   int, optname, char *, optval, int, optlen)
+{
+	struct bpf_tramp_run_ctx *run_ctx, *saved_run_ctx;
+	int ret;
+
+	if (optname != TCP_CONGESTION)
+		return _bpf_setsockopt(sk, level, optname, optval, optlen);
+
+	run_ctx = (struct bpf_tramp_run_ctx *)current->bpf_ctx;
+	if (unlikely(run_ctx->saved_run_ctx &&
+		     run_ctx->saved_run_ctx->type == BPF_RUN_CTX_TYPE_STRUCT_OPS)) {
+		saved_run_ctx = (struct bpf_tramp_run_ctx *)run_ctx->saved_run_ctx;
+		/* It stops this looping
+		 *
+		 * .init => bpf_setsockopt(tcp_cc) => .init =>
+		 * bpf_setsockopt(tcp_cc)" => .init => ....
+		 *
+		 * The second bpf_setsockopt(tcp_cc) is not allowed
+		 * in order to break the loop when both .init
+		 * are the same bpf prog.
+		 *
+		 * This applies even the second bpf_setsockopt(tcp_cc)
+		 * does not cause a loop.  This limits only the first
+		 * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
+		 * pick a fallback cc (eg. peer does not support ECN)
+		 * and the second '.init' cannot fallback to
+		 * another cc.
+		 */
+		if (saved_run_ctx->bpf_cookie == (uintptr_t)sk)
+			return -EBUSY;
+	}
+
+	run_ctx->bpf_cookie = (uintptr_t)sk;
+	ret = _bpf_setsockopt(sk, level, optname, optval, optlen);
+	run_ctx->bpf_cookie = 0;
+
+	return ret;
+}
+
+static const struct bpf_func_proto bpf_init_ops_setsockopt_proto = {
+	.func		= bpf_init_ops_setsockopt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
+	.arg5_type	= ARG_CONST_SIZE,
+};
+
 static u32 prog_ops_moff(const struct bpf_prog *prog)
 {
 	const struct btf_member *m;
@@ -169,6 +220,9 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
 	case BPF_FUNC_sk_storage_delete:
 		return &bpf_sk_storage_delete_proto;
 	case BPF_FUNC_setsockopt:
+		if (prog_ops_moff(prog) ==
+		    offsetof(struct tcp_congestion_ops, init))
+			return &bpf_init_ops_setsockopt_proto;
 		/* Does not allow release() to call setsockopt.
 		 * release() is called when the current bpf-tcp-cc
 		 * is retiring.  It is not allowed to call
-- 
2.30.2