[RFC bpf-next 1/5] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.

kuifeng@xxxxxxxx · Fri, 21 Jul 2023 22:22:44 -0700

From: Kui-Feng Lee <kuifeng@xxxxxxxx>

Enable sleepable cgroup/{get,set}sockopt hooks.

The sleepable BPF programs attached to cgroup/{get,set}sockopt hooks may
received a pointer to the optval in user space instead of a kernel
copy. ctx->user_optval and ctx->user_optval_end are the pointers to the
begin and end of the user space buffer if receiving a user space
buffer. ctx->optval and ctx->optval_end will be a kernel copy if receiving
a kernel space buffer.

A program receives a user space buffer if ctx->flags &
BPF_SOCKOPT_FLAG_OPTVAL_USER is true, otherwise it receives a kernel space
buffer.  The BPF programs should not read/write from/to a user space buffer
dirrectly.  It should access the buffer through bpf_copy_from_user() and
bpf_copy_to_user() provided in the following patches.

Signed-off-by: Kui-Feng Lee <kuifeng@xxxxxxxx>
---
 include/linux/filter.h         |   3 +
 include/uapi/linux/bpf.h       |   9 ++
 kernel/bpf/cgroup.c            | 189 ++++++++++++++++++++++++++-------
 kernel/bpf/verifier.c          |   7 +-
 tools/include/uapi/linux/bpf.h |   9 ++
 tools/lib/bpf/libbpf.c         |   2 +
 6 files changed, 176 insertions(+), 43 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index f69114083ec7..301dd1ba0de1 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1345,6 +1345,9 @@ struct bpf_sockopt_kern {
 	s32		level;
 	s32		optname;
 	s32		optlen;
+	u32		flags;
+	u8		*user_optval;
+	u8		*user_optval_end;
 	/* for retval in struct bpf_cg_run_ctx */
 	struct task_struct *current_task;
 	/* Temporary "register" for indirect stores to ppos. */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 739c15906a65..b2f81193f97b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7135,6 +7135,15 @@ struct bpf_sockopt {
 	__s32	optname;
 	__s32	optlen;
 	__s32	retval;
+
+	__bpf_md_ptr(void *, user_optval);
+	__bpf_md_ptr(void *, user_optval_end);
+	__u32	flags;
+};
+
+enum bpf_sockopt_flags {
+	/* optval is a pointer to user space memory */
+	BPF_SOCKOPT_FLAG_OPTVAL_USER	= (1U << 0),
 };
 
 struct bpf_pidns_info {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..b268bbfa6c53 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -38,15 +38,26 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
 	const struct bpf_prog_array *array;
 	struct bpf_run_ctx *old_run_ctx;
 	struct bpf_cg_run_ctx run_ctx;
+	bool do_sleepable;
 	u32 func_ret;
 
+	do_sleepable =
+		atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT;
+
 	run_ctx.retval = retval;
 	migrate_disable();
-	rcu_read_lock();
+	if (do_sleepable) {
+		might_fault();
+		rcu_read_lock_trace();
+	} else
+		rcu_read_lock();
 	array = rcu_dereference(cgrp->effective[atype]);
 	item = &array->items[0];
 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
 	while ((prog = READ_ONCE(item->prog))) {
+		if (do_sleepable && !prog->aux->sleepable)
+			rcu_read_lock();
+
 		run_ctx.prog_item = item;
 		func_ret = run_prog(prog, ctx);
 		if (ret_flags) {
@@ -56,13 +67,43 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
 		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
 			run_ctx.retval = -EPERM;
 		item++;
+
+		if (do_sleepable && !prog->aux->sleepable)
+			rcu_read_unlock();
 	}
 	bpf_reset_run_ctx(old_run_ctx);
-	rcu_read_unlock();
+	if (do_sleepable)
+		rcu_read_unlock_trace();
+	else
+		rcu_read_unlock();
 	migrate_enable();
 	return run_ctx.retval;
 }
 
+static __always_inline bool
+has_only_sleepable_prog_cg(const struct cgroup_bpf *cgrp,
+			 enum cgroup_bpf_attach_type atype)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	int cnt = 0;
+	bool ret = true;
+
+	rcu_read_lock();
+	item = &rcu_dereference(cgrp->effective[atype])->items[0];
+	while (ret && (prog = READ_ONCE(item->prog))) {
+		if (!prog->aux->sleepable)
+			ret = false;
+		item++;
+		cnt++;
+	}
+	rcu_read_unlock();
+	if (cnt == 0)
+		ret = false;
+
+	return ret;
+}
+
 unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
 				       const struct bpf_insn *insn)
 {
@@ -1773,7 +1814,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
 			     struct bpf_sockopt_buf *buf)
 {
-	if (ctx->optval == buf->data)
+	if (ctx->optval == buf->data ||
+	    ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
 		return;
 	kfree(ctx->optval);
 }
@@ -1781,7 +1823,8 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
 				  struct bpf_sockopt_buf *buf)
 {
-	return ctx->optval != buf->data;
+	return ctx->optval != buf->data &&
+		!(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER);
 }
 
 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
@@ -1796,21 +1839,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 		.optname = *optname,
 	};
 	int ret, max_optlen;
+	bool alloc_mem;
+
+	alloc_mem = !has_only_sleepable_prog_cg(&cgrp->bpf, CGROUP_SETSOCKOPT);
+	if (!alloc_mem) {
+		max_optlen = *optlen;
+		ctx.optlen = *optlen;
+		ctx.user_optval = optval;
+		ctx.user_optval_end = optval + *optlen;
+		ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
+	} else {
+		/* Allocate a bit more than the initial user buffer for
+		 * BPF program. The canonical use case is overriding
+		 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
+		 */
+		max_optlen = max_t(int, 16, *optlen);
+		max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
+		if (max_optlen < 0)
+			return max_optlen;
 
-	/* Allocate a bit more than the initial user buffer for
-	 * BPF program. The canonical use case is overriding
-	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
-	 */
-	max_optlen = max_t(int, 16, *optlen);
-	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
-	if (max_optlen < 0)
-		return max_optlen;
-
-	ctx.optlen = *optlen;
+		ctx.optlen = *optlen;
 
-	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
-		ret = -EFAULT;
-		goto out;
+		if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
 	}
 
 	lock_sock(sk);
@@ -1824,7 +1877,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	if (ctx.optlen == -1) {
 		/* optlen set to -1, bypass kernel */
 		ret = 1;
-	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
+	} else if (alloc_mem &&
+		   (ctx.optlen > max_optlen || ctx.optlen < -1)) {
 		/* optlen is out of bounds */
 		if (*optlen > PAGE_SIZE && ctx.optlen >= 0) {
 			pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n",
@@ -1846,6 +1900,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 		 */
 		if (ctx.optlen != 0) {
 			*optlen = ctx.optlen;
+			if (ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
+				return 0;
 			/* We've used bpf_sockopt_kern->buf as an intermediary
 			 * storage, but the BPF program indicates that we need
 			 * to pass this data to the kernel setsockopt handler.
@@ -1892,33 +1948,59 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 
 	orig_optlen = max_optlen;
 	ctx.optlen = max_optlen;
-	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
-	if (max_optlen < 0)
-		return max_optlen;
+	if (has_only_sleepable_prog_cg(&cgrp->bpf, CGROUP_GETSOCKOPT)) {
+		if (!retval) {
+			/* If kernel getsockopt finished successfully,
+			 * copy whatever was returned to the user back
+			 * into our temporary buffer. Set optlen to the
+			 * one that kernel returned as well to let
+			 * BPF programs inspect the value.
+			 */
 
-	if (!retval) {
-		/* If kernel getsockopt finished successfully,
-		 * copy whatever was returned to the user back
-		 * into our temporary buffer. Set optlen to the
-		 * one that kernel returned as well to let
-		 * BPF programs inspect the value.
-		 */
+			if (get_user(ctx.optlen, optlen)) {
+				ret = -EFAULT;
+				goto out;
+			}
 
-		if (get_user(ctx.optlen, optlen)) {
-			ret = -EFAULT;
-			goto out;
+			if (ctx.optlen < 0) {
+				ret = -EFAULT;
+				goto out;
+			}
+			orig_optlen = ctx.optlen;
 		}
 
-		if (ctx.optlen < 0) {
-			ret = -EFAULT;
-			goto out;
-		}
-		orig_optlen = ctx.optlen;
+		ctx.user_optval = optval;
+		ctx.user_optval_end = optval + *optlen;
+		ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
+	} else {
+		max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
+		if (max_optlen < 0)
+			return max_optlen;
+
+		if (!retval) {
+			/* If kernel getsockopt finished successfully,
+			 * copy whatever was returned to the user back
+			 * into our temporary buffer. Set optlen to the
+			 * one that kernel returned as well to let
+			 * BPF programs inspect the value.
+			 */
 
-		if (copy_from_user(ctx.optval, optval,
-				   min(ctx.optlen, max_optlen)) != 0) {
-			ret = -EFAULT;
-			goto out;
+			if (get_user(ctx.optlen, optlen)) {
+				ret = -EFAULT;
+				goto out;
+			}
+
+			if (ctx.optlen < 0) {
+				ret = -EFAULT;
+				goto out;
+			}
+			orig_optlen = ctx.optlen;
+
+			if (copy_from_user(ctx.optval, optval,
+					   min(ctx.optlen, max_optlen)) != 0) {
+				ret = -EFAULT;
+				goto out;
+			}
 		}
 	}
 
@@ -1942,7 +2024,9 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
 	}
 
 	if (ctx.optlen != 0) {
-		if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
+		if (optval &&
+		    !(ctx.flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) &&
+		    copy_to_user(optval, ctx.optval, ctx.optlen)) {
 			ret = -EFAULT;
 			goto out;
 		}
@@ -2388,6 +2472,20 @@ static bool cg_sockopt_is_valid_access(int off, int size,
 		if (size != size_default)
 			return false;
 		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+	case offsetof(struct bpf_sockopt, user_optval):
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case offsetof(struct bpf_sockopt, user_optval_end):
+		if (size != sizeof(__u64))
+			return false;
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	case offsetof(struct bpf_sockopt, flags):
+		if (size != sizeof(__u32))
+			return false;
+		break;
 	default:
 		if (size != size_default)
 			return false;
@@ -2481,6 +2579,15 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sockopt, optval_end):
 		*insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
 		break;
+	case offsetof(struct bpf_sockopt, user_optval):
+		*insn++ = CG_SOCKOPT_READ_FIELD(user_optval);
+		break;
+	case offsetof(struct bpf_sockopt, user_optval_end):
+		*insn++ = CG_SOCKOPT_READ_FIELD(user_optval_end);
+		break;
+	case offsetof(struct bpf_sockopt, flags):
+		*insn++ = CG_SOCKOPT_READ_FIELD(flags);
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 803b91135ca0..8cb25a775119 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19299,9 +19299,11 @@ static bool can_be_sleepable(struct bpf_prog *prog)
 			return false;
 		}
 	}
+
 	return prog->type == BPF_PROG_TYPE_LSM ||
 	       prog->type == BPF_PROG_TYPE_KPROBE /* only for uprobes */ ||
-	       prog->type == BPF_PROG_TYPE_STRUCT_OPS;
+	       prog->type == BPF_PROG_TYPE_STRUCT_OPS ||
+	       prog->type == BPF_PROG_TYPE_CGROUP_SOCKOPT;
 }
 
 static int check_attach_btf_id(struct bpf_verifier_env *env)
@@ -19323,7 +19325,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
 	}
 
 	if (prog->aux->sleepable && !can_be_sleepable(prog)) {
-		verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable\n");
+		verbose(env, "Only fentry/fexit/fmod_ret, lsm, iter, uprobe,"
+			"cgroup, and struct_ops programs can be sleepable\n");
 		return -EINVAL;
 	}
 
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 739c15906a65..b2f81193f97b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7135,6 +7135,15 @@ struct bpf_sockopt {
 	__s32	optname;
 	__s32	optlen;
 	__s32	retval;
+
+	__bpf_md_ptr(void *, user_optval);
+	__bpf_md_ptr(void *, user_optval_end);
+	__u32	flags;
+};
+
+enum bpf_sockopt_flags {
+	/* optval is a pointer to user space memory */
+	BPF_SOCKOPT_FLAG_OPTVAL_USER	= (1U << 0),
 };
 
 struct bpf_pidns_info {
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 17883f5a44b9..3be9270bbc33 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8766,7 +8766,9 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("cgroup/getsockname6",	CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE),
 	SEC_DEF("cgroup/sysctl",	CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE),
 	SEC_DEF("cgroup/getsockopt",	CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE),
+	SEC_DEF("cgroup/getsockopt.s",	CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE | SEC_SLEEPABLE),
 	SEC_DEF("cgroup/setsockopt",	CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE),
+	SEC_DEF("cgroup/setsockopt.s",	CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE | SEC_SLEEPABLE),
 	SEC_DEF("cgroup/dev",		CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT),
 	SEC_DEF("struct_ops+",		STRUCT_OPS, 0, SEC_NONE),
 	SEC_DEF("struct_ops.s+",	STRUCT_OPS, 0, SEC_SLEEPABLE),
-- 
2.34.1