Re: [RFC bpf-next 1/5] bpf: enable sleepable BPF programs attached to cgroup/{get,set}sockopt.

Martin KaFai Lau <martin.lau@xxxxxxxxx> · Wed, 2 Aug 2023 12:25:02 -0700

On 7/21/23 10:22 PM, kuifeng@xxxxxxxx wrote:

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 739c15906a65..b2f81193f97b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7135,6 +7135,15 @@ struct bpf_sockopt {
  	__s32	optname;
  	__s32	optlen;
  	__s32	retval;
+
+	__bpf_md_ptr(void *, user_optval);
+	__bpf_md_ptr(void *, user_optval_end);
+	__u32	flags;

I will follow up on the UAPI discussion in the other existing thread.

+};
+
+enum bpf_sockopt_flags {
+	/* optval is a pointer to user space memory */
+	BPF_SOCKOPT_FLAG_OPTVAL_USER	= (1U << 0),
  };
  
  struct bpf_pidns_info {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 5b2741aa0d9b..b268bbfa6c53 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -38,15 +38,26 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
  	const struct bpf_prog_array *array;
  	struct bpf_run_ctx *old_run_ctx;
  	struct bpf_cg_run_ctx run_ctx;
+	bool do_sleepable;
  	u32 func_ret;
  
+	do_sleepable =
+		atype == CGROUP_SETSOCKOPT || atype == CGROUP_GETSOCKOPT;
+
  	run_ctx.retval = retval;
  	migrate_disable();
-	rcu_read_lock();
+	if (do_sleepable) {
+		might_fault();
+		rcu_read_lock_trace();
+	} else
+		rcu_read_lock();
  	array = rcu_dereference(cgrp->effective[atype]);

array is now under rcu_read_lock_trace for the "do_sleepable" case.

The array free side should be changed also to wait for the tasks_trace gp. 
Please check if any bpf_prog_array_free() usages in cgroup.c should be replaced 
with bpf_prog_array_free_sleepable().

Another high level comment is, other cgroup hooks may get sleepable support in 
the future. In particular the SEC("lsm_cgroup") when considering other lsm hooks 
in SEC("lsm.s") have sleepable support already. just want to bring up here for 
awareness. This set can focus on get/setsockopt for now.

  	item = &array->items[0];
  	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
  	while ((prog = READ_ONCE(item->prog))) {
+		if (do_sleepable && !prog->aux->sleepable)
+			rcu_read_lock();
+
  		run_ctx.prog_item = item;
  		func_ret = run_prog(prog, ctx);
  		if (ret_flags) {
@@ -56,13 +67,43 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
  		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
  			run_ctx.retval = -EPERM;
  		item++;
+
+		if (do_sleepable && !prog->aux->sleepable)
+			rcu_read_unlock();
  	}
  	bpf_reset_run_ctx(old_run_ctx);
-	rcu_read_unlock();
+	if (do_sleepable)
+		rcu_read_unlock_trace();
+	else
+		rcu_read_unlock();
  	migrate_enable();
  	return run_ctx.retval;
  }
  
+static __always_inline bool
+has_only_sleepable_prog_cg(const struct cgroup_bpf *cgrp,
+			 enum cgroup_bpf_attach_type atype)
+{
+	const struct bpf_prog_array_item *item;
+	const struct bpf_prog *prog;
+	int cnt = 0;
+	bool ret = true;
+
+	rcu_read_lock();
+	item = &rcu_dereference(cgrp->effective[atype])->items[0];
+	while (ret && (prog = READ_ONCE(item->prog))) {
+		if (!prog->aux->sleepable)
+			ret = false;
+		item++;
+		cnt++;
+	}
+	rcu_read_unlock();
+	if (cnt == 0)
+		ret = false;
+
+	return ret;
+}
+
  unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
  				       const struct bpf_insn *insn)
  {
@@ -1773,7 +1814,8 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
  static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
  			     struct bpf_sockopt_buf *buf)
  {
-	if (ctx->optval == buf->data)
+	if (ctx->optval == buf->data ||
+	    ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
  		return;
  	kfree(ctx->optval);
  }
@@ -1781,7 +1823,8 @@ static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
  static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
  				  struct bpf_sockopt_buf *buf)
  {
-	return ctx->optval != buf->data;
+	return ctx->optval != buf->data &&
+		!(ctx->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER);
  }
  
  int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
@@ -1796,21 +1839,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
  		.optname = *optname,
  	};
  	int ret, max_optlen;
+	bool alloc_mem;
+
+	alloc_mem = !has_only_sleepable_prog_cg(&cgrp->bpf, CGROUP_SETSOCKOPT);

hmm... I am not sure if this would work. The cgroup->effective[atype] could have 
been changed after this has_only_sleepable_prog_cg() check. For example, a 
non-sleepable prog is attached after this check. The latter 
bpf_prog_run_array_cg() may end up having an incorrect ctx.

A quick thought is, this alloc decision should be postponed to the 
bpf_prog_run_array_cg()? It may be better to have a different 
bpf_prog_run_array_cg for set/getsockopt here, not sure.

+	if (!alloc_mem) {
+		max_optlen = *optlen;
+		ctx.optlen = *optlen;
+		ctx.user_optval = optval;
+		ctx.user_optval_end = optval + *optlen;
+		ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
+	} else {
+		/* Allocate a bit more than the initial user buffer for
+		 * BPF program. The canonical use case is overriding
+		 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
+		 */
+		max_optlen = max_t(int, 16, *optlen);
+		max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
+		if (max_optlen < 0)
+			return max_optlen;
  
-	/* Allocate a bit more than the initial user buffer for
-	 * BPF program. The canonical use case is overriding
-	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
-	 */
-	max_optlen = max_t(int, 16, *optlen);
-	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
-	if (max_optlen < 0)
-		return max_optlen;
-
-	ctx.optlen = *optlen;
+		ctx.optlen = *optlen;
  
-	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
-		ret = -EFAULT;
-		goto out;
+		if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
  	}