[RFC bpf-next v4 5/6] bpf: Add a new dynptr type for CGRUP_SOCKOPT.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Kui-Feng Lee <thinker.li@xxxxxxxxx>

The new dynptr type (BPF_DYNPTR_TYPE_CGROUP_SOCKOPT) will be used by BPF
programs to create a buffer that can be installed on ctx to replace
exisiting optval or user_optval.

Installation is only allowed if ctx->flags &
BPF_SOCKOPT_FLAG_OPTVAL_REPLACE is true. It is enabled only for sleepable
programs on the cgroup/setsockopt hook.  BPF programs can install a new
buffer holding by a dynptr to increase the size of optval passed to
setsockopt().

Installation is not enabled for cgroup/getsockopt since you can not
increased a buffer created, by user program, to return data from
getsockopt().

Signed-off-by: Kui-Feng Lee <thinker.li@xxxxxxxxx>
---
 include/linux/bpf.h    |   7 ++-
 include/linux/filter.h |   4 ++
 kernel/bpf/btf.c       |   3 +
 kernel/bpf/cgroup.c    |   5 +-
 kernel/bpf/helpers.c   | 140 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c  |  38 ++++++++++-
 6 files changed, 193 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 40a3d392b7f1..aad34298bfd3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -663,12 +663,15 @@ enum bpf_type_flag {
 	/* DYNPTR points to xdp_buff */
 	DYNPTR_TYPE_XDP		= BIT(16 + BPF_BASE_TYPE_BITS),
 
+	/* DYNPTR points to optval buffer of bpf_sockopt */
+	DYNPTR_TYPE_CGROUP_SOCKOPT = BIT(17 + BPF_BASE_TYPE_BITS),
+
 	__BPF_TYPE_FLAG_MAX,
 	__BPF_TYPE_LAST_FLAG	= __BPF_TYPE_FLAG_MAX - 1,
 };
 
 #define DYNPTR_TYPE_FLAG_MASK	(DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
-				 | DYNPTR_TYPE_XDP)
+				 | DYNPTR_TYPE_XDP | DYNPTR_TYPE_CGROUP_SOCKOPT)
 
 /* Max number of base types. */
 #define BPF_BASE_TYPE_LIMIT	(1UL << BPF_BASE_TYPE_BITS)
@@ -1208,6 +1211,8 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_SKB,
 	/* Underlying data is a xdp_buff */
 	BPF_DYNPTR_TYPE_XDP,
+	/* Underlying data is for the optval of a cgroup sock */
+	BPF_DYNPTR_TYPE_CGROUP_SOCKOPT,
 };
 
 int bpf_dynptr_check_size(u32 size);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2aa2a96526de..df12fddd2f21 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1347,6 +1347,10 @@ struct bpf_sockopt_kern {
 enum bpf_sockopt_kern_flags {
 	/* optval is a pointer to user space memory */
 	BPF_SOCKOPT_FLAG_OPTVAL_USER    = (1U << 0),
+	/* able to install new optval */
+	BPF_SOCKOPT_FLAG_OPTVAL_REPLACE	= (1U << 1),
+	/* optval is referenced by a dynptr */
+	BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR	= (1U << 2),
 };
 
 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 249657c466dd..6d6a040688be 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -217,6 +217,7 @@ enum btf_kfunc_hook {
 	BTF_KFUNC_HOOK_SOCKET_FILTER,
 	BTF_KFUNC_HOOK_LWT,
 	BTF_KFUNC_HOOK_NETFILTER,
+	BTF_KFUNC_HOOK_CGROUP_SOCKOPT,
 	BTF_KFUNC_HOOK_MAX,
 };
 
@@ -7846,6 +7847,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 		return BTF_KFUNC_HOOK_LWT;
 	case BPF_PROG_TYPE_NETFILTER:
 		return BTF_KFUNC_HOOK_NETFILTER;
+	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+		return BTF_KFUNC_HOOK_CGROUP_SOCKOPT;
 	default:
 		return BTF_KFUNC_HOOK_MAX;
 	}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 1b2006dac4d5..b27a4fbc6ffe 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1866,6 +1866,8 @@ static int filter_setsockopt_progs_cb(void *arg,
 	if (max_optlen < 0)
 		return max_optlen;
 
+	ctx->flags = BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
+
 	if (copy_from_user(ctx->optval, optval,
 			   min(ctx->optlen, max_optlen)) != 0)
 		return -EFAULT;
@@ -1894,7 +1896,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
 	ctx.optlen = *optlen;
 	ctx.optval = optval;
 	ctx.optval_end = optval + *optlen;
-	ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
+	ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER |
+		BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
 
 	lock_sock(sk);
 	ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_SETSOCKOPT,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index eb91cae0612a..5be1fd9e64f3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1519,6 +1519,51 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
 	.arg4_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
 };
 
+static int __bpf_sockopt_store_bytes(struct bpf_sockopt_kern *sopt, u32 offset,
+				     void *src, u32 len)
+{
+	int buf_len, err;
+	void *buf;
+
+	if (!src)
+		return 0;
+
+	if (sopt->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+		if (!(sopt->flags & BPF_SOCKOPT_FLAG_OPTVAL_REPLACE))
+			return copy_to_user(sopt->optval + offset, src, len) ?
+				-EFAULT : 0;
+		buf_len = sopt->optval_end - sopt->optval;
+		buf = kmalloc(buf_len, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+		err = copy_from_user(buf, sopt->optval, buf_len) ? -EFAULT : 0;
+		if (err < 0) {
+			kfree(buf);
+			return err;
+		}
+		sopt->optval = buf;
+		sopt->optval_end = buf + len;
+		sopt->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_USER;
+		memcpy(buf + offset, src, len);
+	}
+
+	memcpy(sopt->optval + offset, src, len);
+
+	return 0;
+}
+
+static int __bpf_sockopt_load_bytes(struct bpf_sockopt_kern *sopt, u32 offset,
+				    void *dst, u32 len)
+{
+	if (sopt->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
+		return copy_from_user(dst, sopt->optval + offset, len) ?
+			-EFAULT : 0;
+
+	memcpy(dst, sopt->optval + offset, len);
+
+	return 0;
+}
+
 BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
 	   u32, offset, u64, flags)
 {
@@ -1547,6 +1592,8 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
 		return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
 	case BPF_DYNPTR_TYPE_XDP:
 		return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+	case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+		return __bpf_sockopt_load_bytes(src->data, src->offset + offset, dst, len);
 	default:
 		WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
 		return -EFAULT;
@@ -1597,6 +1644,10 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
 		if (flags)
 			return -EINVAL;
 		return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+	case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+		return __bpf_sockopt_store_bytes(dst->data,
+						 dst->offset + offset,
+						 src, len);
 	default:
 		WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
 		return -EFAULT;
@@ -1634,6 +1685,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
 	switch (type) {
 	case BPF_DYNPTR_TYPE_LOCAL:
 	case BPF_DYNPTR_TYPE_RINGBUF:
+	case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
 		return (unsigned long)(ptr->data + ptr->offset + offset);
 	case BPF_DYNPTR_TYPE_SKB:
 	case BPF_DYNPTR_TYPE_XDP:
@@ -2278,6 +2330,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
 		bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
 		return buffer__opt;
 	}
+	case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+		return NULL;
 	default:
 		WARN_ONCE(true, "unknown dynptr type %d\n", type);
 		return NULL;
@@ -2429,6 +2483,80 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
 	rcu_read_unlock();
 }
 
+__bpf_kfunc int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt,
+					   struct bpf_dynptr_kern *ptr)
+{
+	bpf_dynptr_set_null(ptr);
+	return 0;
+}
+
+/* Initialize a sockopt dynptr from a user or installed optval pointer.
+ *
+ * sopt->optval can be a user pointer or a kernel pointer. A kernel pointer
+ * can be a buffer allocated by the caller of the BPF program or a buffer
+ * installed by other BPF programs through bpf_sockopt_dynptr_install().
+ *
+ * Atmost one dynptr shall be created by this function at any moment, or
+ * it will return -EINVAL. You can create another dypptr by this function
+ * after release the previous one by bpf_sockopt_dynptr_release().
+ *
+ * A dynptr that is initialized when optval is a user pointer is an
+ * exception. In this case, the dynptr will point to a kernel buffer with
+ * the same content as the user buffer. To simplify the code, users should
+ * always make sure having only one dynptr initialized by this function at
+ * any moment.
+ */
+__bpf_kfunc int bpf_dynptr_from_sockopt(struct bpf_sockopt *sopt,
+					struct bpf_dynptr_kern *ptr__uninit)
+{
+	struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+	unsigned int size;
+
+	size = sopt_kern->optval_end - sopt_kern->optval;
+
+	bpf_dynptr_init(ptr__uninit, sopt,
+			BPF_DYNPTR_TYPE_CGROUP_SOCKOPT, 0,
+			size);
+
+	return size;
+}
+
+__bpf_kfunc int bpf_sockopt_grow_to(struct bpf_sockopt *sopt,
+				    u32 newsize)
+{
+	struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+	void *newoptval;
+	int err;
+
+	if (newsize > DYNPTR_MAX_SIZE)
+		return -EINVAL;
+
+	if (newsize <= sopt_kern->optlen)
+		return 0;
+
+	if (sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+		newoptval = kmalloc(newsize, GFP_KERNEL);
+		if (!newoptval)
+			return -ENOMEM;
+		err = copy_from_user(newoptval, sopt_kern->optval,
+				     sopt_kern->optval_end - sopt_kern->optval);
+		if (err < 0) {
+			kfree(newoptval);
+			return err;
+		}
+		sopt_kern->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_USER;
+	} else {
+		newoptval = krealloc(sopt_kern->optval, newsize, GFP_KERNEL);
+		if (!newoptval)
+			return -ENOMEM;
+	}
+
+	sopt_kern->optval = newoptval;
+	sopt_kern->optval_end = newoptval + newsize;
+
+	return 0;
+}
+
 __diag_pop();
 
 BTF_SET8_START(generic_btf_ids)
@@ -2494,6 +2622,17 @@ static const struct btf_kfunc_id_set common_kfunc_set = {
 	.set   = &common_btf_ids,
 };
 
+BTF_SET8_START(cgroup_common_btf_ids)
+BTF_ID_FLAGS(func, bpf_sockopt_dynptr_release, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_dynptr_from_sockopt, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_sockopt_grow_to, KF_SLEEPABLE)
+BTF_SET8_END(cgroup_common_btf_ids)
+
+static const struct btf_kfunc_id_set cgroup_kfunc_set = {
+	.owner	= THIS_MODULE,
+	.set	= &cgroup_common_btf_ids,
+};
+
 static int __init kfunc_init(void)
 {
 	int ret;
@@ -2513,6 +2652,7 @@ static int __init kfunc_init(void)
 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCKOPT, &cgroup_kfunc_set);
 	ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
 						  ARRAY_SIZE(generic_dtors),
 						  THIS_MODULE);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83731e998b09..15119ff90bff 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -755,6 +755,8 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
 		return "skb";
 	case BPF_DYNPTR_TYPE_XDP:
 		return "xdp";
+	case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+		return "cgroup_sockopt";
 	case BPF_DYNPTR_TYPE_INVALID:
 		return "<invalid>";
 	default:
@@ -836,6 +838,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
 		return BPF_DYNPTR_TYPE_SKB;
 	case DYNPTR_TYPE_XDP:
 		return BPF_DYNPTR_TYPE_XDP;
+	case DYNPTR_TYPE_CGROUP_SOCKOPT:
+		return BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
 	default:
 		return BPF_DYNPTR_TYPE_INVALID;
 	}
@@ -852,6 +856,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
 		return DYNPTR_TYPE_SKB;
 	case BPF_DYNPTR_TYPE_XDP:
 		return DYNPTR_TYPE_XDP;
+	case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+		return DYNPTR_TYPE_CGROUP_SOCKOPT;
 	default:
 		return 0;
 	}
@@ -859,7 +865,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
 
 static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
 {
-	return type == BPF_DYNPTR_TYPE_RINGBUF;
+	return type == BPF_DYNPTR_TYPE_RINGBUF ||
+		type == BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
 }
 
 static void __mark_dynptr_reg(struct bpf_reg_state *reg,
@@ -10300,6 +10307,8 @@ enum special_kfunc_type {
 	KF_bpf_dynptr_slice,
 	KF_bpf_dynptr_slice_rdwr,
 	KF_bpf_dynptr_clone,
+	KF_bpf_sockopt_dynptr_release,
+	KF_bpf_dynptr_from_sockopt,
 };
 
 BTF_SET_START(special_kfunc_set)
@@ -10320,6 +10329,8 @@ BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_sockopt_dynptr_release)
+BTF_ID(func, bpf_dynptr_from_sockopt)
 BTF_SET_END(special_kfunc_set)
 
 BTF_ID_LIST(special_kfunc_list)
@@ -10342,6 +10353,8 @@ BTF_ID(func, bpf_dynptr_from_xdp)
 BTF_ID(func, bpf_dynptr_slice)
 BTF_ID(func, bpf_dynptr_slice_rdwr)
 BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_sockopt_dynptr_release)
+BTF_ID(func, bpf_dynptr_from_sockopt)
 
 static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -10995,6 +11008,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 			arg_type |= OBJ_RELEASE;
 			break;
 		case KF_ARG_PTR_TO_DYNPTR:
+			if (meta->func_id == special_kfunc_list[KF_bpf_sockopt_dynptr_release]) {
+				int ref_obj_id = dynptr_ref_obj_id(env, reg);
+
+				if (ref_obj_id < 0) {
+					verbose(env, "R%d is not a valid dynptr\n", regno);
+					return -EINVAL;
+				}
+
+				/* Required by check_func_arg_reg_off() */
+				arg_type |= ARG_PTR_TO_DYNPTR | OBJ_RELEASE;
+				meta->release_regno = regno;
+			}
+			break;
 		case KF_ARG_PTR_TO_ITER:
 		case KF_ARG_PTR_TO_LIST_HEAD:
 		case KF_ARG_PTR_TO_LIST_NODE:
@@ -11082,6 +11108,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
 					verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
 					return -EFAULT;
 				}
+			} else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_sockopt] &&
+				   (dynptr_arg_type & MEM_UNINIT)) {
+				dynptr_arg_type |= DYNPTR_TYPE_CGROUP_SOCKOPT;
 			}
 
 			ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
@@ -11390,7 +11419,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
 	 */
 	if (meta.release_regno) {
-		err = release_reference(env, regs[meta.release_regno].ref_obj_id);
+		verbose(env, "release refcounted PTR_TO_BTF_ID %s\n",
+			meta.func_name);
+		if (meta.func_id == special_kfunc_list[KF_bpf_sockopt_dynptr_release])
+			err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
+		else
+			err = release_reference(env, regs[meta.release_regno].ref_obj_id);
 		if (err) {
 			verbose(env, "kfunc %s#%d reference has not been acquired before\n",
 				func_name, meta.func_id);
-- 
2.34.1





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux