[RFC PATCH 01/16] bpf: Introduce BPF_PROG_TYPE_CRIB

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch adds a new BPF program type CRIB (Checkpoint/Restore In eBPF)
for checkpointing/restoring processes through eBPF.

CRIB BPF programs are not attached to any hooks, run through
BPF_PROG_RUN, and are called by userspace programs as eBPF APIs
for dumping/restoring process information.

CRIB BPF programs dump/restore process information through CRIB
kfunc APIs.

Signed-off-by: Juntong Deng <juntong.deng@xxxxxxxxxxx>
---
 include/linux/bpf_crib.h         |  16 +++++
 include/linux/bpf_types.h        |   4 ++
 include/uapi/linux/bpf.h         |   1 +
 kernel/bpf/Kconfig               |   2 +
 kernel/bpf/Makefile              |   2 +
 kernel/bpf/btf.c                 |   4 ++
 kernel/bpf/crib/Kconfig          |  14 ++++
 kernel/bpf/crib/Makefile         |   3 +
 kernel/bpf/crib/bpf_checkpoint.c |  13 ++++
 kernel/bpf/crib/bpf_crib.c       | 109 +++++++++++++++++++++++++++++++
 kernel/bpf/crib/bpf_restore.c    |  13 ++++
 kernel/bpf/helpers.c             |   1 +
 kernel/bpf/syscall.c             |   1 +
 tools/include/uapi/linux/bpf.h   |   1 +
 tools/lib/bpf/libbpf.c           |   2 +
 tools/lib/bpf/libbpf_probes.c    |   1 +
 16 files changed, 187 insertions(+)
 create mode 100644 include/linux/bpf_crib.h
 create mode 100644 kernel/bpf/crib/Kconfig
 create mode 100644 kernel/bpf/crib/Makefile
 create mode 100644 kernel/bpf/crib/bpf_checkpoint.c
 create mode 100644 kernel/bpf/crib/bpf_crib.c
 create mode 100644 kernel/bpf/crib/bpf_restore.c

diff --git a/include/linux/bpf_crib.h b/include/linux/bpf_crib.h
new file mode 100644
index 000000000000..f667b740fcc2
--- /dev/null
+++ b/include/linux/bpf_crib.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Checkpoint/Restore In eBPF (CRIB)
+ *
+ * Author:
+ *	Juntong Deng <juntong.deng@xxxxxxxxxxx>
+ */
+#ifndef _BPF_CRIB_H
+#define _BPF_CRIB_H
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+
+#endif /* _BPF_CRIB_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 9f2a6b83b49e..a6feddfd17e2 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -83,6 +83,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
 BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
 	      struct bpf_nf_ctx, struct bpf_nf_ctx)
 #endif
+#ifdef CONFIG_BPF_CRIB
+BPF_PROG_TYPE(BPF_PROG_TYPE_CRIB, bpf_crib,
+	      void *, void *)
+#endif /* CONFIG_BPF_CRIB */
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 35bcf52dbc65..cb67a9cad8c6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1055,6 +1055,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	BPF_PROG_TYPE_CRIB,
 	__MAX_BPF_PROG_TYPE
 };
 
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index 17067dcb4386..a129677a03e3 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -101,4 +101,6 @@ config BPF_LSM
 
 	  If you are unsure how to answer this question, answer N.
 
+source "kernel/bpf/crib/Kconfig"
+
 endmenu # "BPF subsystem"
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0291eef9ce92..8c350d159d3c 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -58,3 +58,5 @@ vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf
 
 $(obj)/%.o: %.c FORCE
 	$(call if_changed_rule,cc_o_c)
+
+obj-$(CONFIG_BPF_CRIB) += crib/
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4ff11779699e..306349ee3d6a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -219,6 +219,7 @@ enum btf_kfunc_hook {
 	BTF_KFUNC_HOOK_LWT,
 	BTF_KFUNC_HOOK_NETFILTER,
 	BTF_KFUNC_HOOK_KPROBE,
+	BTF_KFUNC_HOOK_CRIB,
 	BTF_KFUNC_HOOK_MAX,
 };
 
@@ -6037,6 +6038,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct
 	case BPF_PROG_TYPE_TRACEPOINT:
 	case BPF_PROG_TYPE_SYSCALL:
 	case BPF_PROG_TYPE_EXT:
+	case BPF_PROG_TYPE_CRIB:
 		return 0; /* anything goes */
 	default:
 		break;
@@ -8326,6 +8328,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 		return BTF_KFUNC_HOOK_NETFILTER;
 	case BPF_PROG_TYPE_KPROBE:
 		return BTF_KFUNC_HOOK_KPROBE;
+	case BPF_PROG_TYPE_CRIB:
+		return BTF_KFUNC_HOOK_CRIB;
 	default:
 		return BTF_KFUNC_HOOK_MAX;
 	}
diff --git a/kernel/bpf/crib/Kconfig b/kernel/bpf/crib/Kconfig
new file mode 100644
index 000000000000..346304f65db6
--- /dev/null
+++ b/kernel/bpf/crib/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config BPF_CRIB
+	bool "Checkpoint/Restore In eBPF (CRIB)"
+	depends on BPF_SYSCALL
+	depends on BPF_JIT
+	depends on DEBUG_INFO_BTF
+	help
+	  Enable CRIB (Checkpoint/Restore In eBPF), which allows
+	  checkpointing/restoring of processes through BPF programs.
+
+	  Compared to procfs and system call interfaces, CRIB achieves
+	  higher performance and supports dumping/restoring more
+	  comprehensive process status information.
diff --git a/kernel/bpf/crib/Makefile b/kernel/bpf/crib/Makefile
new file mode 100644
index 000000000000..abd43c76140b
--- /dev/null
+++ b/kernel/bpf/crib/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_BPF_CRIB) += bpf_crib.o bpf_checkpoint.o bpf_restore.o
diff --git a/kernel/bpf/crib/bpf_checkpoint.c b/kernel/bpf/crib/bpf_checkpoint.c
new file mode 100644
index 000000000000..efaca6bcdfe4
--- /dev/null
+++ b/kernel/bpf/crib/bpf_checkpoint.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Checkpoint/Restore In eBPF (CRIB): Checkpoint
+ *
+ * Author:
+ *	Juntong Deng <juntong.deng@xxxxxxxxxxx>
+ */
+
+#include <linux/bpf_crib.h>
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/crib/bpf_crib.c b/kernel/bpf/crib/bpf_crib.c
new file mode 100644
index 000000000000..9ef2d61955bf
--- /dev/null
+++ b/kernel/bpf/crib/bpf_crib.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Checkpoint/Restore In eBPF (CRIB): Common
+ *
+ * Author:
+ *	Juntong Deng <juntong.deng@xxxxxxxxxxx>
+ */
+
+#include <linux/bpf_crib.h>
+#include <linux/init.h>
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_crib_kfuncs)
+
+BTF_KFUNCS_END(bpf_crib_kfuncs)
+
+static int bpf_prog_run_crib(struct bpf_prog *prog,
+			      const union bpf_attr *kattr,
+			      union bpf_attr __user *uattr)
+{
+	void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+	__u32 ctx_size_in = kattr->test.ctx_size_in;
+	void *ctx = NULL;
+	u32 retval;
+	int err = 0;
+
+	/* doesn't support data_in/out, ctx_out, duration, or repeat or flags */
+	if (kattr->test.data_in || kattr->test.data_out ||
+	    kattr->test.ctx_out || kattr->test.duration ||
+	    kattr->test.repeat || kattr->test.flags ||
+	    kattr->test.batch_size)
+		return -EINVAL;
+
+	if (ctx_size_in < prog->aux->max_ctx_offset ||
+	    ctx_size_in > U16_MAX)
+		return -EINVAL;
+
+	if (ctx_size_in) {
+		ctx = memdup_user(ctx_in, ctx_size_in);
+		if (IS_ERR(ctx))
+			return PTR_ERR(ctx);
+	}
+
+	rcu_read_lock_trace();
+	retval = bpf_prog_run_pin_on_cpu(prog, ctx);
+	rcu_read_unlock_trace();
+
+	if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) {
+		err = -EFAULT;
+		goto out;
+	}
+out:
+	if (ctx_size_in)
+		kfree(ctx);
+
+	return err;
+}
+
+static const struct bpf_func_proto *
+bpf_crib_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	default:
+		return bpf_base_func_proto(func_id, prog);
+	}
+}
+
+static bool bpf_crib_is_valid_access(int off, int size,
+					 enum bpf_access_type type,
+					 const struct bpf_prog *prog,
+					 struct bpf_insn_access_aux *info)
+{
+	/*
+	 * Changing the context is not allowed, and all dumped data
+	 * is returned to userspace via ringbuf.
+	 */
+	if (type != BPF_READ)
+		return false;
+	if (off < 0 || off >= U16_MAX)
+		return false;
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+const struct bpf_prog_ops bpf_crib_prog_ops = {
+	.test_run = bpf_prog_run_crib,
+};
+
+const struct bpf_verifier_ops bpf_crib_verifier_ops = {
+	.get_func_proto		= bpf_crib_func_proto,
+	.is_valid_access	= bpf_crib_is_valid_access,
+};
+
+static const struct btf_kfunc_id_set bpf_crib_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &bpf_crib_kfuncs,
+};
+
+static int __init bpf_crib_init(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_CRIB, &bpf_crib_kfunc_set);
+}
+
+late_initcall(bpf_crib_init);
diff --git a/kernel/bpf/crib/bpf_restore.c b/kernel/bpf/crib/bpf_restore.c
new file mode 100644
index 000000000000..6bbb4b01e34b
--- /dev/null
+++ b/kernel/bpf/crib/bpf_restore.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Checkpoint/Restore In eBPF (CRIB): Restore
+ *
+ * Author:
+ *	Juntong Deng <juntong.deng@xxxxxxxxxxx>
+ */
+
+#include <linux/bpf_crib.h>
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5241ba671c5a..bcd3ce9da00c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2986,6 +2986,7 @@ static int __init kfunc_init(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CRIB, &generic_kfunc_set);
 	ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
 						  ARRAY_SIZE(generic_dtors),
 						  THIS_MODULE);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0719192a3482..faf99e53d706 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2633,6 +2633,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
 		return -EINVAL;
 	case BPF_PROG_TYPE_SYSCALL:
 	case BPF_PROG_TYPE_EXT:
+	case BPF_PROG_TYPE_CRIB:
 		if (expected_attach_type)
 			return -EINVAL;
 		fallthrough;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 35bcf52dbc65..cb67a9cad8c6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1055,6 +1055,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
 	BPF_PROG_TYPE_NETFILTER,
+	BPF_PROG_TYPE_CRIB,
 	__MAX_BPF_PROG_TYPE
 };
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 30f121754d83..4e1451901b7d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -224,6 +224,7 @@ static const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_SK_LOOKUP]		= "sk_lookup",
 	[BPF_PROG_TYPE_SYSCALL]			= "syscall",
 	[BPF_PROG_TYPE_NETFILTER]		= "netfilter",
+	[BPF_PROG_TYPE_CRIB]			= "crib",
 };
 
 static int __base_pr(enum libbpf_print_level level, const char *format,
@@ -9449,6 +9450,7 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("struct_ops.s+",	STRUCT_OPS, 0, SEC_SLEEPABLE),
 	SEC_DEF("sk_lookup",		SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE),
 	SEC_DEF("netfilter",		NETFILTER, BPF_NETFILTER, SEC_NONE),
+	SEC_DEF("crib",			CRIB, 0, SEC_NONE),
 };
 
 int libbpf_register_prog_handler(const char *sec,
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 9dfbe7750f56..2e087280c5f0 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -180,6 +180,7 @@ static int probe_prog_load(enum bpf_prog_type prog_type,
 	case BPF_PROG_TYPE_SK_REUSEPORT:
 	case BPF_PROG_TYPE_FLOW_DISSECTOR:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CRIB:
 		break;
 	case BPF_PROG_TYPE_NETFILTER:
 		opts.expected_attach_type = BPF_NETFILTER;
-- 
2.39.2





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux