This patch adds a new BPF program type CRIB (Checkpoint/Restore In eBPF) for checkpointing/restoring processes through eBPF. CRIB BPF programs are not attached to any hooks, run through BPF_PROG_RUN, and are called by userspace programs as eBPF APIs for dumping/restoring process information. CRIB BPF programs dump/restore process information through CRIB kfunc APIs. Signed-off-by: Juntong Deng <juntong.deng@xxxxxxxxxxx> --- include/linux/bpf_crib.h | 16 +++++ include/linux/bpf_types.h | 4 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/Kconfig | 2 + kernel/bpf/Makefile | 2 + kernel/bpf/btf.c | 4 ++ kernel/bpf/crib/Kconfig | 14 ++++ kernel/bpf/crib/Makefile | 3 + kernel/bpf/crib/bpf_checkpoint.c | 13 ++++ kernel/bpf/crib/bpf_crib.c | 109 +++++++++++++++++++++++++++++++ kernel/bpf/crib/bpf_restore.c | 13 ++++ kernel/bpf/helpers.c | 1 + kernel/bpf/syscall.c | 1 + tools/include/uapi/linux/bpf.h | 1 + tools/lib/bpf/libbpf.c | 2 + tools/lib/bpf/libbpf_probes.c | 1 + 16 files changed, 187 insertions(+) create mode 100644 include/linux/bpf_crib.h create mode 100644 kernel/bpf/crib/Kconfig create mode 100644 kernel/bpf/crib/Makefile create mode 100644 kernel/bpf/crib/bpf_checkpoint.c create mode 100644 kernel/bpf/crib/bpf_crib.c create mode 100644 kernel/bpf/crib/bpf_restore.c diff --git a/include/linux/bpf_crib.h b/include/linux/bpf_crib.h new file mode 100644 index 000000000000..f667b740fcc2 --- /dev/null +++ b/include/linux/bpf_crib.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Checkpoint/Restore In eBPF (CRIB) + * + * Author: + * Juntong Deng <juntong.deng@xxxxxxxxxxx> + */ +#ifndef _BPF_CRIB_H +#define _BPF_CRIB_H + +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> +#include <linux/filter.h> + +#endif /* _BPF_CRIB_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9f2a6b83b49e..a6feddfd17e2 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -83,6 +83,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter, struct bpf_nf_ctx, struct bpf_nf_ctx) #endif +#ifdef CONFIG_BPF_CRIB +BPF_PROG_TYPE(BPF_PROG_TYPE_CRIB, bpf_crib, + void *, void *) +#endif /* CONFIG_BPF_CRIB */ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 35bcf52dbc65..cb67a9cad8c6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1055,6 +1055,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_CRIB, __MAX_BPF_PROG_TYPE }; diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index 17067dcb4386..a129677a03e3 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -101,4 +101,6 @@ config BPF_LSM If you are unsure how to answer this question, answer N. +source "kernel/bpf/crib/Kconfig" + endmenu # "BPF subsystem" diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0291eef9ce92..8c350d159d3c 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -58,3 +58,5 @@ vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf $(obj)/%.o: %.c FORCE $(call if_changed_rule,cc_o_c) + +obj-$(CONFIG_BPF_CRIB) += crib/ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4ff11779699e..306349ee3d6a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -219,6 +219,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_NETFILTER, BTF_KFUNC_HOOK_KPROBE, + BTF_KFUNC_HOOK_CRIB, BTF_KFUNC_HOOK_MAX, }; @@ -6037,6 +6038,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct case BPF_PROG_TYPE_TRACEPOINT: case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_CRIB: return 0; /* anything goes */ default: break; @@ -8326,6 +8328,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_NETFILTER; case BPF_PROG_TYPE_KPROBE: return BTF_KFUNC_HOOK_KPROBE; + case BPF_PROG_TYPE_CRIB: + return BTF_KFUNC_HOOK_CRIB; default: return BTF_KFUNC_HOOK_MAX; } diff --git a/kernel/bpf/crib/Kconfig b/kernel/bpf/crib/Kconfig new file mode 100644 index 000000000000..346304f65db6 --- /dev/null +++ b/kernel/bpf/crib/Kconfig @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config BPF_CRIB + bool "Checkpoint/Restore In eBPF (CRIB)" + depends on BPF_SYSCALL + depends on BPF_JIT + depends on DEBUG_INFO_BTF + help + Enable CRIB (Checkpoint/Restore In eBPF), which allows + checkpointing/restoring of processes through BPF programs. + + Compared to procfs and system call interfaces, CRIB achieves + higher performance and supports dumping/restoring more + comprehensive process status information. diff --git a/kernel/bpf/crib/Makefile b/kernel/bpf/crib/Makefile new file mode 100644 index 000000000000..abd43c76140b --- /dev/null +++ b/kernel/bpf/crib/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_BPF_CRIB) += bpf_crib.o bpf_checkpoint.o bpf_restore.o diff --git a/kernel/bpf/crib/bpf_checkpoint.c b/kernel/bpf/crib/bpf_checkpoint.c new file mode 100644 index 000000000000..efaca6bcdfe4 --- /dev/null +++ b/kernel/bpf/crib/bpf_checkpoint.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Checkpoint/Restore In eBPF (CRIB): Checkpoint + * + * Author: + * Juntong Deng <juntong.deng@xxxxxxxxxxx> + */ + +#include <linux/bpf_crib.h> + +__bpf_kfunc_start_defs(); + +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/crib/bpf_crib.c b/kernel/bpf/crib/bpf_crib.c new file mode 100644 index 000000000000..9ef2d61955bf --- /dev/null +++ b/kernel/bpf/crib/bpf_crib.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Checkpoint/Restore In eBPF (CRIB): Common + * + * Author: + * Juntong Deng <juntong.deng@xxxxxxxxxxx> + */ + +#include <linux/bpf_crib.h> +#include <linux/init.h> + +__bpf_kfunc_start_defs(); + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_crib_kfuncs) + +BTF_KFUNCS_END(bpf_crib_kfuncs) + +static int bpf_prog_run_crib(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); + __u32 ctx_size_in = kattr->test.ctx_size_in; + void *ctx = NULL; + u32 retval; + int err = 0; + + /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ + if (kattr->test.data_in || kattr->test.data_out || + kattr->test.ctx_out || kattr->test.duration || + kattr->test.repeat || kattr->test.flags || + kattr->test.batch_size) + return -EINVAL; + + if (ctx_size_in < prog->aux->max_ctx_offset || + ctx_size_in > U16_MAX) + return -EINVAL; + + if (ctx_size_in) { + ctx = memdup_user(ctx_in, ctx_size_in); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + } + + rcu_read_lock_trace(); + retval = bpf_prog_run_pin_on_cpu(prog, ctx); + rcu_read_unlock_trace(); + + if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) { + err = -EFAULT; + goto out; + } +out: + if (ctx_size_in) + kfree(ctx); + + return err; +} + +static const struct bpf_func_proto * +bpf_crib_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + default: + return bpf_base_func_proto(func_id, prog); + } +} + +static bool bpf_crib_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + /* + * Changing the context is not allowed, and all dumped data + * is returned to userspace via ringbuf. + */ + if (type != BPF_READ) + return false; + if (off < 0 || off >= U16_MAX) + return false; + if (off % size != 0) + return false; + + return true; +} + +const struct bpf_prog_ops bpf_crib_prog_ops = { + .test_run = bpf_prog_run_crib, +}; + +const struct bpf_verifier_ops bpf_crib_verifier_ops = { + .get_func_proto = bpf_crib_func_proto, + .is_valid_access = bpf_crib_is_valid_access, +}; + +static const struct btf_kfunc_id_set bpf_crib_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_crib_kfuncs, +}; + +static int __init bpf_crib_init(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_CRIB, &bpf_crib_kfunc_set); +} + +late_initcall(bpf_crib_init); diff --git a/kernel/bpf/crib/bpf_restore.c b/kernel/bpf/crib/bpf_restore.c new file mode 100644 index 000000000000..6bbb4b01e34b --- /dev/null +++ b/kernel/bpf/crib/bpf_restore.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Checkpoint/Restore In eBPF (CRIB): Restore + * + * Author: + * Juntong Deng <juntong.deng@xxxxxxxxxxx> + */ + +#include <linux/bpf_crib.h> + +__bpf_kfunc_start_defs(); + +__bpf_kfunc_end_defs(); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5241ba671c5a..bcd3ce9da00c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2986,6 +2986,7 @@ static int __init kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CRIB, &generic_kfunc_set); ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, ARRAY_SIZE(generic_dtors), THIS_MODULE); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0719192a3482..faf99e53d706 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2633,6 +2633,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, return -EINVAL; case BPF_PROG_TYPE_SYSCALL: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_CRIB: if (expected_attach_type) return -EINVAL; fallthrough; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 35bcf52dbc65..cb67a9cad8c6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1055,6 +1055,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + BPF_PROG_TYPE_CRIB, __MAX_BPF_PROG_TYPE }; diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 30f121754d83..4e1451901b7d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -224,6 +224,7 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", [BPF_PROG_TYPE_SYSCALL] = "syscall", [BPF_PROG_TYPE_NETFILTER] = "netfilter", + [BPF_PROG_TYPE_CRIB] = "crib", }; static int __base_pr(enum libbpf_print_level level, const char *format, @@ -9449,6 +9450,7 @@ static const struct bpf_sec_def section_defs[] = { SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE), + SEC_DEF("crib", CRIB, 0, SEC_NONE), }; int libbpf_register_prog_handler(const char *sec, diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 9dfbe7750f56..2e087280c5f0 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -180,6 +180,7 @@ static int probe_prog_load(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_SK_REUSEPORT: case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CRIB: break; case BPF_PROG_TYPE_NETFILTER: opts.expected_attach_type = BPF_NETFILTER; -- 2.39.2