Given a loaded dumper bpf program, which already knows which target it should bind to, there two ways to create a dumper: - a file based dumper under hierarchy of /sys/kernel/bpfdump/ which uses can "cat" to print out the output. - an anonymous dumper which user application can "read" the dumping output. For file based dumper, BPF_OBJ_PIN syscall interface is used. For anonymous dumper, BPF_PROG_ATTACH syscall interface is used. To facilitate target seq_ops->show() to get the bpf program easily, dumper creation increased the target-provided seq_file private data size so bpf program pointer is also stored in seq_file private data. Further, a seq_num which represents how many bpf_dump_get_prog() has been called is also available to the target seq_ops->show(). Such information can be used to e.g., print banner before printing out actual data. Note the seq_num does not represent the num of unique kernel objects the bpf program has seen. But it should be a good approximate. A target feature BPF_DUMP_SEQ_NET_PRIVATE is implemented specifically useful for net based dumpers. It sets net namespace as the current process net namespace. This avoids changing existing net seq_ops in order to retrieve net namespace from the seq_file pointer. For open dumper files, anonymous or not, the fdinfo will show the target and prog_id associated with that file descriptor. For dumper file itself, a kernel interface will be provided to retrieve the prog_id in one of the later patches. Signed-off-by: Yonghong Song <yhs@xxxxxx> --- include/linux/bpf.h | 5 + include/uapi/linux/bpf.h | 6 +- kernel/bpf/dump.c | 338 ++++++++++++++++++++++++++++++++- kernel/bpf/syscall.c | 11 +- tools/include/uapi/linux/bpf.h | 6 +- 5 files changed, 362 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 44268d36d901..8171e01ff4be 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1110,10 +1110,15 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd); int bpf_obj_pin_user(u32 ufd, const char __user *pathname); int bpf_obj_get_user(const char __user *pathname, int flags); +#define BPF_DUMP_SEQ_NET_PRIVATE BIT(0) + int bpf_dump_reg_target(const char *target, const char *target_proto, const struct seq_operations *seq_ops, u32 seq_priv_size, u32 target_feature); int bpf_dump_set_target_info(u32 target_fd, struct bpf_prog *prog); +int bpf_dump_create(u32 prog_fd, const char __user *dumper_name); +struct bpf_prog *bpf_dump_get_prog(struct seq_file *seq, u32 priv_data_size, + u64 *seq_num); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0f1cbed446c1..b51d56fc77f9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -354,6 +354,7 @@ enum { /* Flags for accessing BPF object from syscall side. */ BPF_F_RDONLY = (1U << 3), BPF_F_WRONLY = (1U << 4), + BPF_F_DUMP = (1U << 5), /* Flag for stack_map, store build_id+offset instead of pointer */ BPF_F_STACK_BUILD_ID = (1U << 5), @@ -481,7 +482,10 @@ union bpf_attr { }; struct { /* anonymous struct used by BPF_OBJ_* commands */ - __aligned_u64 pathname; + union { + __aligned_u64 pathname; + __aligned_u64 dumper_name; + }; __u32 bpf_fd; __u32 file_flags; }; diff --git a/kernel/bpf/dump.c b/kernel/bpf/dump.c index 1091affe8b3f..ac6856abb711 100644 --- a/kernel/bpf/dump.c +++ b/kernel/bpf/dump.c @@ -30,22 +30,173 @@ struct bpfdump_targets { struct mutex dumper_mutex; }; +struct dumper_inode_info { + struct bpfdump_target_info *tinfo; + struct bpf_prog *prog; +}; + +struct dumper_info { + struct list_head list; + /* file to identify an anon dumper, + * dentry to identify a file dumper. + */ + union { + struct file *file; + struct dentry *dentry; + }; + struct bpfdump_target_info *tinfo; + struct bpf_prog *prog; +}; + +struct dumpers { + struct list_head dumpers; + struct mutex dumper_mutex; +}; + +struct extra_priv_data { + struct bpf_prog *prog; + u64 seq_num; +}; + /* registered dump targets */ static struct bpfdump_targets dump_targets; static struct dentry *bpfdump_dentry; +static struct dumpers anon_dumpers, file_dumpers; + +static const struct file_operations bpf_dumper_ops; +static const struct inode_operations bpf_dir_iops; + +static struct dentry *bpfdump_add_file(const char *name, struct dentry *parent, + const struct file_operations *f_ops, + void *data); static struct dentry *bpfdump_add_dir(const char *name, struct dentry *parent, const struct inode_operations *i_ops, void *data); static int __bpfdump_init(void); +static u32 get_total_priv_dsize(u32 old_size) +{ + return roundup(old_size, 8) + sizeof(struct extra_priv_data); +} + +static void *get_extra_priv_dptr(void *old_ptr, u32 old_size) +{ + return old_ptr + roundup(old_size, 8); +} + +#ifdef CONFIG_PROC_FS +static void dumper_show_fdinfo(struct seq_file *m, struct file *filp) +{ + struct dumper_inode_info *i_info = filp->f_inode->i_private; + + seq_printf(m, "target:\t%s\n" + "prog_id:\t%u\n", + i_info->tinfo->target, + i_info->prog->aux->id); +} + +static void anon_dumper_show_fdinfo(struct seq_file *m, struct file *filp) +{ + struct dumper_info *dinfo; + + mutex_lock(&anon_dumpers.dumper_mutex); + list_for_each_entry(dinfo, &anon_dumpers.dumpers, list) { + if (dinfo->file == filp) { + seq_printf(m, "target:\t%s\n" + "prog_id:\t%u\n", + dinfo->tinfo->target, + dinfo->prog->aux->id); + break; + } + } + mutex_unlock(&anon_dumpers.dumper_mutex); +} + +#endif + +static void process_target_feature(u32 feature, void *priv_data) +{ + /* use the current net namespace */ + if (feature & BPF_DUMP_SEQ_NET_PRIVATE) + set_seq_net_private((struct seq_net_private *)priv_data, + current->nsproxy->net_ns); +} + +static int dumper_open(struct inode *inode, struct file *file) +{ + struct dumper_inode_info *i_info = inode->i_private; + struct extra_priv_data *extra_data; + u32 old_priv_size, total_priv_size; + void *priv_data; + + old_priv_size = i_info->tinfo->seq_priv_size; + total_priv_size = get_total_priv_dsize(old_priv_size); + priv_data = __seq_open_private(file, i_info->tinfo->seq_ops, + total_priv_size); + if (!priv_data) + return -ENOMEM; + + process_target_feature(i_info->tinfo->target_feature, priv_data); + + extra_data = get_extra_priv_dptr(priv_data, old_priv_size); + extra_data->prog = i_info->prog; + extra_data->seq_num = 0; + + return 0; +} + +static int anon_dumper_release(struct inode *inode, struct file *file) +{ + struct dumper_info *dinfo; + + /* release the bpf program */ + mutex_lock(&anon_dumpers.dumper_mutex); + list_for_each_entry(dinfo, &anon_dumpers.dumpers, list) { + if (dinfo->file == file) { + bpf_prog_put(dinfo->prog); + list_del(&dinfo->list); + break; + } + } + mutex_unlock(&anon_dumpers.dumper_mutex); + + return seq_release_private(inode, file); +} + +static int dumper_release(struct inode *inode, struct file *file) +{ + return seq_release_private(inode, file); +} + static int dumper_unlink(struct inode *dir, struct dentry *dentry) { - kfree(d_inode(dentry)->i_private); + struct dumper_inode_info *i_info = d_inode(dentry)->i_private; + + bpf_prog_put(i_info->prog); + kfree(i_info); + return simple_unlink(dir, dentry); } +static const struct file_operations bpf_dumper_ops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = dumper_show_fdinfo, +#endif + .open = dumper_open, + .read = seq_read, + .release = dumper_release, +}; + +static const struct file_operations anon_bpf_dumper_ops = { +#ifdef CONFIG_PROC_FS + .show_fdinfo = anon_dumper_show_fdinfo, +#endif + .read = seq_read, + .release = anon_dumper_release, +}; + static const struct inode_operations bpf_dir_iops = { .lookup = simple_lookup, .unlink = dumper_unlink, @@ -88,6 +239,179 @@ int bpf_dump_set_target_info(u32 target_fd, struct bpf_prog *prog) return err; } +static int create_anon_dumper(struct bpfdump_target_info *tinfo, + struct bpf_prog *prog) +{ + struct extra_priv_data *extra_data; + u32 old_priv_size, total_priv_size; + struct dumper_info *dinfo; + struct file *file; + int err, anon_fd; + void *priv_data; + struct fd fd; + + anon_fd = anon_inode_getfd("bpf-dumper", &anon_bpf_dumper_ops, + NULL, O_CLOEXEC); + if (anon_fd < 0) + return anon_fd; + + /* setup seq_file for anon dumper */ + fd = fdget(anon_fd); + file = fd.file; + + dinfo = kmalloc(sizeof(*dinfo), GFP_KERNEL); + if (!dinfo) { + err = -ENOMEM; + goto free_fd; + } + + old_priv_size = tinfo->seq_priv_size; + total_priv_size = get_total_priv_dsize(old_priv_size); + + priv_data = __seq_open_private(file, tinfo->seq_ops, + total_priv_size); + if (!priv_data) { + err = -ENOMEM; + goto free_dinfo; + } + + dinfo->file = file; + dinfo->tinfo = tinfo; + dinfo->prog = prog; + + mutex_lock(&anon_dumpers.dumper_mutex); + list_add(&dinfo->list, &anon_dumpers.dumpers); + mutex_unlock(&anon_dumpers.dumper_mutex); + + process_target_feature(tinfo->target_feature, priv_data); + + extra_data = get_extra_priv_dptr(priv_data, old_priv_size); + extra_data->prog = prog; + extra_data->seq_num = 0; + + fdput(fd); + return anon_fd; + +free_dinfo: + kfree(dinfo); +free_fd: + fdput(fd); + return err; +} + +static int create_dumper(struct bpfdump_target_info *tinfo, + const char __user *dumper_name, + struct bpf_prog *prog) +{ + struct dumper_inode_info *i_info; + struct dumper_info *dinfo; + struct dentry *dentry; + const char *dname; + int err = 0; + + i_info = kmalloc(sizeof(*i_info), GFP_KERNEL); + if (!i_info) + return -ENOMEM; + + i_info->tinfo = tinfo; + i_info->prog = prog; + + dinfo = kmalloc(sizeof(*dinfo), GFP_KERNEL); + if (!dinfo) { + err = -ENOMEM; + goto free_i_info; + } + + dname = strndup_user(dumper_name, PATH_MAX); + if (!dname) { + err = -ENOMEM; + goto free_dinfo; + } + + dentry = bpfdump_add_file(dname, tinfo->dir_dentry, + &bpf_dumper_ops, i_info); + kfree(dname); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto free_dinfo; + } + + dinfo->dentry = dentry; + dinfo->tinfo = tinfo; + dinfo->prog = prog; + + mutex_lock(&file_dumpers.dumper_mutex); + list_add(&dinfo->list, &file_dumpers.dumpers); + mutex_unlock(&file_dumpers.dumper_mutex); + + return 0; + +free_dinfo: + kfree(dinfo); +free_i_info: + kfree(i_info); + return err; +} + +int bpf_dump_create(u32 prog_fd, const char __user *dumper_name) +{ + struct bpfdump_target_info *tinfo; + const char *target; + struct bpf_prog *prog; + bool existed = false; + int err = 0; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + target = prog->aux->dump_target; + if (!target) { + err = -EINVAL; + goto free_prog; + } + + mutex_lock(&dump_targets.dumper_mutex); + list_for_each_entry(tinfo, &dump_targets.dumpers, list) { + if (strcmp(tinfo->target, target) == 0) { + existed = true; + break; + } + } + mutex_unlock(&dump_targets.dumper_mutex); + + if (!existed) { + err = -EINVAL; + goto free_prog; + } + + err = dumper_name ? create_dumper(tinfo, dumper_name, prog) + : create_anon_dumper(tinfo, prog); + if (err < 0) + goto free_prog; + + return err; + +free_prog: + bpf_prog_put(prog); + return err; +} + +struct bpf_prog *bpf_dump_get_prog(struct seq_file *seq, u32 priv_data_size, + u64 *seq_num) +{ + struct extra_priv_data *extra_data; + + if (seq->file->f_op != &bpf_dumper_ops && + seq->file->f_op != &anon_bpf_dumper_ops) + return NULL; + + extra_data = get_extra_priv_dptr(seq->private, priv_data_size); + *seq_num = extra_data->seq_num++; + + return extra_data->prog; +} + int bpf_dump_reg_target(const char *target, const char *target_proto, const struct seq_operations *seq_ops, @@ -211,6 +535,14 @@ bpfdump_create_dentry(const char *name, umode_t mode, struct dentry *parent, return dentry; } +static struct dentry * +bpfdump_add_file(const char *name, struct dentry *parent, + const struct file_operations *f_ops, void *data) +{ + return bpfdump_create_dentry(name, S_IFREG | 0444, parent, + data, NULL, f_ops); +} + static struct dentry * bpfdump_add_dir(const char *name, struct dentry *parent, const struct inode_operations *i_ops, void *data) @@ -290,6 +622,10 @@ static int __bpfdump_init(void) INIT_LIST_HEAD(&dump_targets.dumpers); mutex_init(&dump_targets.dumper_mutex); + INIT_LIST_HEAD(&anon_dumpers.dumpers); + mutex_init(&anon_dumpers.dumper_mutex); + INIT_LIST_HEAD(&file_dumpers.dumpers); + mutex_init(&file_dumpers.dumper_mutex); return 0; remove_mount: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 41005dee8957..b5e4f18cc633 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2173,9 +2173,13 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) static int bpf_obj_pin(const union bpf_attr *attr) { - if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) + if (CHECK_ATTR(BPF_OBJ) || attr->file_flags & ~BPF_F_DUMP) return -EINVAL; + if (attr->file_flags == BPF_F_DUMP) + return bpf_dump_create(attr->bpf_fd, + u64_to_user_ptr(attr->dumper_name)); + return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); } @@ -2605,6 +2609,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; + case BPF_TRACE_DUMP: + return BPF_PROG_TYPE_TRACING; default: return BPF_PROG_TYPE_UNSPEC; } @@ -2663,6 +2669,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_SOCK_OPS: ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; + case BPF_PROG_TYPE_TRACING: + ret = bpf_dump_create(attr->attach_bpf_fd, (void __user *)NULL); + break; default: ret = -EINVAL; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0f1cbed446c1..b51d56fc77f9 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -354,6 +354,7 @@ enum { /* Flags for accessing BPF object from syscall side. */ BPF_F_RDONLY = (1U << 3), BPF_F_WRONLY = (1U << 4), + BPF_F_DUMP = (1U << 5), /* Flag for stack_map, store build_id+offset instead of pointer */ BPF_F_STACK_BUILD_ID = (1U << 5), @@ -481,7 +482,10 @@ union bpf_attr { }; struct { /* anonymous struct used by BPF_OBJ_* commands */ - __aligned_u64 pathname; + union { + __aligned_u64 pathname; + __aligned_u64 dumper_name; + }; __u32 bpf_fd; __u32 file_flags; }; -- 2.24.1