On Tue, Oct 10, 2023 at 7:35 PM Hou Tao <houtao@xxxxxxxxxxxxxxx> wrote: > > Hi, > > On 9/28/2023 6:57 AM, Andrii Nakryiko wrote: > > Add new kind of BPF kernel object, BPF token. BPF token is meant to > > allow delegating privileged BPF functionality, like loading a BPF > > program or creating a BPF map, from privileged process to a *trusted* > > unprivileged process, all while have a good amount of control over which > > privileged operations could be performed using provided BPF token. > > > > This is achieved through mounting BPF FS instance with extra delegation > > mount options, which determine what operations are delegatable, and also > > constraining it to the owning user namespace (as mentioned in the > > previous patch). > SNIP > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index 70bfa997e896..78692911f4a0 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -847,6 +847,37 @@ union bpf_iter_link_info { > > * Returns zero on success. On error, -1 is returned and *errno* > > * is set appropriately. > > * > > + * BPF_TOKEN_CREATE > > + * Description > > + * Create BPF token with embedded information about what > > + * BPF-related functionality it allows: > > + * - a set of allowed bpf() syscall commands; > > + * - a set of allowed BPF map types to be created with > > + * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; > > + * - a set of allowed BPF program types and BPF program attach > > + * types to be loaded with BPF_PROG_LOAD command, if > > + * BPF_PROG_LOAD itself is allowed. > > + * > > + * BPF token is created (derived) from an instance of BPF FS, > > + * assuming it has necessary delegation mount options specified. > > + * BPF FS mount is specified with openat()-style path FD + string. > > + * This BPF token can be passed as an extra parameter to various > > + * bpf() syscall commands to grant BPF subsystem functionality to > > + * unprivileged processes. > > + * > > + * When created, BPF token is "associated" with the owning > > + * user namespace of BPF FS instance (super block) that it was > > + * derived from, and subsequent BPF operations performed with > > + * BPF token would be performing capabilities checks (i.e., > > + * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within > > + * that user namespace. Without BPF token, such capabilities > > + * have to be granted in init user namespace, making bpf() > > + * syscall incompatible with user namespace, for the most part. > > + * > > + * Return > > + * A new file descriptor (a nonnegative integer), or -1 if an > > + * error occurred (in which case, *errno* is set appropriately). > > + * > > * NOTES > > * eBPF objects (maps and programs) can be shared between processes. > > * > > @@ -901,6 +932,8 @@ enum bpf_cmd { > > BPF_ITER_CREATE, > > BPF_LINK_DETACH, > > BPF_PROG_BIND_MAP, > > + BPF_TOKEN_CREATE, > > + __MAX_BPF_CMD, > > }; > > > > enum bpf_map_type { > > @@ -1694,6 +1727,12 @@ union bpf_attr { > > __u32 flags; /* extra flags */ > > } prog_bind_map; > > > > + struct { /* struct used by BPF_TOKEN_CREATE command */ > > + __u32 flags; > > + __u32 bpffs_path_fd; > > + __u64 bpffs_pathname; > > Because bppfs_pathname is a string pointer, so __aligned_u64 is preferred. ok, I'll use __aligned_u64, even though it can never be unaligned in this case > > + } token_create; > > + > > } __attribute__((aligned(8))); > > > > /* The description below is an attempt at providing documentation to eBPF > > diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile > > index f526b7573e97..4ce95acfcaa7 100644 > > --- a/kernel/bpf/Makefile > > +++ b/kernel/bpf/Makefile > > @@ -6,7 +6,7 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse > > endif > > CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) > > > > -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o > > +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o token.o > > obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o > > obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o > > obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o > > diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c > > index 24b3faf901f4..de1fdf396521 100644 > > --- a/kernel/bpf/inode.c > > +++ b/kernel/bpf/inode.c > > @@ -99,9 +99,9 @@ static const struct inode_operations bpf_prog_iops = { }; > > static const struct inode_operations bpf_map_iops = { }; > > static const struct inode_operations bpf_link_iops = { }; > > > > -static struct inode *bpf_get_inode(struct super_block *sb, > > - const struct inode *dir, > > - umode_t mode) > > +struct inode *bpf_get_inode(struct super_block *sb, > > + const struct inode *dir, > > + umode_t mode) > > { > > struct inode *inode; > > > > @@ -603,11 +603,13 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) > > { > > struct bpf_mount_opts *opts = root->d_sb->s_fs_info; > > umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; > > + u64 mask; > > > > if (mode != S_IRWXUGO) > > seq_printf(m, ",mode=%o", mode); > > > > - if (opts->delegate_cmds == ~0ULL) > > + mask = (1ULL << __MAX_BPF_CMD) - 1; > > + if ((opts->delegate_cmds & mask) == mask) > > seq_printf(m, ",delegate_cmds=any"); > > Should we add a BUILD_BUG_ON assertion to guarantee __MAX_BPF_CMD is > less than sizeof(u64) * 8 ? yep, good idea, will add for CMD and all others > > else if (opts->delegate_cmds) > > seq_printf(m, ",delegate_cmds=0x%llx", opts->delegate_cmds); > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index 7445dad01fb3..b47791a80930 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -5304,6 +5304,20 @@ static int bpf_prog_bind_map(union bpf_attr *attr) > > return ret; > > } > > > > +#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_pathname > > + > > +static int token_create(union bpf_attr *attr) > > +{ > > + if (CHECK_ATTR(BPF_TOKEN_CREATE)) > > + return -EINVAL; > > + > > + /* no flags are supported yet */ > > + if (attr->token_create.flags) > > + return -EINVAL; > > + > > + return bpf_token_create(attr); > > +} > > + > > static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) > > { > > union bpf_attr attr; > > @@ -5437,6 +5451,9 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size) > > case BPF_PROG_BIND_MAP: > > err = bpf_prog_bind_map(&attr); > > break; > > + case BPF_TOKEN_CREATE: > > + err = token_create(&attr); > > + break; > > default: > > err = -EINVAL; > > break; > > diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c > > new file mode 100644 > > index 000000000000..779aad5007a3 > > --- /dev/null > > +++ b/kernel/bpf/token.c > SNIP > > +#define BPF_TOKEN_INODE_NAME "bpf-token" > > + > > +static const struct inode_operations bpf_token_iops = { }; > > + > > +static const struct file_operations bpf_token_fops = { > > + .release = bpf_token_release, > > + .show_fdinfo = bpf_token_show_fdinfo, > > +}; > > + > > +int bpf_token_create(union bpf_attr *attr) > > +{ > > + struct bpf_mount_opts *mnt_opts; > > + struct bpf_token *token = NULL; > > + struct inode *inode; > > + struct file *file; > > + struct path path; > > + umode_t mode; > > + int err, fd; > > + > > + err = user_path_at(attr->token_create.bpffs_path_fd, > > + u64_to_user_ptr(attr->token_create.bpffs_pathname), > > + LOOKUP_FOLLOW | LOOKUP_EMPTY, &path); > > + if (err) > > + return err; > > Need to check the mount is a bpffs mount instead of other filesystem mount. yep, missed that. Fixed, will check `path.mnt->mnt_sb->s_op != &bpf_super_ops`. > > + > > + if (path.mnt->mnt_root != path.dentry) { > > + err = -EINVAL; > > + goto out_path; > > + } > > + err = path_permission(&path, MAY_ACCESS); > > + if (err) > > + goto out_path; > > + > > + mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); > > + inode = bpf_get_inode(path.mnt->mnt_sb, NULL, mode); > > + if (IS_ERR(inode)) { > > + err = PTR_ERR(inode); > > + goto out_path; > > + } > > + > > + inode->i_op = &bpf_token_iops; > > + inode->i_fop = &bpf_token_fops; > > + clear_nlink(inode); /* make sure it is unlinked */ > > + > > + file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); > > + if (IS_ERR(file)) { > > + iput(inode); > > + err = PTR_ERR(file); > > + goto out_file; > > goto out_path ? eagle eye, fixed, thanks! > > + } > > + > > + token = bpf_token_alloc(); > > + if (!token) { > > + err = -ENOMEM; > > + goto out_file; > > + } > > + > > + /* remember bpffs owning userns for future ns_capable() checks */ > > + token->userns = get_user_ns(path.dentry->d_sb->s_user_ns); > > + > > + mnt_opts = path.dentry->d_sb->s_fs_info; > > + token->allowed_cmds = mnt_opts->delegate_cmds; > > + > > + fd = get_unused_fd_flags(O_CLOEXEC); > > + if (fd < 0) { > > + err = fd; > > + goto out_token; > > + } > > + > > + file->private_data = token; > > + fd_install(fd, file); > > + > > + path_put(&path); > > + return fd; > > + > > +out_token: > > + bpf_token_free(token); > > +out_file: > > + fput(file); > > +out_path: > > + path_put(&path); > > + return err; > > +} > > + > . >