It is similar with pid namespace. When we create a new bpf object in a child BPF namespace, it will alloc the id in current BPF namespace and its parent BPF namespace. The hierarchy as follows, init_bpf_ns : level = 0 / \ child_a child_b : level = 1 / \ child_b_a child_b_b : level = 2 When we create a bpf object in child_bb, it will allocate IDs for this object in child_bb, child_b and the init_bpf_ns. We will allocate the id for bpf_map, bpf_prog and bpf_link in bpf namespace. Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx> --- fs/proc/namespaces.c | 4 + include/linux/bpf_namespace.h | 46 +++++++++ include/linux/nsproxy.h | 4 + include/linux/proc_ns.h | 1 + include/linux/user_namespace.h | 1 + kernel/bpf/Makefile | 1 + kernel/bpf/bpf_namespace.c | 219 +++++++++++++++++++++++++++++++++++++++++ kernel/nsproxy.c | 19 +++- kernel/ucount.c | 1 + 9 files changed, 294 insertions(+), 2 deletions(-) create mode 100644 include/linux/bpf_namespace.h create mode 100644 kernel/bpf/bpf_namespace.c diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 8e159fc..1a36757 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -9,6 +9,7 @@ #include <linux/ipc_namespace.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/bpf_namespace.h> #include "internal.h" @@ -37,6 +38,9 @@ &timens_operations, &timens_for_children_operations, #endif +#ifdef CONFIG_BPF + &bpfns_operations, +#endif }; static const char *proc_ns_get_link(struct dentry *dentry, diff --git a/include/linux/bpf_namespace.h b/include/linux/bpf_namespace.h new file mode 100644 index 0000000..06aa51f --- /dev/null +++ b/include/linux/bpf_namespace.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BPF_ID_NS_H +#define _LINUX_BPF_ID_NS_H +#include <linux/types.h> +#include <linux/idr.h> +#include <linux/ns_common.h> +#include <linux/user_namespace.h> + +struct ubpf_obj_id { + int nr; + struct bpf_namespace *ns; +}; + +struct bpf_obj_id { + refcount_t count; + unsigned int level; + struct rcu_head rcu; + struct ubpf_obj_id numbers[1]; +}; + +enum { + MAP_OBJ_ID = 0, + PROG_OBJ_ID, + LINK_OBJ_ID, + OBJ_ID_NUM, +}; + +struct bpf_namespace { + struct idr idr[OBJ_ID_NUM]; + struct rcu_head rcu; + int level; + struct ns_common ns; + struct user_namespace *user_ns; + struct kmem_cache *obj_id_cachep; + struct bpf_namespace *parent; + struct ucounts *ucounts; +}; + +extern struct bpf_namespace init_bpf_ns; +extern struct proc_ns_operations bpfns_operations; + +struct bpf_namespace *copy_bpfns(unsigned long flags, + struct user_namespace *user_ns, + struct bpf_namespace *old_ns); +void put_bpfns(struct bpf_namespace *ns); +#endif /* _LINUX_BPF_ID_NS_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index fee881c..d24ab6b 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -10,6 +10,9 @@ struct ipc_namespace; struct pid_namespace; struct cgroup_namespace; +#ifdef CONFIG_BPF +struct bpf_namespace; +#endif struct fs_struct; /* @@ -38,6 +41,7 @@ struct nsproxy { struct time_namespace *time_ns; struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; + struct bpf_namespace *bpf_ns; }; extern struct nsproxy init_nsproxy; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 555c257..c10ce2c 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -46,6 +46,7 @@ enum { PROC_PID_INIT_INO = 0xEFFFFFFCU, PROC_CGROUP_INIT_INO = 0xEFFFFFFBU, PROC_TIME_INIT_INO = 0xEFFFFFFAU, + PROC_BPF_INIT_INO = 0xEFFFFFF9U, }; #ifdef CONFIG_PROC_FS diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 45f09be..93eb618 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -54,6 +54,7 @@ enum ucount_type { UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_MARKS, #endif + UCOUNT_BPF_NAMESPACES, UCOUNT_COUNTS, }; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 0224261..828aef0 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -44,3 +44,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o $(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE $(call if_changed_rule,cc_o_c) +obj-$(CONFIG_BPF_SYSCALL) += bpf_namespace.o diff --git a/kernel/bpf/bpf_namespace.c b/kernel/bpf/bpf_namespace.c new file mode 100644 index 0000000..88a86cd --- /dev/null +++ b/kernel/bpf/bpf_namespace.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/ns_common.h> +#include <linux/syscalls.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/proc_ns.h> +#include <linux/sched/task.h> +#include <linux/sched/signal.h> +#include <linux/idr.h> +#include <linux/user_namespace.h> +#include <linux/bpf_namespace.h> + +#define MAX_BPF_NS_LEVEL 32 +static struct kmem_cache *bpfns_cachep; +static struct kmem_cache *obj_id_cache[MAX_PID_NS_LEVEL]; +static struct ns_common *bpfns_get(struct task_struct *task); +static void bpfns_put(struct ns_common *ns); +static struct kmem_cache *create_bpf_cachep(unsigned int level); +static DEFINE_MUTEX(obj_id_caches_mutex); + +static int bpfns_install(struct nsset *nsset, struct ns_common *ns) +{ + pr_info("setns not supported for bpf namespace"); + return -EOPNOTSUPP; +} + +struct proc_ns_operations bpfns_operations = { + .name = "bpf", + .type = CLONE_NEWBPF, + .get = bpfns_get, + .put = bpfns_put, + .install = bpfns_install, +}; + +struct bpf_namespace init_bpf_ns = { + .level = 0, + .user_ns = &init_user_ns, + .ns.ops = &bpfns_operations, + .ns.inum = PROC_BPF_INIT_INO, +}; + +static struct bpf_namespace *get_bpfns(struct bpf_namespace *ns) +{ + if (ns != &init_bpf_ns) + refcount_inc(&ns->ns.count); + return ns; +} + +static struct ns_common *bpfns_get(struct task_struct *task) +{ + struct ns_common *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = &nsproxy->bpf_ns->ns; + get_bpfns(container_of(ns, struct bpf_namespace, ns)); + } + rcu_read_unlock(); + return ns; +} + +static struct ucounts *inc_bpf_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_BPF_NAMESPACES); +} + +static void dec_bpf_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_BPF_NAMESPACES); +} + +static void delayed_free_bpfns(struct rcu_head *p) +{ + struct bpf_namespace *ns = container_of(p, struct bpf_namespace, rcu); + + dec_bpf_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + kmem_cache_free(bpfns_cachep, ns); +} + +static void destroy_bpf_namespace(struct bpf_namespace *ns) +{ + int i; + + ns_free_inum(&ns->ns); + for (i = 0; i < OBJ_ID_NUM; i++) + idr_destroy(&ns->idr[i]); + call_rcu(&ns->rcu, delayed_free_bpfns); +} + +void put_bpfns(struct bpf_namespace *ns) +{ + struct bpf_namespace *parent; + + while (ns != &init_bpf_ns) { + parent = ns->parent; + if (!refcount_dec_and_test(&ns->ns.count)) + break; + destroy_bpf_namespace(ns); + ns = parent; + } +} + +static void bpfns_put(struct ns_common *ns) +{ + struct bpf_namespace *bpf_ns; + + bpf_ns = container_of(ns, struct bpf_namespace, ns); + put_bpfns(bpf_ns); +} + +static struct bpf_namespace * +create_bpf_namespace(struct user_namespace *user_ns, + struct bpf_namespace *parent_bpfns) +{ + struct bpf_namespace *ns; + unsigned int level = parent_bpfns->level + 1; + struct ucounts *ucounts; + int err; + int i; + + err = -EINVAL; + if (!in_userns(parent_bpfns->user_ns, user_ns)) + goto out; + + err = -ENOSPC; + if (level > MAX_BPF_NS_LEVEL) + goto out; + ucounts = inc_bpf_namespaces(user_ns); + if (!ucounts) + goto out; + + err = -ENOMEM; + ns = kmem_cache_zalloc(bpfns_cachep, GFP_KERNEL); + if (!ns) + goto out_dec; + + for (i = 0; i < OBJ_ID_NUM; i++) + idr_init(&ns->idr[i]); + + ns->obj_id_cachep = create_bpf_cachep(level); + if (!ns->obj_id_cachep) + goto out_free_idr; + + err = ns_alloc_inum(&ns->ns); + if (err) + goto out_free_idr; + ns->ns.ops = &bpfns_operations; + + refcount_set(&ns->ns.count, 1); + ns->level = level; + ns->parent = get_bpfns(parent_bpfns); + ns->user_ns = get_user_ns(user_ns); + ns->ucounts = ucounts; + return ns; + +out_free_idr: + for (i = 0; i < OBJ_ID_NUM; i++) + idr_destroy(&ns->idr[i]); + kmem_cache_free(bpfns_cachep, ns); +out_dec: + dec_bpf_namespaces(ucounts); +out: + return ERR_PTR(err); +} + +struct bpf_namespace *copy_bpfns(unsigned long flags, + struct user_namespace *user_ns, + struct bpf_namespace *old_ns) +{ + if (!(flags & CLONE_NEWBPF)) + return get_bpfns(old_ns); + return create_bpf_namespace(user_ns, old_ns); +} + +static struct kmem_cache *create_bpf_cachep(unsigned int level) +{ + /* Level 0 is init_bpf_ns.obj_id_cachep */ + struct kmem_cache **pkc = &obj_id_cache[level - 1]; + struct kmem_cache *kc; + char name[4 + 10 + 1]; + unsigned int len; + + kc = READ_ONCE(*pkc); + if (kc) + return kc; + + snprintf(name, sizeof(name), "bpf_%u", level + 1); + len = sizeof(struct bpf_obj_id) + level * sizeof(struct ubpf_obj_id); + mutex_lock(&obj_id_caches_mutex); + /* Name collision forces to do allocation under mutex. */ + if (!*pkc) + *pkc = kmem_cache_create(name, len, 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); + mutex_unlock(&obj_id_caches_mutex); + /* current can fail, but someone else can succeed. */ + return READ_ONCE(*pkc); +} + +static void __init bpfns_idr_init(void) +{ + int i; + + init_bpf_ns.obj_id_cachep = + KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + for (i = 0; i < OBJ_ID_NUM; i++) + idr_init(&init_bpf_ns.idr[i]); +} + +static __init int bpf_namespaces_init(void) +{ + bpfns_cachep = KMEM_CACHE(bpf_namespace, SLAB_PANIC | SLAB_ACCOUNT); + bpfns_idr_init(); + return 0; +} + +late_initcall(bpf_namespaces_init); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a487ff2..6a6fa70 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -19,6 +19,7 @@ #include <net/net_namespace.h> #include <linux/ipc_namespace.h> #include <linux/time_namespace.h> +#include <linux/bpf_namespace.h> #include <linux/fs_struct.h> #include <linux/proc_fs.h> #include <linux/proc_ns.h> @@ -26,6 +27,7 @@ #include <linux/syscalls.h> #include <linux/cgroup.h> #include <linux/perf_event.h> +#include <linux/bpf_namespace.h> static struct kmem_cache *nsproxy_cachep; @@ -47,6 +49,9 @@ struct nsproxy init_nsproxy = { .time_ns = &init_time_ns, .time_ns_for_children = &init_time_ns, #endif +#ifdef CONFIG_BPF + .bpf_ns = &init_bpf_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -121,8 +126,16 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, } new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns); + new_nsp->bpf_ns = copy_bpfns(flags, user_ns, tsk->nsproxy->bpf_ns); + if (IS_ERR(new_nsp->bpf_ns)) { + err = PTR_ERR(new_nsp->bpf_ns); + goto out_bpf; + } return new_nsp; +out_bpf: + put_time_ns(new_nsp->time_ns); + put_time_ns(new_nsp->time_ns_for_children); out_time: put_net(new_nsp->net_ns); out_net: @@ -156,7 +169,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | - CLONE_NEWCGROUP | CLONE_NEWTIME)))) { + CLONE_NEWCGROUP | CLONE_NEWTIME | CLONE_NEWBPF)))) { if ((flags & CLONE_VM) || likely(old_ns->time_ns_for_children == old_ns->time_ns)) { get_nsproxy(old_ns); @@ -203,6 +216,8 @@ void free_nsproxy(struct nsproxy *ns) put_time_ns(ns->time_ns_for_children); put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); + if (ns->bpf_ns) + put_bpfns(ns->bpf_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -218,7 +233,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP | - CLONE_NEWTIME))) + CLONE_NEWTIME | CLONE_NEWBPF))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); diff --git a/kernel/ucount.c b/kernel/ucount.c index ee8e57f..97e0ae3 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -87,6 +87,7 @@ static int set_permissions(struct ctl_table_header *head, UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif + UCOUNT_ENTRY("max_bpf_namespaces"), { } }; #endif /* CONFIG_SYSCTL */ -- 1.8.3.1