On Wed, Nov 04, 2020 at 10:03:05AM +0000, Sargun Dhillon wrote: > On Mon, Nov 02, 2020 at 05:50:33PM +0100, Alexey Gladkov wrote: > > Add a new prctl to change the user namespace in which the process > > counter is located. A pointer to the user namespace is in cred struct > > to be inherited by all child processes. > > > > Signed-off-by: Alexey Gladkov <gladkov.alexey@xxxxxxxxx> > > --- > > fs/exec.c | 2 +- > > fs/io-wq.c | 13 ++++++++----- > > fs/io-wq.h | 1 + > > fs/io_uring.c | 1 + > > include/linux/cred.h | 8 ++++++++ > > include/uapi/linux/prctl.h | 5 +++++ > > kernel/cred.c | 35 +++++++++++++++++++++++++++++------ > > kernel/exit.c | 2 +- > > kernel/fork.c | 4 ++-- > > kernel/sys.c | 22 +++++++++++++++++++++- > > kernel/user_namespace.c | 3 +++ > > 11 files changed, 80 insertions(+), 16 deletions(-) > > > > diff --git a/fs/exec.c b/fs/exec.c > > index c45dfc716394..574b1381276c 100644 > > --- a/fs/exec.c > > +++ b/fs/exec.c > > @@ -1837,7 +1837,7 @@ static int __do_execve_file(int fd, struct filename *filename, > > goto out_ret; > > } > > > > - processes = get_rlimit_counter(&init_user_ns, current_euid(), UCOUNT_RLIMIT_NPROC); > > + processes = get_rlimit_counter(current_rlimit_ns(), current_euid(), UCOUNT_RLIMIT_NPROC); > > > > /* > > * We move the actual failure in case of RLIMIT_NPROC excess from > > diff --git a/fs/io-wq.c b/fs/io-wq.c > > index c3b0843abc9b..19e43ec115cb 100644 > > --- a/fs/io-wq.c > > +++ b/fs/io-wq.c > > @@ -116,6 +116,7 @@ struct io_wq { > > > > struct task_struct *manager; > > struct user_struct *user; > It seems like user would be unused here, and you could use creds->user? Yep. I think so. > > + const struct cred *creds; > > refcount_t refs; > > struct completion done; > > > > @@ -217,7 +218,7 @@ static void io_worker_exit(struct io_worker *worker) > > if (worker->flags & IO_WORKER_F_RUNNING) > > atomic_dec(&acct->nr_running); > > if (!(worker->flags & IO_WORKER_F_BOUND)) > > - dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC); > > + dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC); > > worker->flags = 0; > > preempt_enable(); > > > > @@ -350,9 +351,9 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, > > worker->flags |= IO_WORKER_F_BOUND; > > wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--; > > wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++; > > - dec_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC); > > + dec_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC); > > } else { > > - if (!inc_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC)) > > + if (!inc_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC)) > > return; > > worker->flags &= ~IO_WORKER_F_BOUND; > > wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++; > > @@ -662,7 +663,7 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) > > } > > > > if (index == IO_WQ_ACCT_UNBOUND && > > - !inc_rlimit_counter(&init_user_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) { > > + !inc_rlimit_counter(wq->creds->rlimit_ns, wq->user->uid, UCOUNT_RLIMIT_NPROC)) { > > kfree(worker); > > return false; > > } > > @@ -772,7 +773,8 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, > > if (free_worker) > > return true; > > > > - processes = get_rlimit_counter(&init_user_ns, wqe->wq->user->uid, UCOUNT_RLIMIT_NPROC); > > + processes = get_rlimit_counter(wqe->wq->creds->rlimit_ns, wqe->wq->user->uid, > > + UCOUNT_RLIMIT_NPROC); > > > > if (processes >= acct->max_workers && > > !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN))) > > @@ -1049,6 +1051,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) > > > > /* caller must already hold a reference to this */ > > wq->user = data->user; > > + wq->creds = data->creds; > > > > for_each_node(node) { > > struct io_wqe *wqe; > > diff --git a/fs/io-wq.h b/fs/io-wq.h > > index 071f1a997800..6acc3a04c38f 100644 > > --- a/fs/io-wq.h > > +++ b/fs/io-wq.h > > @@ -105,6 +105,7 @@ typedef void (io_wq_work_fn)(struct io_wq_work **); > > > > struct io_wq_data { > > struct user_struct *user; > > + const struct cred *creds; > > > > io_wq_work_fn *do_work; > > free_work_fn *free_work; > > diff --git a/fs/io_uring.c b/fs/io_uring.c > > index 493e5047e67c..e419923968b3 100644 > > --- a/fs/io_uring.c > > +++ b/fs/io_uring.c > > @@ -6933,6 +6933,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx, > > int ret = 0; > > > > data.user = ctx->user; > > + data.creds = ctx->creds; > > data.free_work = io_free_work; > > data.do_work = io_wq_submit_work; > > > > diff --git a/include/linux/cred.h b/include/linux/cred.h > > index 18639c069263..43aee68d117f 100644 > > --- a/include/linux/cred.h > > +++ b/include/linux/cred.h > > @@ -144,6 +144,7 @@ struct cred { > > #endif > > struct user_struct *user; /* real user ID subscription */ > > struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ > > + struct user_namespace *rlimit_ns; /* user_ns in which rlimits is tracked */ > > struct group_info *group_info; /* supplementary groups for euid/fsgid */ > > /* RCU deletion */ > > union { > > @@ -170,6 +171,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *); > > extern int set_create_files_as(struct cred *, struct inode *); > > extern int cred_fscmp(const struct cred *, const struct cred *); > > extern void __init cred_init(void); > > +extern int set_rlimit_ns(struct user_namespace *ns); > > > > /* > > * check for validity of credentials > > @@ -370,6 +372,7 @@ static inline void put_cred(const struct cred *_cred) > > > > #define task_uid(task) (task_cred_xxx((task), uid)) > > #define task_euid(task) (task_cred_xxx((task), euid)) > > +#define task_rlimit_ns(task) (task_cred_xxx((task), rlimit_ns)) > > > > #define current_cred_xxx(xxx) \ > > ({ \ > > @@ -390,11 +393,16 @@ static inline void put_cred(const struct cred *_cred) > > extern struct user_namespace init_user_ns; > > #ifdef CONFIG_USER_NS > > #define current_user_ns() (current_cred_xxx(user_ns)) > > +#define current_rlimit_ns() (current_cred_xxx(rlimit_ns)) > > #else > > static inline struct user_namespace *current_user_ns(void) > > { > > return &init_user_ns; > > } > > +static inline struct user_namespace *current_rlimit_ns(void) > > +{ > > + return &init_user_ns; > > +} > > #endif > > > > > > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h > > index 07b4f8131e36..4f853f903415 100644 > > --- a/include/uapi/linux/prctl.h > > +++ b/include/uapi/linux/prctl.h > > @@ -238,4 +238,9 @@ struct prctl_mm_map { > > #define PR_SET_IO_FLUSHER 57 > > #define PR_GET_IO_FLUSHER 58 > > > > +#define PR_SET_RLIMIT_USER_NAMESPACE 59 > > +#define PR_GET_RLIMIT_USER_NAMESPACE 60 > > +# define PR_RLIMIT_BIND_GLOBAL_USERNS (1UL << 0) > > +# define PR_RLIMIT_BIND_CURRENT_USERNS (1UL << 1) > > + > > #endif /* _LINUX_PRCTL_H */ > > diff --git a/kernel/cred.c b/kernel/cred.c > > index 748704db1f6b..7b90e1ef9c9a 100644 > > --- a/kernel/cred.c > > +++ b/kernel/cred.c > > @@ -59,6 +59,7 @@ struct cred init_cred = { > > .cap_bset = CAP_FULL_SET, > > .user = INIT_USER, > > .user_ns = &init_user_ns, > > + .rlimit_ns = &init_user_ns, > > .group_info = &init_groups, > > }; > > > > @@ -120,6 +121,7 @@ static void put_cred_rcu(struct rcu_head *rcu) > > put_group_info(cred->group_info); > > free_uid(cred->user); > > put_user_ns(cred->user_ns); > > + put_user_ns(cred->rlimit_ns); > > kmem_cache_free(cred_jar, cred); > > } > > > > @@ -270,6 +272,7 @@ struct cred *prepare_creds(void) > > get_group_info(new->group_info); > > get_uid(new->user); > > get_user_ns(new->user_ns); > > + get_user_ns(new->rlimit_ns); > > > > #ifdef CONFIG_KEYS > > key_get(new->session_keyring); > > @@ -345,7 +348,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) > > #endif > > clone_flags & CLONE_THREAD > > ) { > > - if (!inc_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC)) > > + if (!inc_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC)) > > return -EACCES; > > p->real_cred = get_cred(p->cred); > > get_cred(p->cred); > > @@ -385,7 +388,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) > > } > > #endif > > > > - if (!inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC)) > > + if (!inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC)) > > return -EACCES; > > p->cred = p->real_cred = get_cred(new); > > alter_cred_subscribers(new, 2); > > @@ -487,13 +490,13 @@ int commit_creds(struct cred *new) > > * perhaps this limit is exceeded in the parent user namespace. > > */ > > alter_cred_subscribers(new, 2); > > - if (new->user != old->user && > > - !inc_rlimit_counter(&init_user_ns, new->euid, UCOUNT_RLIMIT_NPROC)) > > + if ((new->user != old->user || new->rlimit_ns != old->rlimit_ns) && > > + !inc_rlimit_counter(new->rlimit_ns, new->euid, UCOUNT_RLIMIT_NPROC)) > > task->flags |= PF_NPROC_UNS_EXCEEDED; > > rcu_assign_pointer(task->real_cred, new); > > rcu_assign_pointer(task->cred, new); > > - if (new->user != old->user) > > - dec_rlimit_counter(&init_user_ns, old->euid, UCOUNT_RLIMIT_NPROC); > > + if (new->user != old->user || new->rlimit_ns != old->rlimit_ns) > > + dec_rlimit_counter(old->rlimit_ns, old->euid, UCOUNT_RLIMIT_NPROC); > > alter_cred_subscribers(old, -2); > > > > /* send notifications */ > > @@ -789,6 +792,26 @@ int set_create_files_as(struct cred *new, struct inode *inode) > > } > > EXPORT_SYMBOL(set_create_files_as); > > > > +/* > > + * Change the rlimit user namespace of the current task, replacing the existing > > + * one. If the given namespace is NULL, then initial user namespace will be > > + * used. > > + * > > + * Return: 0 on success; -errno on failure. > > + */ > > +int set_rlimit_ns(struct user_namespace *ns) > > +{ > > + struct cred *new; > > + > > + new = prepare_creds(); > > + if (!new) > > + return -ENOMEM; > > + > > + new->rlimit_ns = ns ? ns : &init_user_ns; > > + > > + return commit_creds(new); > > +} > > + > > #ifdef CONFIG_DEBUG_CREDENTIALS > > > > bool creds_are_invalid(const struct cred *cred) > > diff --git a/kernel/exit.c b/kernel/exit.c > > index 5a0d7dd1ad64..998436d32373 100644 > > --- a/kernel/exit.c > > +++ b/kernel/exit.c > > @@ -187,7 +187,7 @@ void release_task(struct task_struct *p) > > /* don't need to get the RCU readlock here - the process is dead and > > * can't be modifying its own credentials. But shut RCU-lockdep up */ > > rcu_read_lock(); > > - dec_rlimit_counter(&init_user_ns, task_euid(p), UCOUNT_RLIMIT_NPROC); > > + dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC); > > rcu_read_unlock(); > > > > cgroup_release(p); > > diff --git a/kernel/fork.c b/kernel/fork.c > > index d2b28634dc8f..43f3c54fe4c6 100644 > > --- a/kernel/fork.c > > +++ b/kernel/fork.c > > @@ -1963,7 +1963,7 @@ static __latent_entropy struct task_struct *copy_process( > > current->flags &= ~PF_NPROC_UNS_EXCEEDED; > > goto bad_fork_free; > > } > > - processes = get_rlimit_counter(&init_user_ns, p->real_cred->euid, > > + processes = get_rlimit_counter(task_rlimit_ns(p), task_euid(p), > > UCOUNT_RLIMIT_NPROC); > > if (processes >= task_rlimit(p, RLIMIT_NPROC)) { > > if (p->real_cred->user != INIT_USER && > > @@ -2366,7 +2366,7 @@ static __latent_entropy struct task_struct *copy_process( > > #endif > > delayacct_tsk_free(p); > > bad_fork_cleanup_count: > > - dec_rlimit_counter(&init_user_ns, p->cred->euid, UCOUNT_RLIMIT_NPROC); > > + dec_rlimit_counter(task_rlimit_ns(p), task_euid(p), UCOUNT_RLIMIT_NPROC); > > exit_creds(p); > > bad_fork_free: > > p->state = TASK_DEAD; > > diff --git a/kernel/sys.c b/kernel/sys.c > > index db780ec32d86..917cbd7fc674 100644 > > --- a/kernel/sys.c > > +++ b/kernel/sys.c > > @@ -467,7 +467,7 @@ static int set_user(struct cred *new) > > if (!new_user) > > return -EAGAIN; > > > > - processes = get_rlimit_counter(&init_user_ns, new_user->uid, UCOUNT_RLIMIT_NPROC); > > + processes = get_rlimit_counter(new->rlimit_ns, new_user->uid, UCOUNT_RLIMIT_NPROC); > > > > /* > > * We don't fail in case of NPROC limit excess here because too many > > @@ -2529,6 +2529,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, > > > > error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; > > break; > > + case PR_SET_RLIMIT_USER_NAMESPACE: > > + if (!capable(CAP_SYS_RESOURCE)) > > + return -EPERM; > Can you have CAP_SYS_RESOURCE in the init user ns while you're in a non-init > user ns? Shouldn't that only matter if you want to set yourself to > PR_RLIMIT_BIND_GLOBAL_USERNS anyways and you just want to make sure they have > CAP_SYS_RESOURCE in the current namespace for PR_RLIMIT_BIND_CURRENT_USERNS? I think I was wrong to add the CAP_SYS_RESOURCE check because the user still won't be able to exceed his limit. So since setrlimit does not require CAP_SYS_RESOURCE, then it is not needed here either. > > + > > + switch (arg2) { > > + case PR_RLIMIT_BIND_GLOBAL_USERNS: > > + error = set_rlimit_ns(&init_user_ns); > > + break; > > + case PR_RLIMIT_BIND_CURRENT_USERNS: > > + error = set_rlimit_ns(current_user_ns()); > > + break; > To some degree, this isn't so much "per user namespace rlimits", as much as it > is hierarchical rlimits. I'm not going to nitpick about names, but it might be > worth exploring. Especially because the way the patch is written, it would be > easy to introduce a third user namespace for rlimits with different mappings. I agree that the naming is a bit confusing but I couldn't think of a better one. This is probably because the discussion started with the topic "per userns rlimits". This change is more about the rlimit counter but not the rlimit itself. > The downside I see with a sysctl is that changing it midway through the user > namespaces lifetime could be confusing, and difficult to get right. sysctl requires procfs. I did prctl to avoid it. > > + default: > > + error = -EINVAL; > > + } > > + break; > > + case PR_GET_RLIMIT_USER_NAMESPACE: > > + error = current_rlimit_ns() == &init_user_ns > > + ? PR_RLIMIT_BIND_GLOBAL_USERNS > > + : PR_RLIMIT_BIND_CURRENT_USERNS; > > + break; > It would be nice to have this be a sysctl, so everyone who does > setns() would get the behaviour, and if a process wants to What's so tricky about using prctl? > > default: > > error = -EINVAL; > > break; > > diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c > > index 87804e0371fe..346df35ceba9 100644 > > --- a/kernel/user_namespace.c > > +++ b/kernel/user_namespace.c > > @@ -56,6 +56,8 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) > > #endif > > /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ > > cred->user_ns = user_ns; > > + > > + cred->rlimit_ns = &init_user_ns; > > } > > > > /* > > @@ -121,6 +123,7 @@ int create_user_ns(struct cred *new) > > for (i = 0; i < UCOUNT_COUNTS; i++) { > > ns->ucount_max[i] = INT_MAX; > > } > > + ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); > > ns->ucounts = ucounts; > > > > /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ > > -- > > 2.25.4 > > > -- Rgrds, legion