"Serge E. Hallyn" <serge@xxxxxxxxxx> writes: > This allows setuid/setgid in containers. It also fixes some > corner cases where kernel logic foregoes capability checks when > uids are equivalent. The latter will need to be done throughout > the whole kernel. Except for the extra printk in sethostname this looks good. Acked-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> > > Changelog: > Jan 11: Use nsown_capable() as suggested by Bastian Blank. > Jan 11: Fix logic errors in uid checks pointed out by Bastian. > Feb 15: allow prlimit to current (was regression in previous version) > > Signed-off-by: Serge E. Hallyn <serge.hallyn@xxxxxxxxxxxxx> > --- > kernel/sys.c | 74 ++++++++++++++++++++++++++++++++++++--------------------- > 1 files changed, 47 insertions(+), 27 deletions(-) > > diff --git a/kernel/sys.c b/kernel/sys.c > index 7a1bbad..075370d 100644 > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -118,17 +118,29 @@ EXPORT_SYMBOL(cad_pid); > > void (*pm_power_off_prepare)(void); > > +/* called with rcu_read_lock, creds are safe */ > +static inline int set_one_prio_perm(struct task_struct *p) > +{ > + const struct cred *cred = current_cred(), *pcred = __task_cred(p); > + > + if (pcred->user->user_ns == cred->user->user_ns && > + (pcred->uid == cred->euid || > + pcred->euid == cred->euid)) > + return 1; > + if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) > + return 1; > + return 0; > +} > + > /* > * set the priority of a task > * - the caller must hold the RCU read lock > */ > static int set_one_prio(struct task_struct *p, int niceval, int error) > { > - const struct cred *cred = current_cred(), *pcred = __task_cred(p); > int no_nice; > > - if (pcred->uid != cred->euid && > - pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { > + if (!set_one_prio_perm(p)) { > error = -EPERM; > goto out; > } > @@ -502,7 +514,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) > if (rgid != (gid_t) -1) { > if (old->gid == rgid || > old->egid == rgid || > - capable(CAP_SETGID)) > + nsown_capable(CAP_SETGID)) > new->gid = rgid; > else > goto error; > @@ -511,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) > if (old->gid == egid || > old->egid == egid || > old->sgid == egid || > - capable(CAP_SETGID)) > + nsown_capable(CAP_SETGID)) > new->egid = egid; > else > goto error; > @@ -546,7 +558,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) > old = current_cred(); > > retval = -EPERM; > - if (capable(CAP_SETGID)) > + if (nsown_capable(CAP_SETGID)) > new->gid = new->egid = new->sgid = new->fsgid = gid; > else if (gid == old->gid || gid == old->sgid) > new->egid = new->fsgid = gid; > @@ -613,7 +625,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) > new->uid = ruid; > if (old->uid != ruid && > old->euid != ruid && > - !capable(CAP_SETUID)) > + !nsown_capable(CAP_SETUID)) > goto error; > } > > @@ -622,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) > if (old->uid != euid && > old->euid != euid && > old->suid != euid && > - !capable(CAP_SETUID)) > + !nsown_capable(CAP_SETUID)) > goto error; > } > > @@ -670,7 +682,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) > old = current_cred(); > > retval = -EPERM; > - if (capable(CAP_SETUID)) { > + if (nsown_capable(CAP_SETUID)) { > new->suid = new->uid = uid; > if (uid != old->uid) { > retval = set_user(new); > @@ -712,7 +724,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) > old = current_cred(); > > retval = -EPERM; > - if (!capable(CAP_SETUID)) { > + if (!nsown_capable(CAP_SETUID)) { > if (ruid != (uid_t) -1 && ruid != old->uid && > ruid != old->euid && ruid != old->suid) > goto error; > @@ -776,7 +788,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) > old = current_cred(); > > retval = -EPERM; > - if (!capable(CAP_SETGID)) { > + if (!nsown_capable(CAP_SETGID)) { > if (rgid != (gid_t) -1 && rgid != old->gid && > rgid != old->egid && rgid != old->sgid) > goto error; > @@ -836,7 +848,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) > > if (uid == old->uid || uid == old->euid || > uid == old->suid || uid == old->fsuid || > - capable(CAP_SETUID)) { > + nsown_capable(CAP_SETUID)) { > if (uid != old_fsuid) { > new->fsuid = uid; > if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) > @@ -869,7 +881,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) > > if (gid == old->gid || gid == old->egid || > gid == old->sgid || gid == old->fsgid || > - capable(CAP_SETGID)) { > + nsown_capable(CAP_SETGID)) { > if (gid != old_fsgid) { > new->fsgid = gid; > goto change_okay; > @@ -1177,8 +1189,11 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) > int errno; > char tmp[__NEW_UTS_LEN]; > > - if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) > + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) { > + printk(KERN_NOTICE "%s: did not have CAP_SYS_ADMIN\n", __func__); > return -EPERM; > + } > + printk(KERN_NOTICE "%s: did have CAP_SYS_ADMIN\n", __func__); Ouch! This new print statement could be really annoying if an unprivileged user calls sethostname. Could you remove it? > if (len < 0 || len > __NEW_UTS_LEN) > return -EINVAL; > down_write(&uts_sem); > @@ -1226,7 +1241,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) > int errno; > char tmp[__NEW_UTS_LEN]; > > - if (!capable(CAP_SYS_ADMIN)) > + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) > return -EPERM; > if (len < 0 || len > __NEW_UTS_LEN) > return -EINVAL; > @@ -1341,6 +1356,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, > rlim = tsk->signal->rlim + resource; > task_lock(tsk->group_leader); > if (new_rlim) { > + /* Keep the capable check against init_user_ns until > + cgroups can contain all limits */ > if (new_rlim->rlim_max > rlim->rlim_max && > !capable(CAP_SYS_RESOURCE)) > retval = -EPERM; > @@ -1384,19 +1401,22 @@ static int check_prlimit_permission(struct task_struct *task) > { > const struct cred *cred = current_cred(), *tcred; > > - tcred = __task_cred(task); > - if (current != task && > - (cred->uid != tcred->euid || > - cred->uid != tcred->suid || > - cred->uid != tcred->uid || > - cred->gid != tcred->egid || > - cred->gid != tcred->sgid || > - cred->gid != tcred->gid) && > - !capable(CAP_SYS_RESOURCE)) { > - return -EPERM; > - } > + if (current == task) > + return 0; > > - return 0; > + tcred = __task_cred(task); > + if (cred->user->user_ns == tcred->user->user_ns && > + (cred->uid == tcred->euid && > + cred->uid == tcred->suid && > + cred->uid == tcred->uid && > + cred->gid == tcred->egid && > + cred->gid == tcred->sgid && > + cred->gid == tcred->gid)) > + return 0; > + if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) > + return 0; > + > + return -EPERM; > } > > SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers