Root in a user ns cannot be trusted to write a traditional security.capability xattr. If it were allowed to do so, then any unprivileged user on the host could map his own uid to root in a namespace, write the xattr, and execute the file with privilege on the host. This patch introduces v3 of the security.capability xattr. It builds a vfs_ns_cap_data struct by appending a uid_t rootid to struct vfs_cap_data. This is the absolute uid_t (i.e. the uid_t in init_user_ns) of the root id (uid 0 in a namespace) in whose namespaces the file capabilities may take effect. When a task in a user ns (which is privileged with CAP_SETFCAP toward that user_ns) asks to write v2 security.capability, the kernel will transparently rewrite the xattr as a v3 with the appropriate rootid. Subsequently, any task executing the file which has the noted kuid as its root uid, or which is in a descendent user_ns of such a user_ns, will run the file with capabilities. If a task writes a v3 security.capability, then it can provide a uid (valid within its own user namespace, over which it has CAP_SETFCAP) for the xattr. The kernel will translate that to the absolute uid, and write that to disk. After this, a task in the writer's namespace will not be able to use those capabilities, but a task in a namespace where the given uid is root will. Only a single security.capability xattr may be written. A task may overwrite the existing one so long as it was written by a user mapped into his own user_ns over which he has CAP_SETFCAP. This allows a simple setxattr to work, allows tar/untar to work, and allows us to tar in one namespace and untar in another while preserving the capability, without risking leaking privilege into a parent namespace. Signed-off-by: Serge Hallyn <serge.hallyn@xxxxxxxxxx> --- fs/xattr.c | 18 ++- include/linux/capability.h | 5 +- include/linux/security.h | 2 + include/uapi/linux/capability.h | 22 +++- security/commoncap.c | 269 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 300 insertions(+), 16 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 4861322..d68139b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -94,11 +94,25 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, { struct inode *inode = dentry->d_inode; int error = -EOPNOTSUPP; + void *wvalue = NULL; + size_t wsize = 0; int issec = !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); - if (issec) + if (issec) { inode->i_flags &= ~S_NOSEC; + /* if root in a non-init user_ns tries to set + * security.capability, write the virtualized + * xattr in its place */ + if (!strcmp(name, "security.capability") && + current_user_ns() != &init_user_ns) { + cap_setxattr_make_nscap(dentry, value, size, &wvalue, &wsize); + if (!wvalue) + return -EPERM; + value = wvalue; + size = wsize; + } + } if (inode->i_op->setxattr) { error = inode->i_op->setxattr(dentry, name, value, size, flags); if (!error) { @@ -114,10 +128,10 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, fsnotify_xattr(dentry); } + kfree(wvalue); return error; } - int vfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) diff --git a/include/linux/capability.h b/include/linux/capability.h index 00690ff..0448670 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -13,7 +13,7 @@ #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> - +#include <linux/uidgid.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 #define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3 @@ -240,4 +240,7 @@ extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); +extern void cap_setxattr_make_nscap(struct dentry *dentry, const void *value, + size_t size, void **wvalue, size_t *wsize); + #endif /* !_LINUX_CAPABILITY_H */ diff --git a/include/linux/security.h b/include/linux/security.h index 157f0cb..4b35126 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -86,6 +86,8 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name, extern int cap_inode_removexattr(struct dentry *dentry, const char *name); extern int cap_inode_need_killpriv(struct dentry *dentry); extern int cap_inode_killpriv(struct dentry *dentry); +extern int cap_inode_getsecurity(struct inode *inode, const char *name, + void **buffer, bool alloc); extern int cap_mmap_addr(unsigned long addr); extern int cap_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags); diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 12c37a1..a1b550c 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -62,9 +62,13 @@ typedef struct __user_cap_data_struct { #define VFS_CAP_U32_2 2 #define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2)) -#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2 -#define VFS_CAP_U32 VFS_CAP_U32_2 -#define VFS_CAP_REVISION VFS_CAP_REVISION_2 +#define VFS_CAP_REVISION_3 0x03000000 +#define VFS_CAP_U32_3 2 +#define XATTR_CAPS_SZ_3 (sizeof(__le32)*(2 + 2*VFS_CAP_U32_3)) + +#define XATTR_CAPS_SZ XATTR_CAPS_SZ_3 +#define VFS_CAP_U32 VFS_CAP_U32_3 +#define VFS_CAP_REVISION VFS_CAP_REVISION_3 struct vfs_cap_data { __le32 magic_etc; /* Little endian */ @@ -74,6 +78,18 @@ struct vfs_cap_data { } data[VFS_CAP_U32]; }; +/* + * same as vfs_cap_data but with a rootid at the end + */ +struct vfs_ns_cap_data { + __le32 magic_etc; + struct { + __le32 permitted; /* Little endian */ + __le32 inheritable; /* Little endian */ + } data[VFS_CAP_U32]; + __le32 rootid; +}; + #ifndef __KERNEL__ /* diff --git a/security/commoncap.c b/security/commoncap.c index 48071ed..62c46aa 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -337,6 +337,235 @@ int cap_inode_killpriv(struct dentry *dentry) return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS); } +static bool rootid_owns_currentns(uid_t root) +{ + kuid_t kroot; + struct user_namespace *ns; + + kroot = make_kuid(&init_user_ns, root); + for (ns = current_user_ns(); ; ns = ns->parent) { + if (from_kuid(ns, kroot) == 0) { + return true; + } + if (ns == &init_user_ns) + break; + } + + return false; +} + +/* + * getsecurity: We are called for security.* before any attempt to read the + * xattr from the inode itself. + * + * This gives us a chance to read the on-disk value and convert it. If we + * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. + * + * Note we are not called by vfs_getxattr_alloc(), but that is only called + * by the integrity subsystem, which really wants the unconverted values - + * so that's good. + */ +int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, + bool alloc) +{ + int size, ret; + kuid_t kroot; + uid_t root, mappedroot; + char *tmpbuf = NULL; + struct vfs_ns_cap_data *nscap; + struct dentry *dentry; + + if (!inode->i_op->getxattr) + return -EOPNOTSUPP; + + if (strcmp(name, "capability") != 0) + return -EOPNOTSUPP; + + dentry = d_find_alias(inode); + if (!dentry) + return -EINVAL; + + size = sizeof(struct vfs_ns_cap_data); + ret = vfs_getxattr_alloc(dentry, "security.capability", + &tmpbuf, size, GFP_NOFS); + + if (ret < 0) + return ret; + if (ret == sizeof(struct vfs_cap_data)) { + /* If this is sizeof(vfs_cap_data) then we're ok with the + * on-disk value, so return that. */ + if (alloc) + *buffer = tmpbuf; + else + kfree(tmpbuf); + return ret; + } else if (ret != size) { + kfree(tmpbuf); + return -EINVAL; + } + + nscap = (struct vfs_ns_cap_data *) tmpbuf; + root = le32_to_cpu(nscap->rootid); + kroot = make_kuid(&init_user_ns, root); + + /* If the root kuid maps to a valid uid in current ns, then return + * this as a nscap. */ + mappedroot = from_kuid(current_user_ns(), kroot); + if (mappedroot != (uid_t)-1) { + if (alloc) { + *buffer = tmpbuf; + nscap->rootid = cpu_to_le32(mappedroot); + } else + kfree(tmpbuf); + return size; + } + + if (!rootid_owns_currentns(root)) { + kfree(tmpbuf); + return -EOPNOTSUPP; + } + + /* This comes from a parent namespace. Return as a v2 capability */ + size = sizeof(struct vfs_cap_data); + if (alloc) { + *buffer = kmalloc(size, GFP_ATOMIC); + if (*buffer) { + struct vfs_cap_data *cap = *buffer; + __le32 nsmagic, magic; + magic = VFS_CAP_REVISION_2; + nsmagic = le32_to_cpu(nscap->magic_etc); + if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) + magic |= VFS_CAP_FLAGS_EFFECTIVE; + memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); + cap->magic_etc = cpu_to_le32(magic); + } + } + kfree(tmpbuf); + return size; +} + +/* + * Root can only overwite an existing security.capability xattr + * if it is privileged over the root listed in the xattr + * Note we've already checked for ns_capable(CAP_SETFCAP) in the + * !capable_wrt_inode_uidgid() call by the caller, so we do not + * check for that here. + */ +static bool refuse_fcap_overwrite(struct inode *inode) +{ + void *tmpbuf; + int ret; + uid_t root; + kuid_t kroot; + struct vfs_ns_cap_data *nscap; + __u32 magic_etc; + + ret = cap_inode_getsecurity(inode, "capability", &tmpbuf, true); + if (ret < 0) + return false; + if (ret == sizeof(struct vfs_cap_data)) { + /* + * host-root-installed capability, user-namespace-root may + * not overwrite this. + */ + kfree(tmpbuf); + return true; + } + if (ret < sizeof(struct vfs_ns_cap_data)) { + /* Corrupt fscap. Caller is privileged wrt inode, permit fixup */ + kfree(tmpbuf); + return false; + } + + nscap = (struct vfs_ns_cap_data *)tmpbuf; + + magic_etc = le32_to_cpu(nscap->magic_etc); + if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_3) { + /* + * This version is newer than we know about - i.e. from a newer + * kernel. Don't overwrite. + */ + kfree(tmpbuf); + return true; + } + if (ret != sizeof(struct vfs_ns_cap_data)) { + /* Corrupt v4 fscap. Permit fixup */ + kfree(tmpbuf); + return false; + } + root = le32_to_cpu(nscap->rootid); + kroot = make_kuid(&init_user_ns, root); + if (!uid_valid(kroot)) { + /* fscap owned by ancestor user_ns. refuse */ + kfree(tmpbuf); + return true; + } + + kfree(tmpbuf); + return false; +} + +static kuid_t rootid_from_xattr(const void *value, size_t size, + struct user_namespace *ns) +{ + const struct vfs_ns_cap_data *nscap = value; + uid_t rootid; + + if (size != XATTR_CAPS_SZ_3) + return make_kuid(ns, 0); + + rootid = le32_to_cpu(nscap->rootid); + return make_kuid(ns, rootid); +} + +/* + * Use requested a write of security.capability but is in a non-init + * userns. So we construct and write a v4. + * + * If all is ok, wvalue has an allocated new value. Otherwise, wvalue + * is NULL. + */ +void cap_setxattr_make_nscap(struct dentry *dentry, const void *value, size_t size, + void **wvalue, size_t *wsize) +{ + struct vfs_ns_cap_data *nscap; + const struct vfs_cap_data *cap = value; + __u32 magic, nsmagic; + struct user_namespace *ns = current_user_ns(); + struct inode *inode = d_backing_inode(dentry); + kuid_t rootid; + + if (!value) + return; + if (size != XATTR_CAPS_SZ_2 && size != XATTR_CAPS_SZ_3) + return; + if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) + return; + + /* refuse if security.capability exists */ + if (refuse_fcap_overwrite(inode)) + return; + + rootid = rootid_from_xattr(value, size, ns); + if (!uid_valid(rootid)) + return; + + *wsize = sizeof(struct vfs_ns_cap_data); + nscap = kmalloc(*wsize, GFP_ATOMIC); + if (!nscap) + return; + nscap->rootid = cpu_to_le32(from_kuid(&init_user_ns, rootid)); + nsmagic = VFS_CAP_REVISION_3; + magic = le32_to_cpu(cap->magic_etc); + if (magic & VFS_CAP_FLAGS_EFFECTIVE) + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; + nscap->magic_etc = cpu_to_le32(nsmagic); + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); + + *wvalue = nscap; + return; +} + /* * Calculate the new process capability sets from the capability sets attached * to a file. @@ -390,25 +619,28 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data __u32 magic_etc; unsigned tocopy, i; int size; - struct vfs_cap_data caps; + struct vfs_ns_cap_data data, *nscaps = &data; + struct vfs_cap_data *caps = (struct vfs_cap_data *) &data; memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data)); if (!inode || !inode->i_op->getxattr) return -ENODATA; - size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_CAPS, &caps, - XATTR_CAPS_SZ); + size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_CAPS, + &data, XATTR_CAPS_SZ); + if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; + if (size < 0) return size; if (size < sizeof(magic_etc)) return -EINVAL; - cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc); + cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: @@ -421,6 +653,15 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data return -EINVAL; tocopy = VFS_CAP_U32_2; break; + case VFS_CAP_REVISION_3: + if (size != XATTR_CAPS_SZ_3) + return -EINVAL; + tocopy = VFS_CAP_U32_3; + + if (!rootid_owns_currentns(le32_to_cpu(nscaps->rootid))) + return -ENODATA; + break; + default: return -EINVAL; } @@ -428,8 +669,8 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data CAP_FOR_EACH_U32(i) { if (i >= tocopy) break; - cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted); - cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable); + cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted); + cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable); } cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; @@ -459,8 +700,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) - printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n", - __func__, rc, bprm->filename); + printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", + bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; @@ -657,8 +898,11 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { if (!strcmp(name, XATTR_NAME_CAPS)) { - if (!capable(CAP_SETFCAP)) + /* Note - we want to use Seth's newer code here instead */ + if (current_user_ns() == &init_user_ns && !capable(CAP_SETFCAP)) return -EPERM; + /* for non-init userns we'll check permission later in + * cap_setxattr_make_nscap() */ return 0; } @@ -683,7 +927,11 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name, int cap_inode_removexattr(struct dentry *dentry, const char *name) { if (!strcmp(name, XATTR_NAME_CAPS)) { - if (!capable(CAP_SETFCAP)) + /* Note - we want to use Seth's newer code here instead */ + struct inode *inode = d_backing_inode(dentry); + if (!inode) + return -EINVAL; + if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) return -EPERM; return 0; } @@ -1078,6 +1326,7 @@ struct security_hook_list capability_hooks[] = { LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec), LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv), LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv), + LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity), LSM_HOOK_INIT(mmap_addr, cap_mmap_addr), LSM_HOOK_INIT(mmap_file, cap_mmap_file), LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid), -- 2.7.4 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers