A common way for daemons to run with minimal privilege is to start as root, perhaps setuid-root, choose a desired capability set, set PR_SET_KEEPCAPS, then change uid to non-root. A simpler way to achieve this is to set file capabilities on a not-setuid-root binary. However, when installing a package inside a (user-namespaced) container, packages cannot be installed with file capabilities. For this reason, containers must install ping setuid-root. To achieve this, we would need for containers to be able to request file capabilities be added to a file without causing these to be honored in the initial user namespace. To this end, the patch below introduces a new capability xattr format. The main enhancement over the existing security.capability xattr is that we tag capability sets with a uid - the uid of the root user in the namespace where the capabilities are set. The capabilities will be ignored in any other namespace. The special case of uid == -1 (which must only ever be able to be set by kuid 0) means use the capabilities in all namespaces. An alternative format would use a pair of uids to indicate a range of rootids. This would allow root in a user namespace with uids 100000-165536 mapped to set the xattr once on a file, then launch nested containers wherein the file could be used with privilege. That's not what this patch does, but would be a trivial change if people think it would be worthwhile. This patch does not actually address the real problem, which is setting the xattrs from inside containers. For that, I think the best solution is to add a pair of new system calls, setfcap and getfcap. Userspace would for instance call fsetfcap(fd, cap_user_header_t, cap_user_data_t), to which the kernel would, if not in init_user_ns, react by writing an appropriate security.nscapability xattr. The libcap2 library's cap_set_file/cap_get_file could be switched over transparently to use this to hide its use from all callers. Comments appreciated. Note - In this patch, file capabilities only work for containers which have a root uid defined. We may want to allow -1 uids to work in all namespaces. There certainly would be uses for this, but I'm a bit unsettled about the implications of allowing a program privilege in a container where there is no uid with privilege. This needs more thought. Signed-off-by: Serge Hallyn <serge.hallyn@xxxxxxxxxx> --- include/linux/capability.h | 15 +++++ include/uapi/linux/capability.h | 47 ++++++++++++++ include/uapi/linux/xattr.h | 3 + security/commoncap.c | 135 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 194 insertions(+), 6 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index af9f0b9..24ac18e 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -13,6 +13,7 @@ #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> +#include <linux/uidgid.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 @@ -31,6 +32,20 @@ struct cpu_vfs_cap_data { kernel_cap_t inheritable; }; +struct cpu_vfs_ns_cap_data { + __u32 flags; + kuid_t rootid; + kernel_cap_t permitted; + kernel_cap_t inheritable; +}; + +struct cpu_vfs_ns_cap_header { + __u32 hdr_info; + struct cpu_vfs_ns_cap_data caps[0]; +}; +#define NS_CAPS_VERSION(x) (x & 0xFF) +#define NS_CAPS_NCAPS(x) ( (x >> 8) & 0xFF ) + #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 12c37a1..2211a33 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -62,10 +62,14 @@ typedef struct __user_cap_data_struct { #define VFS_CAP_U32_2 2 #define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2)) +/* version number for security.nscapability xattrs hdr->hdr_info */ +#define VFS_NS_CAP_REVISION 1 + #define XATTR_CAPS_SZ XATTR_CAPS_SZ_2 #define VFS_CAP_U32 VFS_CAP_U32_2 #define VFS_CAP_REVISION VFS_CAP_REVISION_2 + struct vfs_cap_data { __le32 magic_etc; /* Little endian */ struct { @@ -74,6 +78,49 @@ struct vfs_cap_data { } data[VFS_CAP_U32]; }; +/* + * Q: do we want version in the header, or in the data? + * If it is in the header, then a container will need to + * make sure it is writing the same data. + * + * Actually, perhaps we simply do not support writing the + * xattr, we just use a new system call to get/set the fscap. + * The kernel can be in charge of watching the version numbers. + * After all, we can't allow the container to override the + * fscaps of the init ns. + * + * @flags currently only containers the effective bit. The + * other bits are reserved, and must be 0 at the moment. + * @rootid contains the kuid value of the root in the namespace + * for which this capability should be used. If -1, then this + * works for all namespaces. Only root in the initial ns can + * use this. + * + * Q: do we want to use a range instead? Then root in a container + * could allow one binary with one capability to be used by any + * nested containers. + */ +#define VFS_NS_CAP_EFFECTIVE 0x1 +struct vfs_ns_cap_data { + __le32 flags; + __le32 rootid; + struct { + __le32 permitted; /* Little endian */ + __le32 inheritable; /* Little endian */ + } data[VFS_CAP_U32]; +}; + +/* + * 32-bit hdr_info contains + * 16 leftmost: reserved + * next 8: ncaps + * last 8: version + */ +struct vfs_ns_cap_header { + __le32 hdr_info; + /* ncaps * vfs_ns_cap_data */ +}; + #ifndef __KERNEL__ /* diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h index 1590c49..67c80ab 100644 --- a/include/uapi/linux/xattr.h +++ b/include/uapi/linux/xattr.h @@ -68,6 +68,9 @@ #define XATTR_CAPS_SUFFIX "capability" #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX +#define XATTR_NS_CAPS_SUFFIX "nscapability" +#define XATTR_NAME_NS_CAPS XATTR_SECURITY_PREFIX XATTR_NS_CAPS_SUFFIX + #define XATTR_POSIX_ACL_ACCESS "posix_acl_access" #define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS #define XATTR_POSIX_ACL_DEFAULT "posix_acl_default" diff --git a/security/commoncap.c b/security/commoncap.c index 1832cf7..c44edf3 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -308,6 +308,10 @@ int cap_inode_need_killpriv(struct dentry *dentry) if (!inode->i_op->getxattr) return 0; + error = inode->i_op->getxattr(dentry, XATTR_NAME_NS_CAPS, NULL, 0); + if (error > 0) + return 1; + error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0); if (error <= 0) return 0; @@ -325,11 +329,17 @@ int cap_inode_need_killpriv(struct dentry *dentry) int cap_inode_killpriv(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); + int ret1, ret2;; if (!inode->i_op->removexattr) return 0; - return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS); + ret1 = inode->i_op->removexattr(dentry, XATTR_NAME_CAPS); + ret2 = inode->i_op->removexattr(dentry, XATTR_NAME_NS_CAPS); + + if (ret1 != 0) + return ret1; + return ret2; } /* @@ -433,6 +443,117 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data return 0; } +int get_vfs_ns_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) +{ + struct inode *inode = d_backing_inode(dentry); + unsigned tocopy, i; + int ret = 0, size, expected; + unsigned len = 0; + struct vfs_ns_cap_header *hdr; + struct vfs_ns_cap_data *cap, *nscap = NULL; + __u16 ncaps, version; + __u32 hdr_info; + kuid_t current_root, caprootuid; + + memset(cpu_caps, 0, sizeof(*cpu_caps)); + + if (!inode || !inode->i_op->getxattr) + return -ENODATA; + + /* get the size */ + size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS, + NULL, 0); + if (size == -ENODATA || size == -EOPNOTSUPP) + /* no data, that's ok */ + return -ENODATA; + if (size < 0) + return size; + if (size < sizeof(struct cpu_vfs_ns_cap_header)) + return -EINVAL; + if (size > sizeof(struct cpu_vfs_ns_cap_header) + 255 * sizeof(struct vfs_ns_cap_data)) + return -EINVAL; + len = size; + + hdr = kmalloc(len + 1, GFP_NOFS); + if (!hdr) + return -ENOMEM; + + size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS, hdr, + len); + if (size < 0) { + ret = size; + goto out; + } + + if (size != len) { + ret = -EINVAL; + goto out; + } + + hdr_info = le32_to_cpu(hdr->hdr_info); + version = NS_CAPS_VERSION(hdr_info); + ncaps = NS_CAPS_NCAPS(hdr_info); + + if (version != VFS_NS_CAP_REVISION) { + ret = -EINVAL; + goto out; + } + + expected = sizeof(*hdr) + ncaps * sizeof(*cap); + if (size != expected) { + ret = -EINVAL; + goto out; + } + tocopy = VFS_CAP_U32; + + /* find an applicable entry */ + /* a global entry (uid == -1) takes precedence */ + current_root = make_kuid(current_user_ns(), 0); + if (!uid_valid(current_root)) { + /* no root user in this namespace; no capabilities */ + ret = -EINVAL; + goto out; + } + + for (i = 0, cap = (void *) hdr + sizeof(*hdr); i < ncaps; cap += sizeof(*cap), i++) { + uid_t uid = le32_to_cpu(cap->rootid); + if (uid == -1) { + nscap = cap; + break; + } + + caprootuid = make_kuid(&init_user_ns, uid); + if (uid_eq(caprootuid, current_root)) + nscap = cap; + } + + if (!nscap) { + /* nothing found for this namespace */ + ret = -ENODATA; + goto out; + } + + /* copy the entry */ + CAP_FOR_EACH_U32(i) { + if (i >= tocopy) + break; + cpu_caps->permitted.cap[i] = le32_to_cpu(nscap->data[i].permitted); + cpu_caps->inheritable.cap[i] = le32_to_cpu(nscap->data[i].inheritable); + } + + cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + + cpu_caps->magic_etc = VFS_CAP_REVISION_2; + if (nscap->flags & VFS_NS_CAP_EFFECTIVE) + cpu_caps->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE; + +out: + kfree(hdr); + + return ret; +} + /* * Attempt to get the on-exec apply capability sets for an executable file from * its xattrs and, if present, apply them to the proposed credentials being @@ -451,11 +572,13 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) return 0; - rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); + rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &vcaps); + if (rc == -ENODATA) + rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) - printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n", - __func__, rc, bprm->filename); + printk(KERN_NOTICE "Got EINVAL reading file caps for %s\n", + bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; @@ -651,7 +774,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm) int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (!strcmp(name, XATTR_NAME_CAPS)) { + if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) { if (!capable(CAP_SETFCAP)) return -EPERM; return 0; @@ -677,7 +800,7 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name, */ int cap_inode_removexattr(struct dentry *dentry, const char *name) { - if (!strcmp(name, XATTR_NAME_CAPS)) { + if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) { if (!capable(CAP_SETFCAP)) return -EPERM; return 0; -- 2.5.0 -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html