On Mon, Feb 29, 2016 at 03:38:20PM -0600, Serge E. Hallyn wrote: > On Fri, Jan 29, 2016 at 01:31:51AM -0600, Serge E. Hallyn wrote: > > On Wed, Jan 27, 2016 at 04:36:02PM -0800, Andy Lutomirski wrote: > > > On Wed, Jan 27, 2016 at 9:22 AM, Jann Horn <jann@xxxxxxxxx> wrote: > > > > I think it sounds good from a security perspective. > > > > > > I'm a bit late to the game, but I have a question: why should this be > > > keyed to the *root* uid of the namespace in particular? Certainly if > > > user foo trusts the cap bits on some file, then user foo might trust > > > those caps to be exerted over any namespace that user foo owns, since > > > user foo owns the namespace. > > > > ... Tying it to a kuid which represents the userns->owner of any > > namespace in which the capability will be honored might be fine > > with me. Is that what you mean? So if uid 1000 creates a userns > > mapping uids 100000-200000, and 100000 in that container puts X=pe > > on /bin/foo, uid 101000 in that container runs /bin/foo with privilege > > X. Uid 101000 in someone else's container does not. > > > > Although, if I create two containers and provide them different > > uidmaps, it may well be because I want them segragated and want > > to minimize the changes of one container breaking out into the > > other. This risks breaking that. > > Thinking differently now... I really want it to "just work" to tar > and untar these. So I'm thinking of simply using the file owner > as the uid. So to write a security.ns_capability xattr, you must > be uid 0 in the inode's namespace, the file must be owned by uid 0, > and the capabilities in the xattr will be honored for any namespace > where in that uid_t 0 is root. > > Does that sound overly restrictive? I expect file capabilities to > be used on files owned by root but not setuid-root, so I think it > is ok. > > -serge Here is a working first draft: >From 019ff81124b7dd3161414720f5666f6793a8ccd9 Mon Sep 17 00:00:00 2001 From: Serge Hallyn <serge.hallyn@xxxxxxxxxx> Date: Tue, 1 Mar 2016 00:09:35 +0000 Subject: [PATCH 1/1] simplified security.nscapability xattr This can only be set by root in his own namespace, and will only be respected by namespaces with that same root kuid mapped as root. The file must be owned by the root user in the container. This allows us to avoid having to store a 'root user' value in the capability. This allows a simple setxattr to work, allows tar/untar to work, and allows us to tar in one namespace and untar in another while preserving the capability, without risking leaking privilege into a parent namespace. Signed-off-by: Serge Hallyn <serge.hallyn@xxxxxxxxxx> --- include/linux/capability.h | 5 ++- include/uapi/linux/capability.h | 18 +++++++++ include/uapi/linux/xattr.h | 3 ++ security/commoncap.c | 81 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 102 insertions(+), 5 deletions(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index af9f0b9..19a37a9 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -13,7 +13,7 @@ #define _LINUX_CAPABILITY_H #include <uapi/linux/capability.h> - +#include <linux/uidgid.h> #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 #define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3 @@ -31,6 +31,9 @@ struct cpu_vfs_cap_data { kernel_cap_t inheritable; }; +#define NS_CAPS_VERSION(x) (x & 0xFF) +#define NS_CAPS_FLAGS(x) ((x >> 8) & 0xFF) + #define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct)) #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 12c37a1..f0b4a66 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -62,6 +62,9 @@ typedef struct __user_cap_data_struct { #define VFS_CAP_U32_2 2 #define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2)) +/* version number for security.nscapability xattrs hdr->hdr_info */ +#define VFS_NS_CAP_REVISION 1 + #define XATTR_CAPS_SZ XATTR_CAPS_SZ_2 #define VFS_CAP_U32 VFS_CAP_U32_2 #define VFS_CAP_REVISION VFS_CAP_REVISION_2 @@ -74,6 +77,21 @@ struct vfs_cap_data { } data[VFS_CAP_U32]; }; +#define VFS_NS_CAP_EFFECTIVE 0x1 +/* + * 32-bit hdr_info contains + * 16 leftmost: reserved + * next 8: flags (only VFS_NS_CAP_EFFECTIVE so far) + * last 8: version + */ +struct vfs_ns_cap_data { + __le32 magic_etc; + struct { + __le32 permitted; /* Little endian */ + __le32 inheritable; /* Little endian */ + } data[VFS_CAP_U32]; +}; + #ifndef __KERNEL__ /* diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h index 1590c49..67c80ab 100644 --- a/include/uapi/linux/xattr.h +++ b/include/uapi/linux/xattr.h @@ -68,6 +68,9 @@ #define XATTR_CAPS_SUFFIX "capability" #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX +#define XATTR_NS_CAPS_SUFFIX "nscapability" +#define XATTR_NAME_NS_CAPS XATTR_SECURITY_PREFIX XATTR_NS_CAPS_SUFFIX + #define XATTR_POSIX_ACL_ACCESS "posix_acl_access" #define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS #define XATTR_POSIX_ACL_DEFAULT "posix_acl_default" diff --git a/security/commoncap.c b/security/commoncap.c index 6f093f3..735d4c7 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -308,6 +308,10 @@ int cap_inode_need_killpriv(struct dentry *dentry) if (!inode->i_op->getxattr) return 0; + error = inode->i_op->getxattr(dentry, XATTR_NAME_NS_CAPS, NULL, 0); + if (error > 0) + return 1; + error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0); if (error <= 0) return 0; @@ -325,11 +329,17 @@ int cap_inode_need_killpriv(struct dentry *dentry) int cap_inode_killpriv(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); + int ret1, ret2; if (!inode->i_op->removexattr) return 0; - return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS); + ret1 = inode->i_op->removexattr(dentry, XATTR_NAME_CAPS); + ret2 = inode->i_op->removexattr(dentry, XATTR_NAME_NS_CAPS); + + if (ret1 != 0) + return ret1; + return ret2; } /* @@ -433,6 +443,55 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data return 0; } +int get_vfs_ns_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) +{ + struct inode *inode = d_backing_inode(dentry); + unsigned tocopy, i; + u32 magic_etc; + ssize_t size; + struct vfs_ns_cap_data nscap; + + memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data)); + + if (!inode || !inode->i_op->getxattr) + return -ENODATA; + + /* verify that userns root owns this file */ + if (from_kuid(current_user_ns(), dentry->d_inode->i_uid) != 0) + return -ENODATA; + + size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS, + &nscap, sizeof(nscap)); + if (size == -ENODATA || size == -EOPNOTSUPP) + /* no data, that's ok */ + return -ENODATA; + if (size < 0) + return size; + if (size != sizeof(nscap)) + return -EINVAL; + + magic_etc = le32_to_cpu(nscap.magic_etc); + + if (NS_CAPS_VERSION(magic_etc) != VFS_NS_CAP_REVISION) + return -EINVAL; + + cpu_caps->magic_etc = VFS_CAP_REVISION_2; + if (NS_CAPS_FLAGS(magic_etc) & VFS_NS_CAP_EFFECTIVE) + cpu_caps->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE; + /* copy the entry */ + CAP_FOR_EACH_U32(i) { + if (i >= VFS_CAP_U32_2) + break; + cpu_caps->permitted.cap[i] = le32_to_cpu(nscap.data[i].permitted); + cpu_caps->inheritable.cap[i] = le32_to_cpu(nscap.data[i].inheritable); + } + + cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; + + return 0; +} + /* * Attempt to get the on-exec apply capability sets for an executable file from * its xattrs and, if present, apply them to the proposed credentials being @@ -453,11 +512,13 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c if (!current_in_userns(bprm->file->f_path.mnt->mnt_sb->s_user_ns)) return 0; - rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); + rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &vcaps); + if (rc == -ENODATA) + rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) - printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n", - __func__, rc, bprm->filename); + printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", + bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; @@ -661,6 +722,12 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name, return 0; } + if (!strcmp(name, XATTR_NAME_NS_CAPS)) { + if (!capable_wrt_inode_uidgid(dentry->d_inode, CAP_SETFCAP)) + return -EPERM; + return 0; + } + if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && !ns_capable(user_ns, CAP_SYS_ADMIN)) @@ -689,6 +756,12 @@ int cap_inode_removexattr(struct dentry *dentry, const char *name) return 0; } + if (!strcmp(name, XATTR_NAME_NS_CAPS)) { + if (!capable_wrt_inode_uidgid(dentry->d_inode, CAP_SETFCAP)) + return -EPERM; + return 0; + } + if (!strncmp(name, XATTR_SECURITY_PREFIX, sizeof(XATTR_SECURITY_PREFIX) - 1) && !ns_capable(user_ns, CAP_SYS_ADMIN)) -- 2.7.0 -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html