On Wed, 2024-02-21 at 15:24 -0600, Seth Forshee (DigitalOcean) wrote: > To pass around vfs_caps instead of raw xattr data we will need to > convert between the two representations near userspace and disk > boundaries. We already convert xattrs from disks to vfs_caps, so move > that code into a helper, and change get_vfs_caps_from_disk() to use the > helper. > > When converting vfs_caps to xattrs we have different considerations > depending on the destination of the xattr data. For xattrs which will be > written to disk we need to reject the xattr if the rootid does not map > into the filesystem's user namespace, whereas xattrs read by userspace > may need to undergo a conversion from v3 to v2 format when the rootid > does not map. So this helper is split into an internal and an external > interface. The internal interface does not return an error if the rootid > has no mapping in the target user namespace and will be used for > conversions targeting userspace. The external interface returns > EOVERFLOW if the rootid has no mapping and will be used for all other > conversions. > > Signed-off-by: Seth Forshee (DigitalOcean) <sforshee@xxxxxxxxxx> > --- > include/linux/capability.h | 10 ++ > security/commoncap.c | 228 +++++++++++++++++++++++++++++++++++---------- > 2 files changed, 187 insertions(+), 51 deletions(-) > > diff --git a/include/linux/capability.h b/include/linux/capability.h > index eb46d346bbbc..a0893ac4664b 100644 > --- a/include/linux/capability.h > +++ b/include/linux/capability.h > @@ -209,6 +209,16 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) > ns_capable(ns, CAP_SYS_ADMIN); > } > > +/* helpers to convert between xattr and in-kernel representations */ > +int vfs_caps_from_xattr(struct mnt_idmap *idmap, > + struct user_namespace *src_userns, > + struct vfs_caps *vfs_caps, > + const void *data, size_t size); > +ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap, > + struct user_namespace *dest_userns, > + const struct vfs_caps *vfs_caps, > + void *data, size_t size); > + > /* audit system wants to get cap info from files as well */ > int get_vfs_caps_from_disk(struct mnt_idmap *idmap, > const struct dentry *dentry, > diff --git a/security/commoncap.c b/security/commoncap.c > index a0b5c9740759..7531c9634997 100644 > --- a/security/commoncap.c > +++ b/security/commoncap.c > @@ -619,54 +619,41 @@ static inline int bprm_caps_from_vfs_caps(struct vfs_caps *caps, > } > > /** > - * get_vfs_caps_from_disk - retrieve vfs caps from disk > + * vfs_caps_from_xattr - convert raw caps xattr data to vfs_caps > * > - * @idmap: idmap of the mount the inode was found from > - * @dentry: dentry from which @inode is retrieved > - * @cpu_caps: vfs capabilities > + * @idmap: idmap of the mount the inode was found from > + * @src_userns: user namespace for ids in xattr data > + * @vfs_caps: destination buffer for vfs_caps data > + * @data: rax xattr caps data > + * @size: size of xattr data > * > - * Extract the on-exec-apply capability sets for an executable file. > + * Converts a raw security.capability xattr into the kernel-internal > + * capabilities format. > * > - * If the inode has been found through an idmapped mount the idmap of > - * the vfsmount must be passed through @idmap. This function will then > - * take care to map the inode according to @idmap before checking > - * permissions. On non-idmapped mounts or if permission checking is to be > - * performed on the raw inode simply pass @nop_mnt_idmap. > + * If the xattr is being read or written through an idmapped mount the > + * idmap of the vfsmount must be passed through @idmap. This function > + * will then take care to map the rootid according to @idmap. > + * > + * Return: On success, return 0; on error, return < 0. > */ > -int get_vfs_caps_from_disk(struct mnt_idmap *idmap, > - const struct dentry *dentry, > - struct vfs_caps *cpu_caps) > +int vfs_caps_from_xattr(struct mnt_idmap *idmap, > + struct user_namespace *src_userns, > + struct vfs_caps *vfs_caps, > + const void *data, size_t size) > { > - struct inode *inode = d_backing_inode(dentry); > __u32 magic_etc; > - int size; > - struct vfs_ns_cap_data data, *nscaps = &data; > - struct vfs_cap_data *caps = (struct vfs_cap_data *) &data; > + const struct vfs_ns_cap_data *ns_caps = data; > + struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps; > kuid_t rootkuid; > - vfsuid_t rootvfsuid; > - struct user_namespace *fs_ns; > - > - memset(cpu_caps, 0, sizeof(struct vfs_caps)); > - > - if (!inode) > - return -ENODATA; > > - fs_ns = inode->i_sb->s_user_ns; > - size = __vfs_getxattr((struct dentry *)dentry, inode, > - XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ); > - if (size == -ENODATA || size == -EOPNOTSUPP) > - /* no data, that's ok */ > - return -ENODATA; > - > - if (size < 0) > - return size; > + memset(vfs_caps, 0, sizeof(*vfs_caps)); > > if (size < sizeof(magic_etc)) > return -EINVAL; > > - cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); > + vfs_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); > > - rootkuid = make_kuid(fs_ns, 0); > + rootkuid = make_kuid(src_userns, 0); > switch (magic_etc & VFS_CAP_REVISION_MASK) { > case VFS_CAP_REVISION_1: > if (size != XATTR_CAPS_SZ_1) > @@ -679,39 +666,178 @@ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, > case VFS_CAP_REVISION_3: > if (size != XATTR_CAPS_SZ_3) > return -EINVAL; > - rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid)); > + rootkuid = make_kuid(src_userns, le32_to_cpu(ns_caps->rootid)); > break; > > default: > return -EINVAL; > } > > - rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid); > - if (!vfsuid_valid(rootvfsuid)) > - return -ENODATA; > + vfs_caps->rootid = make_vfsuid(idmap, src_userns, rootkuid); > + if (!vfsuid_valid(vfs_caps->rootid)) > + return -EOVERFLOW; > > - /* Limit the caps to the mounter of the filesystem > - * or the more limited uid specified in the xattr. > + vfs_caps->permitted.val = le32_to_cpu(caps->data[0].permitted); > + vfs_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable); > + > + /* > + * Rev1 had just a single 32-bit word, later expanded > + * to a second one for the high bits > */ > - if (!rootid_owns_currentns(rootvfsuid)) > - return -ENODATA; > + if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) { > + vfs_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32; > + vfs_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32; > + } > + > + vfs_caps->permitted.val &= CAP_VALID_MASK; > + vfs_caps->inheritable.val &= CAP_VALID_MASK; > + > + return 0; > +} > + > +/* > + * Inner implementation of vfs_caps_to_xattr() which does not return an > + * error if the rootid does not map into @dest_userns. > + */ > +static ssize_t __vfs_caps_to_xattr(struct mnt_idmap *idmap, > + struct user_namespace *dest_userns, > + const struct vfs_caps *vfs_caps, > + void *data, size_t size) > +{ > + struct vfs_ns_cap_data *ns_caps = data; > + struct vfs_cap_data *caps = (struct vfs_cap_data *)ns_caps; > + kuid_t rootkuid; > + uid_t rootid; > + > + memset(ns_caps, 0, size); size -> sizeof(*ns_caps) (or an equivalent change) I was zeroing more (the size of the buffer passed to vfs_getxattr()). Roberto > + > + rootid = 0; > + switch (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) { > + case VFS_CAP_REVISION_1: > + if (size < XATTR_CAPS_SZ_1) > + return -EINVAL; > + size = XATTR_CAPS_SZ_1; > + break; > + case VFS_CAP_REVISION_2: > + if (size < XATTR_CAPS_SZ_2) > + return -EINVAL; > + size = XATTR_CAPS_SZ_2; > + break; > + case VFS_CAP_REVISION_3: > + if (size < XATTR_CAPS_SZ_3) > + return -EINVAL; > + size = XATTR_CAPS_SZ_3; > + rootkuid = from_vfsuid(idmap, dest_userns, vfs_caps->rootid); > + rootid = from_kuid(dest_userns, rootkuid); > + ns_caps->rootid = cpu_to_le32(rootid); > + break; > > - cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted); > - cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable); > + default: > + return -EINVAL; > + } > + > + caps->magic_etc = cpu_to_le32(vfs_caps->magic_etc); > + > + caps->data[0].permitted = cpu_to_le32(lower_32_bits(vfs_caps->permitted.val)); > + caps->data[0].inheritable = cpu_to_le32(lower_32_bits(vfs_caps->inheritable.val)); > > /* > * Rev1 had just a single 32-bit word, later expanded > * to a second one for the high bits > */ > - if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) { > - cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32; > - cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32; > + if ((vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) { > + caps->data[1].permitted = > + cpu_to_le32(upper_32_bits(vfs_caps->permitted.val)); > + caps->data[1].inheritable = > + cpu_to_le32(upper_32_bits(vfs_caps->inheritable.val)); > } > > - cpu_caps->permitted.val &= CAP_VALID_MASK; > - cpu_caps->inheritable.val &= CAP_VALID_MASK; > + return size; > +} > + > + > +/** > + * vfs_caps_to_xattr - convert vfs_caps to raw caps xattr data > + * > + * @idmap: idmap of the mount the inode was found from > + * @dest_userns: user namespace for ids in xattr data > + * @vfs_caps: source vfs_caps data > + * @data: destination buffer for rax xattr caps data > + * @size: size of the @data buffer > + * > + * Converts a kernel-internal capability into the raw security.capability > + * xattr format. > + * > + * If the xattr is being read or written through an idmapped mount the > + * idmap of the vfsmount must be passed through @idmap. This function > + * will then take care to map the rootid according to @idmap. > + * > + * Return: On success, return the size of the xattr data. On error, > + * return < 0. > + */ > +ssize_t vfs_caps_to_xattr(struct mnt_idmap *idmap, > + struct user_namespace *dest_userns, > + const struct vfs_caps *vfs_caps, > + void *data, size_t size) > +{ > + struct vfs_ns_cap_data *caps = data; > + int ret; > + > + ret = __vfs_caps_to_xattr(idmap, dest_userns, vfs_caps, data, size); > + if (ret > 0 && > + (vfs_caps->magic_etc & VFS_CAP_REVISION_MASK) == VFS_CAP_REVISION_3 && > + le32_to_cpu(caps->rootid) == (uid_t)-1) > + return -EOVERFLOW; > + return ret; > +} > + > +/** > + * get_vfs_caps_from_disk - retrieve vfs caps from disk > + * > + * @idmap: idmap of the mount the inode was found from > + * @dentry: dentry from which @inode is retrieved > + * @cpu_caps: vfs capabilities > + * > + * Extract the on-exec-apply capability sets for an executable file. > + * > + * If the inode has been found through an idmapped mount the idmap of > + * the vfsmount must be passed through @idmap. This function will then > + * take care to map the inode according to @idmap before checking > + * permissions. On non-idmapped mounts or if permission checking is to be > + * performed on the raw inode simply pass @nop_mnt_idmap. > + */ > +int get_vfs_caps_from_disk(struct mnt_idmap *idmap, > + const struct dentry *dentry, > + struct vfs_caps *cpu_caps) > +{ > + struct inode *inode = d_backing_inode(dentry); > + int size, ret; > + struct vfs_ns_cap_data data, *nscaps = &data; > + > + if (!inode) > + return -ENODATA; > > - cpu_caps->rootid = rootvfsuid; > + size = __vfs_getxattr((struct dentry *)dentry, inode, > + XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ); > + if (size == -ENODATA || size == -EOPNOTSUPP) > + /* no data, that's ok */ > + return -ENODATA; > + > + if (size < 0) > + return size; > + > + ret = vfs_caps_from_xattr(idmap, inode->i_sb->s_user_ns, > + cpu_caps, nscaps, size); > + if (ret == -EOVERFLOW) > + return -ENODATA; > + if (ret) > + return ret; > + > + /* Limit the caps to the mounter of the filesystem > + * or the more limited uid specified in the xattr. > + */ > + if (!rootid_owns_currentns(cpu_caps->rootid)) > + return -ENODATA; > > return 0; > } >