On 1/23/25 13:19, Christian Brauner wrote: > In container workloads idmapped mounts are often used as layers for > overlayfs. Recently I added the ability to specify layers in overlayfs > as file descriptors instead of path names. It should be possible to > simply use the detached mounts directly when specifying layers instead > of having to attach them beforehand. They are discarded after overlayfs > is mounted anyway so it's pointless system calls for userspace and > pointless locking for the kernel. > > This just recently come up again in [1]. So enable clone_private_mount() > to use detached mounts directly. Following conditions must be met: > > - Provided path must be the root of a detached mount tree. > - Provided path may not create mount namespace loops. > - Provided path must be mounted. > > It would be possible to be stricter and require that the caller must > have CAP_SYS_ADMIN in the owning user namespace of the anonymous mount > namespace but since this restriction isn't enforced for move_mount() > there's no point in enforcing it for clone_private_mount(). > > Link: https://lore.kernel.org/r/fd8f6574-f737-4743-b220-79c815ee1554@xxxxxxxxxxxx [1] > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > --- > fs/namespace.c | 78 ++++++++++++++++++++++++++++---------------------- > 1 file changed, 43 insertions(+), 35 deletions(-) > > diff --git a/fs/namespace.c b/fs/namespace.c > index 4013fbac354a..3985a695d373 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -2287,6 +2287,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) > return false; > } > > +/* > + * Check that there aren't references to earlier/same mount namespaces in the > + * specified subtree. Such references can act as pins for mount namespaces > + * that aren't checked by the mount-cycle checking code, thereby allowing > + * cycles to be made. > + */ > +static bool check_for_nsfs_mounts(struct mount *subtree) > +{ > + struct mount *p; > + bool ret = false; > + > + lock_mount_hash(); > + for (p = subtree; p; p = next_mnt(p, subtree)) > + if (mnt_ns_loop(p->mnt.mnt_root)) > + goto out; > + > + ret = true; > +out: > + unlock_mount_hash(); > + return ret; > +} > + > /** > * clone_private_mount - create a private clone of a path > * @path: path to clone > @@ -2295,6 +2317,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) > * will not be attached anywhere in the namespace and will be private (i.e. > * changes to the originating mount won't be propagated into this). > * > + * This assumes caller has called or done the equivalent of may_mount(). > + * > * Release with mntput(). > */ > struct vfsmount *clone_private_mount(const struct path *path) > @@ -2302,30 +2326,36 @@ struct vfsmount *clone_private_mount(const struct path *path) > struct mount *old_mnt = real_mount(path->mnt); > struct mount *new_mnt; > > - down_read(&namespace_sem); > + scoped_guard(rwsem_read, &namespace_sem) > if (IS_MNT_UNBINDABLE(old_mnt)) > - goto invalid; > + return ERR_PTR(-EINVAL); > + > + if (mnt_has_parent(old_mnt)) { > + if (!check_mnt(old_mnt)) > + return ERR_PTR(-EINVAL); > + } else { > + /* Make sure this isn't something purely kernel internal. */ > + if (!is_anon_ns(old_mnt->mnt_ns)) > + return ERR_PTR(-EINVAL); > > - if (!check_mnt(old_mnt)) > - goto invalid; > + /* Make sure we don't create mount namespace loops. */ > + if (!check_for_nsfs_mounts(old_mnt)) > + return ERR_PTR(-EINVAL); > + > + if (!path_mounted(path)) > + return ERR_PTR(-EINVAL); > + } > > if (has_locked_children(old_mnt, path->dentry)) > - goto invalid; > + return ERR_PTR(-EINVAL); > > new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); > - up_read(&namespace_sem); > - > if (IS_ERR(new_mnt)) > - return ERR_CAST(new_mnt); > + return ERR_PTR(-EINVAL); > > /* Longterm mount to be removed by kern_unmount*() */ > new_mnt->mnt_ns = MNT_NS_INTERNAL; > - > return &new_mnt->mnt; > - > -invalid: > - up_read(&namespace_sem); > - return ERR_PTR(-EINVAL); > } > EXPORT_SYMBOL_GPL(clone_private_mount); > > @@ -3123,28 +3153,6 @@ static inline int tree_contains_unbindable(struct mount *mnt) > return 0; > } > > -/* > - * Check that there aren't references to earlier/same mount namespaces in the > - * specified subtree. Such references can act as pins for mount namespaces > - * that aren't checked by the mount-cycle checking code, thereby allowing > - * cycles to be made. > - */ > -static bool check_for_nsfs_mounts(struct mount *subtree) > -{ > - struct mount *p; > - bool ret = false; > - > - lock_mount_hash(); > - for (p = subtree; p; p = next_mnt(p, subtree)) > - if (mnt_ns_loop(p->mnt.mnt_root)) > - goto out; > - > - ret = true; > -out: > - unlock_mount_hash(); > - return ret; > -} > - > static int do_set_group(struct path *from_path, struct path *to_path) > { > struct mount *from, *to; Confirmed this works for the use case I'm interested in of directly passing idmapped mounts to lower layers. Tested-by: Mike Baynton <mike@xxxxxxxxxxxx>