Re: [PATCH RFC 0/5] nsfs: iterate through mount namespaces

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, 2024-07-19 at 13:41 +0200, Christian Brauner wrote:
> Hey,
> 
> Recently, we added the ability to list mounts in other mount
> namespaces
> and the ability to retrieve namespace file descriptors without having
> to
> go through procfs by deriving them from pidfds.
> 
> This extends nsfs in two ways:
> 
> (1) Add the ability to retrieve information about a mount namespace
> via
>     NS_MNT_GET_INFO. This will return the mount namespace id and the
>     number of mounts currently in the mount namespace. The number of
>     mounts can be used to size the buffer that needs to be used for
>     listmount() and is in general useful without having to actually
>     iterate through all the mounts.
> 
>     The structure is extensible.
> 
> (2) Add the ability to iterate through all mount namespaces over
> which
>     the caller holds privilege returning the file descriptor for the
>     next or previous mount namespace.
> 
>     To retrieve a mount namespace the caller must be privileged wrt
> to
>     it's owning user namespace. This means that PID 1 on the host can
>     list all mounts in all mount namespaces or that a container can
> list
>     all mounts of its nested containers.
> 
>     Optionally pass a structure for NS_MNT_GET_INFO with
>     NS_MNT_GET_{PREV,NEXT} to retrieve information about the mount
>     namespace in one go.
> 
> (1) and (2) can be implemented for other namespace types easily.
> 
> Together with recent api additions this means one can iterate through
> all mounts in all mount namespaces without ever touching procfs.
> Here's
> a sample program list_all_mounts_everywhere.c:
> 
>   // SPDX-License-Identifier: GPL-2.0-or-later
> 
>   #define _GNU_SOURCE
>   #include <asm/unistd.h>
>   #include <assert.h>
>   #include <errno.h>
>   #include <fcntl.h>
>   #include <getopt.h>
>   #include <linux/stat.h>
>   #include <sched.h>
>   #include <stddef.h>
>   #include <stdint.h>
>   #include <stdio.h>
>   #include <stdlib.h>
>   #include <string.h>
>   #include <sys/ioctl.h>
>   #include <sys/param.h>
>   #include <sys/pidfd.h>
>   #include <sys/stat.h>
>   #include <sys/statfs.h>
> 
>   #define die_errno(format,
> ...)                                             \
>   	do
> {                                                               \
>   		fprintf(stderr, "%m | %s: %d: %s: " format "\n",
> __FILE__, \
>   			__LINE__, __func__,
> ##__VA_ARGS__);                \
>   		exit(EXIT_FAILURE);                                 
>        \
>   	} while (0)
> 
>   /* Get the id for a mount namespace */
>   #define NS_GET_MNTNS_ID		_IO(0xb7, 0x5)
>   /* Get next mount namespace. */
> 
>   struct mnt_ns_info {
>   	__u32 size;
>   	__u32 nr_mounts;
>   	__u64 mnt_ns_id;
>   };
> 
>   #define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct
> */
> 
>   /* Get information about namespace. */
>   #define NS_MNT_GET_INFO		_IOR(0xb7, 10, struct
> mnt_ns_info)
>   /* Get next namespace. */
>   #define NS_MNT_GET_NEXT		_IOR(0xb7, 11, struct
> mnt_ns_info)
>   /* Get previous namespace. */
>   #define NS_MNT_GET_PREV		_IOR(0xb7, 12, struct
> mnt_ns_info)
> 
>   #define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3)
> 
>   #define STATX_MNT_ID_UNIQUE	0x00004000U	/* Want/got extended
> stx_mount_id */
> 
>   #define __NR_listmount 458
>   #define __NR_statmount 457
> 
>   /*
>    * @mask bits for statmount(2)
>    */
>   #define STATMOUNT_SB_BASIC		0x00000001U     /* Want/got
> sb_... */
>   #define STATMOUNT_MNT_BASIC		0x00000002U	/* Want/got
> mnt_... */
>   #define STATMOUNT_PROPAGATE_FROM	0x00000004U	/* Want/got
> propagate_from */
>   #define STATMOUNT_MNT_ROOT		0x00000008U	/* Want/got
> mnt_root  */
>   #define STATMOUNT_MNT_POINT		0x00000010U	/* Want/got
> mnt_point */
>   #define STATMOUNT_FS_TYPE		0x00000020U	/* Want/got
> fs_type */
>   #define STATMOUNT_MNT_NS_ID             0x00000040U     /* Want/got
> mnt_ns_id */
>   #define STATMOUNT_MNT_OPTS              0x00000080U     /* Want/got
> mnt_opts */
> 
>   struct statmount {
>   	__u32 size;		/* Total size, including strings */
>   	__u32 mnt_opts;
>   	__u64 mask;		/* What results were written */
>   	__u32 sb_dev_major;	/* Device ID */
>   	__u32 sb_dev_minor;
>   	__u64 sb_magic;		/* ..._SUPER_MAGIC */
>   	__u32 sb_flags;		/*
> SB_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
>   	__u32 fs_type;		/* [str] Filesystem type */
>   	__u64 mnt_id;		/* Unique ID of mount */
>   	__u64 mnt_parent_id;	/* Unique ID of parent (for root ==
> mnt_id) */
>   	__u32 mnt_id_old;	/* Reused IDs used in
> proc/.../mountinfo */
>   	__u32 mnt_parent_id_old;
>   	__u64 mnt_attr;		/* MOUNT_ATTR_... */
>   	__u64 mnt_propagation;	/*
> MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
>   	__u64 mnt_peer_group;	/* ID of shared peer group */
>   	__u64 mnt_master;	/* Mount receives propagation from
> this ID */
>   	__u64 propagate_from;	/* Propagation from in current
> namespace */
>   	__u32 mnt_root;		/* [str] Root of mount
> relative to root of fs */
>   	__u32 mnt_point;	/* [str] Mountpoint relative to
> current root */
>   	__u64 mnt_ns_id;
>   	__u64 __spare2[49];
>   	char str[];		/* Variable size part containing
> strings */
>   };
> 
>   struct mnt_id_req {
>   	__u32 size;
>   	__u32 spare;
>   	__u64 mnt_id;
>   	__u64 param;
>   	__u64 mnt_ns_id;
>   };
> 
>   #define MNT_ID_REQ_SIZE_VER1	32 /* sizeof second published struct
> */
> 
>   #define LSMT_ROOT		0xffffffffffffffff	/* root
> mount */
> 
>   static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask,
>   		       struct statmount *stmnt, size_t bufsize,
> unsigned int flags)
>   {
>   	struct mnt_id_req req = {
>   		.size = MNT_ID_REQ_SIZE_VER1,
>   		.mnt_id = mnt_id,
>   		.param = mask,
>   		.mnt_ns_id = mnt_ns_id,
>   	};
> 
>   	return syscall(__NR_statmount, &req, stmnt, bufsize, flags);
>   }
> 
>   static struct statmount *sys_statmount(__u64 mnt_id, __u64
> mnt_ns_id,
>   				       __u64 mask, unsigned int
> flags)
>   {
>   	size_t bufsize = 1 << 15;
>   	struct statmount *stmnt = NULL, *tmp = NULL;
>   	int ret;
> 
>   	for (;;) {
>   		tmp = realloc(stmnt, bufsize);
>   		if (!tmp)
>   			goto out;
> 
>   		stmnt = tmp;
>   		ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt,
> bufsize, flags);
>   		if (!ret)
>   			return stmnt;
> 
>   		if (errno != EOVERFLOW)
>   			goto out;
> 
>   		bufsize <<= 1;
>   		if (bufsize >= UINT_MAX / 2)
>   			goto out;
> 
>   	}
> 
>   out:
>   	free(stmnt);
>   	printf("statmount failed");
>   	return NULL;
>   }
> 
>   static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64
> mnt_ns_id,
>   			     __u64 list[], size_t num, unsigned int
> flags)
>   {
>   	struct mnt_id_req req = {
>   		.size = MNT_ID_REQ_SIZE_VER1,
>   		.mnt_id = mnt_id,
>   		.param = last_mnt_id,
>   		.mnt_ns_id = mnt_ns_id,
>   	};
> 
>   	return syscall(__NR_listmount, &req, list, num, flags);
>   }
> 
>   int main(int argc, char *argv[])
>   {
>   #define LISTMNT_BUFFER 10
>   	__u64 list[LISTMNT_BUFFER], last_mnt_id = 0;
>   	int ret, pidfd, fd_mntns;
>   	struct mnt_ns_info info = {};
> 
>   	pidfd = pidfd_open(getpid(), 0);
>   	if (pidfd < 0)
>   		die_errno("pidfd_open failed");
> 
>   	fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0);
>   	if (fd_mntns < 0)
>   		die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed");
> 
>   	ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info);
>   	if (ret < 0)
>   		die_errno("ioctl(NS_GET_MNTNS_ID) failed");
> 
>   	printf("Listing %u mounts for mount namespace %d:%llu\n",
> info.nr_mounts, fd_mntns, info.mnt_ns_id);
>   	for (;;) {
>   		ssize_t nr_mounts;
>   	next:
>   		nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id,
> info.mnt_ns_id, list, LISTMNT_BUFFER, 0);
>   		if (nr_mounts <= 0) {
>   			printf("Finished listing mounts for mount
> namespace %d:%llu\n\n", fd_mntns, info.mnt_ns_id);
>   			ret = ioctl(fd_mntns, NS_MNT_GET_NEXT, 0);
>   			if (ret < 0)
>   				die_errno("ioctl(NS_MNT_GET_NEXT)
> failed");
>   			close(ret);
>   			ret = ioctl(fd_mntns, NS_MNT_GET_NEXT,
> &info);
>   			if (ret < 0) {
>   				if (errno == ENOENT) {
>   					printf("Finished listing all
> mount namespaces\n");
>   					exit(0);
>   				}
>   				die_errno("ioctl(NS_MNT_GET_NEXT)
> failed");
>   			}
>   			close(fd_mntns);
>   			fd_mntns = ret;
>   			last_mnt_id = 0;
>   			printf("Listing %u mounts for mount
> namespace %d:%llu\n", info.nr_mounts, fd_mntns, info.mnt_ns_id);
>   			goto next;
>   		}
> 
>   		for (size_t cur = 0; cur < nr_mounts; cur++) {
>   			struct statmount *stmnt;
> 
>   			last_mnt_id = list[cur];
> 
>   			stmnt = sys_statmount(last_mnt_id,
> info.mnt_ns_id,
>   					      STATMOUNT_SB_BASIC |
>   					      STATMOUNT_MNT_BASIC |
>   					      STATMOUNT_MNT_ROOT |
>   					      STATMOUNT_MNT_POINT |
>   					      STATMOUNT_MNT_NS_ID |
>   					      STATMOUNT_MNT_OPTS |
>   					      STATMOUNT_FS_TYPE,
>   					  0);
>   			if (!stmnt) {
>   				printf("Failed to statmount(%llu) in
> mount namespace(%llu)\n", last_mnt_id, info.mnt_ns_id);
>   				continue;
>   			}
> 
>   			printf("mnt_id(%u/%llu) |
> mnt_parent_id(%u/%llu): %s @ %s ==> %s with options: %s\n",
>   			       stmnt->mnt_id_old, stmnt->mnt_id,
>   			       stmnt->mnt_parent_id_old, stmnt-
> >mnt_parent_id,
>   			       stmnt->str + stmnt->fs_type,
>   			       stmnt->str + stmnt->mnt_root,
>   			       stmnt->str + stmnt->mnt_point,
>   			       stmnt->str + stmnt->mnt_opts);
>   			free(stmnt);
>   		}
>   	}
> 
>   	exit(0);
>   }
> 
> Thanks!
> Christian
> 
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>
> ---
> ---
> base-commit: 720261cfc7329406a50c2a8536e0039b9dd9a4e5
> change-id: 20240705-work-mount-namespace-126b73a11f5c
> 

This all looks pretty straightforward to me. I do wish that we had
proper libc bindings for this...or maybe even a new userland library?

I just get the feeling that all of this syscall() and ioctl() usage is
eventually going to bite us in the ass. I don't have any concrete
proposal for that however, and we do have some immediate need for this
functionality, so, you can add

Reviewed-by: Jeff Layton <jlayton@xxxxxxxxxx>





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux