From: Zhang Yunkai <zhang.yunkai@xxxxxxxxxx> When using container platforms such as Docker, it has two ways to change the root directory to the specified path, pivot_root or chroot. Docker uses pivot_root by default, which can be handled very cleanly. But it only support for a disk or block device, not for rootfs. Because the specified directory does not have a parent mount. So if we want use docker on rootfs, we need specify DOCKER_RAMDISK=yes. Then docker change the root directory will use chroot instead of pivot_root. There are at least two reasons, we still have to use pivot_root for rootfs. Chroot can only simply change the root directory, which will lead to resource leakage. An example is that a USB device connected prior to the creation of a containers on the host gets disconnected after a container is created. if the USB device was mounted on containers, but already removed and umounted on the host, the mount point will not go away until all containers unmount the USB device. Containers will have mount point even if they haven't done a mount action. Another reason for Docker to use pivot_root is that upon initialization the net-namspace is mounted under /var/run/docker/netns/ on the host by dockerd. Without pivot_root Docker must either wait to create the network namespace prior to the creation of containers or simply deal with leaking this to each container. This patch creates a parent mount point for rootfs to support pivot_root. The main steps are: mkdir /root cd /root mount tmpfs to /root decompress initramfs and initrd to tmpfs mount . / ksys_chroot . In addition, because there is an additional layer of mounting, it is necessary to slightly modify the way init_eaccess searches for files during the kernel initialization. While mounting tmpfs to /root, 'rootflags' is passed, and it means that we can set options for the mount of rootfs in boot cmd now. For example, the size of tmpfs can be set with 'rootflags=size=1024M'. Tested-by: Zeal Robot <zealci@xxxxxxxxxx> Signed-off-by: Zhang Yunkai <zhang.yunkai@xxxxxxxxxx> --- fs/init.c | 10 ++++++++-- include/linux/init.h | 1 + init/do_mounts.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ init/do_mounts.h | 14 ++++++++++++++ init/initramfs.c | 16 ++++++++++++++-- init/main.c | 6 +++++- usr/Kconfig | 10 ++++++++++ 7 files changed, 97 insertions(+), 5 deletions(-) diff --git a/fs/init.c b/fs/init.c index 5c36adaa9b44..4974f19bf645 100644 --- a/fs/init.c +++ b/fs/init.c @@ -112,14 +112,20 @@ int __init init_chmod(const char *filename, umode_t mode) int __init init_eaccess(const char *filename) { - struct path path; + struct path path, root; int error; - error = kern_path(filename, LOOKUP_FOLLOW, &path); + error = kern_path("/", LOOKUP_DOWN, &root); if (error) return error; + error = vfs_path_lookup(root.dentry, root.mnt, filename, + LOOKUP_FOLLOW, &path); + if (error) + goto on_err; error = path_permission(&path, MAY_ACCESS); path_put(&path); +on_err: + path_put(&root); return error; } diff --git a/include/linux/init.h b/include/linux/init.h index baf0b29a7010..6eddd3730ce8 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -149,6 +149,7 @@ extern unsigned int reset_devices; void setup_arch(char **); void prepare_namespace(void); void __init init_rootfs(void); +bool ramdisk_exec_exist(void); extern struct file_system_type rootfs_fs_type; #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX) diff --git a/init/do_mounts.c b/init/do_mounts.c index 7058e14ad5f7..c28a5792ddc3 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -649,6 +649,50 @@ void __init prepare_namespace(void) } static bool is_tmpfs; +#ifdef CONFIG_ROOTFS_MOUNT + +/* + * Give systems running from the rootfs and making use of pivot_root a + * proper mount so it can be umounted during pivot_root. + */ +int __init prepare_mount_rootfs(void) +{ + char *rootfs = "ramfs"; + + if (is_tmpfs) + rootfs = "tmpfs"; + + init_mkdir("/root", 0700); + return do_mount_root(rootfs, rootfs, + root_mountflags & ~MS_RDONLY, + root_mount_data); +} + +/* + * Revert to previous mount by chdir to '/' and unmounting the second + * mount. + */ +void __init revert_mount_rootfs(void) +{ + init_chdir("/"); + init_umount(".", MNT_DETACH); +} + +/* + * Change root to the new rootfs that mounted in prepare_mount_rootfs() + * if cpio is unpacked successfully and 'ramdisk_execute_command' exist. + */ +void __init finish_mount_rootfs(void) +{ + init_mount(".", "/", NULL, MS_MOVE, NULL); + if (likely(ramdisk_exec_exist())) + init_chroot("."); + else + revert_mount_rootfs(); +} + +#define rootfs_init_fs_context ramfs_init_fs_context +#else static int rootfs_init_fs_context(struct fs_context *fc) { if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs) @@ -656,6 +700,7 @@ static int rootfs_init_fs_context(struct fs_context *fc) return ramfs_init_fs_context(fc); } +#endif struct file_system_type rootfs_fs_type = { .name = "rootfs", diff --git a/init/do_mounts.h b/init/do_mounts.h index 7a29ac3e427b..6bc954b84015 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -14,6 +14,20 @@ void mount_block_root(char *name, int flags); void mount_root(void); extern int root_mountflags; +#ifdef CONFIG_ROOTFS_MOUNT + +int prepare_mount_rootfs(void); +void finish_mount_rootfs(void); +void revert_mount_rootfs(void); + +#else + +static inline int prepare_mount_rootfs(void) { return 0; } +static inline void finish_mount_rootfs(void) { } +static inline void revert_mount_rootfs(void) { } + +#endif + static inline __init int create_dev(char *name, dev_t dev) { init_unlink(name); diff --git a/init/initramfs.c b/init/initramfs.c index 2f3d96dc3db6..7b68c5aeff7d 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -17,6 +17,8 @@ #include <linux/init_syscalls.h> #include <linux/umh.h> +#include "do_mounts.h" + static ssize_t __init xwrite(struct file *file, const char *p, size_t count, loff_t *pos) { @@ -671,12 +673,19 @@ static void __init populate_initrd_image(char *err) static void __init do_populate_rootfs(void *unused, async_cookie_t cookie) { /* Load the built in initramfs */ - char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); + char *err; + + if (prepare_mount_rootfs()) + panic("Failed to mount rootfs\n"); + + err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */ - if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) + if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { + finish_mount_rootfs(); goto done; + } if (IS_ENABLED(CONFIG_BLK_DEV_RAM)) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); @@ -685,11 +694,14 @@ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie) err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); if (err) { + revert_mount_rootfs(); #ifdef CONFIG_BLK_DEV_RAM populate_initrd_image(err); #else printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); #endif + } else { + finish_mount_rootfs(); } done: diff --git a/init/main.c b/init/main.c index 98182c3c2c4b..2e4875834f97 100644 --- a/init/main.c +++ b/init/main.c @@ -1580,6 +1580,10 @@ void __init console_on_rootfs(void) fput(file); } +bool __init ramdisk_exec_exist(void) +{ + return init_eaccess(ramdisk_execute_command) == 0; +} static noinline void __init kernel_init_freeable(void) { /* Now the scheduler is fully set up and can do blocking allocations */ @@ -1621,7 +1625,7 @@ static noinline void __init kernel_init_freeable(void) * check if there is an early userspace init. If yes, let it do all * the work */ - if (init_eaccess(ramdisk_execute_command) != 0) { + if (!ramdisk_exec_exist()) { ramdisk_execute_command = NULL; prepare_namespace(); } diff --git a/usr/Kconfig b/usr/Kconfig index 8bbcf699fe3b..03dbb22e95f9 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -52,6 +52,16 @@ config INITRAMFS_ROOT_GID If you are not sure, leave it set to "0". +config ROOTFS_MOUNT + bool "Create mount point for rootfs to make pivot_root() supported" + default n + help + Before unpacking cpio, create a mount point and make it become + the root filesystem. Therefore, rootfs will be supported by + pivot_root(). + + If container platforms is used with rootfs, say Y. + config RD_GZIP bool "Support initial ramdisk/ramfs compressed using gzip" default y -- 2.25.1