For systems which run directly from initramfs, it is not possible to use pivot_root without first changing root. This is because of the intentional design choice that rootfs, which is where initramfs is unpacked to, cannot be unmounted. pivot_root is an important feature for creating containers and the alternative (mounting the new root over the top of the old with MS_MOVE and then calling chroot) is not favoured by most container runtimes [1][2] as it does not completely remove the host system mounts from the mount namespace. The general work around, when running directly from initramfs, is to have init mount a new tmpfs, copy everything out of rootfs, and then switch_root [3][4]. This is only required when running directly from the initramfs as all other methods of acquiring a root device (having the kernel mount a root device directly via the root= parameter, or using initramfs to mount and then switch_root to a new root) leave an empty rootfs at the top of the mount stack. This commit adds a new build option - EMPTY_ROOTFS, available when initrd/initramfs is enabled. When selected, rather than unpacking the inbuilt / bootloader provided initramfs directly into rootfs, the kernel will mount a new tmpfs/ramfs over the top of the rootfs and unpack to that instead, leaving an empty rootfs at the top of the stack. This removes the need to have init copy everything as a workaround. [1]: https://github.com/opencontainers/runc/blob/95a93c132cf179a017312e22a954f137e8237c4e/man/runc-create.8.md?plain=1#L27 [2]: https://github.com/containers/crun/blob/8e8d7972f738f28294cd5c16091d136ca278759e/crun.1.md?plain=1#L103 [3]: https://github.com/tinycorelinux/Core-scripts/blob/dbb24bf42a0a9935b18e66a0b936266b2244251b/init#L13 [4]: https://github.com/kubernetes/minikube/blob/master/deploy/iso/minikube-iso/board/minikube/x86_64/rootfs-overlay/init#L6 Signed-off-by: Emily Shepherd <emily@xxxxxxxxxxx> --- v2: - Fix formatting error in patch - Update overmount_rootfs() return type to void - cc relevant kernel devs based on blame of init files - cc OCI container runtime devs who have supported no-pivot options - cc small / embedded linux devs who have mitigated this by copying root - tweak to changelog: clarify why no-pivot is not recommended - tweak to changelog: include missing reference to minikube's rootfs mitigation --- init/Kconfig | 13 +++++++++++++ init/do_mounts.c | 23 +++++++++++++++++++++++ init/do_mounts.h | 6 ++++++ init/initramfs.c | 4 ++++ 4 files changed, 46 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index 6d35728b94b2b..bf15bd08abdc2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1299,6 +1299,19 @@ config BLK_DEV_INITRD if BLK_DEV_INITRD +config EMPTY_ROOTFS + bool "Mount initramfs over empty rootfs" + help + Normally initramfs is unpacked directly into the rootfs. When this + option is enabled, initramfs is instead unpacked into a tmpfs + mounted on top of a permanently empty rootfs. + + This is mostly useful for embedded operating systems, running + directly from initramfs, which need to make use of pivot_root (for + example systems running containers). + + If unsure, say N. + source "usr/Kconfig" endif diff --git a/init/do_mounts.c b/init/do_mounts.c index 5dfd30b13f485..7cf106cf976db 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -514,3 +514,26 @@ void __init init_rootfs(void) (!root_fs_names || strstr(root_fs_names, "tmpfs"))) is_tmpfs = true; } + +#ifdef CONFIG_EMPTY_ROOTFS +void __init overmount_rootfs(void) { + int err; + + err = init_mkdir("/root", 0700); + if (err != 0) + goto out; + + err = init_mount("rootfs", "/root", is_tmpfs ? "tmpfs" : "ramfs", 0, NULL); + if (err != 0) + goto out; + + init_chdir("/root"); + init_mount(".", "/", NULL, MS_MOVE, NULL); + init_chroot("."); + + return; + +out: + printk(KERN_WARNING "Failed to mount over rootfs\n"); +} +#endif /* CONFIG_EMPTY_ROOTFS */ diff --git a/init/do_mounts.h b/init/do_mounts.h index 15e372b00ce70..3a261f1ae0d64 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -41,3 +41,9 @@ static inline bool initrd_load(char *root_device_name) } #endif + +#ifdef CONFIG_EMPTY_ROOTFS +void __init overmount_rootfs(void); +#else +static inline void __init overmount_rootfs(void) { return; } +#endif diff --git a/init/initramfs.c b/init/initramfs.c index 8d0fd946cdd2b..76525108a39d2 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -19,6 +19,8 @@ #include <linux/task_work.h> #include <linux/umh.h> +#include "do_mounts.h" + static __initdata bool csum_present; static __initdata u32 io_csum; @@ -688,6 +690,8 @@ static void __init populate_initrd_image(char *err) static void __init do_populate_rootfs(void *unused, async_cookie_t cookie) { + overmount_rootfs(); + /* Load the built in initramfs */ char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) -- 2.42.0