diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.rst b/Documentation/filesystems/ramfs-rootfs-initramfs.rst index a9d271e171c3..a8899f849e90 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.rst +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.rst @@ -76,10 +76,15 @@ What is rootfs? --------------- Rootfs is a special instance of ramfs (or tmpfs, if that's enabled), which is -always present in 2.6 systems. You can't unmount rootfs for approximately the -same reason you can't kill the init process; rather than having special code -to check for and handle an empty list, it's smaller and simpler for the kernel -to just make sure certain lists can't become empty. +always present in 2.6 systems. Traditionally, you can't unmount rootfs for +approximately the same reason you can't kill the init process; rather than +having special code to check for and handle an empty list, it's smaller and +simpler for the kernel to just make sure certain lists can't become empty. + +However, if the kernel is booted with "nullfs_rootfs", an immutable empty +filesystem called nullfs is used as the true root, with the mutable rootfs +(tmpfs/ramfs) mounted on top of it. This allows pivot_root() and unmounting +of the initramfs to work normally. Most systems just mount another filesystem over rootfs and ignore it. The amount of space an empty instance of ramfs takes up is tiny. @@ -121,17 +126,26 @@ All this differs from the old initrd in several ways: program. See the switch_root utility, below.) - When switching another root device, initrd would pivot_root and then - umount the ramdisk. But initramfs is rootfs: you can neither pivot_root - rootfs, nor unmount it. Instead delete everything out of rootfs to - free up the space (find -xdev / -exec rm '{}' ';'), overmount rootfs - with the new root (cd /newmount; mount --move . /; chroot .), attach - stdin/stdout/stderr to the new /dev/console, and exec the new init. + umount the ramdisk. Traditionally, initramfs is rootfs: you can neither + pivot_root rootfs, nor unmount it. Instead delete everything out of + rootfs to free up the space (find -xdev / -exec rm '{}' ';'), overmount + rootfs with the new root (cd /newmount; mount --move . /; chroot .), + attach stdin/stdout/stderr to the new /dev/console, and exec the new init. Since this is a remarkably persnickety process (and involves deleting commands before you can run them), the klibc package introduced a helper program (utils/run_init.c) to do all this for you. Most other packages (such as busybox) have named this command "switch_root". + However, if the kernel is booted with "nullfs_rootfs", pivot_root() works + normally from the initramfs. Userspace can simply do:: + + chdir(new_root); + pivot_root(".", "."); + umount2(".", MNT_DETACH); + + This is the preferred method when nullfs_rootfs is enabled. + Populating initramfs: --------------------- diff --git a/fs/Makefile b/fs/Makefile index a04274a3c854..becf133e4791 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ - file_attr.o + file_attr.o nullfs.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o diff --git a/fs/init.c b/fs/init.c index e0f5429c0a49..e33b2690d851 100644 --- a/fs/init.c +++ b/fs/init.c @@ -13,6 +13,23 @@ #include #include "internal.h" +int __init init_pivot_root(const char *new_root, const char *put_old) +{ + struct path new_path __free(path_put) = {}; + struct path old_path __free(path_put) = {}; + int ret; + + ret = kern_path(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new_path); + if (ret) + return ret; + + ret = kern_path(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_path); + if (ret) + return ret; + + return path_pivot_root(&new_path, &old_path); +} + int __init init_mount(const char *dev_name, const char *dir_name, const char *type_page, unsigned long flags, void *data_page) { diff --git a/fs/internal.h b/fs/internal.h index ab638d41ab81..4b27a4b0fdef 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -90,6 +90,7 @@ extern bool may_mount(void); int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(const struct path *path, int flags); +int path_pivot_root(struct path *new, struct path *old); int show_path(struct seq_file *m, struct dentry *root); diff --git a/fs/mount.h b/fs/mount.h index 2d28ef2a3aed..e0816c11a198 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -5,6 +5,7 @@ #include #include +extern struct file_system_type nullfs_fs_type; extern struct list_head notify_list; struct mnt_namespace { diff --git a/fs/namespace.c b/fs/namespace.c index c58674a20cad..a44ebb2f1161 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,6 +75,17 @@ static int __init initramfs_options_setup(char *str) __setup("initramfs_options=", initramfs_options_setup); +bool nullfs_rootfs = false; + +static int __init nullfs_rootfs_setup(char *str) +{ + if (*str) + return 0; + nullfs_rootfs = true; + return 1; +} +__setup("nullfs_rootfs", nullfs_rootfs_setup); + static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); @@ -221,7 +232,7 @@ static int mnt_alloc_id(struct mount *mnt) int res; xa_lock(&mnt_id_xa); - res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL); + res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL); if (!res) mnt->mnt_id_unique = ++mnt_id_ctr; xa_unlock(&mnt_id_xa); @@ -4498,36 +4509,8 @@ bool path_is_under(const struct path *path1, const struct path *path2) } EXPORT_SYMBOL(path_is_under); -/* - * pivot_root Semantics: - * Moves the root file system of the current process to the directory put_old, - * makes new_root as the new root file system of the current process, and sets - * root/cwd of all processes which had them on the current root to new_root. - * - * Restrictions: - * The new_root and put_old must be directories, and must not be on the - * same file system as the current process root. The put_old must be - * underneath new_root, i.e. adding a non-zero number of /.. to the string - * pointed to by put_old must yield the same directory as new_root. No other - * file system may be mounted on put_old. After all, new_root is a mountpoint. - * - * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives - * in this situation. - * - * Notes: - * - we don't move root/cwd if they are not at the root (reason: if something - * cared enough to change them, it's probably wrong to force them elsewhere) - * - it's okay to pick a root that isn't the root of a file system, e.g. - * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, - * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root - * first. - */ -SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, - const char __user *, put_old) +int path_pivot_root(struct path *new, struct path *old) { - struct path new __free(path_put) = {}; - struct path old __free(path_put) = {}; struct path root __free(path_put) = {}; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; int error; @@ -4535,28 +4518,18 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!may_mount()) return -EPERM; - error = user_path_at(AT_FDCWD, new_root, - LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); - if (error) - return error; - - error = user_path_at(AT_FDCWD, put_old, - LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); - if (error) - return error; - - error = security_sb_pivotroot(&old, &new); + error = security_sb_pivotroot(old, new); if (error) return error; get_fs_root(current->fs, &root); - LOCK_MOUNT(old_mp, &old); + LOCK_MOUNT(old_mp, old); old_mnt = old_mp.parent; if (IS_ERR(old_mnt)) return PTR_ERR(old_mnt); - new_mnt = real_mount(new.mnt); + new_mnt = real_mount(new->mnt); root_mnt = real_mount(root.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; @@ -4568,7 +4541,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, return -EINVAL; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) return -EINVAL; - if (d_unlinked(new.dentry)) + if (d_unlinked(new->dentry)) return -ENOENT; if (new_mnt == root_mnt || old_mnt == root_mnt) return -EBUSY; /* loop, on the same file system */ @@ -4576,15 +4549,15 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) return -EINVAL; /* absolute root */ - if (!path_mounted(&new)) + if (!path_mounted(new)) return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) return -EINVAL; /* absolute root */ /* make sure we can reach put_old from new_root */ - if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new)) + if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new)) return -EINVAL; /* make certain new is below the root */ - if (!is_path_reachable(new_mnt, new.dentry, &root)) + if (!is_path_reachable(new_mnt, new->dentry, &root)) return -EINVAL; lock_mount_hash(); umount_mnt(new_mnt); @@ -4603,10 +4576,56 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, unlock_mount_hash(); mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); - chroot_fs_refs(&root, &new); + chroot_fs_refs(&root, new); return 0; } +/* + * pivot_root Semantics: + * Moves the root file system of the current process to the directory put_old, + * makes new_root as the new root file system of the current process, and sets + * root/cwd of all processes which had them on the current root to new_root. + * + * Restrictions: + * The new_root and put_old must be directories, and must not be on the + * same file system as the current process root. The put_old must be + * underneath new_root, i.e. adding a non-zero number of /.. to the string + * pointed to by put_old must yield the same directory as new_root. No other + * file system may be mounted on put_old. After all, new_root is a mountpoint. + * + * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem + * unless the kernel was booted with "nullfs_rootfs". See + * Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives + * in this situation. + * + * Notes: + * - we don't move root/cwd if they are not at the root (reason: if something + * cared enough to change them, it's probably wrong to force them elsewhere) + * - it's okay to pick a root that isn't the root of a file system, e.g. + * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, + * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root + * first. + */ +SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, + const char __user *, put_old) +{ + struct path new __free(path_put) = {}; + struct path old __free(path_put) = {}; + int error; + + error = user_path_at(AT_FDCWD, new_root, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); + if (error) + return error; + + error = user_path_at(AT_FDCWD, put_old, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); + if (error) + return error; + + return path_pivot_root(&new, &old); +} + static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) { unsigned int flags = mnt->mnt.mnt_flags; @@ -5969,24 +5988,72 @@ struct mnt_namespace init_mnt_ns = { static void __init init_mount_tree(void) { - struct vfsmount *mnt; - struct mount *m; + struct vfsmount *mnt, *nullfs_mnt; + struct mount *mnt_root; struct path root; + /* + * When nullfs is used, we create two mounts: + * + * (1) nullfs with mount id 1 + * (2) mutable rootfs with mount id 2 + * + * with (2) mounted on top of (1). + */ + if (nullfs_rootfs) { + nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL); + if (IS_ERR(nullfs_mnt)) + panic("VFS: Failed to create nullfs"); + } + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); - m = real_mount(mnt); - init_mnt_ns.root = m; - init_mnt_ns.nr_mounts = 1; - mnt_add_to_ns(&init_mnt_ns, m); + if (nullfs_rootfs) { + VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1); + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2); + + /* The namespace root is the nullfs mnt. */ + mnt_root = real_mount(nullfs_mnt); + init_mnt_ns.root = mnt_root; + + /* Mount mutable rootfs on top of nullfs. */ + root.mnt = nullfs_mnt; + root.dentry = nullfs_mnt->mnt_root; + + LOCK_MOUNT_EXACT(mp, &root); + if (unlikely(IS_ERR(mp.parent))) + panic("VFS: Failed to mount rootfs on nullfs"); + scoped_guard(mount_writer) + attach_mnt(real_mount(mnt), mp.parent, mp.mp); + + pr_info("VFS: Finished mounting rootfs on nullfs\n"); + } else { + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 1); + + /* The namespace root is the mutable rootfs. */ + mnt_root = real_mount(mnt); + init_mnt_ns.root = mnt_root; + } + + /* + * We've dropped all locks here but that's fine. Not just are we + * the only task that's running, there's no other mount + * namespace in existence and the initial mount namespace is + * completely empty until we add the mounts we just created. + */ + for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) { + mnt_add_to_ns(&init_mnt_ns, p); + init_mnt_ns.nr_mounts++; + } + init_task.nsproxy->mnt_ns = &init_mnt_ns; get_mnt_ns(&init_mnt_ns); - root.mnt = mnt; - root.dentry = mnt->mnt_root; - + /* The root and pwd always point to the mutable rootfs. */ + root.mnt = mnt; + root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); diff --git a/fs/nullfs.c b/fs/nullfs.c new file mode 100644 index 000000000000..fdbd3e5d3d71 --- /dev/null +++ b/fs/nullfs.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Christian Brauner */ +#include +#include +#include + +static const struct super_operations nullfs_super_operations = { + .statfs = simple_statfs, +}; + +static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct inode *inode; + + s->s_maxbytes = MAX_LFS_FILESIZE; + s->s_blocksize = PAGE_SIZE; + s->s_blocksize_bits = PAGE_SHIFT; + s->s_magic = NULL_FS_MAGIC; + s->s_op = &nullfs_super_operations; + s->s_export_op = NULL; + s->s_xattr = NULL; + s->s_time_gran = 1; + s->s_d_flags = 0; + + inode = new_inode(s); + if (!inode) + return -ENOMEM; + + /* nullfs is permanently empty... */ + make_empty_dir_inode(inode); + simple_inode_init_ts(inode); + inode->i_ino = 1; + /* ... and immutable. */ + inode->i_flags |= S_IMMUTABLE; + + s->s_root = d_make_root(inode); + if (!s->s_root) + return -ENOMEM; + + return 0; +} + +/* + * For now this is a single global instance. If needed we can make it + * mountable by userspace at which point we will need to make it + * multi-instance. + */ +static int nullfs_fs_get_tree(struct fs_context *fc) +{ + return get_tree_single(fc, nullfs_fs_fill_super); +} + +static const struct fs_context_operations nullfs_fs_context_ops = { + .get_tree = nullfs_fs_get_tree, +}; + +static int nullfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &nullfs_fs_context_ops; + fc->global = true; + fc->sb_flags = SB_NOUSER; + fc->s_iflags = SB_I_NOEXEC | SB_I_NODEV; + return 0; +} + +struct file_system_type nullfs_fs_type = { + .name = "nullfs", + .init_fs_context = nullfs_init_fs_context, + .kill_sb = kill_anon_super, +}; diff --git a/include/linux/init_syscalls.h b/include/linux/init_syscalls.h index 92045d18cbfc..28776ee28d8e 100644 --- a/include/linux/init_syscalls.h +++ b/include/linux/init_syscalls.h @@ -17,3 +17,4 @@ int __init init_mkdir(const char *pathname, umode_t mode); int __init init_rmdir(const char *pathname); int __init init_utimes(char *filename, struct timespec64 *ts); int __init init_dup(struct file *file); +int __init init_pivot_root(const char *new_root, const char *put_old); diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 638ca21b7a90..4f2da935a76c 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -104,5 +104,6 @@ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ #define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ +#define NULL_FS_MAGIC 0x4E554C4C /* "NULL" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/init/do_mounts.c b/init/do_mounts.c index defbbf1d55f7..675397c8a7a4 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -492,6 +492,20 @@ void __init prepare_namespace(void) mount_root(saved_root_name); out: devtmpfs_mount(); + + if (nullfs_rootfs) { + if (init_pivot_root(".", ".")) { + pr_err("VFS: Failed to pivot into new rootfs\n"); + return; + } + if (init_umount(".", MNT_DETACH)) { + pr_err("VFS: Failed to unmount old rootfs\n"); + return; + } + pr_info("VFS: Pivoted into new rootfs\n"); + return; + } + init_mount(".", "/", NULL, MS_MOVE, NULL); init_chroot("."); } diff --git a/init/do_mounts.h b/init/do_mounts.h index 6069ea3eb80d..fbfee810aa89 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -15,6 +15,7 @@ void mount_root_generic(char *name, char *pretty_name, int flags); void mount_root(char *root_device_name); extern int root_mountflags; +extern bool nullfs_rootfs; static inline __init int create_dev(char *name, dev_t dev) {