vfs-7.0-rc1.nullfs

Please consider pulling these changes from the signed vfs-7.0-rc1.nullfs tag. Thanks! Christian -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaYX49gAKCRCRxhvAZXjc olG7AQD9TywOR0HC9PMT8jrhC1TKODnZ4H1aLNlYVltzfJ09xwEAwFSGO4rQmGAF aZdD0RQw4bkf7IC1PIZHEGUqmVXJCQ8= =NvyI -----END PGP SIGNATURE----- Merge tag 'vfs-7.0-rc1.nullfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs Pull vfs nullfs update from Christian Brauner: "Add a completely catatonic minimal pseudo filesystem called "nullfs" and make pivot_root() work in the initramfs. Currently pivot_root() does not work on the real rootfs because it cannot be unmounted. Userspace has to recursively delete initramfs contents manually before continuing boot, using the fragile switch_root sequence (overmount + chroot). Add nullfs, a minimal immutable filesystem that serves as the true root of the mount hierarchy. The mutable rootfs (tmpfs/ramfs) is mounted on top of it. This allows userspace to simply: chdir(new_root); pivot_root(".", "."); umount2(".", MNT_DETACH); without the traditional switch_root workarounds. systemd already handles this correctly. It tries pivot_root() first and falls back to MS_MOVE only when that fails. This also means rootfs mounts in unprivileged namespaces no longer need MNT_LOCKED, since the immutable nullfs guarantees nothing can be revealed by unmounting the covering mount. nullfs is a single-instance filesystem (get_tree_single()) marked SB_NOUSER | SB_I_NOEXEC | SB_I_NODEV with an immutable empty root directory. This means sooner or later it can be used to overmount other directories to hide their contents without any additional protection needed. We enable it unconditionally. If we see any real regression we'll hide it behind a boot option. nullfs has extensions beyond this in the future. It will serve as a concept to support the creation of completely empty mount namespaces - which is work coming up in the next cycle" * tag 'vfs-7.0-rc1.nullfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: fs: use nullfs unconditionally as the real rootfs docs: mention nullfs fs: add immutable rootfs fs: add init_pivot_root() fs: ensure that internal tmpfs mount gets mount id zero
2026-03-08 01:24:47 +01:00 · 2026-02-09 13:41:34 -08:00 · 2026-02-09 13:41:34 -08:00 · c84bb79f70
commit c84bb79f70
parent 7e01a69f5c 313c47f4fe
10 changed files with 215 additions and 73 deletions
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.rst
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.rst
@ -76,10 +76,10 @@ What is rootfs?
 ---------------

 Rootfs is a special instance of ramfs (or tmpfs, if that's enabled), which is
-always present in 2.6 systems.  You can't unmount rootfs for approximately the
-same reason you can't kill the init process; rather than having special code
-to check for and handle an empty list, it's smaller and simpler for the kernel
-to just make sure certain lists can't become empty.
+always present in Linux systems.  The kernel uses an immutable empty filesystem
+called nullfs as the true root of the VFS hierarchy, with the mutable rootfs
+(tmpfs/ramfs) mounted on top of it.  This allows pivot_root() and unmounting
+of the initramfs to work normally.

 Most systems just mount another filesystem over rootfs and ignore it.  The
 amount of space an empty instance of ramfs takes up is tiny.
@ -121,16 +121,14 @@ All this differs from the old initrd in several ways:
    program.  See the switch_root utility, below.)

  - When switching another root device, initrd would pivot_root and then
-    umount the ramdisk.  But initramfs is rootfs: you can neither pivot_root
-    rootfs, nor unmount it.  Instead delete everything out of rootfs to
-    free up the space (find -xdev / -exec rm '{}' ';'), overmount rootfs
-    with the new root (cd /newmount; mount --move . /; chroot .), attach
-    stdin/stdout/stderr to the new /dev/console, and exec the new init.
+    umount the ramdisk.  With nullfs as the true root, pivot_root() works
+    normally from the initramfs.  Userspace can simply do::

-    Since this is a remarkably persnickety process (and involves deleting
-    commands before you can run them), the klibc package introduced a helper
-    program (utils/run_init.c) to do all this for you.  Most other packages
-    (such as busybox) have named this command "switch_root".
+      chdir(new_root);
+      pivot_root(".", ".");
+      umount2(".", MNT_DETACH);
+
+    This is the preferred method for switching root filesystems.

 Populating initramfs:
 ---------------------
--- a/fs/Makefile
+++ b/fs/Makefile
@ -16,7 +16,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
 		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
-		file_attr.o fserror.o
+		file_attr.o fserror.o nullfs.o

 obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
--- a/fs/init.c
+++ b/fs/init.c
@ -13,6 +13,23 @@
 #include <linux/security.h>
 #include "internal.h"

+int __init init_pivot_root(const char *new_root, const char *put_old)
+{
+	struct path new_path __free(path_put) = {};
+	struct path old_path __free(path_put) = {};
+	int ret;
+
+	ret = kern_path(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new_path);
+	if (ret)
+		return ret;
+
+	ret = kern_path(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_path);
+	if (ret)
+		return ret;
+
+	return path_pivot_root(&new_path, &old_path);
+}
+
 int __init init_mount(const char *dev_name, const char *dir_name,
 		const char *type_page, unsigned long flags, void *data_page)
 {
--- a/fs/internal.h
+++ b/fs/internal.h
@ -90,6 +90,7 @@ extern bool may_mount(void);
 int path_mount(const char *dev_name, const struct path *path,
 		const char *type_page, unsigned long flags, void *data_page);
 int path_umount(const struct path *path, int flags);
+int path_pivot_root(struct path *new, struct path *old);

 int show_path(struct seq_file *m, struct dentry *root);

--- a/fs/mount.h
+++ b/fs/mount.h
@ -5,6 +5,7 @@
 #include <linux/ns_common.h>
 #include <linux/fs_pin.h>

+extern struct file_system_type nullfs_fs_type;
 extern struct list_head notify_list;

 struct mnt_namespace {
--- a/fs/namespace.c
+++ b/fs/namespace.c
@ -221,7 +221,7 @@ static int mnt_alloc_id(struct mount *mnt)
 	int res;

 	xa_lock(&mnt_id_xa);
-	res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
+	res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL);
 	if (!res)
 		mnt->mnt_id_unique = ++mnt_id_ctr;
 	xa_unlock(&mnt_id_xa);
@ -4498,36 +4498,8 @@ bool path_is_under(const struct path *path1, const struct path *path2)
 }
 EXPORT_SYMBOL(path_is_under);

-/*
- * pivot_root Semantics:
- * Moves the root file system of the current process to the directory put_old,
- * makes new_root as the new root file system of the current process, and sets
- * root/cwd of all processes which had them on the current root to new_root.
- *
- * Restrictions:
- * The new_root and put_old must be directories, and  must not be on the
- * same file  system as the current process root. The put_old  must  be
- * underneath new_root,  i.e. adding a non-zero number of /.. to the string
- * pointed to by put_old must yield the same directory as new_root. No other
- * file system may be mounted on put_old. After all, new_root is a mountpoint.
- *
- * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
- * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
- * in this situation.
- *
- * Notes:
- *  - we don't move root/cwd if they are not at the root (reason: if something
- *    cared enough to change them, it's probably wrong to force them elsewhere)
- *  - it's okay to pick a root that isn't the root of a file system, e.g.
- *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
- *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
- *    first.
- */
-SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
-		const char __user *, put_old)
+int path_pivot_root(struct path *new, struct path *old)
 {
-	struct path new __free(path_put) = {};
-	struct path old __free(path_put) = {};
 	struct path root __free(path_put) = {};
 	struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
 	int error;
@ -4535,28 +4507,18 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	if (!may_mount())
 		return -EPERM;

-	error = user_path_at(AT_FDCWD, new_root,
-			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
-	if (error)
-		return error;
-
-	error = user_path_at(AT_FDCWD, put_old,
-			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
-	if (error)
-		return error;
-
-	error = security_sb_pivotroot(&old, &new);
+	error = security_sb_pivotroot(old, new);
 	if (error)
 		return error;

 	get_fs_root(current->fs, &root);

-	LOCK_MOUNT(old_mp, &old);
+	LOCK_MOUNT(old_mp, old);
 	old_mnt = old_mp.parent;
 	if (IS_ERR(old_mnt))
 		return PTR_ERR(old_mnt);

-	new_mnt = real_mount(new.mnt);
+	new_mnt = real_mount(new->mnt);
 	root_mnt = real_mount(root.mnt);
 	ex_parent = new_mnt->mnt_parent;
 	root_parent = root_mnt->mnt_parent;
@ -4568,7 +4530,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		return -EINVAL;
 	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
 		return -EINVAL;
-	if (d_unlinked(new.dentry))
+	if (d_unlinked(new->dentry))
 		return -ENOENT;
 	if (new_mnt == root_mnt || old_mnt == root_mnt)
 		return -EBUSY; /* loop, on the same file system  */
@ -4576,15 +4538,15 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		return -EINVAL; /* not a mountpoint */
 	if (!mnt_has_parent(root_mnt))
 		return -EINVAL; /* absolute root */
-	if (!path_mounted(&new))
+	if (!path_mounted(new))
 		return -EINVAL; /* not a mountpoint */
 	if (!mnt_has_parent(new_mnt))
 		return -EINVAL; /* absolute root */
 	/* make sure we can reach put_old from new_root */
-	if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new))
+	if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new))
 		return -EINVAL;
 	/* make certain new is below the root */
-	if (!is_path_reachable(new_mnt, new.dentry, &root))
+	if (!is_path_reachable(new_mnt, new->dentry, &root))
 		return -EINVAL;
 	lock_mount_hash();
 	umount_mnt(new_mnt);
@ -4603,10 +4565,55 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	unlock_mount_hash();
 	mnt_notify_add(root_mnt);
 	mnt_notify_add(new_mnt);
-	chroot_fs_refs(&root, &new);
+	chroot_fs_refs(&root, new);
 	return 0;
 }

+/*
+ * pivot_root Semantics:
+ * Moves the root file system of the current process to the directory put_old,
+ * makes new_root as the new root file system of the current process, and sets
+ * root/cwd of all processes which had them on the current root to new_root.
+ *
+ * Restrictions:
+ * The new_root and put_old must be directories, and  must not be on the
+ * same file  system as the current process root. The put_old  must  be
+ * underneath new_root,  i.e. adding a non-zero number of /.. to the string
+ * pointed to by put_old must yield the same directory as new_root. No other
+ * file system may be mounted on put_old. After all, new_root is a mountpoint.
+ *
+ * The immutable nullfs filesystem is mounted as the true root of the VFS
+ * hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this,
+ * allowing pivot_root() to work normally from initramfs.
+ *
+ * Notes:
+ *  - we don't move root/cwd if they are not at the root (reason: if something
+ *    cared enough to change them, it's probably wrong to force them elsewhere)
+ *  - it's okay to pick a root that isn't the root of a file system, e.g.
+ *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
+ *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
+ *    first.
+ */
+SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
+		const char __user *, put_old)
+{
+	struct path new __free(path_put) = {};
+	struct path old __free(path_put) = {};
+	int error;
+
+	error = user_path_at(AT_FDCWD, new_root,
+			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
+	if (error)
+		return error;
+
+	error = user_path_at(AT_FDCWD, put_old,
+			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
+	if (error)
+		return error;
+
+	return path_pivot_root(&new, &old);
+}
+
 static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
 {
 	unsigned int flags = mnt->mnt.mnt_flags;
@ -5969,24 +5976,62 @@ struct mnt_namespace init_mnt_ns = {

 static void __init init_mount_tree(void)
 {
-	struct vfsmount *mnt;
-	struct mount *m;
+	struct vfsmount *mnt, *nullfs_mnt;
+	struct mount *mnt_root;
 	struct path root;

+	/*
+	 * We create two mounts:
+	 *
+	 * (1) nullfs with mount id 1
+	 * (2) mutable rootfs with mount id 2
+	 *
+	 * with (2) mounted on top of (1).
+	 */
+	nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL);
+	if (IS_ERR(nullfs_mnt))
+		panic("VFS: Failed to create nullfs");
+
 	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");

-	m = real_mount(mnt);
-	init_mnt_ns.root = m;
-	init_mnt_ns.nr_mounts = 1;
-	mnt_add_to_ns(&init_mnt_ns, m);
+	VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1);
+	VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2);
+
+	/* The namespace root is the nullfs mnt. */
+	mnt_root		= real_mount(nullfs_mnt);
+	init_mnt_ns.root	= mnt_root;
+
+	/* Mount mutable rootfs on top of nullfs. */
+	root.mnt		= nullfs_mnt;
+	root.dentry		= nullfs_mnt->mnt_root;
+
+	LOCK_MOUNT_EXACT(mp, &root);
+	if (unlikely(IS_ERR(mp.parent)))
+		panic("VFS: Failed to mount rootfs on nullfs");
+	scoped_guard(mount_writer)
+		attach_mnt(real_mount(mnt), mp.parent, mp.mp);
+
+	pr_info("VFS: Finished mounting rootfs on nullfs\n");
+
+	/*
+	 * We've dropped all locks here but that's fine. Not just are we
+	 * the only task that's running, there's no other mount
+	 * namespace in existence and the initial mount namespace is
+	 * completely empty until we add the mounts we just created.
+	 */
+	for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) {
+		mnt_add_to_ns(&init_mnt_ns, p);
+		init_mnt_ns.nr_mounts++;
+	}
+
 	init_task.nsproxy->mnt_ns = &init_mnt_ns;
 	get_mnt_ns(&init_mnt_ns);

+	/* The root and pwd always point to the mutable rootfs. */
 	root.mnt	= mnt;
 	root.dentry	= mnt->mnt_root;
-
 	set_fs_pwd(current->fs, &root);
 	set_fs_root(current->fs, &root);

--- a/fs/nullfs.c
+++ b/fs/nullfs.c
@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
+#include <linux/fs/super_types.h>
+#include <linux/fs_context.h>
+#include <linux/magic.h>
+
+static const struct super_operations nullfs_super_operations = {
+	.statfs	= simple_statfs,
+};
+
+static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc)
+{
+	struct inode *inode;
+
+	s->s_maxbytes		= MAX_LFS_FILESIZE;
+	s->s_blocksize		= PAGE_SIZE;
+	s->s_blocksize_bits	= PAGE_SHIFT;
+	s->s_magic		= NULL_FS_MAGIC;
+	s->s_op			= &nullfs_super_operations;
+	s->s_export_op		= NULL;
+	s->s_xattr		= NULL;
+	s->s_time_gran		= 1;
+	s->s_d_flags		= 0;
+
+	inode = new_inode(s);
+	if (!inode)
+		return -ENOMEM;
+
+	/* nullfs is permanently empty... */
+	make_empty_dir_inode(inode);
+	simple_inode_init_ts(inode);
+	inode->i_ino	= 1;
+	/* ... and immutable. */
+	inode->i_flags |= S_IMMUTABLE;
+
+	s->s_root = d_make_root(inode);
+	if (!s->s_root)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/*
+ * For now this is a single global instance. If needed we can make it
+ * mountable by userspace at which point we will need to make it
+ * multi-instance.
+ */
+static int nullfs_fs_get_tree(struct fs_context *fc)
+{
+	return get_tree_single(fc, nullfs_fs_fill_super);
+}
+
+static const struct fs_context_operations nullfs_fs_context_ops = {
+	.get_tree	= nullfs_fs_get_tree,
+};
+
+static int nullfs_init_fs_context(struct fs_context *fc)
+{
+	fc->ops		= &nullfs_fs_context_ops;
+	fc->global	= true;
+	fc->sb_flags	= SB_NOUSER;
+	fc->s_iflags	= SB_I_NOEXEC | SB_I_NODEV;
+	return 0;
+}
+
+struct file_system_type nullfs_fs_type = {
+	.name			= "nullfs",
+	.init_fs_context	= nullfs_init_fs_context,
+	.kill_sb		= kill_anon_super,
+};
--- a/include/linux/init_syscalls.h
+++ b/include/linux/init_syscalls.h
@ -17,3 +17,4 @@ int __init init_mkdir(const char *pathname, umode_t mode);
 int __init init_rmdir(const char *pathname);
 int __init init_utimes(char *filename, struct timespec64 *ts);
 int __init init_dup(struct file *file);
+int __init init_pivot_root(const char *new_root, const char *put_old);
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@ -104,5 +104,6 @@
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
 #define PID_FS_MAGIC		0x50494446	/* "PIDF" */
 #define GUEST_MEMFD_MAGIC	0x474d454d	/* "GMEM" */
+#define NULL_FS_MAGIC		0x4E554C4C	/* "NULL" */

 #endif /* __LINUX_MAGIC_H__ */
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@ -483,8 +483,16 @@ void __init prepare_namespace(void)
 		wait_for_root(saved_root_name);
 	mount_root(saved_root_name);
 	devtmpfs_mount();
-	init_mount(".", "/", NULL, MS_MOVE, NULL);
-	init_chroot(".");
+
+	if (init_pivot_root(".", ".")) {
+		pr_err("VFS: Failed to pivot into new rootfs\n");
+		return;
+	}
+	if (init_umount(".", MNT_DETACH)) {
+		pr_err("VFS: Failed to unmount old rootfs\n");
+		return;
+	}
+	pr_info("VFS: Pivoted into new rootfs\n");
 }

 static bool is_tmpfs;