vfs-7.0-rc1.nullfs

Please consider pulling these changes from the signed vfs-7.0-rc1.nullfs tag.
 
 Thanks!
 Christian
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaYX49gAKCRCRxhvAZXjc
 olG7AQD9TywOR0HC9PMT8jrhC1TKODnZ4H1aLNlYVltzfJ09xwEAwFSGO4rQmGAF
 aZdD0RQw4bkf7IC1PIZHEGUqmVXJCQ8=
 =NvyI
 -----END PGP SIGNATURE-----

Merge tag 'vfs-7.0-rc1.nullfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs nullfs update from Christian Brauner:
 "Add a completely catatonic minimal pseudo filesystem called "nullfs"
  and make pivot_root() work in the initramfs.

  Currently pivot_root() does not work on the real rootfs because it
  cannot be unmounted. Userspace has to recursively delete initramfs
  contents manually before continuing boot, using the fragile
  switch_root sequence (overmount + chroot).

  Add nullfs, a minimal immutable filesystem that serves as the true
  root of the mount hierarchy. The mutable rootfs (tmpfs/ramfs) is
  mounted on top of it. This allows userspace to simply:

      chdir(new_root);
      pivot_root(".", ".");
      umount2(".", MNT_DETACH);

  without the traditional switch_root workarounds. systemd already
  handles this correctly. It tries pivot_root() first and falls back
  to MS_MOVE only when that fails.

  This also means rootfs mounts in unprivileged namespaces no longer
  need MNT_LOCKED, since the immutable nullfs guarantees nothing can be
  revealed by unmounting the covering mount.

  nullfs is a single-instance filesystem (get_tree_single()) marked
  SB_NOUSER | SB_I_NOEXEC | SB_I_NODEV with an immutable empty root
  directory. This means sooner or later it can be used to overmount
  other directories to hide their contents without any additional
  protection needed.

  We enable it unconditionally. If we see any real regression we'll
  hide it behind a boot option.

  nullfs has extensions beyond this in the future. It will serve as a
  concept to support the creation of completely empty mount namespaces -
  which is work coming up in the next cycle"

* tag 'vfs-7.0-rc1.nullfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  fs: use nullfs unconditionally as the real rootfs
  docs: mention nullfs
  fs: add immutable rootfs
  fs: add init_pivot_root()
  fs: ensure that internal tmpfs mount gets mount id zero
This commit is contained in:
Linus Torvalds 2026-02-09 13:41:34 -08:00
commit c84bb79f70
10 changed files with 215 additions and 73 deletions

View file

@ -76,10 +76,10 @@ What is rootfs?
---------------
Rootfs is a special instance of ramfs (or tmpfs, if that's enabled), which is
always present in 2.6 systems. You can't unmount rootfs for approximately the
same reason you can't kill the init process; rather than having special code
to check for and handle an empty list, it's smaller and simpler for the kernel
to just make sure certain lists can't become empty.
always present in Linux systems. The kernel uses an immutable empty filesystem
called nullfs as the true root of the VFS hierarchy, with the mutable rootfs
(tmpfs/ramfs) mounted on top of it. This allows pivot_root() and unmounting
of the initramfs to work normally.
Most systems just mount another filesystem over rootfs and ignore it. The
amount of space an empty instance of ramfs takes up is tiny.
@ -121,16 +121,14 @@ All this differs from the old initrd in several ways:
program. See the switch_root utility, below.)
- When switching another root device, initrd would pivot_root and then
umount the ramdisk. But initramfs is rootfs: you can neither pivot_root
rootfs, nor unmount it. Instead delete everything out of rootfs to
free up the space (find -xdev / -exec rm '{}' ';'), overmount rootfs
with the new root (cd /newmount; mount --move . /; chroot .), attach
stdin/stdout/stderr to the new /dev/console, and exec the new init.
umount the ramdisk. With nullfs as the true root, pivot_root() works
normally from the initramfs. Userspace can simply do::
Since this is a remarkably persnickety process (and involves deleting
commands before you can run them), the klibc package introduced a helper
program (utils/run_init.c) to do all this for you. Most other packages
(such as busybox) have named this command "switch_root".
chdir(new_root);
pivot_root(".", ".");
umount2(".", MNT_DETACH);
This is the preferred method for switching root filesystems.
Populating initramfs:
---------------------

View file

@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
file_attr.o fserror.o
file_attr.o fserror.o nullfs.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o

View file

@ -13,6 +13,23 @@
#include <linux/security.h>
#include "internal.h"
int __init init_pivot_root(const char *new_root, const char *put_old)
{
struct path new_path __free(path_put) = {};
struct path old_path __free(path_put) = {};
int ret;
ret = kern_path(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new_path);
if (ret)
return ret;
ret = kern_path(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_path);
if (ret)
return ret;
return path_pivot_root(&new_path, &old_path);
}
int __init init_mount(const char *dev_name, const char *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{

View file

@ -90,6 +90,7 @@ extern bool may_mount(void);
int path_mount(const char *dev_name, const struct path *path,
const char *type_page, unsigned long flags, void *data_page);
int path_umount(const struct path *path, int flags);
int path_pivot_root(struct path *new, struct path *old);
int show_path(struct seq_file *m, struct dentry *root);

View file

@ -5,6 +5,7 @@
#include <linux/ns_common.h>
#include <linux/fs_pin.h>
extern struct file_system_type nullfs_fs_type;
extern struct list_head notify_list;
struct mnt_namespace {

View file

@ -221,7 +221,7 @@ static int mnt_alloc_id(struct mount *mnt)
int res;
xa_lock(&mnt_id_xa);
res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL);
if (!res)
mnt->mnt_id_unique = ++mnt_id_ctr;
xa_unlock(&mnt_id_xa);
@ -4498,36 +4498,8 @@ bool path_is_under(const struct path *path1, const struct path *path2)
}
EXPORT_SYMBOL(path_is_under);
/*
* pivot_root Semantics:
* Moves the root file system of the current process to the directory put_old,
* makes new_root as the new root file system of the current process, and sets
* root/cwd of all processes which had them on the current root to new_root.
*
* Restrictions:
* The new_root and put_old must be directories, and must not be on the
* same file system as the current process root. The put_old must be
* underneath new_root, i.e. adding a non-zero number of /.. to the string
* pointed to by put_old must yield the same directory as new_root. No other
* file system may be mounted on put_old. After all, new_root is a mountpoint.
*
* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
* See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
* in this situation.
*
* Notes:
* - we don't move root/cwd if they are not at the root (reason: if something
* cared enough to change them, it's probably wrong to force them elsewhere)
* - it's okay to pick a root that isn't the root of a file system, e.g.
* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
* first.
*/
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
int path_pivot_root(struct path *new, struct path *old)
{
struct path new __free(path_put) = {};
struct path old __free(path_put) = {};
struct path root __free(path_put) = {};
struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
int error;
@ -4535,28 +4507,18 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
if (!may_mount())
return -EPERM;
error = user_path_at(AT_FDCWD, new_root,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
if (error)
return error;
error = user_path_at(AT_FDCWD, put_old,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
if (error)
return error;
error = security_sb_pivotroot(&old, &new);
error = security_sb_pivotroot(old, new);
if (error)
return error;
get_fs_root(current->fs, &root);
LOCK_MOUNT(old_mp, &old);
LOCK_MOUNT(old_mp, old);
old_mnt = old_mp.parent;
if (IS_ERR(old_mnt))
return PTR_ERR(old_mnt);
new_mnt = real_mount(new.mnt);
new_mnt = real_mount(new->mnt);
root_mnt = real_mount(root.mnt);
ex_parent = new_mnt->mnt_parent;
root_parent = root_mnt->mnt_parent;
@ -4568,7 +4530,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
return -EINVAL;
if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
return -EINVAL;
if (d_unlinked(new.dentry))
if (d_unlinked(new->dentry))
return -ENOENT;
if (new_mnt == root_mnt || old_mnt == root_mnt)
return -EBUSY; /* loop, on the same file system */
@ -4576,15 +4538,15 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
return -EINVAL; /* absolute root */
if (!path_mounted(&new))
if (!path_mounted(new))
return -EINVAL; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
return -EINVAL; /* absolute root */
/* make sure we can reach put_old from new_root */
if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new))
if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new))
return -EINVAL;
/* make certain new is below the root */
if (!is_path_reachable(new_mnt, new.dentry, &root))
if (!is_path_reachable(new_mnt, new->dentry, &root))
return -EINVAL;
lock_mount_hash();
umount_mnt(new_mnt);
@ -4603,10 +4565,55 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
unlock_mount_hash();
mnt_notify_add(root_mnt);
mnt_notify_add(new_mnt);
chroot_fs_refs(&root, &new);
chroot_fs_refs(&root, new);
return 0;
}
/*
* pivot_root Semantics:
* Moves the root file system of the current process to the directory put_old,
* makes new_root as the new root file system of the current process, and sets
* root/cwd of all processes which had them on the current root to new_root.
*
* Restrictions:
* The new_root and put_old must be directories, and must not be on the
* same file system as the current process root. The put_old must be
* underneath new_root, i.e. adding a non-zero number of /.. to the string
* pointed to by put_old must yield the same directory as new_root. No other
* file system may be mounted on put_old. After all, new_root is a mountpoint.
*
* The immutable nullfs filesystem is mounted as the true root of the VFS
* hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this,
* allowing pivot_root() to work normally from initramfs.
*
* Notes:
* - we don't move root/cwd if they are not at the root (reason: if something
* cared enough to change them, it's probably wrong to force them elsewhere)
* - it's okay to pick a root that isn't the root of a file system, e.g.
* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
* first.
*/
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
const char __user *, put_old)
{
struct path new __free(path_put) = {};
struct path old __free(path_put) = {};
int error;
error = user_path_at(AT_FDCWD, new_root,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
if (error)
return error;
error = user_path_at(AT_FDCWD, put_old,
LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
if (error)
return error;
return path_pivot_root(&new, &old);
}
static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
{
unsigned int flags = mnt->mnt.mnt_flags;
@ -5969,24 +5976,62 @@ struct mnt_namespace init_mnt_ns = {
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
struct mount *m;
struct vfsmount *mnt, *nullfs_mnt;
struct mount *mnt_root;
struct path root;
/*
* We create two mounts:
*
* (1) nullfs with mount id 1
* (2) mutable rootfs with mount id 2
*
* with (2) mounted on top of (1).
*/
nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL);
if (IS_ERR(nullfs_mnt))
panic("VFS: Failed to create nullfs");
mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
if (IS_ERR(mnt))
panic("Can't create rootfs");
m = real_mount(mnt);
init_mnt_ns.root = m;
init_mnt_ns.nr_mounts = 1;
mnt_add_to_ns(&init_mnt_ns, m);
VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1);
VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2);
/* The namespace root is the nullfs mnt. */
mnt_root = real_mount(nullfs_mnt);
init_mnt_ns.root = mnt_root;
/* Mount mutable rootfs on top of nullfs. */
root.mnt = nullfs_mnt;
root.dentry = nullfs_mnt->mnt_root;
LOCK_MOUNT_EXACT(mp, &root);
if (unlikely(IS_ERR(mp.parent)))
panic("VFS: Failed to mount rootfs on nullfs");
scoped_guard(mount_writer)
attach_mnt(real_mount(mnt), mp.parent, mp.mp);
pr_info("VFS: Finished mounting rootfs on nullfs\n");
/*
* We've dropped all locks here but that's fine. Not just are we
* the only task that's running, there's no other mount
* namespace in existence and the initial mount namespace is
* completely empty until we add the mounts we just created.
*/
for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) {
mnt_add_to_ns(&init_mnt_ns, p);
init_mnt_ns.nr_mounts++;
}
init_task.nsproxy->mnt_ns = &init_mnt_ns;
get_mnt_ns(&init_mnt_ns);
/* The root and pwd always point to the mutable rootfs. */
root.mnt = mnt;
root.dentry = mnt->mnt_root;
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);

70
fs/nullfs.c Normal file
View file

@ -0,0 +1,70 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2026 Christian Brauner <brauner@kernel.org> */
#include <linux/fs/super_types.h>
#include <linux/fs_context.h>
#include <linux/magic.h>
static const struct super_operations nullfs_super_operations = {
.statfs = simple_statfs,
};
static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc)
{
struct inode *inode;
s->s_maxbytes = MAX_LFS_FILESIZE;
s->s_blocksize = PAGE_SIZE;
s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = NULL_FS_MAGIC;
s->s_op = &nullfs_super_operations;
s->s_export_op = NULL;
s->s_xattr = NULL;
s->s_time_gran = 1;
s->s_d_flags = 0;
inode = new_inode(s);
if (!inode)
return -ENOMEM;
/* nullfs is permanently empty... */
make_empty_dir_inode(inode);
simple_inode_init_ts(inode);
inode->i_ino = 1;
/* ... and immutable. */
inode->i_flags |= S_IMMUTABLE;
s->s_root = d_make_root(inode);
if (!s->s_root)
return -ENOMEM;
return 0;
}
/*
* For now this is a single global instance. If needed we can make it
* mountable by userspace at which point we will need to make it
* multi-instance.
*/
static int nullfs_fs_get_tree(struct fs_context *fc)
{
return get_tree_single(fc, nullfs_fs_fill_super);
}
static const struct fs_context_operations nullfs_fs_context_ops = {
.get_tree = nullfs_fs_get_tree,
};
static int nullfs_init_fs_context(struct fs_context *fc)
{
fc->ops = &nullfs_fs_context_ops;
fc->global = true;
fc->sb_flags = SB_NOUSER;
fc->s_iflags = SB_I_NOEXEC | SB_I_NODEV;
return 0;
}
struct file_system_type nullfs_fs_type = {
.name = "nullfs",
.init_fs_context = nullfs_init_fs_context,
.kill_sb = kill_anon_super,
};

View file

@ -17,3 +17,4 @@ int __init init_mkdir(const char *pathname, umode_t mode);
int __init init_rmdir(const char *pathname);
int __init init_utimes(char *filename, struct timespec64 *ts);
int __init init_dup(struct file *file);
int __init init_pivot_root(const char *new_root, const char *put_old);

View file

@ -104,5 +104,6 @@
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
#define PID_FS_MAGIC 0x50494446 /* "PIDF" */
#define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */
#define NULL_FS_MAGIC 0x4E554C4C /* "NULL" */
#endif /* __LINUX_MAGIC_H__ */

View file

@ -483,8 +483,16 @@ void __init prepare_namespace(void)
wait_for_root(saved_root_name);
mount_root(saved_root_name);
devtmpfs_mount();
init_mount(".", "/", NULL, MS_MOVE, NULL);
init_chroot(".");
if (init_pivot_root(".", ".")) {
pr_err("VFS: Failed to pivot into new rootfs\n");
return;
}
if (init_umount(".", MNT_DETACH)) {
pr_err("VFS: Failed to unmount old rootfs\n");
return;
}
pr_info("VFS: Pivoted into new rootfs\n");
}
static bool is_tmpfs;