Merge patch series "mount: add OPEN_TREE_NAMESPACE"

Christian Brauner <brauner@kernel.org> says:

When creating containers the setup usually involves using CLONE_NEWNS
via clone3() or unshare(). This copies the caller's complete mount
namespace. The runtime will also assemble a new rootfs and then use
pivot_root() to switch the old mount tree with the new rootfs. Afterward
it will recursively umount the old mount tree thereby getting rid of all
mounts.

On a basic system here where the mount table isn't particularly large
this still copies about 30 mounts. Copying all of these mounts only to
get rid of them later is pretty wasteful.

This is exacerbated if intermediary mount namespaces are used that only
exist for a very short amount of time and are immediately destroyed
again causing a ton of mounts to be copied and destroyed needlessly.

With a large mount table and a system where thousands or ten-thousands
of namespaces are spawned in parallel this quickly becomes a bottleneck
increasing contention on the semaphore.

Extend open_tree() with a new OPEN_TREE_NAMESPACE flag. Similar to
OPEN_TREE_CLONE only the indicated mount tree is copied. Instead of
returning a file descriptor referring to that mount tree
OPEN_TREE_NAMESPACE will cause open_tree() to return a file descriptor
to a new mount namespace. In that new mount namespace the copied mount
tree has been mounted on top of a copy of the real rootfs.

The caller can setns() into that mount namespace and perform any
additionally setup such as move_mount()ing detached mounts in there.

This allows OPEN_TREE_NAMESPACE to function as a combined
unshare(CLONE_NEWNS) and pivot_root().

A caller may for example choose to create an extremely minimal rootfs:

fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE);

This will create a mount namespace where "wootwoot" has become the
rootfs mounted on top of the real rootfs. The caller can now setns()
into this new mount namespace and assemble additional mounts.

This also works with user namespaces:

unshare(CLONE_NEWUSER);
fd_mntns = open_tree(-EBADF, "/var/lib/containers/wootwoot", OPEN_TREE_NAMESPACE);

which creates a new mount namespace owned by the earlier created user
namespace with "wootwoot" as the rootfs mounted on top of the real
rootfs.

This will scale a lot better when creating tons of mount namespaces and
will allow to get rid of a lot of unnecessary mount and umount cycles.
It also allows to create mount namespaces without needing to spawn
throwaway helper processes.

* patches from https://patch.msgid.link/20251229-work-empty-namespace-v1-0-bfb24c7b061f@kernel.org:
  selftests/open_tree: add OPEN_TREE_NAMESPACE tests
  mount: add OPEN_TREE_NAMESPACE

Link: https://patch.msgid.link/20251229-work-empty-namespace-v1-0-bfb24c7b061f@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
Christian Brauner 2026-01-12 13:51:32 +01:00
commit 1bce1a664a
No known key found for this signature in database
GPG key ID: 91C61BC06578DCA2
9 changed files with 1231 additions and 17 deletions

View file

@ -246,6 +246,7 @@ extern void mnt_pin_kill(struct mount *m);
*/
extern const struct dentry_operations ns_dentry_operations;
int open_namespace(struct ns_common *ns);
struct file *open_namespace_file(struct ns_common *ns);
/*
* fs/stat.c:

View file

@ -2796,6 +2796,9 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
__unlock_mount(m);
}
static void lock_mount_exact(const struct path *path,
struct pinned_mountpoint *mp);
#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
do_lock_mount((path), &mp, (beneath))
@ -2946,10 +2949,11 @@ static inline bool may_copy_tree(const struct path *path)
return check_anonymous_mnt(mnt);
}
static struct mount *__do_loopback(const struct path *old_path, int recurse)
static struct mount *__do_loopback(const struct path *old_path,
unsigned int flags, unsigned int copy_flags)
{
struct mount *old = real_mount(old_path->mnt);
bool recurse = flags & AT_RECURSIVE;
if (IS_MNT_UNBINDABLE(old))
return ERR_PTR(-EINVAL);
@ -2960,10 +2964,22 @@ static struct mount *__do_loopback(const struct path *old_path, int recurse)
if (!recurse && __has_locked_children(old, old_path->dentry))
return ERR_PTR(-EINVAL);
/*
* When creating a new mount namespace we don't want to copy over
* mounts of mount namespaces to avoid the risk of cycles and also to
* minimize the default complex interdependencies between mount
* namespaces.
*
* We could ofc just check whether all mount namespace files aren't
* creating cycles but really let's keep this simple.
*/
if (!(flags & OPEN_TREE_NAMESPACE))
copy_flags |= CL_COPY_MNT_NS_FILE;
if (recurse)
return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
else
return clone_mnt(old, old_path->dentry, 0);
return copy_tree(old, old_path->dentry, copy_flags);
return clone_mnt(old, old_path->dentry, copy_flags);
}
/*
@ -2974,7 +2990,9 @@ static int do_loopback(const struct path *path, const char *old_name,
{
struct path old_path __free(path_put) = {};
struct mount *mnt = NULL;
unsigned int flags = recurse ? AT_RECURSIVE : 0;
int err;
if (!old_name || !*old_name)
return -EINVAL;
err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
@ -2991,7 +3009,7 @@ static int do_loopback(const struct path *path, const char *old_name,
if (!check_mnt(mp.parent))
return -EINVAL;
mnt = __do_loopback(&old_path, recurse);
mnt = __do_loopback(&old_path, flags, 0);
if (IS_ERR(mnt))
return PTR_ERR(mnt);
@ -3004,7 +3022,7 @@ static int do_loopback(const struct path *path, const char *old_name,
return err;
}
static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags)
{
struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
struct user_namespace *user_ns = mnt_ns->user_ns;
@ -3029,7 +3047,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
ns->seq_origin = src_mnt_ns->ns.ns_id;
}
mnt = __do_loopback(path, recursive);
mnt = __do_loopback(path, flags, 0);
if (IS_ERR(mnt)) {
emptied_ns = ns;
return ERR_CAST(mnt);
@ -3043,9 +3061,9 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
return ns;
}
static struct file *open_detached_copy(struct path *path, bool recursive)
static struct file *open_detached_copy(struct path *path, unsigned int flags)
{
struct mnt_namespace *ns = get_detached_copy(path, recursive);
struct mnt_namespace *ns = get_detached_copy(path, flags);
struct file *file;
if (IS_ERR(ns))
@ -3061,21 +3079,122 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
return file;
}
DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
{
struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
struct path to_path __free(path_put) = {};
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct user_namespace *user_ns = current_user_ns();
struct mount *new_ns_root;
struct mount *mnt;
unsigned int copy_flags = 0;
bool locked = false;
if (user_ns != ns->user_ns)
copy_flags |= CL_SLAVE;
new_ns = alloc_mnt_ns(user_ns, false);
if (IS_ERR(new_ns))
return ERR_CAST(new_ns);
scoped_guard(namespace_excl) {
new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
if (IS_ERR(new_ns_root))
return ERR_CAST(new_ns_root);
/*
* If the real rootfs had a locked mount on top of it somewhere
* in the stack, lock the new mount tree as well so it can't be
* exposed.
*/
mnt = ns->root;
while (mnt->overmount) {
mnt = mnt->overmount;
if (mnt->mnt.mnt_flags & MNT_LOCKED)
locked = true;
}
}
/*
* We dropped the namespace semaphore so we can actually lock
* the copy for mounting. The copied mount isn't attached to any
* mount namespace and it is thus excluded from any propagation.
* So realistically we're isolated and the mount can't be
* overmounted.
*/
/* Borrow the reference from clone_mnt(). */
to_path.mnt = &new_ns_root->mnt;
to_path.dentry = dget(new_ns_root->mnt.mnt_root);
/* Now lock for actual mounting. */
LOCK_MOUNT_EXACT(mp, &to_path);
if (unlikely(IS_ERR(mp.parent)))
return ERR_CAST(mp.parent);
/*
* We don't emulate unshare()ing a mount namespace. We stick to the
* restrictions of creating detached bind-mounts. It has a lot
* saner and simpler semantics.
*/
mnt = __do_loopback(path, flags, copy_flags);
if (IS_ERR(mnt))
return ERR_CAST(mnt);
scoped_guard(mount_writer) {
if (locked)
mnt->mnt.mnt_flags |= MNT_LOCKED;
/*
* Now mount the detached tree on top of the copy of the
* real rootfs we created.
*/
attach_mnt(mnt, new_ns_root, mp.mp);
if (user_ns != ns->user_ns)
lock_mnt_tree(new_ns_root);
}
/* Add all mounts to the new namespace. */
for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
mnt_add_to_ns(new_ns, p);
new_ns->nr_mounts++;
}
new_ns->root = real_mount(no_free_ptr(to_path.mnt));
ns_tree_add_raw(new_ns);
return no_free_ptr(new_ns);
}
static struct file *open_new_namespace(struct path *path, unsigned int flags)
{
struct mnt_namespace *new_ns;
new_ns = create_new_namespace(path, flags);
if (IS_ERR(new_ns))
return ERR_CAST(new_ns);
return open_namespace_file(to_ns_common(new_ns));
}
static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
{
int ret;
struct path path __free(path_put) = {};
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
bool detached = flags & OPEN_TREE_CLONE;
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
OPEN_TREE_CLOEXEC))
OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
return ERR_PTR(-EINVAL);
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
AT_RECURSIVE)
return ERR_PTR(-EINVAL);
if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
return ERR_PTR(-EINVAL);
if (flags & AT_NO_AUTOMOUNT)
@ -3085,15 +3204,27 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
if (detached && !may_mount())
/*
* If we create a new mount namespace with the cloned mount tree we
* just care about being privileged over our current user namespace.
* The new mount namespace will be owned by it.
*/
if ((flags & OPEN_TREE_NAMESPACE) &&
!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if ((flags & OPEN_TREE_CLONE) && !may_mount())
return ERR_PTR(-EPERM);
ret = user_path_at(dfd, filename, lookup_flags, &path);
if (unlikely(ret))
return ERR_PTR(ret);
if (detached)
return open_detached_copy(&path, flags & AT_RECURSIVE);
if (flags & OPEN_TREE_NAMESPACE)
return open_new_namespace(&path, flags);
if (flags & OPEN_TREE_CLONE)
return open_detached_copy(&path, flags);
return dentry_open(&path, O_PATH, current_cred());
}

View file

@ -99,6 +99,19 @@ int ns_get_path(struct path *path, struct task_struct *task,
return ns_get_path_cb(path, ns_get_path_task, &args);
}
struct file *open_namespace_file(struct ns_common *ns)
{
struct path path __free(path_put) = {};
int err;
/* call first to consume reference */
err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
if (err < 0)
return ERR_PTR(err);
return dentry_open(&path, O_RDONLY, current_cred());
}
/**
* open_namespace - open a namespace
* @ns: the namespace to open

View file

@ -61,7 +61,8 @@
/*
* open_tree() flags.
*/
#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */
#define OPEN_TREE_CLONE (1 << 0) /* Clone the target tree and attach the clone */
#define OPEN_TREE_NAMESPACE (1 << 1) /* Clone the target tree into a new mount namespace */
#define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */
/*

View file

@ -0,0 +1 @@
open_tree_ns_test

View file

@ -0,0 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
TEST_GEN_PROGS := open_tree_ns_test
CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES)
LDLIBS := -lcap
include ../../lib.mk
$(OUTPUT)/open_tree_ns_test: open_tree_ns_test.c ../utils.c
$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)

File diff suppressed because it is too large Load diff

View file

@ -515,6 +515,32 @@ int setup_userns(void)
return 0;
}
int enter_userns(void)
{
int ret;
char buf[32];
uid_t uid = getuid();
gid_t gid = getgid();
ret = unshare(CLONE_NEWUSER);
if (ret)
return ret;
sprintf(buf, "0 %d 1", uid);
ret = write_file("/proc/self/uid_map", buf);
if (ret)
return ret;
ret = write_file("/proc/self/setgroups", "deny");
if (ret)
return ret;
sprintf(buf, "0 %d 1", gid);
ret = write_file("/proc/self/gid_map", buf);
if (ret)
return ret;
return 0;
}
/* caps_down - lower all effective caps */
int caps_down(void)
{

View file

@ -28,6 +28,7 @@ extern int cap_down(cap_value_t down);
extern bool switch_ids(uid_t uid, gid_t gid);
extern int setup_userns(void);
extern int enter_userns(void);
static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
{