vfs-7.0-rc1.namespace

Please consider pulling these changes from the signed vfs-7.0-rc1.namespace tag. Thanks! Christian -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCaYX49gAKCRCRxhvAZXjc ovzgAP9BpqMQhMy2VCurru8/T5VAd6eJdgXzEfXqMksL5BNm8gEAsLx666KJNKgm Sh/yVA2KBjf51gvcLZ4gHOISaMU8bAI= =RGLf -----END PGP SIGNATURE----- Merge tag 'vfs-7.0-rc1.namespace' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs Pull vfs mount updates from Christian Brauner: - statmount: accept fd as a parameter Extend struct mnt_id_req with a file descriptor field and a new STATMOUNT_BY_FD flag. When set, statmount() returns mount information for the mount the fd resides on — including detached mounts (unmounted via umount2(MNT_DETACH)). For detached mounts the STATMOUNT_MNT_POINT and STATMOUNT_MNT_NS_ID mask bits are cleared since neither is meaningful. The capability check is skipped for STATMOUNT_BY_FD since holding an fd already implies prior access to the mount and equivalent information is available through fstatfs() and /proc/pid/mountinfo without privilege. Includes comprehensive selftests covering both attached and detached mount cases. - fs: Remove internal old mount API code (1 patch) Now that every in-tree filesystem has been converted to the new mount API, remove all the legacy shim code in fs_context.c that handled unconverted filesystems. This deletes ~280 lines including legacy_init_fs_context(), the legacy_fs_context struct, and associated wrappers. The mount(2) syscall path for userspace remains untouched. Documentation references to the legacy callbacks are cleaned up. - mount: add OPEN_TREE_NAMESPACE to open_tree() Container runtimes currently use CLONE_NEWNS to copy the caller's entire mount namespace — only to then pivot_root() and recursively unmount everything they just copied. With large mount tables and thousands of parallel container launches this creates significant contention on the namespace semaphore. OPEN_TREE_NAMESPACE copies only the specified mount tree (like OPEN_TREE_CLONE) but returns a mount namespace fd instead of a detached mount fd. The new namespace contains the copied tree mounted on top of a clone of the real rootfs. This functions as a combined unshare(CLONE_NEWNS) + pivot_root() in a single syscall. Works with user namespaces: an unshare(CLONE_NEWUSER) followed by OPEN_TREE_NAMESPACE creates a mount namespace owned by the new user namespace. Mount namespace file mounts are excluded from the copy to prevent cycles. Includes ~1000 lines of selftests" * tag 'vfs-7.0-rc1.namespace' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: selftests/open_tree: add OPEN_TREE_NAMESPACE tests mount: add OPEN_TREE_NAMESPACE fs: Remove internal old mount API code selftests: statmount: tests for STATMOUNT_BY_FD statmount: accept fd as a parameter statmount: permission check should return EPERM
2026-03-08 01:04:41 +01:00 · 2026-02-09 14:43:47 -08:00 · 2026-02-09 14:43:47 -08:00 · 157d3d6efd
commit 157d3d6efd
parent 8113b3998d 1bce1a664a
20 changed files with 1669 additions and 365 deletions
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@ -180,7 +180,6 @@ prototypes::
 	int (*freeze_fs) (struct super_block *);
 	int (*unfreeze_fs) (struct super_block *);
 	int (*statfs) (struct dentry *, struct kstatfs *);
-	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*umount_begin) (struct super_block *);
 	int (*show_options)(struct seq_file *, struct dentry *);
 	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
@ -204,7 +203,6 @@ sync_fs:		read
 freeze_fs:		write
 unfreeze_fs:		write
 statfs:			maybe(read)	(see below)
-remount_fs:		write
 umount_begin:		no
 show_options:		no		(namespace_sem)
 quota_read:		no		(see below)
@ -229,8 +227,6 @@ file_system_type

 prototypes::

-	struct dentry *(*mount) (struct file_system_type *, int,
-		       const char *, void *);
 	void (*kill_sb) (struct super_block *);

 locking rules:
@ -238,13 +234,9 @@ locking rules:
 =======		=========
 ops		may block
 =======		=========
-mount		yes
 kill_sb		yes
 =======		=========

->mount() returns ERR_PTR or the root dentry; its superblock should be locked
-on return.
-
 ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
 unlocks and drops the reference.

--- a/Documentation/filesystems/mount_api.rst
+++ b/Documentation/filesystems/mount_api.rst
@ -299,8 +299,6 @@ manage the filesystem context.  They are as follows:
     On success it should return 0.  In the case of an error, it should return
     a negative error code.

-     .. Note:: reconfigure is intended as a replacement for remount_fs.
-

 Filesystem context Security
 ===========================
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@ -448,11 +448,8 @@ a file off.

 **mandatory**

->get_sb() is gone.  Switch to use of ->mount().  Typically it's just
-a matter of switching from calling ``get_sb_``... to ``mount_``... and changing
-the function type.  If you were doing it manually, just switch from setting
->mnt_root to some pointer to returning that pointer.  On errors return
-ERR_PTR(...).
+->get_sb() and ->mount() are gone. Switch to using the new mount API. See
+Documentation/filesystems/mount_api.rst for more details.

 ---

--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@ -94,11 +94,9 @@ functions:

 The passed struct file_system_type describes your filesystem.  When a
 request is made to mount a filesystem onto a directory in your
-namespace, the VFS will call the appropriate mount() method for the
-specific filesystem.  New vfsmount referring to the tree returned by
->mount() will be attached to the mountpoint, so that when pathname
-resolution reaches the mountpoint it will jump into the root of that
-vfsmount.
+namespace, the VFS will call the appropriate get_tree() method for the
+specific filesystem.  See Documentation/filesystems/mount_api.rst
+for more details.

 You can see all filesystems that are registered to the kernel in the
 file /proc/filesystems.
@ -117,8 +115,6 @@ members are defined:
 		int fs_flags;
 		int (*init_fs_context)(struct fs_context *);
 		const struct fs_parameter_spec *parameters;
-		struct dentry *(*mount) (struct file_system_type *, int,
-			const char *, void *);
 		void (*kill_sb) (struct super_block *);
 		struct module *owner;
 		struct file_system_type * next;
@ -151,10 +147,6 @@ members are defined:
 	'struct fs_parameter_spec'.
 	More info in Documentation/filesystems/mount_api.rst.

-``mount``
-	the method to call when a new instance of this filesystem should
-	be mounted
-
 ``kill_sb``
 	the method to call when an instance of this filesystem should be
 	shut down
@ -173,45 +165,6 @@ members are defined:
  s_lock_key, s_umount_key, s_vfs_rename_key, s_writers_key,
  i_lock_key, i_mutex_key, invalidate_lock_key, i_mutex_dir_key: lockdep-specific

-The mount() method has the following arguments:
-
-``struct file_system_type *fs_type``
-	describes the filesystem, partly initialized by the specific
-	filesystem code
-
-``int flags``
-	mount flags
-
-``const char *dev_name``
-	the device name we are mounting.
-
-``void *data``
-	arbitrary mount options, usually comes as an ASCII string (see
-	"Mount Options" section)
-
-The mount() method must return the root dentry of the tree requested by
-caller.  An active reference to its superblock must be grabbed and the
-superblock must be locked.  On failure it should return ERR_PTR(error).
-
-The arguments match those of mount(2) and their interpretation depends
-on filesystem type.  E.g. for block filesystems, dev_name is interpreted
-as block device name, that device is opened and if it contains a
-suitable filesystem image the method creates and initializes struct
-super_block accordingly, returning its root dentry to caller.
-
->mount() may choose to return a subtree of existing filesystem - it
-doesn't have to create a new one.  The main result from the caller's
-point of view is a reference to dentry at the root of (sub)tree to be
-attached; creation of new superblock is a common side effect.
-
-The most interesting member of the superblock structure that the mount()
-method fills in is the "s_op" field.  This is a pointer to a "struct
-super_operations" which describes the next level of the filesystem
-implementation.
-
-For more information on mounting (and the new mount API), see
-Documentation/filesystems/mount_api.rst.
-
 The Superblock Object
 =====================

@ -244,7 +197,6 @@ filesystem.  The following members are defined:
 					enum freeze_wholder who);
 		int (*unfreeze_fs) (struct super_block *);
 		int (*statfs) (struct dentry *, struct kstatfs *);
-		int (*remount_fs) (struct super_block *, int *, char *);
 		void (*umount_begin) (struct super_block *);

 		int (*show_options)(struct seq_file *, struct dentry *);
@ -351,10 +303,6 @@ or bottom half).
 ``statfs``
 	called when the VFS needs to get filesystem statistics.

-``remount_fs``
-	called when the filesystem is remounted.  This is called with
-	the kernel lock held
-
 ``umount_begin``
 	called when the VFS is unmounting a filesystem.

--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@ -24,20 +24,6 @@
 #include "mount.h"
 #include "internal.h"

-enum legacy_fs_param {
-	LEGACY_FS_UNSET_PARAMS,
-	LEGACY_FS_MONOLITHIC_PARAMS,
-	LEGACY_FS_INDIVIDUAL_PARAMS,
-};
-
-struct legacy_fs_context {
-	char			*legacy_data;	/* Data page for legacy filesystems */
-	size_t			data_size;
-	enum legacy_fs_param	param_type;
-};
-
-static int legacy_init_fs_context(struct fs_context *fc);
-
 static const struct constant_table common_set_sb_flag[] = {
 	{ "dirsync",	SB_DIRSYNC },
 	{ "lazytime",	SB_LAZYTIME },
@ -275,7 +261,6 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 				      unsigned int sb_flags_mask,
 				      enum fs_context_purpose purpose)
 {
-	int (*init_fs_context)(struct fs_context *);
 	struct fs_context *fc;
 	int ret = -ENOMEM;

@ -307,12 +292,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type,
 		break;
 	}

-	/* TODO: Make all filesystems support this unconditionally */
-	init_fs_context = fc->fs_type->init_fs_context;
-	if (!init_fs_context)
-		init_fs_context = legacy_init_fs_context;
-
-	ret = init_fs_context(fc);
+	ret = fc->fs_type->init_fs_context(fc);
 	if (ret < 0)
 		goto err_fc;
 	fc->need_free = true;
@ -376,8 +356,6 @@ void fc_drop_locked(struct fs_context *fc)
 	deactivate_locked_super(sb);
 }

-static void legacy_fs_context_free(struct fs_context *fc);
-
 /**
 * vfs_dup_fs_context - Duplicate a filesystem context.
 * @src_fc: The context to copy.
@ -531,184 +509,6 @@ void put_fs_context(struct fs_context *fc)
 }
 EXPORT_SYMBOL(put_fs_context);

-/*
- * Free the config for a filesystem that doesn't support fs_context.
- */
-static void legacy_fs_context_free(struct fs_context *fc)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-
-	if (ctx) {
-		if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS)
-			kfree(ctx->legacy_data);
-		kfree(ctx);
-	}
-}
-
-/*
- * Duplicate a legacy config.
- */
-static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
-{
-	struct legacy_fs_context *ctx;
-	struct legacy_fs_context *src_ctx = src_fc->fs_private;
-
-	ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL);
-	if (!ctx)
-		return -ENOMEM;
-
-	if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) {
-		ctx->legacy_data = kmemdup(src_ctx->legacy_data,
-					   src_ctx->data_size, GFP_KERNEL);
-		if (!ctx->legacy_data) {
-			kfree(ctx);
-			return -ENOMEM;
-		}
-	}
-
-	fc->fs_private = ctx;
-	return 0;
-}
-
-/*
- * Add a parameter to a legacy config.  We build up a comma-separated list of
- * options.
- */
-static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-	unsigned int size = ctx->data_size;
-	size_t len = 0;
-	int ret;
-
-	ret = vfs_parse_fs_param_source(fc, param);
-	if (ret != -ENOPARAM)
-		return ret;
-
-	if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
-		return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
-
-	switch (param->type) {
-	case fs_value_is_string:
-		len = 1 + param->size;
-		fallthrough;
-	case fs_value_is_flag:
-		len += strlen(param->key);
-		break;
-	default:
-		return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported",
-			      param->key);
-	}
-
-	if (size + len + 2 > PAGE_SIZE)
-		return invalf(fc, "VFS: Legacy: Cumulative options too large");
-	if (strchr(param->key, ',') ||
-	    (param->type == fs_value_is_string &&
-	     memchr(param->string, ',', param->size)))
-		return invalf(fc, "VFS: Legacy: Option '%s' contained comma",
-			      param->key);
-	if (!ctx->legacy_data) {
-		ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
-		if (!ctx->legacy_data)
-			return -ENOMEM;
-	}
-
-	if (size)
-		ctx->legacy_data[size++] = ',';
-	len = strlen(param->key);
-	memcpy(ctx->legacy_data + size, param->key, len);
-	size += len;
-	if (param->type == fs_value_is_string) {
-		ctx->legacy_data[size++] = '=';
-		memcpy(ctx->legacy_data + size, param->string, param->size);
-		size += param->size;
-	}
-	ctx->legacy_data[size] = '\0';
-	ctx->data_size = size;
-	ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS;
-	return 0;
-}
-
-/*
- * Add monolithic mount data.
- */
-static int legacy_parse_monolithic(struct fs_context *fc, void *data)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-
-	if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) {
-		pr_warn("VFS: Can't mix monolithic and individual options\n");
-		return -EINVAL;
-	}
-
-	ctx->legacy_data = data;
-	ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS;
-	if (!ctx->legacy_data)
-		return 0;
-
-	if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA)
-		return 0;
-	return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security);
-}
-
-/*
- * Get a mountable root with the legacy mount command.
- */
-static int legacy_get_tree(struct fs_context *fc)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-	struct super_block *sb;
-	struct dentry *root;
-
-	root = fc->fs_type->mount(fc->fs_type, fc->sb_flags,
-				      fc->source, ctx->legacy_data);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
-
-	sb = root->d_sb;
-	BUG_ON(!sb);
-
-	fc->root = root;
-	return 0;
-}
-
-/*
- * Handle remount.
- */
-static int legacy_reconfigure(struct fs_context *fc)
-{
-	struct legacy_fs_context *ctx = fc->fs_private;
-	struct super_block *sb = fc->root->d_sb;
-
-	if (!sb->s_op->remount_fs)
-		return 0;
-
-	return sb->s_op->remount_fs(sb, &fc->sb_flags,
-				    ctx ? ctx->legacy_data : NULL);
-}
-
-const struct fs_context_operations legacy_fs_context_ops = {
-	.free			= legacy_fs_context_free,
-	.dup			= legacy_fs_context_dup,
-	.parse_param		= legacy_parse_param,
-	.parse_monolithic	= legacy_parse_monolithic,
-	.get_tree		= legacy_get_tree,
-	.reconfigure		= legacy_reconfigure,
-};
-
-/*
- * Initialise a legacy context for a filesystem that doesn't support
- * fs_context.
- */
-static int legacy_init_fs_context(struct fs_context *fc)
-{
-	fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT);
-	if (!fc->fs_private)
-		return -ENOMEM;
-	fc->ops = &legacy_fs_context_ops;
-	return 0;
-}
-
 int parse_monolithic_mount_data(struct fs_context *fc, void *data)
 {
 	int (*monolithic_mount_data)(struct fs_context *, void *);
@ -757,10 +557,8 @@ int finish_clean_context(struct fs_context *fc)
 	if (fc->phase != FS_CONTEXT_AWAITING_RECONF)
 		return 0;

-	if (fc->fs_type->init_fs_context)
-		error = fc->fs_type->init_fs_context(fc);
-	else
-		error = legacy_init_fs_context(fc);
+	error = fc->fs_type->init_fs_context(fc);
+
 	if (unlikely(error)) {
 		fc->phase = FS_CONTEXT_FAILED;
 		return error;
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@ -404,16 +404,6 @@ SYSCALL_DEFINE5(fsconfig,
 		return -EINVAL;

 	fc = fd_file(f)->private_data;
-	if (fc->ops == &legacy_fs_context_ops) {
-		switch (cmd) {
-		case FSCONFIG_SET_BINARY:
-		case FSCONFIG_SET_PATH:
-		case FSCONFIG_SET_PATH_EMPTY:
-		case FSCONFIG_SET_FD:
-		case FSCONFIG_CMD_CREATE_EXCL:
-			return -EOPNOTSUPP;
-		}
-	}

 	if (_key) {
 		param.key = strndup_user(_key, 256);
--- a/fs/internal.h
+++ b/fs/internal.h
@ -44,7 +44,6 @@ extern void __init chrdev_init(void);
 /*
 * fs_context.c
 */
-extern const struct fs_context_operations legacy_fs_context_ops;
 extern int parse_monolithic_mount_data(struct fs_context *, void *);
 extern void vfs_clean_context(struct fs_context *fc);
 extern int finish_clean_context(struct fs_context *fc);
@ -249,6 +248,7 @@ extern void mnt_pin_kill(struct mount *m);
 */
 extern const struct dentry_operations ns_dentry_operations;
 int open_namespace(struct ns_common *ns);
+struct file *open_namespace_file(struct ns_common *ns);

 /*
 * fs/stat.c:
--- a/fs/namespace.c
+++ b/fs/namespace.c
@ -2796,6 +2796,9 @@ static inline void unlock_mount(struct pinned_mountpoint *m)
 		__unlock_mount(m);
 }

+static void lock_mount_exact(const struct path *path,
+			     struct pinned_mountpoint *mp);
+
 #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \
 	struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \
 	do_lock_mount((path), &mp, (beneath))
@ -2946,10 +2949,11 @@ static inline bool may_copy_tree(const struct path *path)
 	return check_anonymous_mnt(mnt);
 }

-
-static struct mount *__do_loopback(const struct path *old_path, int recurse)
+static struct mount *__do_loopback(const struct path *old_path,
+				   unsigned int flags, unsigned int copy_flags)
 {
 	struct mount *old = real_mount(old_path->mnt);
+	bool recurse = flags & AT_RECURSIVE;

 	if (IS_MNT_UNBINDABLE(old))
 		return ERR_PTR(-EINVAL);
@ -2960,10 +2964,22 @@ static struct mount *__do_loopback(const struct path *old_path, int recurse)
 	if (!recurse && __has_locked_children(old, old_path->dentry))
 		return ERR_PTR(-EINVAL);

+	/*
+	 * When creating a new mount namespace we don't want to copy over
+	 * mounts of mount namespaces to avoid the risk of cycles and also to
+	 * minimize the default complex interdependencies between mount
+	 * namespaces.
+	 *
+	 * We could ofc just check whether all mount namespace files aren't
+	 * creating cycles but really let's keep this simple.
+	 */
+	if (!(flags & OPEN_TREE_NAMESPACE))
+		copy_flags |= CL_COPY_MNT_NS_FILE;
+
 	if (recurse)
-		return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
-	else
-		return clone_mnt(old, old_path->dentry, 0);
+		return copy_tree(old, old_path->dentry, copy_flags);
+
+	return clone_mnt(old, old_path->dentry, copy_flags);
 }

 /*
@ -2974,7 +2990,9 @@ static int do_loopback(const struct path *path, const char *old_name,
 {
 	struct path old_path __free(path_put) = {};
 	struct mount *mnt = NULL;
+	unsigned int flags = recurse ? AT_RECURSIVE : 0;
 	int err;
+
 	if (!old_name || !*old_name)
 		return -EINVAL;
 	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
@ -2991,7 +3009,7 @@ static int do_loopback(const struct path *path, const char *old_name,
 	if (!check_mnt(mp.parent))
 		return -EINVAL;

-	mnt = __do_loopback(&old_path, recurse);
+	mnt = __do_loopback(&old_path, flags, 0);
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);

@ -3004,7 +3022,7 @@ static int do_loopback(const struct path *path, const char *old_name,
 	return err;
 }

-static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive)
+static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags)
 {
 	struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
 	struct user_namespace *user_ns = mnt_ns->user_ns;
@ -3029,7 +3047,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
 			ns->seq_origin = src_mnt_ns->ns.ns_id;
 	}

-	mnt = __do_loopback(path, recursive);
+	mnt = __do_loopback(path, flags, 0);
 	if (IS_ERR(mnt)) {
 		emptied_ns = ns;
 		return ERR_CAST(mnt);
@ -3043,9 +3061,9 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec
 	return ns;
 }

-static struct file *open_detached_copy(struct path *path, bool recursive)
+static struct file *open_detached_copy(struct path *path, unsigned int flags)
 {
-	struct mnt_namespace *ns = get_detached_copy(path, recursive);
+	struct mnt_namespace *ns = get_detached_copy(path, flags);
 	struct file *file;

 	if (IS_ERR(ns))
@ -3061,21 +3079,122 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
 	return file;
 }

+DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *,
+	    if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T))
+
+static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags)
+{
+	struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL;
+	struct path to_path __free(path_put) = {};
+	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+	struct user_namespace *user_ns = current_user_ns();
+	struct mount *new_ns_root;
+	struct mount *mnt;
+	unsigned int copy_flags = 0;
+	bool locked = false;
+
+	if (user_ns != ns->user_ns)
+		copy_flags |= CL_SLAVE;
+
+	new_ns = alloc_mnt_ns(user_ns, false);
+	if (IS_ERR(new_ns))
+		return ERR_CAST(new_ns);
+
+	scoped_guard(namespace_excl) {
+		new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags);
+		if (IS_ERR(new_ns_root))
+			return ERR_CAST(new_ns_root);
+
+		/*
+		 * If the real rootfs had a locked mount on top of it somewhere
+		 * in the stack, lock the new mount tree as well so it can't be
+		 * exposed.
+		 */
+		mnt = ns->root;
+		while (mnt->overmount) {
+			mnt = mnt->overmount;
+			if (mnt->mnt.mnt_flags & MNT_LOCKED)
+				locked = true;
+		}
+	}
+
+	/*
+	 * We dropped the namespace semaphore so we can actually lock
+	 * the copy for mounting. The copied mount isn't attached to any
+	 * mount namespace and it is thus excluded from any propagation.
+	 * So realistically we're isolated and the mount can't be
+	 * overmounted.
+	 */
+
+	/* Borrow the reference from clone_mnt(). */
+	to_path.mnt = &new_ns_root->mnt;
+	to_path.dentry = dget(new_ns_root->mnt.mnt_root);
+
+	/* Now lock for actual mounting. */
+	LOCK_MOUNT_EXACT(mp, &to_path);
+	if (unlikely(IS_ERR(mp.parent)))
+		return ERR_CAST(mp.parent);
+
+	/*
+	 * We don't emulate unshare()ing a mount namespace. We stick to the
+	 * restrictions of creating detached bind-mounts. It has a lot
+	 * saner and simpler semantics.
+	 */
+	mnt = __do_loopback(path, flags, copy_flags);
+	if (IS_ERR(mnt))
+		return ERR_CAST(mnt);
+
+	scoped_guard(mount_writer) {
+		if (locked)
+			mnt->mnt.mnt_flags |= MNT_LOCKED;
+		/*
+		 * Now mount the detached tree on top of the copy of the
+		 * real rootfs we created.
+		 */
+		attach_mnt(mnt, new_ns_root, mp.mp);
+		if (user_ns != ns->user_ns)
+			lock_mnt_tree(new_ns_root);
+	}
+
+	/* Add all mounts to the new namespace. */
+	for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) {
+		mnt_add_to_ns(new_ns, p);
+		new_ns->nr_mounts++;
+	}
+
+	new_ns->root = real_mount(no_free_ptr(to_path.mnt));
+	ns_tree_add_raw(new_ns);
+	return no_free_ptr(new_ns);
+}
+
+static struct file *open_new_namespace(struct path *path, unsigned int flags)
+{
+	struct mnt_namespace *new_ns;
+
+	new_ns = create_new_namespace(path, flags);
+	if (IS_ERR(new_ns))
+		return ERR_CAST(new_ns);
+	return open_namespace_file(to_ns_common(new_ns));
+}
+
 static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
 {
 	int ret;
 	struct path path __free(path_put) = {};
 	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-	bool detached = flags & OPEN_TREE_CLONE;

 	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);

 	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
 		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
-		      OPEN_TREE_CLOEXEC))
+		      OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE))
 		return ERR_PTR(-EINVAL);

-	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
+	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) ==
+	    AT_RECURSIVE)
+		return ERR_PTR(-EINVAL);
+
+	if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1)
 		return ERR_PTR(-EINVAL);

 	if (flags & AT_NO_AUTOMOUNT)
@ -3085,15 +3204,27 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned
 	if (flags & AT_EMPTY_PATH)
 		lookup_flags |= LOOKUP_EMPTY;

-	if (detached && !may_mount())
+	/*
+	 * If we create a new mount namespace with the cloned mount tree we
+	 * just care about being privileged over our current user namespace.
+	 * The new mount namespace will be owned by it.
+	 */
+	if ((flags & OPEN_TREE_NAMESPACE) &&
+	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if ((flags & OPEN_TREE_CLONE) && !may_mount())
 		return ERR_PTR(-EPERM);

 	ret = user_path_at(dfd, filename, lookup_flags, &path);
 	if (unlikely(ret))
 		return ERR_PTR(ret);

-	if (detached)
-		return open_detached_copy(&path, flags & AT_RECURSIVE);
+	if (flags & OPEN_TREE_NAMESPACE)
+		return open_new_namespace(&path, flags);
+
+	if (flags & OPEN_TREE_CLONE)
+		return open_detached_copy(&path, flags);

 	return dentry_open(&path, O_PATH, current_cred());
 }
@ -5554,31 +5685,49 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)

 /* locks: namespace_shared */
 static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
-			struct mnt_namespace *ns)
+                        struct file *mnt_file, struct mnt_namespace *ns)
 {
-	struct mount *m;
 	int err;

-	/* Has the namespace already been emptied? */
-	if (mnt_ns_id && mnt_ns_empty(ns))
-		return -ENOENT;
+	if (mnt_file) {
+		WARN_ON_ONCE(ns != NULL);

-	s->mnt = lookup_mnt_in_ns(mnt_id, ns);
-	if (!s->mnt)
-		return -ENOENT;
+		s->mnt = mnt_file->f_path.mnt;
+		ns = real_mount(s->mnt)->mnt_ns;
+		if (!ns)
+			/*
+			 * We can't set mount point and mnt_ns_id since we don't have a
+			 * ns for the mount. This can happen if the mount is unmounted
+			 * with MNT_DETACH.
+			 */
+			s->mask &= ~(STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID);
+	} else {
+		/* Has the namespace already been emptied? */
+		if (mnt_ns_id && mnt_ns_empty(ns))
+			return -ENOENT;

-	err = grab_requested_root(ns, &s->root);
-	if (err)
-		return err;
+		s->mnt = lookup_mnt_in_ns(mnt_id, ns);
+		if (!s->mnt)
+			return -ENOENT;
+	}

-	/*
-	 * Don't trigger audit denials. We just want to determine what
-	 * mounts to show users.
-	 */
-	m = real_mount(s->mnt);
-	if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
-	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
-		return -EPERM;
+	if (ns) {
+		err = grab_requested_root(ns, &s->root);
+		if (err)
+			return err;
+
+		if (!mnt_file) {
+			struct mount *m;
+			/*
+			 * Don't trigger audit denials. We just want to determine what
+			 * mounts to show users.
+			 */
+			m = real_mount(s->mnt);
+			if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
+			    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+				return -EPERM;
+		}
+	}

 	err = security_sb_statfs(s->mnt->mnt_root);
 	if (err)
@ -5700,7 +5849,7 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
 }

 static int copy_mnt_id_req(const struct mnt_id_req __user *req,
-			   struct mnt_id_req *kreq)
+			   struct mnt_id_req *kreq, unsigned int flags)
 {
 	int ret;
 	size_t usize;
@ -5718,11 +5867,17 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req,
 	ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
 	if (ret)
 		return ret;
-	if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id)
-		return -EINVAL;
-	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
-	if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
-		return -EINVAL;
+
+	if (flags & STATMOUNT_BY_FD) {
+		if (kreq->mnt_id || kreq->mnt_ns_id)
+			return -EINVAL;
+	} else {
+		if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id)
+			return -EINVAL;
+		/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+		if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
+			return -EINVAL;
+	}
 	return 0;
 }

@ -5769,25 +5924,33 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
 {
 	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
 	struct kstatmount *ks __free(kfree) = NULL;
+	struct file *mnt_file __free(fput) = NULL;
 	struct mnt_id_req kreq;
 	/* We currently support retrieval of 3 strings. */
 	size_t seq_size = 3 * PATH_MAX;
 	int ret;

-	if (flags)
+	if (flags & ~STATMOUNT_BY_FD)
 		return -EINVAL;

-	ret = copy_mnt_id_req(req, &kreq);
+	ret = copy_mnt_id_req(req, &kreq, flags);
 	if (ret)
 		return ret;

-	ns = grab_requested_mnt_ns(&kreq);
-	if (IS_ERR(ns))
-		return PTR_ERR(ns);
+	if (flags & STATMOUNT_BY_FD) {
+		mnt_file = fget_raw(kreq.mnt_fd);
+		if (!mnt_file)
+			return -EBADF;
+		/* do_statmount sets ns in case of STATMOUNT_BY_FD */
+	} else {
+		ns = grab_requested_mnt_ns(&kreq);
+		if (IS_ERR(ns))
+			return PTR_ERR(ns);

-	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
-	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
-		return -ENOENT;
+		if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
+		    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+			return -EPERM;
+	}

 	ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
 	if (!ks)
@ -5799,7 +5962,7 @@ retry:
 		return ret;

 	scoped_guard(namespace_shared)
-		ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
+		ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, mnt_file, ns);

 	if (!ret)
 		ret = copy_statmount_to_user(ks);
@ -5939,7 +6102,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
 	if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
 		return -EFAULT;

-	ret = copy_mnt_id_req(req, &kreq);
+	ret = copy_mnt_id_req(req, &kreq, 0);
 	if (ret)
 		return ret;

--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@ -99,6 +99,19 @@ int ns_get_path(struct path *path, struct task_struct *task,
 	return ns_get_path_cb(path, ns_get_path_task, &args);
 }

+struct file *open_namespace_file(struct ns_common *ns)
+{
+	struct path path __free(path_put) = {};
+	int err;
+
+	/* call first to consume reference */
+	err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	return dentry_open(&path, O_RDONLY, current_cred());
+}
+
 /**
 * open_namespace - open a namespace
 * @ns: the namespace to open
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@ -2282,8 +2282,6 @@ struct file_system_type {
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	int (*init_fs_context)(struct fs_context *);
 	const struct fs_parameter_spec *parameters;
-	struct dentry *(*mount) (struct file_system_type *, int,
-		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
 	struct module *owner;
 	struct file_system_type * next;
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@ -97,7 +97,6 @@ struct super_operations {
 			  const void *owner);
 	int (*unfreeze_fs)(struct super_block *sb);
 	int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs);
-	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*umount_begin)(struct super_block *sb);

 	int (*show_options)(struct seq_file *seq, struct dentry *dentry);
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@ -61,7 +61,8 @@
 /*
 * open_tree() flags.
 */
-#define OPEN_TREE_CLONE		1		/* Clone the target tree and attach the clone */
+#define OPEN_TREE_CLONE		(1 << 0)	/* Clone the target tree and attach the clone */
+#define OPEN_TREE_NAMESPACE	(1 << 1)	/* Clone the target tree into a new mount namespace */
 #define OPEN_TREE_CLOEXEC	O_CLOEXEC	/* Close the file on execve() */

 /*
@ -197,7 +198,10 @@ struct statmount {
 */
 struct mnt_id_req {
 	__u32 size;
-	__u32 mnt_ns_fd;
+	union {
+		__u32 mnt_ns_fd;
+		__u32 mnt_fd;
+	};
 	__u64 mnt_id;
 	__u64 param;
 	__u64 mnt_ns_id;
@ -232,4 +236,9 @@ struct mnt_id_req {
 #define LSMT_ROOT		0xffffffffffffffff	/* root mount */
 #define LISTMOUNT_REVERSE	(1 << 0) /* List later mounts first */

+/*
+ * @flag bits for statmount(2)
+ */
+#define STATMOUNT_BY_FD		0x00000001U	/* want mountinfo for given fd */
+
 #endif /* _UAPI_LINUX_MOUNT_H */
--- a/tools/testing/selftests/filesystems/open_tree_ns/.gitignore
+++ b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore
@ -0,0 +1 @@
+open_tree_ns_test
--- a/tools/testing/selftests/filesystems/open_tree_ns/Makefile
+++ b/tools/testing/selftests/filesystems/open_tree_ns/Makefile
@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := open_tree_ns_test
+
+CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES)
+LDLIBS := -lcap
+
+include ../../lib.mk
+
+$(OUTPUT)/open_tree_ns_test: open_tree_ns_test.c ../utils.c
+	$(CC) $(CFLAGS) -o $@ $^ $(LDLIBS)
--- a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
+++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c
--- a/tools/testing/selftests/filesystems/statmount/statmount.h
+++ b/tools/testing/selftests/filesystems/statmount/statmount.h
@ -43,19 +43,24 @@
 	#endif
 #endif

-static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask,
-			    struct statmount *buf, size_t bufsize,
+static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint32_t fd,
+			    uint64_t mask, struct statmount *buf, size_t bufsize,
 			    unsigned int flags)
 {
 	struct mnt_id_req req = {
 		.size = MNT_ID_REQ_SIZE_VER0,
-		.mnt_id = mnt_id,
 		.param = mask,
 	};

-	if (mnt_ns_id) {
+	if (flags & STATMOUNT_BY_FD) {
 		req.size = MNT_ID_REQ_SIZE_VER1;
-		req.mnt_ns_id = mnt_ns_id;
+		req.mnt_fd = fd;
+	} else {
+		req.mnt_id = mnt_id;
+		if (mnt_ns_id) {
+			req.size = MNT_ID_REQ_SIZE_VER1;
+			req.mnt_ns_id = mnt_ns_id;
+		}
 	}

 	return syscall(__NR_statmount, &req, buf, bufsize, flags);
--- a/tools/testing/selftests/filesystems/statmount/statmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c
@ -33,15 +33,24 @@ static const char *const known_fs[] = {
 	"sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf",
 	"vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL };

-static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags)
+static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags)
 {
 	size_t bufsize = 1 << 15;
-	struct statmount *buf = NULL, *tmp = alloca(bufsize);
+	struct statmount *buf = NULL, *tmp = NULL;
 	int tofree = 0;
 	int ret;

+	if (flags & STATMOUNT_BY_FD && fd < 0)
+		return NULL;
+
+	tmp = alloca(bufsize);
+
 	for (;;) {
-		ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags);
+		if (flags & STATMOUNT_BY_FD)
+			ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags);
+		else
+			ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags);
+
 		if (ret != -1)
 			break;
 		if (tofree)
@ -237,7 +246,7 @@ static void test_statmount_zero_mask(void)
 	struct statmount sm;
 	int ret;

-	ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, 0, 0, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount zero mask: %s\n",
 				      strerror(errno));
@ -263,7 +272,7 @@ static void test_statmount_mnt_basic(void)
 	int ret;
 	uint64_t mask = STATMOUNT_MNT_BASIC;

-	ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount mnt basic: %s\n",
 				      strerror(errno));
@ -323,7 +332,7 @@ static void test_statmount_sb_basic(void)
 	struct statx sx;
 	struct statfs sf;

-	ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount sb basic: %s\n",
 				      strerror(errno));
@ -375,7 +384,7 @@ static void test_statmount_mnt_point(void)
 {
 	struct statmount *sm;

-	sm = statmount_alloc(root_id, STATMOUNT_MNT_POINT, 0);
+	sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_POINT, 0);
 	if (!sm) {
 		ksft_test_result_fail("statmount mount point: %s\n",
 				      strerror(errno));
@ -405,7 +414,7 @@ static void test_statmount_mnt_root(void)
 	assert(last_dir);
 	last_dir++;

-	sm = statmount_alloc(root_id, STATMOUNT_MNT_ROOT, 0);
+	sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT, 0);
 	if (!sm) {
 		ksft_test_result_fail("statmount mount root: %s\n",
 				      strerror(errno));
@ -438,7 +447,7 @@ static void test_statmount_fs_type(void)
 	const char *fs_type;
 	const char *const *s;

-	sm = statmount_alloc(root_id, STATMOUNT_FS_TYPE, 0);
+	sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
 	if (!sm) {
 		ksft_test_result_fail("statmount fs type: %s\n",
 				      strerror(errno));
@ -467,7 +476,7 @@ static void test_statmount_mnt_opts(void)
 	char *line = NULL;
 	size_t len = 0;

-	sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS,
+	sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS,
 			     0);
 	if (!sm) {
 		ksft_test_result_fail("statmount mnt opts: %s\n",
@ -557,7 +566,7 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name)
 	uint32_t start, i;
 	int ret;

-	sm = statmount_alloc(root_id, mask, 0);
+	sm = statmount_alloc(root_id, 0, mask, 0);
 	if (!sm) {
 		ksft_test_result_fail("statmount %s: %s\n", name,
 				      strerror(errno));
@ -586,14 +595,14 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name)
 	exactsize = sm->size;
 	shortsize = sizeof(*sm) + i;

-	ret = statmount(root_id, 0, mask, sm, exactsize, 0);
+	ret = statmount(root_id, 0, 0, mask, sm, exactsize, 0);
 	if (ret == -1) {
 		ksft_test_result_fail("statmount exact size: %s\n",
 				      strerror(errno));
 		goto out;
 	}
 	errno = 0;
-	ret = statmount(root_id, 0, mask, sm, shortsize, 0);
+	ret = statmount(root_id, 0, 0, mask, sm, shortsize, 0);
 	if (ret != -1 || errno != EOVERFLOW) {
 		ksft_test_result_fail("should have failed with EOVERFLOW: %s\n",
 				      strerror(errno));
@ -658,6 +667,226 @@ static void test_listmount_tree(void)
 	ksft_test_result_pass("listmount tree\n");
 }

+static void test_statmount_by_fd(void)
+{
+	struct statmount *sm = NULL;
+	char tmpdir[] = "/statmount.fd.XXXXXX";
+	const char root[] = "/test";
+	char subdir[PATH_MAX], tmproot[PATH_MAX];
+	int fd;
+
+	if (!mkdtemp(tmpdir)) {
+		ksft_perror("mkdtemp");
+		return;
+	}
+
+	if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) {
+		ksft_perror("mount");
+		rmdir(tmpdir);
+		return;
+	}
+
+	snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root);
+	snprintf(tmproot, PATH_MAX, "%s/%s", tmpdir, "chroot");
+
+	if (mkdir(subdir, 0755)) {
+		ksft_perror("mkdir");
+		goto err_tmpdir;
+	}
+
+	if (mount(subdir, subdir, NULL, MS_BIND, 0)) {
+		ksft_perror("mount");
+		goto err_subdir;
+	}
+
+	if (mkdir(tmproot, 0755)) {
+		ksft_perror("mkdir");
+		goto err_subdir;
+	}
+
+	fd = open(subdir, O_PATH);
+	if (fd < 0) {
+		ksft_perror("open");
+		goto err_tmproot;
+	}
+
+	if (chroot(tmproot)) {
+		ksft_perror("chroot");
+		goto err_fd;
+	}
+
+	sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
+	if (!sm) {
+		ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
+		goto err_chroot;
+	}
+
+	if (sm->size < sizeof(*sm)) {
+		ksft_test_result_fail("unexpected size: %u < %u\n",
+				      sm->size, (uint32_t) sizeof(*sm));
+		goto err_chroot;
+	}
+
+	if (sm->mask & STATMOUNT_MNT_POINT) {
+		ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in statmount\n");
+		goto err_chroot;
+	}
+
+	if (!(sm->mask & STATMOUNT_MNT_ROOT)) {
+		ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n");
+		goto err_chroot;
+	}
+
+	if (strcmp(root, sm->str + sm->mnt_root) != 0) {
+		ksft_test_result_fail("statmount returned incorrect mnt_root,"
+			"statmount mnt_root: %s != %s\n",
+			sm->str + sm->mnt_root, root);
+		goto err_chroot;
+	}
+
+	if (chroot(".")) {
+		ksft_perror("chroot");
+		goto out;
+	}
+
+	free(sm);
+	sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD);
+	if (!sm) {
+		ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno));
+		goto err_fd;
+	}
+
+	if (sm->size < sizeof(*sm)) {
+		ksft_test_result_fail("unexpected size: %u < %u\n",
+				      sm->size, (uint32_t) sizeof(*sm));
+		goto out;
+	}
+
+	if (!(sm->mask & STATMOUNT_MNT_POINT)) {
+		ksft_test_result_fail("STATMOUNT_MNT_POINT not set in statmount\n");
+		goto out;
+	}
+
+	if (!(sm->mask & STATMOUNT_MNT_ROOT)) {
+		ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n");
+		goto out;
+	}
+
+	if (strcmp(subdir, sm->str + sm->mnt_point) != 0) {
+		ksft_test_result_fail("statmount returned incorrect mnt_point,"
+			"statmount mnt_point: %s != %s\n", sm->str + sm->mnt_point, subdir);
+		goto out;
+	}
+
+	if (strcmp(root, sm->str + sm->mnt_root) != 0) {
+		ksft_test_result_fail("statmount returned incorrect mnt_root,"
+			"statmount mnt_root: %s != %s\n", sm->str + sm->mnt_root, root);
+		goto out;
+	}
+
+	ksft_test_result_pass("statmount by fd\n");
+	goto out;
+err_chroot:
+	chroot(".");
+out:
+	free(sm);
+err_fd:
+	close(fd);
+err_tmproot:
+	rmdir(tmproot);
+err_subdir:
+	umount2(subdir, MNT_DETACH);
+	rmdir(subdir);
+err_tmpdir:
+	umount2(tmpdir, MNT_DETACH);
+	rmdir(tmpdir);
+}
+
+static void test_statmount_by_fd_unmounted(void)
+{
+	const char root[] = "/test.unmounted";
+	char tmpdir[] = "/statmount.fd.XXXXXX";
+	char subdir[PATH_MAX];
+	int fd;
+	struct statmount *sm = NULL;
+
+	if (!mkdtemp(tmpdir)) {
+		ksft_perror("mkdtemp");
+		return;
+	}
+
+	if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) {
+		ksft_perror("mount");
+		rmdir(tmpdir);
+		return;
+	}
+
+	snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root);
+
+	if (mkdir(subdir, 0755)) {
+		ksft_perror("mkdir");
+		goto err_tmpdir;
+	}
+
+	if (mount(subdir, subdir, 0, MS_BIND, NULL)) {
+		ksft_perror("mount");
+		goto err_subdir;
+	}
+
+	fd = open(subdir, O_PATH);
+	if (fd < 0) {
+		ksft_perror("open");
+		goto err_subdir;
+	}
+
+	if (umount2(tmpdir, MNT_DETACH)) {
+		ksft_perror("umount2");
+		goto err_fd;
+	}
+
+	sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD);
+	if (!sm) {
+		ksft_test_result_fail("statmount by fd unmounted: %s\n",
+				      strerror(errno));
+		goto err_sm;
+	}
+
+	if (sm->size < sizeof(*sm)) {
+		ksft_test_result_fail("unexpected size: %u < %u\n",
+				      sm->size, (uint32_t) sizeof(*sm));
+		goto err_sm;
+	}
+
+	if (sm->mask & STATMOUNT_MNT_POINT) {
+		ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in mask\n");
+		goto err_sm;
+	}
+
+	if (!(sm->mask & STATMOUNT_MNT_ROOT)) {
+		ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in mask\n");
+		goto err_sm;
+	}
+
+	if (strcmp(sm->str + sm->mnt_root, root) != 0) {
+		ksft_test_result_fail("statmount returned incorrect mnt_root,"
+			"statmount mnt_root: %s != %s\n",
+			sm->str + sm->mnt_root, root);
+		goto err_sm;
+	}
+
+	ksft_test_result_pass("statmount by fd on unmounted mount\n");
+err_sm:
+	free(sm);
+err_fd:
+	close(fd);
+err_subdir:
+	umount2(subdir, MNT_DETACH);
+	rmdir(subdir);
+err_tmpdir:
+	umount2(tmpdir, MNT_DETACH);
+	rmdir(tmpdir);
+}
+
 #define str_off(memb) (offsetof(struct statmount, memb) / sizeof(uint32_t))

 int main(void)
@ -669,14 +898,14 @@ int main(void)

 	ksft_print_header();

-	ret = statmount(0, 0, 0, NULL, 0, 0);
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
 	assert(ret == -1);
 	if (errno == ENOSYS)
 		ksft_exit_skip("statmount() syscall not supported\n");

 	setup_namespace();

-	ksft_set_plan(15);
+	ksft_set_plan(17);
 	test_listmount_empty_root();
 	test_statmount_zero_mask();
 	test_statmount_mnt_basic();
@ -693,6 +922,8 @@ int main(void)
 	test_statmount_string(all_mask, str_off(fs_type), "fs type & all");

 	test_listmount_tree();
+	test_statmount_by_fd_unmounted();
+	test_statmount_by_fd();


 	if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
--- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
+++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c
@ -102,7 +102,7 @@ static int _test_statmount_mnt_ns_id(void)
 	if (!root_id)
 		return NSID_ERROR;

-	ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0);
+	ret = statmount(root_id, 0, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0);
 	if (ret == -1) {
 		ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno));
 		return NSID_ERROR;
@ -128,6 +128,98 @@ static int _test_statmount_mnt_ns_id(void)
 	return NSID_PASS;
 }

+static int _test_statmount_mnt_ns_id_by_fd(void)
+{
+	struct statmount sm;
+	uint64_t mnt_ns_id;
+	int ret, fd, mounted = 1, status = NSID_ERROR;
+	char mnt[] = "/statmount.fd.XXXXXX";
+
+	ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id);
+	if (ret != NSID_PASS)
+		return ret;
+
+	if (!mkdtemp(mnt)) {
+		ksft_print_msg("statmount by fd mnt ns id mkdtemp: %s\n", strerror(errno));
+		return NSID_ERROR;
+	}
+
+	if (mount(mnt, mnt, NULL, MS_BIND, 0)) {
+		ksft_print_msg("statmount by fd mnt ns id mount: %s\n", strerror(errno));
+		status = NSID_ERROR;
+		goto err;
+	}
+
+	fd = open(mnt, O_PATH);
+	if (fd < 0) {
+		ksft_print_msg("statmount by fd mnt ns id open: %s\n", strerror(errno));
+		goto err;
+	}
+
+	ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD);
+	if (ret == -1) {
+		ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno));
+		status = NSID_ERROR;
+		goto out;
+	}
+
+	if (sm.size != sizeof(sm)) {
+		ksft_print_msg("unexpected size: %u != %u\n", sm.size,
+			       (uint32_t)sizeof(sm));
+		status = NSID_FAIL;
+		goto out;
+	}
+	if (sm.mask != STATMOUNT_MNT_NS_ID) {
+		ksft_print_msg("statmount mnt ns id unavailable\n");
+		status = NSID_SKIP;
+		goto out;
+	}
+
+	if (sm.mnt_ns_id != mnt_ns_id) {
+		ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n",
+			       (unsigned long long)sm.mnt_ns_id,
+			       (unsigned long long)mnt_ns_id);
+		status = NSID_FAIL;
+		goto out;
+	}
+
+	mounted = 0;
+	if (umount2(mnt, MNT_DETACH)) {
+		ksft_print_msg("statmount by fd mnt ns id umount2: %s\n", strerror(errno));
+		goto out;
+	}
+
+	ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD);
+	if (ret == -1) {
+		ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno));
+		status = NSID_ERROR;
+		goto out;
+	}
+
+	if (sm.size != sizeof(sm)) {
+		ksft_print_msg("unexpected size: %u != %u\n", sm.size,
+			       (uint32_t)sizeof(sm));
+		status = NSID_FAIL;
+		goto out;
+	}
+
+	if (sm.mask == STATMOUNT_MNT_NS_ID) {
+		ksft_print_msg("unexpected STATMOUNT_MNT_NS_ID in mask\n");
+		status = NSID_FAIL;
+		goto out;
+	}
+
+	status = NSID_PASS;
+out:
+	close(fd);
+	if (mounted)
+		umount2(mnt, MNT_DETACH);
+err:
+	rmdir(mnt);
+	return status;
+}
+
+
 static void test_statmount_mnt_ns_id(void)
 {
 	pid_t pid;
@ -148,6 +240,9 @@ static void test_statmount_mnt_ns_id(void)
 	if (ret != NSID_PASS)
 		exit(ret);
 	ret = _test_statmount_mnt_ns_id();
+	if (ret != NSID_PASS)
+		exit(ret);
+	ret = _test_statmount_mnt_ns_id_by_fd();
 	exit(ret);
 }

@ -179,7 +274,7 @@ static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts)
 	for (int i = 0; i < nr_mounts; i++) {
 		struct statmount sm;

-		ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm,
+		ret = statmount(list[i], mnt_ns_id, 0, STATMOUNT_MNT_NS_ID, &sm,
 				sizeof(sm), 0);
 		if (ret < 0) {
 			ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno));
@ -275,7 +370,7 @@ int main(void)
 	int ret;

 	ksft_print_header();
-	ret = statmount(0, 0, 0, NULL, 0, 0);
+	ret = statmount(0, 0, 0, 0, NULL, 0, 0);
 	assert(ret == -1);
 	if (errno == ENOSYS)
 		ksft_exit_skip("statmount() syscall not supported\n");
--- a/tools/testing/selftests/filesystems/utils.c
+++ b/tools/testing/selftests/filesystems/utils.c
@ -515,6 +515,32 @@ int setup_userns(void)
 	return 0;
 }

+int enter_userns(void)
+{
+	int ret;
+	char buf[32];
+	uid_t uid = getuid();
+	gid_t gid = getgid();
+
+	ret = unshare(CLONE_NEWUSER);
+	if (ret)
+		return ret;
+
+	sprintf(buf, "0 %d 1", uid);
+	ret = write_file("/proc/self/uid_map", buf);
+	if (ret)
+		return ret;
+	ret = write_file("/proc/self/setgroups", "deny");
+	if (ret)
+		return ret;
+	sprintf(buf, "0 %d 1", gid);
+	ret = write_file("/proc/self/gid_map", buf);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
 /* caps_down - lower all effective caps */
 int caps_down(void)
 {
--- a/tools/testing/selftests/filesystems/utils.h
+++ b/tools/testing/selftests/filesystems/utils.h
@ -28,6 +28,7 @@ extern int cap_down(cap_value_t down);

 extern bool switch_ids(uid_t uid, gid_t gid);
 extern int setup_userns(void);
+extern int enter_userns(void);

 static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps)
 {