diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 3837891e933d..8025df6e6499 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -180,7 +180,6 @@ prototypes:: int (*freeze_fs) (struct super_block *); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); @@ -204,7 +203,6 @@ sync_fs: read freeze_fs: write unfreeze_fs: write statfs: maybe(read) (see below) -remount_fs: write umount_begin: no show_options: no (namespace_sem) quota_read: no (see below) @@ -229,8 +227,6 @@ file_system_type prototypes:: - struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); void (*kill_sb) (struct super_block *); locking rules: @@ -238,13 +234,9 @@ locking rules: ======= ========= ops may block ======= ========= -mount yes kill_sb yes ======= ========= -->mount() returns ERR_PTR or the root dentry; its superblock should be locked -on return. - ->kill_sb() takes a write-locked superblock, does all shutdown work on it, unlocks and drops the reference. diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst index c99ab1f7fea4..a064234fed5b 100644 --- a/Documentation/filesystems/mount_api.rst +++ b/Documentation/filesystems/mount_api.rst @@ -299,8 +299,6 @@ manage the filesystem context. They are as follows: On success it should return 0. In the case of an error, it should return a negative error code. - .. Note:: reconfigure is intended as a replacement for remount_fs. - Filesystem context Security =========================== diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index c0f7103628ab..ed3ac56e3c76 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -448,11 +448,8 @@ a file off. **mandatory** -->get_sb() is gone. Switch to use of ->mount(). Typically it's just -a matter of switching from calling ``get_sb_``... to ``mount_``... and changing -the function type. If you were doing it manually, just switch from setting -->mnt_root to some pointer to returning that pointer. On errors return -ERR_PTR(...). +->get_sb() and ->mount() are gone. Switch to using the new mount API. See +Documentation/filesystems/mount_api.rst for more details. --- diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 85654eb91594..7c753148af88 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -94,11 +94,9 @@ functions: The passed struct file_system_type describes your filesystem. When a request is made to mount a filesystem onto a directory in your -namespace, the VFS will call the appropriate mount() method for the -specific filesystem. New vfsmount referring to the tree returned by -->mount() will be attached to the mountpoint, so that when pathname -resolution reaches the mountpoint it will jump into the root of that -vfsmount. +namespace, the VFS will call the appropriate get_tree() method for the +specific filesystem. See Documentation/filesystems/mount_api.rst +for more details. You can see all filesystems that are registered to the kernel in the file /proc/filesystems. @@ -117,8 +115,6 @@ members are defined: int fs_flags; int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; - struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; @@ -151,10 +147,6 @@ members are defined: 'struct fs_parameter_spec'. More info in Documentation/filesystems/mount_api.rst. -``mount`` - the method to call when a new instance of this filesystem should - be mounted - ``kill_sb`` the method to call when an instance of this filesystem should be shut down @@ -173,45 +165,6 @@ members are defined: s_lock_key, s_umount_key, s_vfs_rename_key, s_writers_key, i_lock_key, i_mutex_key, invalidate_lock_key, i_mutex_dir_key: lockdep-specific -The mount() method has the following arguments: - -``struct file_system_type *fs_type`` - describes the filesystem, partly initialized by the specific - filesystem code - -``int flags`` - mount flags - -``const char *dev_name`` - the device name we are mounting. - -``void *data`` - arbitrary mount options, usually comes as an ASCII string (see - "Mount Options" section) - -The mount() method must return the root dentry of the tree requested by -caller. An active reference to its superblock must be grabbed and the -superblock must be locked. On failure it should return ERR_PTR(error). - -The arguments match those of mount(2) and their interpretation depends -on filesystem type. E.g. for block filesystems, dev_name is interpreted -as block device name, that device is opened and if it contains a -suitable filesystem image the method creates and initializes struct -super_block accordingly, returning its root dentry to caller. - -->mount() may choose to return a subtree of existing filesystem - it -doesn't have to create a new one. The main result from the caller's -point of view is a reference to dentry at the root of (sub)tree to be -attached; creation of new superblock is a common side effect. - -The most interesting member of the superblock structure that the mount() -method fills in is the "s_op" field. This is a pointer to a "struct -super_operations" which describes the next level of the filesystem -implementation. - -For more information on mounting (and the new mount API), see -Documentation/filesystems/mount_api.rst. - The Superblock Object ===================== @@ -244,7 +197,6 @@ filesystem. The following members are defined: enum freeze_wholder who); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); @@ -351,10 +303,6 @@ or bottom half). ``statfs`` called when the VFS needs to get filesystem statistics. -``remount_fs`` - called when the filesystem is remounted. This is called with - the kernel lock held - ``umount_begin`` called when the VFS is unmounting a filesystem. diff --git a/fs/fs_context.c b/fs/fs_context.c index 93b7ebf8d927..81ed94f46cac 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -24,20 +24,6 @@ #include "mount.h" #include "internal.h" -enum legacy_fs_param { - LEGACY_FS_UNSET_PARAMS, - LEGACY_FS_MONOLITHIC_PARAMS, - LEGACY_FS_INDIVIDUAL_PARAMS, -}; - -struct legacy_fs_context { - char *legacy_data; /* Data page for legacy filesystems */ - size_t data_size; - enum legacy_fs_param param_type; -}; - -static int legacy_init_fs_context(struct fs_context *fc); - static const struct constant_table common_set_sb_flag[] = { { "dirsync", SB_DIRSYNC }, { "lazytime", SB_LAZYTIME }, @@ -275,7 +261,6 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, unsigned int sb_flags_mask, enum fs_context_purpose purpose) { - int (*init_fs_context)(struct fs_context *); struct fs_context *fc; int ret = -ENOMEM; @@ -307,12 +292,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, break; } - /* TODO: Make all filesystems support this unconditionally */ - init_fs_context = fc->fs_type->init_fs_context; - if (!init_fs_context) - init_fs_context = legacy_init_fs_context; - - ret = init_fs_context(fc); + ret = fc->fs_type->init_fs_context(fc); if (ret < 0) goto err_fc; fc->need_free = true; @@ -376,8 +356,6 @@ void fc_drop_locked(struct fs_context *fc) deactivate_locked_super(sb); } -static void legacy_fs_context_free(struct fs_context *fc); - /** * vfs_dup_fs_context - Duplicate a filesystem context. * @src_fc: The context to copy. @@ -531,184 +509,6 @@ void put_fs_context(struct fs_context *fc) } EXPORT_SYMBOL(put_fs_context); -/* - * Free the config for a filesystem that doesn't support fs_context. - */ -static void legacy_fs_context_free(struct fs_context *fc) -{ - struct legacy_fs_context *ctx = fc->fs_private; - - if (ctx) { - if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) - kfree(ctx->legacy_data); - kfree(ctx); - } -} - -/* - * Duplicate a legacy config. - */ -static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) -{ - struct legacy_fs_context *ctx; - struct legacy_fs_context *src_ctx = src_fc->fs_private; - - ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) { - ctx->legacy_data = kmemdup(src_ctx->legacy_data, - src_ctx->data_size, GFP_KERNEL); - if (!ctx->legacy_data) { - kfree(ctx); - return -ENOMEM; - } - } - - fc->fs_private = ctx; - return 0; -} - -/* - * Add a parameter to a legacy config. We build up a comma-separated list of - * options. - */ -static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - struct legacy_fs_context *ctx = fc->fs_private; - unsigned int size = ctx->data_size; - size_t len = 0; - int ret; - - ret = vfs_parse_fs_param_source(fc, param); - if (ret != -ENOPARAM) - return ret; - - if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) - return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); - - switch (param->type) { - case fs_value_is_string: - len = 1 + param->size; - fallthrough; - case fs_value_is_flag: - len += strlen(param->key); - break; - default: - return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported", - param->key); - } - - if (size + len + 2 > PAGE_SIZE) - return invalf(fc, "VFS: Legacy: Cumulative options too large"); - if (strchr(param->key, ',') || - (param->type == fs_value_is_string && - memchr(param->string, ',', param->size))) - return invalf(fc, "VFS: Legacy: Option '%s' contained comma", - param->key); - if (!ctx->legacy_data) { - ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!ctx->legacy_data) - return -ENOMEM; - } - - if (size) - ctx->legacy_data[size++] = ','; - len = strlen(param->key); - memcpy(ctx->legacy_data + size, param->key, len); - size += len; - if (param->type == fs_value_is_string) { - ctx->legacy_data[size++] = '='; - memcpy(ctx->legacy_data + size, param->string, param->size); - size += param->size; - } - ctx->legacy_data[size] = '\0'; - ctx->data_size = size; - ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS; - return 0; -} - -/* - * Add monolithic mount data. - */ -static int legacy_parse_monolithic(struct fs_context *fc, void *data) -{ - struct legacy_fs_context *ctx = fc->fs_private; - - if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) { - pr_warn("VFS: Can't mix monolithic and individual options\n"); - return -EINVAL; - } - - ctx->legacy_data = data; - ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS; - if (!ctx->legacy_data) - return 0; - - if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA) - return 0; - return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security); -} - -/* - * Get a mountable root with the legacy mount command. - */ -static int legacy_get_tree(struct fs_context *fc) -{ - struct legacy_fs_context *ctx = fc->fs_private; - struct super_block *sb; - struct dentry *root; - - root = fc->fs_type->mount(fc->fs_type, fc->sb_flags, - fc->source, ctx->legacy_data); - if (IS_ERR(root)) - return PTR_ERR(root); - - sb = root->d_sb; - BUG_ON(!sb); - - fc->root = root; - return 0; -} - -/* - * Handle remount. - */ -static int legacy_reconfigure(struct fs_context *fc) -{ - struct legacy_fs_context *ctx = fc->fs_private; - struct super_block *sb = fc->root->d_sb; - - if (!sb->s_op->remount_fs) - return 0; - - return sb->s_op->remount_fs(sb, &fc->sb_flags, - ctx ? ctx->legacy_data : NULL); -} - -const struct fs_context_operations legacy_fs_context_ops = { - .free = legacy_fs_context_free, - .dup = legacy_fs_context_dup, - .parse_param = legacy_parse_param, - .parse_monolithic = legacy_parse_monolithic, - .get_tree = legacy_get_tree, - .reconfigure = legacy_reconfigure, -}; - -/* - * Initialise a legacy context for a filesystem that doesn't support - * fs_context. - */ -static int legacy_init_fs_context(struct fs_context *fc) -{ - fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT); - if (!fc->fs_private) - return -ENOMEM; - fc->ops = &legacy_fs_context_ops; - return 0; -} - int parse_monolithic_mount_data(struct fs_context *fc, void *data) { int (*monolithic_mount_data)(struct fs_context *, void *); @@ -757,10 +557,8 @@ int finish_clean_context(struct fs_context *fc) if (fc->phase != FS_CONTEXT_AWAITING_RECONF) return 0; - if (fc->fs_type->init_fs_context) - error = fc->fs_type->init_fs_context(fc); - else - error = legacy_init_fs_context(fc); + error = fc->fs_type->init_fs_context(fc); + if (unlikely(error)) { fc->phase = FS_CONTEXT_FAILED; return error; diff --git a/fs/fsopen.c b/fs/fsopen.c index f645c99204eb..622ee3926cd5 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -404,16 +404,6 @@ SYSCALL_DEFINE5(fsconfig, return -EINVAL; fc = fd_file(f)->private_data; - if (fc->ops == &legacy_fs_context_ops) { - switch (cmd) { - case FSCONFIG_SET_BINARY: - case FSCONFIG_SET_PATH: - case FSCONFIG_SET_PATH_EMPTY: - case FSCONFIG_SET_FD: - case FSCONFIG_CMD_CREATE_EXCL: - return -EOPNOTSUPP; - } - } if (_key) { param.key = strndup_user(_key, 256); diff --git a/fs/internal.h b/fs/internal.h index 9514d80ef5c4..5ec0dd514185 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -44,7 +44,6 @@ extern void __init chrdev_init(void); /* * fs_context.c */ -extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); extern void vfs_clean_context(struct fs_context *fc); extern int finish_clean_context(struct fs_context *fc); @@ -249,6 +248,7 @@ extern void mnt_pin_kill(struct mount *m); */ extern const struct dentry_operations ns_dentry_operations; int open_namespace(struct ns_common *ns); +struct file *open_namespace_file(struct ns_common *ns); /* * fs/stat.c: diff --git a/fs/namespace.c b/fs/namespace.c index 53d1055c1825..1d2089ffb6ab 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2796,6 +2796,9 @@ static inline void unlock_mount(struct pinned_mountpoint *m) __unlock_mount(m); } +static void lock_mount_exact(const struct path *path, + struct pinned_mountpoint *mp); + #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ do_lock_mount((path), &mp, (beneath)) @@ -2946,10 +2949,11 @@ static inline bool may_copy_tree(const struct path *path) return check_anonymous_mnt(mnt); } - -static struct mount *__do_loopback(const struct path *old_path, int recurse) +static struct mount *__do_loopback(const struct path *old_path, + unsigned int flags, unsigned int copy_flags) { struct mount *old = real_mount(old_path->mnt); + bool recurse = flags & AT_RECURSIVE; if (IS_MNT_UNBINDABLE(old)) return ERR_PTR(-EINVAL); @@ -2960,10 +2964,22 @@ static struct mount *__do_loopback(const struct path *old_path, int recurse) if (!recurse && __has_locked_children(old, old_path->dentry)) return ERR_PTR(-EINVAL); + /* + * When creating a new mount namespace we don't want to copy over + * mounts of mount namespaces to avoid the risk of cycles and also to + * minimize the default complex interdependencies between mount + * namespaces. + * + * We could ofc just check whether all mount namespace files aren't + * creating cycles but really let's keep this simple. + */ + if (!(flags & OPEN_TREE_NAMESPACE)) + copy_flags |= CL_COPY_MNT_NS_FILE; + if (recurse) - return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); - else - return clone_mnt(old, old_path->dentry, 0); + return copy_tree(old, old_path->dentry, copy_flags); + + return clone_mnt(old, old_path->dentry, copy_flags); } /* @@ -2974,7 +2990,9 @@ static int do_loopback(const struct path *path, const char *old_name, { struct path old_path __free(path_put) = {}; struct mount *mnt = NULL; + unsigned int flags = recurse ? AT_RECURSIVE : 0; int err; + if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); @@ -2991,7 +3009,7 @@ static int do_loopback(const struct path *path, const char *old_name, if (!check_mnt(mp.parent)) return -EINVAL; - mnt = __do_loopback(&old_path, recurse); + mnt = __do_loopback(&old_path, flags, 0); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -3004,7 +3022,7 @@ static int do_loopback(const struct path *path, const char *old_name, return err; } -static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive) +static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags) { struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; struct user_namespace *user_ns = mnt_ns->user_ns; @@ -3029,7 +3047,7 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec ns->seq_origin = src_mnt_ns->ns.ns_id; } - mnt = __do_loopback(path, recursive); + mnt = __do_loopback(path, flags, 0); if (IS_ERR(mnt)) { emptied_ns = ns; return ERR_CAST(mnt); @@ -3043,9 +3061,9 @@ static struct mnt_namespace *get_detached_copy(const struct path *path, bool rec return ns; } -static struct file *open_detached_copy(struct path *path, bool recursive) +static struct file *open_detached_copy(struct path *path, unsigned int flags) { - struct mnt_namespace *ns = get_detached_copy(path, recursive); + struct mnt_namespace *ns = get_detached_copy(path, flags); struct file *file; if (IS_ERR(ns)) @@ -3061,21 +3079,122 @@ static struct file *open_detached_copy(struct path *path, bool recursive) return file; } +DEFINE_FREE(put_empty_mnt_ns, struct mnt_namespace *, + if (!IS_ERR_OR_NULL(_T)) free_mnt_ns(_T)) + +static struct mnt_namespace *create_new_namespace(struct path *path, unsigned int flags) +{ + struct mnt_namespace *new_ns __free(put_empty_mnt_ns) = NULL; + struct path to_path __free(path_put) = {}; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct user_namespace *user_ns = current_user_ns(); + struct mount *new_ns_root; + struct mount *mnt; + unsigned int copy_flags = 0; + bool locked = false; + + if (user_ns != ns->user_ns) + copy_flags |= CL_SLAVE; + + new_ns = alloc_mnt_ns(user_ns, false); + if (IS_ERR(new_ns)) + return ERR_CAST(new_ns); + + scoped_guard(namespace_excl) { + new_ns_root = clone_mnt(ns->root, ns->root->mnt.mnt_root, copy_flags); + if (IS_ERR(new_ns_root)) + return ERR_CAST(new_ns_root); + + /* + * If the real rootfs had a locked mount on top of it somewhere + * in the stack, lock the new mount tree as well so it can't be + * exposed. + */ + mnt = ns->root; + while (mnt->overmount) { + mnt = mnt->overmount; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + locked = true; + } + } + + /* + * We dropped the namespace semaphore so we can actually lock + * the copy for mounting. The copied mount isn't attached to any + * mount namespace and it is thus excluded from any propagation. + * So realistically we're isolated and the mount can't be + * overmounted. + */ + + /* Borrow the reference from clone_mnt(). */ + to_path.mnt = &new_ns_root->mnt; + to_path.dentry = dget(new_ns_root->mnt.mnt_root); + + /* Now lock for actual mounting. */ + LOCK_MOUNT_EXACT(mp, &to_path); + if (unlikely(IS_ERR(mp.parent))) + return ERR_CAST(mp.parent); + + /* + * We don't emulate unshare()ing a mount namespace. We stick to the + * restrictions of creating detached bind-mounts. It has a lot + * saner and simpler semantics. + */ + mnt = __do_loopback(path, flags, copy_flags); + if (IS_ERR(mnt)) + return ERR_CAST(mnt); + + scoped_guard(mount_writer) { + if (locked) + mnt->mnt.mnt_flags |= MNT_LOCKED; + /* + * Now mount the detached tree on top of the copy of the + * real rootfs we created. + */ + attach_mnt(mnt, new_ns_root, mp.mp); + if (user_ns != ns->user_ns) + lock_mnt_tree(new_ns_root); + } + + /* Add all mounts to the new namespace. */ + for (struct mount *p = new_ns_root; p; p = next_mnt(p, new_ns_root)) { + mnt_add_to_ns(new_ns, p); + new_ns->nr_mounts++; + } + + new_ns->root = real_mount(no_free_ptr(to_path.mnt)); + ns_tree_add_raw(new_ns); + return no_free_ptr(new_ns); +} + +static struct file *open_new_namespace(struct path *path, unsigned int flags) +{ + struct mnt_namespace *new_ns; + + new_ns = create_new_namespace(path, flags); + if (IS_ERR(new_ns)) + return ERR_CAST(new_ns); + return open_namespace_file(to_ns_common(new_ns)); +} + static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { int ret; struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; - bool detached = flags & OPEN_TREE_CLONE; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | - OPEN_TREE_CLOEXEC)) + OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE)) return ERR_PTR(-EINVAL); - if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) + if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) == + AT_RECURSIVE) + return ERR_PTR(-EINVAL); + + if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1) return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) @@ -3085,15 +3204,27 @@ static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; - if (detached && !may_mount()) + /* + * If we create a new mount namespace with the cloned mount tree we + * just care about being privileged over our current user namespace. + * The new mount namespace will be owned by it. + */ + if ((flags & OPEN_TREE_NAMESPACE) && + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if ((flags & OPEN_TREE_CLONE) && !may_mount()) return ERR_PTR(-EPERM); ret = user_path_at(dfd, filename, lookup_flags, &path); if (unlikely(ret)) return ERR_PTR(ret); - if (detached) - return open_detached_copy(&path, flags & AT_RECURSIVE); + if (flags & OPEN_TREE_NAMESPACE) + return open_new_namespace(&path, flags); + + if (flags & OPEN_TREE_CLONE) + return open_detached_copy(&path, flags); return dentry_open(&path, O_PATH, current_cred()); } @@ -5554,31 +5685,49 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) /* locks: namespace_shared */ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, - struct mnt_namespace *ns) + struct file *mnt_file, struct mnt_namespace *ns) { - struct mount *m; int err; - /* Has the namespace already been emptied? */ - if (mnt_ns_id && mnt_ns_empty(ns)) - return -ENOENT; + if (mnt_file) { + WARN_ON_ONCE(ns != NULL); - s->mnt = lookup_mnt_in_ns(mnt_id, ns); - if (!s->mnt) - return -ENOENT; + s->mnt = mnt_file->f_path.mnt; + ns = real_mount(s->mnt)->mnt_ns; + if (!ns) + /* + * We can't set mount point and mnt_ns_id since we don't have a + * ns for the mount. This can happen if the mount is unmounted + * with MNT_DETACH. + */ + s->mask &= ~(STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID); + } else { + /* Has the namespace already been emptied? */ + if (mnt_ns_id && mnt_ns_empty(ns)) + return -ENOENT; - err = grab_requested_root(ns, &s->root); - if (err) - return err; + s->mnt = lookup_mnt_in_ns(mnt_id, ns); + if (!s->mnt) + return -ENOENT; + } - /* - * Don't trigger audit denials. We just want to determine what - * mounts to show users. - */ - m = real_mount(s->mnt); - if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && - !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) - return -EPERM; + if (ns) { + err = grab_requested_root(ns, &s->root); + if (err) + return err; + + if (!mnt_file) { + struct mount *m; + /* + * Don't trigger audit denials. We just want to determine what + * mounts to show users. + */ + m = real_mount(s->mnt); + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + } + } err = security_sb_statfs(s->mnt->mnt_root); if (err) @@ -5700,7 +5849,7 @@ static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, } static int copy_mnt_id_req(const struct mnt_id_req __user *req, - struct mnt_id_req *kreq) + struct mnt_id_req *kreq, unsigned int flags) { int ret; size_t usize; @@ -5718,11 +5867,17 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); if (ret) return ret; - if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) - return -EINVAL; - /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ - if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) - return -EINVAL; + + if (flags & STATMOUNT_BY_FD) { + if (kreq->mnt_id || kreq->mnt_ns_id) + return -EINVAL; + } else { + if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) + return -EINVAL; + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ + if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) + return -EINVAL; + } return 0; } @@ -5769,25 +5924,33 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, { struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct kstatmount *ks __free(kfree) = NULL; + struct file *mnt_file __free(fput) = NULL; struct mnt_id_req kreq; /* We currently support retrieval of 3 strings. */ size_t seq_size = 3 * PATH_MAX; int ret; - if (flags) + if (flags & ~STATMOUNT_BY_FD) return -EINVAL; - ret = copy_mnt_id_req(req, &kreq); + ret = copy_mnt_id_req(req, &kreq, flags); if (ret) return ret; - ns = grab_requested_mnt_ns(&kreq); - if (IS_ERR(ns)) - return PTR_ERR(ns); + if (flags & STATMOUNT_BY_FD) { + mnt_file = fget_raw(kreq.mnt_fd); + if (!mnt_file) + return -EBADF; + /* do_statmount sets ns in case of STATMOUNT_BY_FD */ + } else { + ns = grab_requested_mnt_ns(&kreq); + if (IS_ERR(ns)) + return PTR_ERR(ns); - if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && - !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) - return -ENOENT; + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + } ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); if (!ks) @@ -5799,7 +5962,7 @@ retry: return ret; scoped_guard(namespace_shared) - ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); + ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, mnt_file, ns); if (!ret) ret = copy_statmount_to_user(ks); @@ -5939,7 +6102,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) return -EFAULT; - ret = copy_mnt_id_req(req, &kreq); + ret = copy_mnt_id_req(req, &kreq, 0); if (ret) return ret; diff --git a/fs/nsfs.c b/fs/nsfs.c index bf27d5da91f1..db91de208645 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -99,6 +99,19 @@ int ns_get_path(struct path *path, struct task_struct *task, return ns_get_path_cb(path, ns_get_path_task, &args); } +struct file *open_namespace_file(struct ns_common *ns) +{ + struct path path __free(path_put) = {}; + int err; + + /* call first to consume reference */ + err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (err < 0) + return ERR_PTR(err); + + return dentry_open(&path, O_RDONLY, current_cred()); +} + /** * open_namespace - open a namespace * @ns: the namespace to open diff --git a/include/linux/fs.h b/include/linux/fs.h index 01f73f1eeb7b..ca31bc9308a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2282,8 +2282,6 @@ struct file_system_type { #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; - struct dentry *(*mount) (struct file_system_type *, int, - const char *, void *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 97a8552d8f2b..fa7638b81246 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -97,7 +97,6 @@ struct super_operations { const void *owner); int (*unfreeze_fs)(struct super_block *sb); int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs); - int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin)(struct super_block *sb); int (*show_options)(struct seq_file *seq, struct dentry *dentry); diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 5d3f8c9e3a62..d9d86598d100 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -61,7 +61,8 @@ /* * open_tree() flags. */ -#define OPEN_TREE_CLONE 1 /* Clone the target tree and attach the clone */ +#define OPEN_TREE_CLONE (1 << 0) /* Clone the target tree and attach the clone */ +#define OPEN_TREE_NAMESPACE (1 << 1) /* Clone the target tree into a new mount namespace */ #define OPEN_TREE_CLOEXEC O_CLOEXEC /* Close the file on execve() */ /* @@ -197,7 +198,10 @@ struct statmount { */ struct mnt_id_req { __u32 size; - __u32 mnt_ns_fd; + union { + __u32 mnt_ns_fd; + __u32 mnt_fd; + }; __u64 mnt_id; __u64 param; __u64 mnt_ns_id; @@ -232,4 +236,9 @@ struct mnt_id_req { #define LSMT_ROOT 0xffffffffffffffff /* root mount */ #define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ +/* + * @flag bits for statmount(2) + */ +#define STATMOUNT_BY_FD 0x00000001U /* want mountinfo for given fd */ + #endif /* _UAPI_LINUX_MOUNT_H */ diff --git a/tools/testing/selftests/filesystems/open_tree_ns/.gitignore b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore new file mode 100644 index 000000000000..fb12b93fbcaa --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/.gitignore @@ -0,0 +1 @@ +open_tree_ns_test diff --git a/tools/testing/selftests/filesystems/open_tree_ns/Makefile b/tools/testing/selftests/filesystems/open_tree_ns/Makefile new file mode 100644 index 000000000000..73c03c4a7ef6 --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +TEST_GEN_PROGS := open_tree_ns_test + +CFLAGS := -Wall -Werror -g $(KHDR_INCLUDES) +LDLIBS := -lcap + +include ../../lib.mk + +$(OUTPUT)/open_tree_ns_test: open_tree_ns_test.c ../utils.c + $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) diff --git a/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c new file mode 100644 index 000000000000..9711556280ae --- /dev/null +++ b/tools/testing/selftests/filesystems/open_tree_ns/open_tree_ns_test.c @@ -0,0 +1,1030 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for OPEN_TREE_NAMESPACE flag. + * + * Test that open_tree() with OPEN_TREE_NAMESPACE creates a new mount + * namespace containing the specified mount tree. + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../wrappers.h" +#include "../statmount/statmount.h" +#include "../utils.h" +#include "../../kselftest_harness.h" + +#ifndef OPEN_TREE_NAMESPACE +#define OPEN_TREE_NAMESPACE (1 << 1) +#endif + +static int get_mnt_ns_id(int fd, uint64_t *mnt_ns_id) +{ + if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) + return -errno; + return 0; +} + +static int get_mnt_ns_id_from_path(const char *path, uint64_t *mnt_ns_id) +{ + int fd, ret; + + fd = open(path, O_RDONLY); + if (fd < 0) + return -errno; + + ret = get_mnt_ns_id(fd, mnt_ns_id); + close(fd); + return ret; +} + +#define STATMOUNT_BUFSIZE (1 << 15) + +static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask) +{ + struct statmount *buf; + size_t bufsize = STATMOUNT_BUFSIZE; + int ret; + + for (;;) { + buf = malloc(bufsize); + if (!buf) + return NULL; + + ret = statmount(mnt_id, mnt_ns_id, mask, buf, bufsize, 0); + if (ret == 0) + return buf; + + free(buf); + if (errno != EOVERFLOW) + return NULL; + + bufsize <<= 1; + } +} + +static void log_mount(struct __test_metadata *_metadata, struct statmount *sm) +{ + const char *fs_type = ""; + const char *mnt_root = ""; + const char *mnt_point = ""; + + if (sm->mask & STATMOUNT_FS_TYPE) + fs_type = sm->str + sm->fs_type; + if (sm->mask & STATMOUNT_MNT_ROOT) + mnt_root = sm->str + sm->mnt_root; + if (sm->mask & STATMOUNT_MNT_POINT) + mnt_point = sm->str + sm->mnt_point; + + TH_LOG(" mnt_id: %llu, parent_id: %llu, fs_type: %s, root: %s, point: %s", + (unsigned long long)sm->mnt_id, + (unsigned long long)sm->mnt_parent_id, + fs_type, mnt_root, mnt_point); +} + +static void dump_mounts(struct __test_metadata *_metadata, uint64_t mnt_ns_id) +{ + uint64_t list[256]; + ssize_t nr_mounts; + + nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) { + TH_LOG("listmount failed: %s", strerror(errno)); + return; + } + + TH_LOG("Mount namespace %llu contains %zd mount(s):", + (unsigned long long)mnt_ns_id, nr_mounts); + + for (ssize_t i = 0; i < nr_mounts; i++) { + struct statmount *sm; + + sm = statmount_alloc(list[i], mnt_ns_id, + STATMOUNT_MNT_BASIC | + STATMOUNT_FS_TYPE | + STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT); + if (!sm) { + TH_LOG(" [%zd] mnt_id %llu: statmount failed: %s", + i, (unsigned long long)list[i], strerror(errno)); + continue; + } + + log_mount(_metadata, sm); + free(sm); + } +} + +FIXTURE(open_tree_ns) +{ + int fd; + uint64_t current_ns_id; +}; + +FIXTURE_VARIANT(open_tree_ns) +{ + const char *path; + unsigned int flags; + bool expect_success; + bool expect_different_ns; + int min_mounts; +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, basic_root) +{ + .path = "/", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + /* + * The empty rootfs is hidden from listmount()/mountinfo, + * so we only see the bind mount on top of it. + */ + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_root) +{ + .path = "/", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, subdir_tmp) +{ + .path = "/tmp", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, subdir_proc) +{ + .path = "/proc", + .flags = OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_tmp) +{ + .path = "/tmp", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, recursive_run) +{ + .path = "/run", + .flags = OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = true, + .expect_different_ns = true, + .min_mounts = 1, +}; + +FIXTURE_VARIANT_ADD(open_tree_ns, invalid_recursive_alone) +{ + .path = "/", + .flags = AT_RECURSIVE | OPEN_TREE_CLOEXEC, + .expect_success = false, + .expect_different_ns = false, + .min_mounts = 0, +}; + +FIXTURE_SETUP(open_tree_ns) +{ + int ret; + + self->fd = -1; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); + + /* Get current mount namespace ID for comparison */ + ret = get_mnt_ns_id_from_path("/proc/self/ns/mnt", &self->current_ns_id); + if (ret < 0) + SKIP(return, "Failed to get current mount namespace ID"); +} + +FIXTURE_TEARDOWN(open_tree_ns) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(open_tree_ns, create_namespace) +{ + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags); + + if (!variant->expect_success) { + ASSERT_LT(self->fd, 0); + ASSERT_EQ(errno, EINVAL); + return; + } + + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + /* Verify we can get the namespace ID */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + /* Verify it's a different namespace */ + if (variant->expect_different_ns) + ASSERT_NE(new_ns_id, self->current_ns_id); + + /* List mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 0) { + TH_LOG("%m - listmount failed"); + } + + /* Verify minimum expected mounts */ + ASSERT_GE(nr_mounts, variant->min_mounts); + TH_LOG("Namespace contains %zd mounts", nr_mounts); +} + +TEST_F(open_tree_ns, setns_into_namespace) +{ + uint64_t new_ns_id; + pid_t pid; + int status; + int ret; + + /* Only test with basic flags */ + if (!(variant->flags & OPEN_TREE_NAMESPACE)) + SKIP(return, "setns test only for basic / case"); + + self->fd = sys_open_tree(AT_FDCWD, variant->path, variant->flags); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + /* Get namespace ID and dump all mounts */ + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + dump_mounts(_metadata, new_ns_id); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + /* Child: try to enter the namespace */ + if (setns(self->fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + ASSERT_EQ(WEXITSTATUS(status), 0); +} + +TEST_F(open_tree_ns, verify_mount_properties) +{ + struct statmount sm; + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int ret; + + /* Only test with basic flags on root */ + if (variant->flags != (OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC) || + strcmp(variant->path, "/") != 0) + SKIP(return, "mount properties test only for basic / case"); + + self->fd = sys_open_tree(AT_FDCWD, "/", OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (self->fd < 0 && errno == EINVAL) + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + + ASSERT_GE(self->fd, 0); + + ret = get_mnt_ns_id(self->fd, &new_ns_id); + ASSERT_EQ(ret, 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 1); + + /* Get info about the root mount (the bind mount, rootfs is hidden) */ + ret = statmount(list[0], new_ns_id, STATMOUNT_MNT_BASIC, &sm, sizeof(sm), 0); + ASSERT_EQ(ret, 0); + + ASSERT_NE(sm.mnt_id, sm.mnt_parent_id); + + TH_LOG("Root mount id: %llu, parent: %llu", + (unsigned long long)sm.mnt_id, + (unsigned long long)sm.mnt_parent_id); +} + +FIXTURE(open_tree_ns_caps) +{ + bool has_caps; +}; + +FIXTURE_SETUP(open_tree_ns_caps) +{ + int ret; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + self->has_caps = (geteuid() == 0); +} + +FIXTURE_TEARDOWN(open_tree_ns_caps) +{ +} + +TEST_F(open_tree_ns_caps, requires_cap_sys_admin) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int fd; + + /* Child: drop privileges using utils.h helper */ + if (enter_userns() != 0) + _exit(2); + + /* Drop all caps using utils.h helper */ + if (caps_down() == 0) + _exit(3); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd >= 0) { + close(fd); + /* Should have failed without caps */ + _exit(1); + } + + if (errno == EPERM) + _exit(0); + + /* EINVAL means OPEN_TREE_NAMESPACE not supported */ + if (errno == EINVAL) + _exit(4); + + /* Unexpected error */ + _exit(5); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Expected: EPERM without caps */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("OPEN_TREE_NAMESPACE succeeded without caps"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 3: + SKIP(return, "caps_down failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(open_tree_ns_userns) +{ + int fd; +}; + +FIXTURE_SETUP(open_tree_ns_userns) +{ + int ret; + + self->fd = -1; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Check if statmount/listmount are supported */ + ret = statmount(0, 0, 0, NULL, 0, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "statmount() syscall not supported"); +} + +FIXTURE_TEARDOWN(open_tree_ns_userns) +{ + if (self->fd >= 0) + close(self->fd); +} + +TEST_F(open_tree_ns_userns, create_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + + /* Create new user namespace (also creates mount namespace) */ + if (enter_userns() != 0) + _exit(2); + + /* Now we have CAP_SYS_ADMIN in the user namespace */ + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); /* OPEN_TREE_NAMESPACE not supported */ + _exit(1); + } + + /* Verify we can get the namespace ID */ + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Verify we can list mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) + _exit(6); + + /* Should have at least 1 mount */ + if (nr_mounts < 1) + _exit(7); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, setns_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + int fd; + pid_t inner_pid; + int inner_status; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Fork again to test setns into the new namespace */ + inner_pid = fork(); + if (inner_pid < 0) + _exit(8); + + if (inner_pid == 0) { + /* Inner child: enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(1); + _exit(0); + } + + if (waitpid(inner_pid, &inner_status, 0) != inner_pid) + _exit(9); + + if (!WIFEXITED(inner_status) || WEXITSTATUS(inner_status) != 0) + _exit(10); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree or setns failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("Inner fork failed"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("Inner waitpid failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, recursive_in_userns) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + /* Test recursive flag in userns */ + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + if (nr_mounts < 0) + _exit(6); + + /* Recursive should copy submounts too */ + if (nr_mounts < 1) + _exit(7); + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + /* Success */ + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE|AT_RECURSIVE) failed in userns"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("listmount failed in new namespace"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("New namespace has no mounts"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, umount_fails_einval) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + + /* Create new user namespace */ + if (enter_userns() != 0) + _exit(2); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(9); + + if (nr_mounts < 1) + _exit(10); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(6); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT); + if (!sm) + _exit(11); + + mnt_point = sm->str + sm->mnt_point; + + TH_LOG("Trying to umount %s", mnt_point); + if (umount2(mnt_point, MNT_DETACH) == 0) { + free(sm); + _exit(7); + } + + if (errno != EINVAL) { + /* Wrong error */ + free(sm); + _exit(8); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + break; + case 8: + ASSERT_FALSE(true) TH_LOG("umount failed with wrong error (expected EINVAL)"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 11: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +TEST_F(open_tree_ns_userns, umount_succeeds) +{ + pid_t pid; + int status; + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + + if (unshare(CLONE_NEWNS)) + _exit(1); + + if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) != 0) + _exit(1); + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + if (fd < 0) { + if (errno == EINVAL) + _exit(4); + _exit(1); + } + + if (get_mnt_ns_id(fd, &new_ns_id) != 0) + _exit(5); + + /* Get all mounts in the new namespace */ + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, LISTMOUNT_REVERSE); + if (nr_mounts < 0) + _exit(9); + + if (nr_mounts < 1) + _exit(10); + + /* Enter the new namespace */ + if (setns(fd, CLONE_NEWNS) < 0) + _exit(6); + + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, + STATMOUNT_MNT_POINT); + if (!sm) + _exit(11); + + mnt_point = sm->str + sm->mnt_point; + + TH_LOG("Trying to umount %s", mnt_point); + if (umount2(mnt_point, MNT_DETACH) != 0) { + free(sm); + _exit(7); + } + + free(sm); + } + + close(fd); + _exit(0); + } + + ASSERT_EQ(waitpid(pid, &status, 0), pid); + ASSERT_TRUE(WIFEXITED(status)); + + switch (WEXITSTATUS(status)) { + case 0: + break; + case 1: + ASSERT_FALSE(true) TH_LOG("open_tree(OPEN_TREE_NAMESPACE) failed"); + break; + case 2: + SKIP(return, "setup_userns failed"); + break; + case 4: + SKIP(return, "OPEN_TREE_NAMESPACE not supported"); + break; + case 5: + ASSERT_FALSE(true) TH_LOG("Failed to get mount namespace ID"); + break; + case 6: + ASSERT_FALSE(true) TH_LOG("setns into new namespace failed"); + break; + case 7: + ASSERT_FALSE(true) TH_LOG("umount succeeded but should have failed with EINVAL"); + break; + case 9: + ASSERT_FALSE(true) TH_LOG("listmount failed"); + break; + case 10: + ASSERT_FALSE(true) TH_LOG("No mounts in new namespace"); + break; + case 11: + ASSERT_FALSE(true) TH_LOG("statmount_alloc failed"); + break; + default: + ASSERT_FALSE(true) TH_LOG("Unexpected error in child (exit %d)", + WEXITSTATUS(status)); + break; + } +} + +FIXTURE(open_tree_ns_unbindable) +{ + char tmpdir[PATH_MAX]; + bool mounted; +}; + +FIXTURE_SETUP(open_tree_ns_unbindable) +{ + int ret; + + self->mounted = false; + + /* Check if open_tree syscall is supported */ + ret = sys_open_tree(-1, NULL, 0); + if (ret == -1 && errno == ENOSYS) + SKIP(return, "open_tree() syscall not supported"); + + /* Create a temporary directory for the test mount */ + snprintf(self->tmpdir, sizeof(self->tmpdir), + "/tmp/open_tree_ns_test.XXXXXX"); + ASSERT_NE(mkdtemp(self->tmpdir), NULL); + + /* Mount tmpfs there */ + ret = mount("tmpfs", self->tmpdir, "tmpfs", 0, NULL); + if (ret < 0) { + rmdir(self->tmpdir); + SKIP(return, "Failed to mount tmpfs"); + } + self->mounted = true; + + ret = mount(NULL, self->tmpdir, NULL, MS_UNBINDABLE, NULL); + if (ret < 0) { + rmdir(self->tmpdir); + SKIP(return, "Failed to make tmpfs unbindable"); + } +} + +FIXTURE_TEARDOWN(open_tree_ns_unbindable) +{ + if (self->mounted) + umount2(self->tmpdir, MNT_DETACH); + rmdir(self->tmpdir); +} + +TEST_F(open_tree_ns_unbindable, fails_on_unbindable) +{ + int fd; + + fd = sys_open_tree(AT_FDCWD, self->tmpdir, + OPEN_TREE_NAMESPACE | OPEN_TREE_CLOEXEC); + ASSERT_LT(fd, 0); +} + +TEST_F(open_tree_ns_unbindable, recursive_skips_on_unbindable) +{ + uint64_t new_ns_id; + uint64_t list[256]; + ssize_t nr_mounts; + int fd; + ssize_t i; + bool found_unbindable = false; + + fd = sys_open_tree(AT_FDCWD, "/", + OPEN_TREE_NAMESPACE | AT_RECURSIVE | OPEN_TREE_CLOEXEC); + ASSERT_GT(fd, 0); + + ASSERT_EQ(get_mnt_ns_id(fd, &new_ns_id), 0); + + nr_mounts = listmount(LSMT_ROOT, new_ns_id, 0, list, 256, 0); + ASSERT_GE(nr_mounts, 0) { + TH_LOG("listmount failed: %m"); + } + + /* + * Iterate through all mounts in the new namespace and verify + * the unbindable tmpfs mount was silently dropped. + */ + for (i = 0; i < nr_mounts; i++) { + struct statmount *sm; + const char *mnt_point; + + sm = statmount_alloc(list[i], new_ns_id, STATMOUNT_MNT_POINT); + ASSERT_NE(sm, NULL) { + TH_LOG("statmount_alloc failed for mnt_id %llu", + (unsigned long long)list[i]); + } + + mnt_point = sm->str + sm->mnt_point; + + if (strcmp(mnt_point, self->tmpdir) == 0) { + TH_LOG("Found unbindable mount at %s (should have been dropped)", + mnt_point); + found_unbindable = true; + } + + free(sm); + } + + ASSERT_FALSE(found_unbindable) { + TH_LOG("Unbindable mount at %s was not dropped", self->tmpdir); + } + + close(fd); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h index 99e5ad082fb1..e1cba4bfd8d9 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount.h +++ b/tools/testing/selftests/filesystems/statmount/statmount.h @@ -43,19 +43,24 @@ #endif #endif -static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask, - struct statmount *buf, size_t bufsize, +static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint32_t fd, + uint64_t mask, struct statmount *buf, size_t bufsize, unsigned int flags) { struct mnt_id_req req = { .size = MNT_ID_REQ_SIZE_VER0, - .mnt_id = mnt_id, .param = mask, }; - if (mnt_ns_id) { + if (flags & STATMOUNT_BY_FD) { req.size = MNT_ID_REQ_SIZE_VER1; - req.mnt_ns_id = mnt_ns_id; + req.mnt_fd = fd; + } else { + req.mnt_id = mnt_id; + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } } return syscall(__NR_statmount, &req, buf, bufsize, flags); diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index 6e53430423d2..a04bcaace126 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -33,15 +33,24 @@ static const char *const known_fs[] = { "sysv", "tmpfs", "tracefs", "ubifs", "udf", "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL }; -static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags) +static struct statmount *statmount_alloc(uint64_t mnt_id, int fd, uint64_t mask, unsigned int flags) { size_t bufsize = 1 << 15; - struct statmount *buf = NULL, *tmp = alloca(bufsize); + struct statmount *buf = NULL, *tmp = NULL; int tofree = 0; int ret; + if (flags & STATMOUNT_BY_FD && fd < 0) + return NULL; + + tmp = alloca(bufsize); + for (;;) { - ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags); + if (flags & STATMOUNT_BY_FD) + ret = statmount(0, 0, (uint32_t) fd, mask, tmp, bufsize, flags); + else + ret = statmount(mnt_id, 0, 0, mask, tmp, bufsize, flags); + if (ret != -1) break; if (tofree) @@ -237,7 +246,7 @@ static void test_statmount_zero_mask(void) struct statmount sm; int ret; - ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, 0, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount zero mask: %s\n", strerror(errno)); @@ -263,7 +272,7 @@ static void test_statmount_mnt_basic(void) int ret; uint64_t mask = STATMOUNT_MNT_BASIC; - ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount mnt basic: %s\n", strerror(errno)); @@ -323,7 +332,7 @@ static void test_statmount_sb_basic(void) struct statx sx; struct statfs sf; - ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount sb basic: %s\n", strerror(errno)); @@ -375,7 +384,7 @@ static void test_statmount_mnt_point(void) { struct statmount *sm; - sm = statmount_alloc(root_id, STATMOUNT_MNT_POINT, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_POINT, 0); if (!sm) { ksft_test_result_fail("statmount mount point: %s\n", strerror(errno)); @@ -405,7 +414,7 @@ static void test_statmount_mnt_root(void) assert(last_dir); last_dir++; - sm = statmount_alloc(root_id, STATMOUNT_MNT_ROOT, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_ROOT, 0); if (!sm) { ksft_test_result_fail("statmount mount root: %s\n", strerror(errno)); @@ -438,7 +447,7 @@ static void test_statmount_fs_type(void) const char *fs_type; const char *const *s; - sm = statmount_alloc(root_id, STATMOUNT_FS_TYPE, 0); + sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0); if (!sm) { ksft_test_result_fail("statmount fs type: %s\n", strerror(errno)); @@ -467,7 +476,7 @@ static void test_statmount_mnt_opts(void) char *line = NULL; size_t len = 0; - sm = statmount_alloc(root_id, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, + sm = statmount_alloc(root_id, 0, STATMOUNT_MNT_BASIC | STATMOUNT_MNT_OPTS, 0); if (!sm) { ksft_test_result_fail("statmount mnt opts: %s\n", @@ -557,7 +566,7 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) uint32_t start, i; int ret; - sm = statmount_alloc(root_id, mask, 0); + sm = statmount_alloc(root_id, 0, mask, 0); if (!sm) { ksft_test_result_fail("statmount %s: %s\n", name, strerror(errno)); @@ -586,14 +595,14 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) exactsize = sm->size; shortsize = sizeof(*sm) + i; - ret = statmount(root_id, 0, mask, sm, exactsize, 0); + ret = statmount(root_id, 0, 0, mask, sm, exactsize, 0); if (ret == -1) { ksft_test_result_fail("statmount exact size: %s\n", strerror(errno)); goto out; } errno = 0; - ret = statmount(root_id, 0, mask, sm, shortsize, 0); + ret = statmount(root_id, 0, 0, mask, sm, shortsize, 0); if (ret != -1 || errno != EOVERFLOW) { ksft_test_result_fail("should have failed with EOVERFLOW: %s\n", strerror(errno)); @@ -658,6 +667,226 @@ static void test_listmount_tree(void) ksft_test_result_pass("listmount tree\n"); } +static void test_statmount_by_fd(void) +{ + struct statmount *sm = NULL; + char tmpdir[] = "/statmount.fd.XXXXXX"; + const char root[] = "/test"; + char subdir[PATH_MAX], tmproot[PATH_MAX]; + int fd; + + if (!mkdtemp(tmpdir)) { + ksft_perror("mkdtemp"); + return; + } + + if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) { + ksft_perror("mount"); + rmdir(tmpdir); + return; + } + + snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root); + snprintf(tmproot, PATH_MAX, "%s/%s", tmpdir, "chroot"); + + if (mkdir(subdir, 0755)) { + ksft_perror("mkdir"); + goto err_tmpdir; + } + + if (mount(subdir, subdir, NULL, MS_BIND, 0)) { + ksft_perror("mount"); + goto err_subdir; + } + + if (mkdir(tmproot, 0755)) { + ksft_perror("mkdir"); + goto err_subdir; + } + + fd = open(subdir, O_PATH); + if (fd < 0) { + ksft_perror("open"); + goto err_tmproot; + } + + if (chroot(tmproot)) { + ksft_perror("chroot"); + goto err_fd; + } + + sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); + goto err_chroot; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto err_chroot; + } + + if (sm->mask & STATMOUNT_MNT_POINT) { + ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in statmount\n"); + goto err_chroot; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n"); + goto err_chroot; + } + + if (strcmp(root, sm->str + sm->mnt_root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", + sm->str + sm->mnt_root, root); + goto err_chroot; + } + + if (chroot(".")) { + ksft_perror("chroot"); + goto out; + } + + free(sm); + sm = statmount_alloc(0, fd, STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd failed: %s\n", strerror(errno)); + goto err_fd; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto out; + } + + if (!(sm->mask & STATMOUNT_MNT_POINT)) { + ksft_test_result_fail("STATMOUNT_MNT_POINT not set in statmount\n"); + goto out; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in statmount\n"); + goto out; + } + + if (strcmp(subdir, sm->str + sm->mnt_point) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_point," + "statmount mnt_point: %s != %s\n", sm->str + sm->mnt_point, subdir); + goto out; + } + + if (strcmp(root, sm->str + sm->mnt_root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", sm->str + sm->mnt_root, root); + goto out; + } + + ksft_test_result_pass("statmount by fd\n"); + goto out; +err_chroot: + chroot("."); +out: + free(sm); +err_fd: + close(fd); +err_tmproot: + rmdir(tmproot); +err_subdir: + umount2(subdir, MNT_DETACH); + rmdir(subdir); +err_tmpdir: + umount2(tmpdir, MNT_DETACH); + rmdir(tmpdir); +} + +static void test_statmount_by_fd_unmounted(void) +{ + const char root[] = "/test.unmounted"; + char tmpdir[] = "/statmount.fd.XXXXXX"; + char subdir[PATH_MAX]; + int fd; + struct statmount *sm = NULL; + + if (!mkdtemp(tmpdir)) { + ksft_perror("mkdtemp"); + return; + } + + if (mount("statmount.test", tmpdir, "tmpfs", 0, NULL)) { + ksft_perror("mount"); + rmdir(tmpdir); + return; + } + + snprintf(subdir, PATH_MAX, "%s%s", tmpdir, root); + + if (mkdir(subdir, 0755)) { + ksft_perror("mkdir"); + goto err_tmpdir; + } + + if (mount(subdir, subdir, 0, MS_BIND, NULL)) { + ksft_perror("mount"); + goto err_subdir; + } + + fd = open(subdir, O_PATH); + if (fd < 0) { + ksft_perror("open"); + goto err_subdir; + } + + if (umount2(tmpdir, MNT_DETACH)) { + ksft_perror("umount2"); + goto err_fd; + } + + sm = statmount_alloc(0, fd, STATMOUNT_MNT_POINT | STATMOUNT_MNT_ROOT, STATMOUNT_BY_FD); + if (!sm) { + ksft_test_result_fail("statmount by fd unmounted: %s\n", + strerror(errno)); + goto err_sm; + } + + if (sm->size < sizeof(*sm)) { + ksft_test_result_fail("unexpected size: %u < %u\n", + sm->size, (uint32_t) sizeof(*sm)); + goto err_sm; + } + + if (sm->mask & STATMOUNT_MNT_POINT) { + ksft_test_result_fail("STATMOUNT_MNT_POINT unexpectedly set in mask\n"); + goto err_sm; + } + + if (!(sm->mask & STATMOUNT_MNT_ROOT)) { + ksft_test_result_fail("STATMOUNT_MNT_ROOT not set in mask\n"); + goto err_sm; + } + + if (strcmp(sm->str + sm->mnt_root, root) != 0) { + ksft_test_result_fail("statmount returned incorrect mnt_root," + "statmount mnt_root: %s != %s\n", + sm->str + sm->mnt_root, root); + goto err_sm; + } + + ksft_test_result_pass("statmount by fd on unmounted mount\n"); +err_sm: + free(sm); +err_fd: + close(fd); +err_subdir: + umount2(subdir, MNT_DETACH); + rmdir(subdir); +err_tmpdir: + umount2(tmpdir, MNT_DETACH); + rmdir(tmpdir); +} + #define str_off(memb) (offsetof(struct statmount, memb) / sizeof(uint32_t)) int main(void) @@ -669,14 +898,14 @@ int main(void) ksft_print_header(); - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); setup_namespace(); - ksft_set_plan(15); + ksft_set_plan(17); test_listmount_empty_root(); test_statmount_zero_mask(); test_statmount_mnt_basic(); @@ -693,6 +922,8 @@ int main(void) test_statmount_string(all_mask, str_off(fs_type), "fs type & all"); test_listmount_tree(); + test_statmount_by_fd_unmounted(); + test_statmount_by_fd(); if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c index d56d4103182f..063d9de46431 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -102,7 +102,7 @@ static int _test_statmount_mnt_ns_id(void) if (!root_id) return NSID_ERROR; - ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); if (ret == -1) { ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); return NSID_ERROR; @@ -128,6 +128,98 @@ static int _test_statmount_mnt_ns_id(void) return NSID_PASS; } +static int _test_statmount_mnt_ns_id_by_fd(void) +{ + struct statmount sm; + uint64_t mnt_ns_id; + int ret, fd, mounted = 1, status = NSID_ERROR; + char mnt[] = "/statmount.fd.XXXXXX"; + + ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id); + if (ret != NSID_PASS) + return ret; + + if (!mkdtemp(mnt)) { + ksft_print_msg("statmount by fd mnt ns id mkdtemp: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (mount(mnt, mnt, NULL, MS_BIND, 0)) { + ksft_print_msg("statmount by fd mnt ns id mount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto err; + } + + fd = open(mnt, O_PATH); + if (fd < 0) { + ksft_print_msg("statmount by fd mnt ns id open: %s\n", strerror(errno)); + goto err; + } + + ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto out; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + status = NSID_FAIL; + goto out; + } + if (sm.mask != STATMOUNT_MNT_NS_ID) { + ksft_print_msg("statmount mnt ns id unavailable\n"); + status = NSID_SKIP; + goto out; + } + + if (sm.mnt_ns_id != mnt_ns_id) { + ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n", + (unsigned long long)sm.mnt_ns_id, + (unsigned long long)mnt_ns_id); + status = NSID_FAIL; + goto out; + } + + mounted = 0; + if (umount2(mnt, MNT_DETACH)) { + ksft_print_msg("statmount by fd mnt ns id umount2: %s\n", strerror(errno)); + goto out; + } + + ret = statmount(0, 0, fd, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), STATMOUNT_BY_FD); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id statmount: %s\n", strerror(errno)); + status = NSID_ERROR; + goto out; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + status = NSID_FAIL; + goto out; + } + + if (sm.mask == STATMOUNT_MNT_NS_ID) { + ksft_print_msg("unexpected STATMOUNT_MNT_NS_ID in mask\n"); + status = NSID_FAIL; + goto out; + } + + status = NSID_PASS; +out: + close(fd); + if (mounted) + umount2(mnt, MNT_DETACH); +err: + rmdir(mnt); + return status; +} + + static void test_statmount_mnt_ns_id(void) { pid_t pid; @@ -148,6 +240,9 @@ static void test_statmount_mnt_ns_id(void) if (ret != NSID_PASS) exit(ret); ret = _test_statmount_mnt_ns_id(); + if (ret != NSID_PASS) + exit(ret); + ret = _test_statmount_mnt_ns_id_by_fd(); exit(ret); } @@ -179,7 +274,7 @@ static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts) for (int i = 0; i < nr_mounts; i++) { struct statmount sm; - ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm, + ret = statmount(list[i], mnt_ns_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); if (ret < 0) { ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); @@ -275,7 +370,7 @@ int main(void) int ret; ksft_print_header(); - ret = statmount(0, 0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); diff --git a/tools/testing/selftests/filesystems/utils.c b/tools/testing/selftests/filesystems/utils.c index c9dd5412b37b..d6f26f849053 100644 --- a/tools/testing/selftests/filesystems/utils.c +++ b/tools/testing/selftests/filesystems/utils.c @@ -515,6 +515,32 @@ int setup_userns(void) return 0; } +int enter_userns(void) +{ + int ret; + char buf[32]; + uid_t uid = getuid(); + gid_t gid = getgid(); + + ret = unshare(CLONE_NEWUSER); + if (ret) + return ret; + + sprintf(buf, "0 %d 1", uid); + ret = write_file("/proc/self/uid_map", buf); + if (ret) + return ret; + ret = write_file("/proc/self/setgroups", "deny"); + if (ret) + return ret; + sprintf(buf, "0 %d 1", gid); + ret = write_file("/proc/self/gid_map", buf); + if (ret) + return ret; + + return 0; +} + /* caps_down - lower all effective caps */ int caps_down(void) { diff --git a/tools/testing/selftests/filesystems/utils.h b/tools/testing/selftests/filesystems/utils.h index 70f7ccc607f4..0bccfed666a9 100644 --- a/tools/testing/selftests/filesystems/utils.h +++ b/tools/testing/selftests/filesystems/utils.h @@ -28,6 +28,7 @@ extern int cap_down(cap_value_t down); extern bool switch_ids(uid_t uid, gid_t gid); extern int setup_userns(void); +extern int enter_userns(void); static inline bool switch_userns(int fd, uid_t uid, gid_t gid, bool drop_caps) {