mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 03:24:45 +01:00
Add a new listns() system call that allows userspace to iterate through
namespaces in the system. This provides a programmatic interface to
discover and inspect namespaces, enhancing existing namespace apis.
Currently, there is no direct way for userspace to enumerate namespaces
in the system. Applications must resort to scanning /proc/<pid>/ns/
across all processes, which is:
1. Inefficient - requires iterating over all processes
2. Incomplete - misses inactive namespaces that aren't attached to any
running process but are kept alive by file descriptors, bind mounts,
or parent namespace references
3. Permission-heavy - requires access to /proc for many processes
4. No ordering or ownership.
5. No filtering per namespace type: Must always iterate and check all
namespaces.
The list goes on. The listns() system call solves these problems by
providing direct kernel-level enumeration of namespaces. It is similar
to listmount() but obviously tailored to namespaces.
/*
* @req: Pointer to struct ns_id_req specifying search parameters
* @ns_ids: User buffer to receive namespace IDs
* @nr_ns_ids: Size of ns_ids buffer (maximum number of IDs to return)
* @flags: Reserved for future use (must be 0)
*/
ssize_t listns(const struct ns_id_req *req, u64 *ns_ids,
size_t nr_ns_ids, unsigned int flags);
Returns:
- On success: Number of namespace IDs written to ns_ids
- On error: Negative error code
/*
* @size: Structure size
* @ns_id: Starting point for iteration; use 0 for first call, then
* use the last returned ID for subsequent calls to paginate
* @ns_type: Bitmask of namespace types to include (from enum ns_type):
* 0: Return all namespace types
* MNT_NS: Mount namespaces
* NET_NS: Network namespaces
* USER_NS: User namespaces
* etc. Can be OR'd together
* @user_ns_id: Filter results to namespaces owned by this user namespace:
* 0: Return all namespaces (subject to permission checks)
* LISTNS_CURRENT_USER: Namespaces owned by caller's user namespace
* Other value: Namespaces owned by the specified user namespace ID
*/
struct ns_id_req {
__u32 size; /* sizeof(struct ns_id_req) */
__u32 spare; /* Reserved, must be 0 */
__u64 ns_id; /* Last seen namespace ID (for pagination) */
__u32 ns_type; /* Filter by namespace type(s) */
__u32 spare2; /* Reserved, must be 0 */
__u64 user_ns_id; /* Filter by owning user namespace */
};
Example 1: List all namespaces
void list_all_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0, /* Start from beginning */
.ns_type = 0, /* All types */
.user_ns_id = 0, /* All user namespaces */
};
uint64_t ids[100];
ssize_t ret;
printf("All namespaces in the system:\n");
do {
ret = listns(&req, ids, 100, 0);
if (ret < 0) {
perror("listns");
break;
}
for (ssize_t i = 0; i < ret; i++)
printf(" Namespace ID: %llu\n", (unsigned long long)ids[i]);
/* Continue from last seen ID */
if (ret > 0)
req.ns_id = ids[ret - 1];
} while (ret == 100); /* Buffer was full, more may exist */
}
Example 2: List network namespaces only
void list_network_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = NET_NS, /* Only network namespaces */
.user_ns_id = 0,
};
uint64_t ids[100];
ssize_t ret;
ret = listns(&req, ids, 100, 0);
if (ret < 0) {
perror("listns");
return;
}
printf("Network namespaces: %zd found\n", ret);
for (ssize_t i = 0; i < ret; i++)
printf(" netns ID: %llu\n", (unsigned long long)ids[i]);
}
Example 3: List namespaces owned by current user namespace
void list_owned_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = 0, /* All types */
.user_ns_id = LISTNS_CURRENT_USER, /* Current userns */
};
uint64_t ids[100];
ssize_t ret;
ret = listns(&req, ids, 100, 0);
if (ret < 0) {
perror("listns");
return;
}
printf("Namespaces owned by my user namespace: %zd\n", ret);
for (ssize_t i = 0; i < ret; i++)
printf(" ns ID: %llu\n", (unsigned long long)ids[i]);
}
Example 4: List multiple namespace types
void list_network_and_mount_namespaces(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = NET_NS | MNT_NS, /* Network and mount */
.user_ns_id = 0,
};
uint64_t ids[100];
ssize_t ret;
ret = listns(&req, ids, 100, 0);
printf("Network and mount namespaces: %zd found\n", ret);
}
Example 5: Pagination through large namespace sets
void list_all_with_pagination(void)
{
struct ns_id_req req = {
.size = sizeof(req),
.ns_id = 0,
.ns_type = 0,
.user_ns_id = 0,
};
uint64_t ids[50];
size_t total = 0;
ssize_t ret;
printf("Enumerating all namespaces with pagination:\n");
while (1) {
ret = listns(&req, ids, 50, 0);
if (ret < 0) {
perror("listns");
break;
}
if (ret == 0)
break; /* No more namespaces */
total += ret;
printf(" Batch: %zd namespaces\n", ret);
/* Last ID in this batch becomes start of next batch */
req.ns_id = ids[ret - 1];
if (ret < 50)
break; /* Partial batch = end of results */
}
printf("Total: %zu namespaces\n", total);
}
Permission Model
listns() respects namespace isolation and capabilities:
(1) Global listing (user_ns_id = 0):
- Requires CAP_SYS_ADMIN in the namespace's owning user namespace
- OR the namespace must be in the caller's namespace context (e.g.,
a namespace the caller is currently using)
- User namespaces additionally allow listing if the caller has
CAP_SYS_ADMIN in that user namespace itself
(2) Owner-filtered listing (user_ns_id != 0):
- Requires CAP_SYS_ADMIN in the specified owner user namespace
- OR the namespace must be in the caller's namespace context
- This allows unprivileged processes to enumerate namespaces they own
(3) Visibility:
- Only "active" namespaces are listed
- A namespace is active if it has a non-zero __ns_ref_active count
- This includes namespaces used by running processes, held by open
file descriptors, or kept active by bind mounts
- Inactive namespaces (kept alive only by internal kernel
references) are not visible via listns()
Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-19-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
296 lines
8.5 KiB
C
296 lines
8.5 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
|
|
|
|
#include <linux/ns_common.h>
|
|
#include <linux/proc_ns.h>
|
|
#include <linux/user_namespace.h>
|
|
#include <linux/vfsdebug.h>
|
|
|
|
#ifdef CONFIG_DEBUG_VFS
|
|
static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
|
|
{
|
|
switch (ns->ns_type) {
|
|
#ifdef CONFIG_CGROUPS
|
|
case CLONE_NEWCGROUP:
|
|
VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
|
|
break;
|
|
#endif
|
|
#ifdef CONFIG_IPC_NS
|
|
case CLONE_NEWIPC:
|
|
VFS_WARN_ON_ONCE(ops != &ipcns_operations);
|
|
break;
|
|
#endif
|
|
case CLONE_NEWNS:
|
|
VFS_WARN_ON_ONCE(ops != &mntns_operations);
|
|
break;
|
|
#ifdef CONFIG_NET_NS
|
|
case CLONE_NEWNET:
|
|
VFS_WARN_ON_ONCE(ops != &netns_operations);
|
|
break;
|
|
#endif
|
|
#ifdef CONFIG_PID_NS
|
|
case CLONE_NEWPID:
|
|
VFS_WARN_ON_ONCE(ops != &pidns_operations);
|
|
break;
|
|
#endif
|
|
#ifdef CONFIG_TIME_NS
|
|
case CLONE_NEWTIME:
|
|
VFS_WARN_ON_ONCE(ops != &timens_operations);
|
|
break;
|
|
#endif
|
|
#ifdef CONFIG_USER_NS
|
|
case CLONE_NEWUSER:
|
|
VFS_WARN_ON_ONCE(ops != &userns_operations);
|
|
break;
|
|
#endif
|
|
#ifdef CONFIG_UTS_NS
|
|
case CLONE_NEWUTS:
|
|
VFS_WARN_ON_ONCE(ops != &utsns_operations);
|
|
break;
|
|
#endif
|
|
}
|
|
}
|
|
#endif
|
|
|
|
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
|
|
{
|
|
int ret;
|
|
|
|
refcount_set(&ns->__ns_ref, 1);
|
|
ns->stashed = NULL;
|
|
ns->ops = ops;
|
|
ns->ns_id = 0;
|
|
ns->ns_type = ns_type;
|
|
RB_CLEAR_NODE(&ns->ns_tree_node);
|
|
RB_CLEAR_NODE(&ns->ns_unified_tree_node);
|
|
RB_CLEAR_NODE(&ns->ns_owner_tree_node);
|
|
INIT_LIST_HEAD(&ns->ns_list_node);
|
|
INIT_LIST_HEAD(&ns->ns_unified_list_node);
|
|
ns->ns_owner_tree = RB_ROOT;
|
|
INIT_LIST_HEAD(&ns->ns_owner);
|
|
INIT_LIST_HEAD(&ns->ns_owner_entry);
|
|
|
|
#ifdef CONFIG_DEBUG_VFS
|
|
ns_debug(ns, ops);
|
|
#endif
|
|
|
|
if (inum) {
|
|
ns->inum = inum;
|
|
return 0;
|
|
}
|
|
ret = proc_alloc_inum(&ns->inum);
|
|
if (ret)
|
|
return ret;
|
|
/*
|
|
* Tree ref starts at 0. It's incremented when namespace enters
|
|
* active use (installed in nsproxy) and decremented when all
|
|
* active uses are gone. Initial namespaces are always active.
|
|
*/
|
|
if (is_initial_namespace(ns))
|
|
atomic_set(&ns->__ns_ref_active, 1);
|
|
else
|
|
atomic_set(&ns->__ns_ref_active, 0);
|
|
return 0;
|
|
}
|
|
|
|
void __ns_common_free(struct ns_common *ns)
|
|
{
|
|
proc_free_inum(ns->inum);
|
|
}
|
|
|
|
struct ns_common *__must_check ns_owner(struct ns_common *ns)
|
|
{
|
|
struct user_namespace *owner;
|
|
|
|
if (unlikely(!ns->ops))
|
|
return NULL;
|
|
VFS_WARN_ON_ONCE(!ns->ops->owner);
|
|
owner = ns->ops->owner(ns);
|
|
VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
|
|
if (!owner)
|
|
return NULL;
|
|
/* Skip init_user_ns as it's always active */
|
|
if (owner == &init_user_ns)
|
|
return NULL;
|
|
return to_ns_common(owner);
|
|
}
|
|
|
|
void __ns_ref_active_get_owner(struct ns_common *ns)
|
|
{
|
|
ns = ns_owner(ns);
|
|
if (ns)
|
|
WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
|
|
}
|
|
|
|
/*
|
|
* The active reference count works by having each namespace that gets
|
|
* created take a single active reference on its owning user namespace.
|
|
* That single reference is only released once the child namespace's
|
|
* active count itself goes down.
|
|
*
|
|
* A regular namespace tree might look as follow:
|
|
* Legend:
|
|
* + : adding active reference
|
|
* - : dropping active reference
|
|
* x : always active (initial namespace)
|
|
*
|
|
*
|
|
* net_ns pid_ns
|
|
* \ /
|
|
* + +
|
|
* user_ns1 (2)
|
|
* |
|
|
* ipc_ns | uts_ns
|
|
* \ | /
|
|
* + + +
|
|
* user_ns2 (3)
|
|
* |
|
|
* cgroup_ns | mnt_ns
|
|
* \ | /
|
|
* x x x
|
|
* init_user_ns (1)
|
|
*
|
|
* If both net_ns and pid_ns put their last active reference on
|
|
* themselves it will cascade to user_ns1 dropping its own active
|
|
* reference and dropping one active reference on user_ns2:
|
|
*
|
|
* net_ns pid_ns
|
|
* \ /
|
|
* - -
|
|
* user_ns1 (0)
|
|
* |
|
|
* ipc_ns | uts_ns
|
|
* \ | /
|
|
* + - +
|
|
* user_ns2 (2)
|
|
* |
|
|
* cgroup_ns | mnt_ns
|
|
* \ | /
|
|
* x x x
|
|
* init_user_ns (1)
|
|
*
|
|
* The iteration stops once we reach a namespace that still has active
|
|
* references.
|
|
*/
|
|
void __ns_ref_active_put_owner(struct ns_common *ns)
|
|
{
|
|
for (;;) {
|
|
ns = ns_owner(ns);
|
|
if (!ns)
|
|
return;
|
|
if (!atomic_dec_and_test(&ns->__ns_ref_active))
|
|
return;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The active reference count works by having each namespace that gets
|
|
* created take a single active reference on its owning user namespace.
|
|
* That single reference is only released once the child namespace's
|
|
* active count itself goes down. This makes it possible to efficiently
|
|
* resurrect a namespace tree:
|
|
*
|
|
* A regular namespace tree might look as follow:
|
|
* Legend:
|
|
* + : adding active reference
|
|
* - : dropping active reference
|
|
* x : always active (initial namespace)
|
|
*
|
|
*
|
|
* net_ns pid_ns
|
|
* \ /
|
|
* + +
|
|
* user_ns1 (2)
|
|
* |
|
|
* ipc_ns | uts_ns
|
|
* \ | /
|
|
* + + +
|
|
* user_ns2 (3)
|
|
* |
|
|
* cgroup_ns | mnt_ns
|
|
* \ | /
|
|
* x x x
|
|
* init_user_ns (1)
|
|
*
|
|
* If both net_ns and pid_ns put their last active reference on
|
|
* themselves it will cascade to user_ns1 dropping its own active
|
|
* reference and dropping one active reference on user_ns2:
|
|
*
|
|
* net_ns pid_ns
|
|
* \ /
|
|
* - -
|
|
* user_ns1 (0)
|
|
* |
|
|
* ipc_ns | uts_ns
|
|
* \ | /
|
|
* + - +
|
|
* user_ns2 (2)
|
|
* |
|
|
* cgroup_ns | mnt_ns
|
|
* \ | /
|
|
* x x x
|
|
* init_user_ns (1)
|
|
*
|
|
* Assume the whole tree is dead but all namespaces are still active:
|
|
*
|
|
* net_ns pid_ns
|
|
* \ /
|
|
* - -
|
|
* user_ns1 (0)
|
|
* |
|
|
* ipc_ns | uts_ns
|
|
* \ | /
|
|
* - - -
|
|
* user_ns2 (0)
|
|
* |
|
|
* cgroup_ns | mnt_ns
|
|
* \ | /
|
|
* x x x
|
|
* init_user_ns (1)
|
|
*
|
|
* Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
|
|
*
|
|
* net_ns pid_ns
|
|
* \ /
|
|
* + -
|
|
* user_ns1 (0)
|
|
* |
|
|
* ipc_ns | uts_ns
|
|
* \ | /
|
|
* - + -
|
|
* user_ns2 (0)
|
|
* |
|
|
* cgroup_ns | mnt_ns
|
|
* \ | /
|
|
* x x x
|
|
* init_user_ns (1)
|
|
*
|
|
* If net_ns had a zero reference count and we bumped it we also need to
|
|
* take another reference on its owning user namespace. Similarly, if
|
|
* pid_ns had a zero reference count it also needs to take another
|
|
* reference on its owning user namespace. So both net_ns and pid_ns
|
|
* will each have their own reference on the owning user namespace.
|
|
*
|
|
* If the owning user namespace user_ns1 had a zero reference count then
|
|
* it also needs to take another reference on its owning user namespace
|
|
* and so on.
|
|
*/
|
|
void __ns_ref_active_resurrect(struct ns_common *ns)
|
|
{
|
|
/* If we didn't resurrect the namespace we're done. */
|
|
if (atomic_fetch_add(1, &ns->__ns_ref_active))
|
|
return;
|
|
|
|
/*
|
|
* We did resurrect it. Walk the ownership hierarchy upwards
|
|
* until we found an owning user namespace that is active.
|
|
*/
|
|
for (;;) {
|
|
ns = ns_owner(ns);
|
|
if (!ns)
|
|
return;
|
|
|
|
if (atomic_fetch_add(1, &ns->__ns_ref_active))
|
|
return;
|
|
}
|
|
}
|