linux/kernel/nscommon.c
Christian Brauner 76b6f5dfb3
nstree: add listns()
Add a new listns() system call that allows userspace to iterate through
namespaces in the system. This provides a programmatic interface to
discover and inspect namespaces, enhancing existing namespace apis.

Currently, there is no direct way for userspace to enumerate namespaces
in the system. Applications must resort to scanning /proc/<pid>/ns/
across all processes, which is:

1. Inefficient - requires iterating over all processes
2. Incomplete - misses inactive namespaces that aren't attached to any
   running process but are kept alive by file descriptors, bind mounts,
   or parent namespace references
3. Permission-heavy - requires access to /proc for many processes
4. No ordering or ownership.
5. No filtering per namespace type: Must always iterate and check all
   namespaces.

The list goes on. The listns() system call solves these problems by
providing direct kernel-level enumeration of namespaces. It is similar
to listmount() but obviously tailored to namespaces.

/*
 * @req: Pointer to struct ns_id_req specifying search parameters
 * @ns_ids: User buffer to receive namespace IDs
 * @nr_ns_ids: Size of ns_ids buffer (maximum number of IDs to return)
 * @flags: Reserved for future use (must be 0)
 */
ssize_t listns(const struct ns_id_req *req, u64 *ns_ids,
               size_t nr_ns_ids, unsigned int flags);

Returns:
- On success: Number of namespace IDs written to ns_ids
- On error: Negative error code

/*
 * @size: Structure size
 * @ns_id: Starting point for iteration; use 0 for first call, then
 *         use the last returned ID for subsequent calls to paginate
 * @ns_type: Bitmask of namespace types to include (from enum ns_type):
 *           0: Return all namespace types
 *           MNT_NS: Mount namespaces
 *           NET_NS: Network namespaces
 *           USER_NS: User namespaces
 *           etc. Can be OR'd together
 * @user_ns_id: Filter results to namespaces owned by this user namespace:
 *              0: Return all namespaces (subject to permission checks)
 *              LISTNS_CURRENT_USER: Namespaces owned by caller's user namespace
 *              Other value: Namespaces owned by the specified user namespace ID
 */
struct ns_id_req {
        __u32 size;         /* sizeof(struct ns_id_req) */
        __u32 spare;        /* Reserved, must be 0 */
        __u64 ns_id;        /* Last seen namespace ID (for pagination) */
        __u32 ns_type;      /* Filter by namespace type(s) */
        __u32 spare2;       /* Reserved, must be 0 */
        __u64 user_ns_id;   /* Filter by owning user namespace */
};

Example 1: List all namespaces

void list_all_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,          /* Start from beginning */
        .ns_type = 0,        /* All types */
        .user_ns_id = 0,     /* All user namespaces */
    };
    uint64_t ids[100];
    ssize_t ret;

    printf("All namespaces in the system:\n");
    do {
        ret = listns(&req, ids, 100, 0);
        if (ret < 0) {
            perror("listns");
            break;
        }

        for (ssize_t i = 0; i < ret; i++)
            printf("  Namespace ID: %llu\n", (unsigned long long)ids[i]);

        /* Continue from last seen ID */
        if (ret > 0)
            req.ns_id = ids[ret - 1];
    } while (ret == 100);  /* Buffer was full, more may exist */
}

Example 2: List network namespaces only

void list_network_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = NET_NS,   /* Only network namespaces */
        .user_ns_id = 0,
    };
    uint64_t ids[100];
    ssize_t ret;

    ret = listns(&req, ids, 100, 0);
    if (ret < 0) {
        perror("listns");
        return;
    }

    printf("Network namespaces: %zd found\n", ret);
    for (ssize_t i = 0; i < ret; i++)
        printf("  netns ID: %llu\n", (unsigned long long)ids[i]);
}

Example 3: List namespaces owned by current user namespace

void list_owned_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = 0,                      /* All types */
        .user_ns_id = LISTNS_CURRENT_USER, /* Current userns */
    };
    uint64_t ids[100];
    ssize_t ret;

    ret = listns(&req, ids, 100, 0);
    if (ret < 0) {
        perror("listns");
        return;
    }

    printf("Namespaces owned by my user namespace: %zd\n", ret);
    for (ssize_t i = 0; i < ret; i++)
        printf("  ns ID: %llu\n", (unsigned long long)ids[i]);
}

Example 4: List multiple namespace types

void list_network_and_mount_namespaces(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = NET_NS | MNT_NS,  /* Network and mount */
        .user_ns_id = 0,
    };
    uint64_t ids[100];
    ssize_t ret;

    ret = listns(&req, ids, 100, 0);
    printf("Network and mount namespaces: %zd found\n", ret);
}

Example 5: Pagination through large namespace sets

void list_all_with_pagination(void)
{
    struct ns_id_req req = {
        .size = sizeof(req),
        .ns_id = 0,
        .ns_type = 0,
        .user_ns_id = 0,
    };
    uint64_t ids[50];
    size_t total = 0;
    ssize_t ret;

    printf("Enumerating all namespaces with pagination:\n");

    while (1) {
        ret = listns(&req, ids, 50, 0);
        if (ret < 0) {
            perror("listns");
            break;
        }
        if (ret == 0)
            break;  /* No more namespaces */

        total += ret;
        printf("  Batch: %zd namespaces\n", ret);

        /* Last ID in this batch becomes start of next batch */
        req.ns_id = ids[ret - 1];

        if (ret < 50)
            break;  /* Partial batch = end of results */
    }

    printf("Total: %zu namespaces\n", total);
}

Permission Model

listns() respects namespace isolation and capabilities:

(1) Global listing (user_ns_id = 0):
    - Requires CAP_SYS_ADMIN in the namespace's owning user namespace
    - OR the namespace must be in the caller's namespace context (e.g.,
      a namespace the caller is currently using)
    - User namespaces additionally allow listing if the caller has
      CAP_SYS_ADMIN in that user namespace itself
(2) Owner-filtered listing (user_ns_id != 0):
    - Requires CAP_SYS_ADMIN in the specified owner user namespace
    - OR the namespace must be in the caller's namespace context
    - This allows unprivileged processes to enumerate namespaces they own
(3) Visibility:
    - Only "active" namespaces are listed
    - A namespace is active if it has a non-zero __ns_ref_active count
    - This includes namespaces used by running processes, held by open
      file descriptors, or kept active by bind mounts
    - Inactive namespaces (kept alive only by internal kernel
      references) are not visible via listns()

Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-19-2e6f823ebdc0@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-11-03 17:41:18 +01:00

296 lines
8.5 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */
#include <linux/ns_common.h>
#include <linux/proc_ns.h>
#include <linux/user_namespace.h>
#include <linux/vfsdebug.h>
#ifdef CONFIG_DEBUG_VFS
static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops)
{
switch (ns->ns_type) {
#ifdef CONFIG_CGROUPS
case CLONE_NEWCGROUP:
VFS_WARN_ON_ONCE(ops != &cgroupns_operations);
break;
#endif
#ifdef CONFIG_IPC_NS
case CLONE_NEWIPC:
VFS_WARN_ON_ONCE(ops != &ipcns_operations);
break;
#endif
case CLONE_NEWNS:
VFS_WARN_ON_ONCE(ops != &mntns_operations);
break;
#ifdef CONFIG_NET_NS
case CLONE_NEWNET:
VFS_WARN_ON_ONCE(ops != &netns_operations);
break;
#endif
#ifdef CONFIG_PID_NS
case CLONE_NEWPID:
VFS_WARN_ON_ONCE(ops != &pidns_operations);
break;
#endif
#ifdef CONFIG_TIME_NS
case CLONE_NEWTIME:
VFS_WARN_ON_ONCE(ops != &timens_operations);
break;
#endif
#ifdef CONFIG_USER_NS
case CLONE_NEWUSER:
VFS_WARN_ON_ONCE(ops != &userns_operations);
break;
#endif
#ifdef CONFIG_UTS_NS
case CLONE_NEWUTS:
VFS_WARN_ON_ONCE(ops != &utsns_operations);
break;
#endif
}
}
#endif
int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum)
{
int ret;
refcount_set(&ns->__ns_ref, 1);
ns->stashed = NULL;
ns->ops = ops;
ns->ns_id = 0;
ns->ns_type = ns_type;
RB_CLEAR_NODE(&ns->ns_tree_node);
RB_CLEAR_NODE(&ns->ns_unified_tree_node);
RB_CLEAR_NODE(&ns->ns_owner_tree_node);
INIT_LIST_HEAD(&ns->ns_list_node);
INIT_LIST_HEAD(&ns->ns_unified_list_node);
ns->ns_owner_tree = RB_ROOT;
INIT_LIST_HEAD(&ns->ns_owner);
INIT_LIST_HEAD(&ns->ns_owner_entry);
#ifdef CONFIG_DEBUG_VFS
ns_debug(ns, ops);
#endif
if (inum) {
ns->inum = inum;
return 0;
}
ret = proc_alloc_inum(&ns->inum);
if (ret)
return ret;
/*
* Tree ref starts at 0. It's incremented when namespace enters
* active use (installed in nsproxy) and decremented when all
* active uses are gone. Initial namespaces are always active.
*/
if (is_initial_namespace(ns))
atomic_set(&ns->__ns_ref_active, 1);
else
atomic_set(&ns->__ns_ref_active, 0);
return 0;
}
void __ns_common_free(struct ns_common *ns)
{
proc_free_inum(ns->inum);
}
struct ns_common *__must_check ns_owner(struct ns_common *ns)
{
struct user_namespace *owner;
if (unlikely(!ns->ops))
return NULL;
VFS_WARN_ON_ONCE(!ns->ops->owner);
owner = ns->ops->owner(ns);
VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns));
if (!owner)
return NULL;
/* Skip init_user_ns as it's always active */
if (owner == &init_user_ns)
return NULL;
return to_ns_common(owner);
}
void __ns_ref_active_get_owner(struct ns_common *ns)
{
ns = ns_owner(ns);
if (ns)
WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active));
}
/*
* The active reference count works by having each namespace that gets
* created take a single active reference on its owning user namespace.
* That single reference is only released once the child namespace's
* active count itself goes down.
*
* A regular namespace tree might look as follow:
* Legend:
* + : adding active reference
* - : dropping active reference
* x : always active (initial namespace)
*
*
* net_ns pid_ns
* \ /
* + +
* user_ns1 (2)
* |
* ipc_ns | uts_ns
* \ | /
* + + +
* user_ns2 (3)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* If both net_ns and pid_ns put their last active reference on
* themselves it will cascade to user_ns1 dropping its own active
* reference and dropping one active reference on user_ns2:
*
* net_ns pid_ns
* \ /
* - -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* + - +
* user_ns2 (2)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* The iteration stops once we reach a namespace that still has active
* references.
*/
void __ns_ref_active_put_owner(struct ns_common *ns)
{
for (;;) {
ns = ns_owner(ns);
if (!ns)
return;
if (!atomic_dec_and_test(&ns->__ns_ref_active))
return;
}
}
/*
* The active reference count works by having each namespace that gets
* created take a single active reference on its owning user namespace.
* That single reference is only released once the child namespace's
* active count itself goes down. This makes it possible to efficiently
* resurrect a namespace tree:
*
* A regular namespace tree might look as follow:
* Legend:
* + : adding active reference
* - : dropping active reference
* x : always active (initial namespace)
*
*
* net_ns pid_ns
* \ /
* + +
* user_ns1 (2)
* |
* ipc_ns | uts_ns
* \ | /
* + + +
* user_ns2 (3)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* If both net_ns and pid_ns put their last active reference on
* themselves it will cascade to user_ns1 dropping its own active
* reference and dropping one active reference on user_ns2:
*
* net_ns pid_ns
* \ /
* - -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* + - +
* user_ns2 (2)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* Assume the whole tree is dead but all namespaces are still active:
*
* net_ns pid_ns
* \ /
* - -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* - - -
* user_ns2 (0)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()):
*
* net_ns pid_ns
* \ /
* + -
* user_ns1 (0)
* |
* ipc_ns | uts_ns
* \ | /
* - + -
* user_ns2 (0)
* |
* cgroup_ns | mnt_ns
* \ | /
* x x x
* init_user_ns (1)
*
* If net_ns had a zero reference count and we bumped it we also need to
* take another reference on its owning user namespace. Similarly, if
* pid_ns had a zero reference count it also needs to take another
* reference on its owning user namespace. So both net_ns and pid_ns
* will each have their own reference on the owning user namespace.
*
* If the owning user namespace user_ns1 had a zero reference count then
* it also needs to take another reference on its owning user namespace
* and so on.
*/
void __ns_ref_active_resurrect(struct ns_common *ns)
{
/* If we didn't resurrect the namespace we're done. */
if (atomic_fetch_add(1, &ns->__ns_ref_active))
return;
/*
* We did resurrect it. Walk the ownership hierarchy upwards
* until we found an owning user namespace that is active.
*/
for (;;) {
ns = ns_owner(ns);
if (!ns)
return;
if (atomic_fetch_add(1, &ns->__ns_ref_active))
return;
}
}