Merge branch 'close-race-in-freeing-special-fields-and-map-value'

Kumar Kartikeya Dwivedi says:

====================
Close race in freeing special fields and map value

There exists a race across various map types where the freeing of
special fields (tw, timer, wq, kptr, etc.) can be done eagerly when a
logical delete operation is done on a map value, such that the program
which continues to have access to such a map value can recreate the
fields and cause them to leak.

The set contains fixes for this case. It is a continuation of Mykyta's
previous attempt in [0], but applies to all fields. A test is included
which reproduces the bug reliably in absence of the fixes.

Local Storage Benchmarks
------------------------
Evaluation Setup: Benchmarked on a dual-socket Intel Xeon Gold 6348 (Ice
Lake) @ 2.60GHz (56 cores / 112 threads), with the CPU governor set to
performance. Bench was pinned to a single NUMA node throughout the test.

Benchmark comes from [1] using the following command:
./bench -p 1 local-storage-create --storage-type <socket,task> --batch-size <16,32,64>

Before the test, 10 runs of all cases ([socket|task] x 3 batch sizes x 7
iterations per batch size) are done to warm up and prime the machine.

Then, 3 runs of all cases are done (with and without the patch, across
reboots).

For each comparison, we have 21 samples, i.e. per batch size (e.g.
socket 16) of a given local storage, we have 3 runs x 7 iterations.

The statistics (mean, median, stddev) and t-test is done for each
scenario (local storage and batch size pair) individually (21 samples
for either case). All values are for local storage creations in thousand
creations / sec (k/s).

	       Baseline (without patch)               With patch                       Delta
     Case      Median        Mean   Std. Dev.   Median        Mean   Std. Dev.   Median       %
---------------------------------------------------------------------------------------------------
socket 16     432.026     431.941    1.047     431.347     431.953    1.635      -0.679    -0.16%
socket 32     432.641     432.818    1.535     432.488     432.302    1.508      -0.153    -0.04%
socket 64     431.504     431.996    1.337     429.145     430.326    2.469      -2.359    -0.55%
  task 16      38.816      39.382    1.456      39.657      39.337    1.831      +0.841    +2.17%
  task 32      38.815      39.644    2.690      38.721      39.122    1.636      -0.094    -0.24%
  task 64      37.562      38.080    1.701      39.554      38.563    1.689      +1.992    +5.30%

The cases for socket are within the range of noise, and improvements in task
local storage are due to high variance (CV ~4%-6% across batch sizes). The only
statistically significant case worth mentioning is socket with batch size 64
with p-value from t-test < 0.05, but the absolute difference is small (~2k/s).

TL;DR there doesn't appear to be any significant regression or improvement.

  [0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com
  [1]: https://lore.kernel.org/bpf/20260205222916.1788211-1-ameryhung@gmail.com

Changelog:
----------
v2 -> v3
v2: https://lore.kernel.org/bpf/20260227052031.3988575-1-memxor@gmail.com

 * Add syzbot Tested-by.
 * Add Amery's Reviewed-by.
 * Fix missing rcu_dereference_check() in __bpf_selem_free_rcu. (BPF CI Bot)
 * Remove migrate_disable() in bpf_selem_free_rcu. (Alexei)

v1 -> v2
v1: https://lore.kernel.org/bpf/20260225185121.2057388-1-memxor@gmail.com

 * Add Paul's Reviewed-by.
 * Fix use-after-free in accessing bpf_mem_alloc embedded in map. (syzbot CI)
 * Add benchmark numbers for local storage.
 * Add extra test case for per-cpu hashmap coverage with up to 16 refcount leaks.
 * Target bpf tree.
====================

Link: https://patch.msgid.link/20260227224806.646888-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Alexei Starovoitov 2026-02-27 15:39:01 -08:00
commit 5263e30fff
15 changed files with 604 additions and 58 deletions

View file

@ -124,7 +124,7 @@ struct bpf_map_ops {
u32 (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
struct seq_file *m);
int (*map_check_btf)(const struct bpf_map *map,
int (*map_check_btf)(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type);
@ -656,7 +656,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
map->ops->map_seq_show_elem;
}
int map_check_no_btf(const struct bpf_map *map,
int map_check_no_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type);

View file

@ -176,7 +176,7 @@ u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
void bpf_local_storage_map_free(struct bpf_map *map,
struct bpf_local_storage_cache *cache);
int bpf_local_storage_map_check_btf(const struct bpf_map *map,
int bpf_local_storage_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type);

View file

@ -14,6 +14,8 @@ struct bpf_mem_alloc {
struct obj_cgroup *objcg;
bool percpu;
struct work_struct work;
void (*dtor_ctx_free)(void *ctx);
void *dtor_ctx;
};
/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
@ -32,6 +34,10 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg
/* The percpu allocation with a specific unit size. */
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma,
void (*dtor)(void *obj, void *ctx),
void (*dtor_ctx_free)(void *ctx),
void *ctx);
/* Check the allocation size for kmalloc equivalent allocator */
int bpf_mem_alloc_check_size(bool percpu, size_t size);

View file

@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key,
return -EOPNOTSUPP;
}
static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf,
const struct btf_type *key_type, const struct btf_type *value_type)
{
return 0;

View file

@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
static int array_map_check_btf(const struct bpf_map *map,
static int array_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)

View file

@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key,
return -EINVAL;
}
static int bloom_map_check_btf(const struct bpf_map *map,
static int bloom_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)

View file

@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
static int insn_array_check_btf(const struct bpf_map *map,
static int insn_array_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)

View file

@ -107,14 +107,12 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
/* If RCU Tasks Trace grace period implies RCU grace period, do
* kfree(), else do kfree_rcu().
/*
* RCU Tasks Trace grace period implies RCU grace period, do
* kfree() directly.
*/
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
if (rcu_trace_implies_rcu_gp())
kfree(local_storage);
else
kfree_rcu(local_storage, rcu);
kfree(local_storage);
}
/* Handle use_kmalloc_nolock == false */
@ -138,10 +136,11 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
if (rcu_trace_implies_rcu_gp())
bpf_local_storage_free_rcu(rcu);
else
call_rcu(rcu, bpf_local_storage_free_rcu);
/*
* RCU Tasks Trace grace period implies RCU grace period, do
* kfree() directly.
*/
bpf_local_storage_free_rcu(rcu);
}
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
@ -164,16 +163,29 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bpf_local_storage_free_trace_rcu);
}
/* rcu callback for use_kmalloc_nolock == false */
static void __bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map *smap;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
/* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
kfree(selem);
}
/* rcu tasks trace callback for use_kmalloc_nolock == false */
static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
if (rcu_trace_implies_rcu_gp())
kfree(selem);
else
kfree_rcu(selem, rcu);
/*
* RCU Tasks Trace grace period implies RCU grace period, do
* kfree() directly.
*/
__bpf_selem_free_rcu(rcu);
}
/* Handle use_kmalloc_nolock == false */
@ -181,7 +193,7 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
bool vanilla_rcu)
{
if (vanilla_rcu)
kfree_rcu(selem, rcu);
call_rcu(&selem->rcu, __bpf_selem_free_rcu);
else
call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
}
@ -195,37 +207,29 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
if (smap) {
migrate_disable();
if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
migrate_enable();
}
kfree_nolock(selem);
}
static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
if (rcu_trace_implies_rcu_gp())
bpf_selem_free_rcu(rcu);
else
call_rcu(rcu, bpf_selem_free_rcu);
/*
* RCU Tasks Trace grace period implies RCU grace period, do
* kfree() directly.
*/
bpf_selem_free_rcu(rcu);
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
bool reuse_now)
{
struct bpf_local_storage_map *smap;
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
if (!selem->use_kmalloc_nolock) {
/*
* No uptr will be unpin even when reuse_now == false since uptr
* is only supported in task local storage, where
* smap->use_kmalloc_nolock == true.
*/
if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
@ -797,7 +801,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
int bpf_local_storage_map_check_btf(const struct bpf_map *map,
int bpf_local_storage_map_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
@ -958,10 +962,9 @@ restart:
*/
synchronize_rcu();
if (smap->use_kmalloc_nolock) {
rcu_barrier_tasks_trace();
rcu_barrier();
}
/* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
rcu_barrier_tasks_trace();
rcu_barrier();
kvfree(smap->buckets);
bpf_map_area_free(smap);
}

View file

@ -125,6 +125,11 @@ struct htab_elem {
char key[] __aligned(8);
};
struct htab_btf_record {
struct btf_record *record;
u32 key_size;
};
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
{
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
@ -457,6 +462,83 @@ static int htab_map_alloc_check(union bpf_attr *attr)
return 0;
}
static void htab_mem_dtor(void *obj, void *ctx)
{
struct htab_btf_record *hrec = ctx;
struct htab_elem *elem = obj;
void *map_value;
if (IS_ERR_OR_NULL(hrec->record))
return;
map_value = htab_elem_value(elem, hrec->key_size);
bpf_obj_free_fields(hrec->record, map_value);
}
static void htab_pcpu_mem_dtor(void *obj, void *ctx)
{
void __percpu *pptr = *(void __percpu **)obj;
struct htab_btf_record *hrec = ctx;
int cpu;
if (IS_ERR_OR_NULL(hrec->record))
return;
for_each_possible_cpu(cpu)
bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu));
}
static void htab_dtor_ctx_free(void *ctx)
{
struct htab_btf_record *hrec = ctx;
btf_record_free(hrec->record);
kfree(ctx);
}
static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
{
u32 key_size = htab->map.key_size;
struct bpf_mem_alloc *ma;
struct htab_btf_record *hrec;
int err;
/* No need for dtors. */
if (IS_ERR_OR_NULL(htab->map.record))
return 0;
hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
if (!hrec)
return -ENOMEM;
hrec->key_size = key_size;
hrec->record = btf_record_dup(htab->map.record);
if (IS_ERR(hrec->record)) {
err = PTR_ERR(hrec->record);
kfree(hrec);
return err;
}
ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
return 0;
}
static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
const struct btf_type *key_type, const struct btf_type *value_type)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
if (htab_is_prealloc(htab))
return 0;
/*
* We must set the dtor using this callback, as map's BTF record is not
* populated in htab_map_alloc(), so it will always appear as NULL.
*/
if (htab_is_percpu(htab))
return htab_set_dtor(htab, htab_pcpu_mem_dtor);
else
return htab_set_dtor(htab, htab_mem_dtor);
}
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@ -2281,6 +2363,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
.map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
.map_btf_id = &htab_map_btf_ids[0],
@ -2303,6 +2386,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
.map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru),
.map_btf_id = &htab_map_btf_ids[0],
@ -2482,6 +2566,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
.map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_percpu),
.map_btf_id = &htab_map_btf_ids[0],
@ -2502,6 +2587,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
.map_check_btf = htab_map_check_btf,
.map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru_percpu),
.map_btf_id = &htab_map_btf_ids[0],

View file

@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
}
static int cgroup_storage_check_btf(const struct bpf_map *map,
static int cgroup_storage_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)

View file

@ -751,7 +751,7 @@ free_stack:
return err;
}
static int trie_check_btf(const struct bpf_map *map,
static int trie_check_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)

View file

@ -102,6 +102,8 @@ struct bpf_mem_cache {
int percpu_size;
bool draining;
struct bpf_mem_cache *tgt;
void (*dtor)(void *obj, void *ctx);
void *dtor_ctx;
/* list of objects to be freed after RCU GP */
struct llist_head free_by_rcu;
@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu)
kfree(obj);
}
static int free_all(struct llist_node *llnode, bool percpu)
static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu)
{
struct llist_node *pos, *t;
int cnt = 0;
llist_for_each_safe(pos, t, llnode) {
if (c->dtor)
c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx);
free_one(pos, percpu);
cnt++;
}
@ -276,7 +280,7 @@ static void __free_rcu(struct rcu_head *head)
{
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
atomic_set(&c->call_rcu_ttrace_in_progress, 0);
}
@ -308,7 +312,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
if (unlikely(READ_ONCE(c->draining))) {
llnode = llist_del_all(&c->free_by_rcu_ttrace);
free_all(llnode, !!c->percpu_size);
free_all(c, llnode, !!c->percpu_size);
}
return;
}
@ -417,7 +421,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c)
dec_active(c, &flags);
if (unlikely(READ_ONCE(c->draining))) {
free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
atomic_set(&c->call_rcu_in_progress, 0);
} else {
call_rcu_hurry(&c->rcu, __free_by_rcu);
@ -635,13 +639,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
* Except for waiting_for_gp_ttrace list, there are no concurrent operations
* on these lists, so it is safe to use __llist_del_all().
*/
free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
free_all(__llist_del_all(&c->free_llist), percpu);
free_all(__llist_del_all(&c->free_llist_extra), percpu);
free_all(__llist_del_all(&c->free_by_rcu), percpu);
free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
free_all(llist_del_all(&c->waiting_for_gp), percpu);
free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu);
free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu);
free_all(c, __llist_del_all(&c->free_llist), percpu);
free_all(c, __llist_del_all(&c->free_llist_extra), percpu);
free_all(c, __llist_del_all(&c->free_by_rcu), percpu);
free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu);
free_all(c, llist_del_all(&c->waiting_for_gp), percpu);
}
static void check_mem_cache(struct bpf_mem_cache *c)
@ -680,6 +684,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma)
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
{
/* We can free dtor ctx only once all callbacks are done using it. */
if (ma->dtor_ctx_free)
ma->dtor_ctx_free(ma->dtor_ctx);
check_leaked_objs(ma);
free_percpu(ma->cache);
free_percpu(ma->caches);
@ -1014,3 +1021,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size)
return 0;
}
void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx),
void (*dtor_ctx_free)(void *ctx), void *ctx)
{
struct bpf_mem_caches *cc;
struct bpf_mem_cache *c;
int cpu, i;
ma->dtor_ctx_free = dtor_ctx_free;
ma->dtor_ctx = ctx;
if (ma->cache) {
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(ma->cache, cpu);
c->dtor = dtor;
c->dtor_ctx = ctx;
}
}
if (ma->caches) {
for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(ma->caches, cpu);
for (i = 0; i < NUM_CACHES; i++) {
c = &cc->cache[i];
c->dtor = dtor;
c->dtor_ctx = ctx;
}
}
}
}

View file

@ -1234,7 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
}
EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
int map_check_no_btf(const struct bpf_map *map,
int map_check_no_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)

View file

@ -0,0 +1,218 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
#include <test_progs.h>
#include <network_helpers.h>
#include "map_kptr_race.skel.h"
static int get_map_id(int map_fd)
{
struct bpf_map_info info = {};
__u32 len = sizeof(info);
if (!ASSERT_OK(bpf_map_get_info_by_fd(map_fd, &info, &len), "get_map_info"))
return -1;
return info.id;
}
static int read_refs(struct map_kptr_race *skel)
{
LIBBPF_OPTS(bpf_test_run_opts, opts);
int ret;
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &opts);
if (!ASSERT_OK(ret, "count_ref run"))
return -1;
if (!ASSERT_OK(opts.retval, "count_ref retval"))
return -1;
return skel->bss->num_of_refs;
}
static void test_htab_leak(void)
{
LIBBPF_OPTS(bpf_test_run_opts, opts,
.data_in = &pkt_v4,
.data_size_in = sizeof(pkt_v4),
.repeat = 1,
);
struct map_kptr_race *skel, *watcher;
int ret, map_id;
skel = map_kptr_race__open_and_load();
if (!ASSERT_OK_PTR(skel, "open_and_load"))
return;
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_htab_leak), &opts);
if (!ASSERT_OK(ret, "test_htab_leak run"))
goto out_skel;
if (!ASSERT_OK(opts.retval, "test_htab_leak retval"))
goto out_skel;
map_id = get_map_id(bpf_map__fd(skel->maps.race_hash_map));
if (!ASSERT_GE(map_id, 0, "map_id"))
goto out_skel;
watcher = map_kptr_race__open_and_load();
if (!ASSERT_OK_PTR(watcher, "watcher open_and_load"))
goto out_skel;
watcher->bss->target_map_id = map_id;
watcher->links.map_put = bpf_program__attach(watcher->progs.map_put);
if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry"))
goto out_watcher;
watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free);
if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit"))
goto out_watcher;
map_kptr_race__destroy(skel);
skel = NULL;
kern_sync_rcu();
while (!READ_ONCE(watcher->bss->map_freed))
sched_yield();
ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed");
ASSERT_EQ(read_refs(watcher), 2, "htab refcount");
out_watcher:
map_kptr_race__destroy(watcher);
out_skel:
map_kptr_race__destroy(skel);
}
static void test_percpu_htab_leak(void)
{
LIBBPF_OPTS(bpf_test_run_opts, opts,
.data_in = &pkt_v4,
.data_size_in = sizeof(pkt_v4),
.repeat = 1,
);
struct map_kptr_race *skel, *watcher;
int ret, map_id;
skel = map_kptr_race__open();
if (!ASSERT_OK_PTR(skel, "open"))
return;
skel->rodata->nr_cpus = libbpf_num_possible_cpus();
if (skel->rodata->nr_cpus > 16)
skel->rodata->nr_cpus = 16;
ret = map_kptr_race__load(skel);
if (!ASSERT_OK(ret, "load"))
goto out_skel;
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_percpu_htab_leak), &opts);
if (!ASSERT_OK(ret, "test_percpu_htab_leak run"))
goto out_skel;
if (!ASSERT_OK(opts.retval, "test_percpu_htab_leak retval"))
goto out_skel;
map_id = get_map_id(bpf_map__fd(skel->maps.race_percpu_hash_map));
if (!ASSERT_GE(map_id, 0, "map_id"))
goto out_skel;
watcher = map_kptr_race__open_and_load();
if (!ASSERT_OK_PTR(watcher, "watcher open_and_load"))
goto out_skel;
watcher->bss->target_map_id = map_id;
watcher->links.map_put = bpf_program__attach(watcher->progs.map_put);
if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry"))
goto out_watcher;
watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free);
if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit"))
goto out_watcher;
map_kptr_race__destroy(skel);
skel = NULL;
kern_sync_rcu();
while (!READ_ONCE(watcher->bss->map_freed))
sched_yield();
ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed");
ASSERT_EQ(read_refs(watcher), 2, "percpu_htab refcount");
out_watcher:
map_kptr_race__destroy(watcher);
out_skel:
map_kptr_race__destroy(skel);
}
static void test_sk_ls_leak(void)
{
struct map_kptr_race *skel, *watcher;
int listen_fd = -1, client_fd = -1, map_id;
skel = map_kptr_race__open_and_load();
if (!ASSERT_OK_PTR(skel, "open_and_load"))
return;
if (!ASSERT_OK(map_kptr_race__attach(skel), "attach"))
goto out_skel;
listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
if (!ASSERT_GE(listen_fd, 0, "start_server"))
goto out_skel;
client_fd = connect_to_fd(listen_fd, 0);
if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
goto out_skel;
if (!ASSERT_EQ(skel->bss->sk_ls_leak_done, 1, "sk_ls_leak_done"))
goto out_skel;
close(client_fd);
client_fd = -1;
close(listen_fd);
listen_fd = -1;
map_id = get_map_id(bpf_map__fd(skel->maps.race_sk_ls_map));
if (!ASSERT_GE(map_id, 0, "map_id"))
goto out_skel;
watcher = map_kptr_race__open_and_load();
if (!ASSERT_OK_PTR(watcher, "watcher open_and_load"))
goto out_skel;
watcher->bss->target_map_id = map_id;
watcher->links.map_put = bpf_program__attach(watcher->progs.map_put);
if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry"))
goto out_watcher;
watcher->links.sk_map_free = bpf_program__attach(watcher->progs.sk_map_free);
if (!ASSERT_OK_PTR(watcher->links.sk_map_free, "attach fexit"))
goto out_watcher;
map_kptr_race__destroy(skel);
skel = NULL;
kern_sync_rcu();
while (!READ_ONCE(watcher->bss->map_freed))
sched_yield();
ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed");
ASSERT_EQ(read_refs(watcher), 2, "sk_ls refcount");
out_watcher:
map_kptr_race__destroy(watcher);
out_skel:
if (client_fd >= 0)
close(client_fd);
if (listen_fd >= 0)
close(listen_fd);
map_kptr_race__destroy(skel);
}
void serial_test_map_kptr_race(void)
{
if (test__start_subtest("htab_leak"))
test_htab_leak();
if (test__start_subtest("percpu_htab_leak"))
test_percpu_htab_leak();
if (test__start_subtest("sk_ls_leak"))
test_sk_ls_leak();
}

View file

@ -0,0 +1,197 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "../test_kmods/bpf_testmod_kfunc.h"
struct map_value {
struct prog_test_ref_kfunc __kptr *ref_ptr;
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct map_value);
__uint(max_entries, 1);
} race_hash_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct map_value);
__uint(max_entries, 1);
} race_percpu_hash_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct map_value);
} race_sk_ls_map SEC(".maps");
int num_of_refs;
int sk_ls_leak_done;
int target_map_id;
int map_freed;
const volatile int nr_cpus;
SEC("tc")
int test_htab_leak(struct __sk_buff *skb)
{
struct prog_test_ref_kfunc *p, *old;
struct map_value val = {};
struct map_value *v;
int key = 0;
if (bpf_map_update_elem(&race_hash_map, &key, &val, BPF_ANY))
return 1;
v = bpf_map_lookup_elem(&race_hash_map, &key);
if (!v)
return 2;
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
if (!p)
return 3;
old = bpf_kptr_xchg(&v->ref_ptr, p);
if (old)
bpf_kfunc_call_test_release(old);
bpf_map_delete_elem(&race_hash_map, &key);
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
if (!p)
return 4;
old = bpf_kptr_xchg(&v->ref_ptr, p);
if (old)
bpf_kfunc_call_test_release(old);
return 0;
}
static int fill_percpu_kptr(struct map_value *v)
{
struct prog_test_ref_kfunc *p, *old;
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
if (!p)
return 1;
old = bpf_kptr_xchg(&v->ref_ptr, p);
if (old)
bpf_kfunc_call_test_release(old);
return 0;
}
SEC("tc")
int test_percpu_htab_leak(struct __sk_buff *skb)
{
struct map_value *v, *arr[16] = {};
struct map_value val = {};
int key = 0;
int err = 0;
if (bpf_map_update_elem(&race_percpu_hash_map, &key, &val, BPF_ANY))
return 1;
for (int i = 0; i < nr_cpus; i++) {
v = bpf_map_lookup_percpu_elem(&race_percpu_hash_map, &key, i);
if (!v)
return 2;
arr[i] = v;
}
bpf_map_delete_elem(&race_percpu_hash_map, &key);
for (int i = 0; i < nr_cpus; i++) {
v = arr[i];
err = fill_percpu_kptr(v);
if (err)
return 3;
}
return 0;
}
SEC("tp_btf/inet_sock_set_state")
int BPF_PROG(test_sk_ls_leak, struct sock *sk, int oldstate, int newstate)
{
struct prog_test_ref_kfunc *p, *old;
struct map_value *v;
if (newstate != BPF_TCP_SYN_SENT)
return 0;
if (sk_ls_leak_done)
return 0;
v = bpf_sk_storage_get(&race_sk_ls_map, sk, NULL,
BPF_SK_STORAGE_GET_F_CREATE);
if (!v)
return 0;
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
if (!p)
return 0;
old = bpf_kptr_xchg(&v->ref_ptr, p);
if (old)
bpf_kfunc_call_test_release(old);
bpf_sk_storage_delete(&race_sk_ls_map, sk);
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
if (!p)
return 0;
old = bpf_kptr_xchg(&v->ref_ptr, p);
if (old)
bpf_kfunc_call_test_release(old);
sk_ls_leak_done = 1;
return 0;
}
long target_map_ptr;
SEC("fentry/bpf_map_put")
int BPF_PROG(map_put, struct bpf_map *map)
{
if (target_map_id && map->id == (u32)target_map_id)
target_map_ptr = (long)map;
return 0;
}
SEC("fexit/htab_map_free")
int BPF_PROG(htab_map_free, struct bpf_map *map)
{
if (target_map_ptr && (long)map == target_map_ptr)
map_freed = 1;
return 0;
}
SEC("fexit/bpf_sk_storage_map_free")
int BPF_PROG(sk_map_free, struct bpf_map *map)
{
if (target_map_ptr && (long)map == target_map_ptr)
map_freed = 1;
return 0;
}
SEC("syscall")
int count_ref(void *ctx)
{
struct prog_test_ref_kfunc *p;
unsigned long arg = 0;
p = bpf_kfunc_call_test_acquire(&arg);
if (!p)
return 1;
num_of_refs = p->cnt.refs.counter;
bpf_kfunc_call_test_release(p);
return 0;
}
char _license[] SEC("license") = "GPL";