mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 03:04:51 +01:00
Merge branch 'close-race-in-freeing-special-fields-and-map-value'
Kumar Kartikeya Dwivedi says:
====================
Close race in freeing special fields and map value
There exists a race across various map types where the freeing of
special fields (tw, timer, wq, kptr, etc.) can be done eagerly when a
logical delete operation is done on a map value, such that the program
which continues to have access to such a map value can recreate the
fields and cause them to leak.
The set contains fixes for this case. It is a continuation of Mykyta's
previous attempt in [0], but applies to all fields. A test is included
which reproduces the bug reliably in absence of the fixes.
Local Storage Benchmarks
------------------------
Evaluation Setup: Benchmarked on a dual-socket Intel Xeon Gold 6348 (Ice
Lake) @ 2.60GHz (56 cores / 112 threads), with the CPU governor set to
performance. Bench was pinned to a single NUMA node throughout the test.
Benchmark comes from [1] using the following command:
./bench -p 1 local-storage-create --storage-type <socket,task> --batch-size <16,32,64>
Before the test, 10 runs of all cases ([socket|task] x 3 batch sizes x 7
iterations per batch size) are done to warm up and prime the machine.
Then, 3 runs of all cases are done (with and without the patch, across
reboots).
For each comparison, we have 21 samples, i.e. per batch size (e.g.
socket 16) of a given local storage, we have 3 runs x 7 iterations.
The statistics (mean, median, stddev) and t-test is done for each
scenario (local storage and batch size pair) individually (21 samples
for either case). All values are for local storage creations in thousand
creations / sec (k/s).
Baseline (without patch) With patch Delta
Case Median Mean Std. Dev. Median Mean Std. Dev. Median %
---------------------------------------------------------------------------------------------------
socket 16 432.026 431.941 1.047 431.347 431.953 1.635 -0.679 -0.16%
socket 32 432.641 432.818 1.535 432.488 432.302 1.508 -0.153 -0.04%
socket 64 431.504 431.996 1.337 429.145 430.326 2.469 -2.359 -0.55%
task 16 38.816 39.382 1.456 39.657 39.337 1.831 +0.841 +2.17%
task 32 38.815 39.644 2.690 38.721 39.122 1.636 -0.094 -0.24%
task 64 37.562 38.080 1.701 39.554 38.563 1.689 +1.992 +5.30%
The cases for socket are within the range of noise, and improvements in task
local storage are due to high variance (CV ~4%-6% across batch sizes). The only
statistically significant case worth mentioning is socket with batch size 64
with p-value from t-test < 0.05, but the absolute difference is small (~2k/s).
TL;DR there doesn't appear to be any significant regression or improvement.
[0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com
[1]: https://lore.kernel.org/bpf/20260205222916.1788211-1-ameryhung@gmail.com
Changelog:
----------
v2 -> v3
v2: https://lore.kernel.org/bpf/20260227052031.3988575-1-memxor@gmail.com
* Add syzbot Tested-by.
* Add Amery's Reviewed-by.
* Fix missing rcu_dereference_check() in __bpf_selem_free_rcu. (BPF CI Bot)
* Remove migrate_disable() in bpf_selem_free_rcu. (Alexei)
v1 -> v2
v1: https://lore.kernel.org/bpf/20260225185121.2057388-1-memxor@gmail.com
* Add Paul's Reviewed-by.
* Fix use-after-free in accessing bpf_mem_alloc embedded in map. (syzbot CI)
* Add benchmark numbers for local storage.
* Add extra test case for per-cpu hashmap coverage with up to 16 refcount leaks.
* Target bpf tree.
====================
Link: https://patch.msgid.link/20260227224806.646888-1-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
commit
5263e30fff
15 changed files with 604 additions and 58 deletions
|
|
@ -124,7 +124,7 @@ struct bpf_map_ops {
|
|||
u32 (*map_fd_sys_lookup_elem)(void *ptr);
|
||||
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
|
||||
struct seq_file *m);
|
||||
int (*map_check_btf)(const struct bpf_map *map,
|
||||
int (*map_check_btf)(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type);
|
||||
|
|
@ -656,7 +656,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
|
|||
map->ops->map_seq_show_elem;
|
||||
}
|
||||
|
||||
int map_check_no_btf(const struct bpf_map *map,
|
||||
int map_check_no_btf(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type);
|
||||
|
|
|
|||
|
|
@ -176,7 +176,7 @@ u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage);
|
|||
void bpf_local_storage_map_free(struct bpf_map *map,
|
||||
struct bpf_local_storage_cache *cache);
|
||||
|
||||
int bpf_local_storage_map_check_btf(const struct bpf_map *map,
|
||||
int bpf_local_storage_map_check_btf(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type);
|
||||
|
|
|
|||
|
|
@ -14,6 +14,8 @@ struct bpf_mem_alloc {
|
|||
struct obj_cgroup *objcg;
|
||||
bool percpu;
|
||||
struct work_struct work;
|
||||
void (*dtor_ctx_free)(void *ctx);
|
||||
void *dtor_ctx;
|
||||
};
|
||||
|
||||
/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects.
|
||||
|
|
@ -32,6 +34,10 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg
|
|||
/* The percpu allocation with a specific unit size. */
|
||||
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
|
||||
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
|
||||
void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma,
|
||||
void (*dtor)(void *obj, void *ctx),
|
||||
void (*dtor_ctx_free)(void *ctx),
|
||||
void *ctx);
|
||||
|
||||
/* Check the allocation size for kmalloc equivalent allocator */
|
||||
int bpf_mem_alloc_check_size(bool percpu, size_t size);
|
||||
|
|
|
|||
|
|
@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key,
|
|||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf,
|
||||
static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf,
|
||||
const struct btf_type *key_type, const struct btf_type *value_type)
|
||||
{
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
|
|||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int array_map_check_btf(const struct bpf_map *map,
|
||||
static int array_map_check_btf(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type)
|
||||
|
|
|
|||
|
|
@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key,
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int bloom_map_check_btf(const struct bpf_map *map,
|
||||
static int bloom_map_check_btf(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type)
|
||||
|
|
|
|||
|
|
@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int insn_array_check_btf(const struct bpf_map *map,
|
||||
static int insn_array_check_btf(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type)
|
||||
|
|
|
|||
|
|
@ -107,14 +107,12 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
|
|||
{
|
||||
struct bpf_local_storage *local_storage;
|
||||
|
||||
/* If RCU Tasks Trace grace period implies RCU grace period, do
|
||||
* kfree(), else do kfree_rcu().
|
||||
/*
|
||||
* RCU Tasks Trace grace period implies RCU grace period, do
|
||||
* kfree() directly.
|
||||
*/
|
||||
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
|
||||
if (rcu_trace_implies_rcu_gp())
|
||||
kfree(local_storage);
|
||||
else
|
||||
kfree_rcu(local_storage, rcu);
|
||||
kfree(local_storage);
|
||||
}
|
||||
|
||||
/* Handle use_kmalloc_nolock == false */
|
||||
|
|
@ -138,10 +136,11 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
|
|||
|
||||
static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
|
||||
{
|
||||
if (rcu_trace_implies_rcu_gp())
|
||||
bpf_local_storage_free_rcu(rcu);
|
||||
else
|
||||
call_rcu(rcu, bpf_local_storage_free_rcu);
|
||||
/*
|
||||
* RCU Tasks Trace grace period implies RCU grace period, do
|
||||
* kfree() directly.
|
||||
*/
|
||||
bpf_local_storage_free_rcu(rcu);
|
||||
}
|
||||
|
||||
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
|
||||
|
|
@ -164,16 +163,29 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
|
|||
bpf_local_storage_free_trace_rcu);
|
||||
}
|
||||
|
||||
/* rcu callback for use_kmalloc_nolock == false */
|
||||
static void __bpf_selem_free_rcu(struct rcu_head *rcu)
|
||||
{
|
||||
struct bpf_local_storage_elem *selem;
|
||||
struct bpf_local_storage_map *smap;
|
||||
|
||||
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
|
||||
/* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
|
||||
|
||||
if (smap)
|
||||
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
|
||||
kfree(selem);
|
||||
}
|
||||
|
||||
/* rcu tasks trace callback for use_kmalloc_nolock == false */
|
||||
static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
|
||||
{
|
||||
struct bpf_local_storage_elem *selem;
|
||||
|
||||
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
|
||||
if (rcu_trace_implies_rcu_gp())
|
||||
kfree(selem);
|
||||
else
|
||||
kfree_rcu(selem, rcu);
|
||||
/*
|
||||
* RCU Tasks Trace grace period implies RCU grace period, do
|
||||
* kfree() directly.
|
||||
*/
|
||||
__bpf_selem_free_rcu(rcu);
|
||||
}
|
||||
|
||||
/* Handle use_kmalloc_nolock == false */
|
||||
|
|
@ -181,7 +193,7 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
|
|||
bool vanilla_rcu)
|
||||
{
|
||||
if (vanilla_rcu)
|
||||
kfree_rcu(selem, rcu);
|
||||
call_rcu(&selem->rcu, __bpf_selem_free_rcu);
|
||||
else
|
||||
call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
|
||||
}
|
||||
|
|
@ -195,37 +207,29 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
|
|||
/* The bpf_local_storage_map_free will wait for rcu_barrier */
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
|
||||
|
||||
if (smap) {
|
||||
migrate_disable();
|
||||
if (smap)
|
||||
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
|
||||
migrate_enable();
|
||||
}
|
||||
kfree_nolock(selem);
|
||||
}
|
||||
|
||||
static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
|
||||
{
|
||||
if (rcu_trace_implies_rcu_gp())
|
||||
bpf_selem_free_rcu(rcu);
|
||||
else
|
||||
call_rcu(rcu, bpf_selem_free_rcu);
|
||||
/*
|
||||
* RCU Tasks Trace grace period implies RCU grace period, do
|
||||
* kfree() directly.
|
||||
*/
|
||||
bpf_selem_free_rcu(rcu);
|
||||
}
|
||||
|
||||
void bpf_selem_free(struct bpf_local_storage_elem *selem,
|
||||
bool reuse_now)
|
||||
{
|
||||
struct bpf_local_storage_map *smap;
|
||||
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
|
||||
|
||||
if (!selem->use_kmalloc_nolock) {
|
||||
/*
|
||||
* No uptr will be unpin even when reuse_now == false since uptr
|
||||
* is only supported in task local storage, where
|
||||
* smap->use_kmalloc_nolock == true.
|
||||
*/
|
||||
if (smap)
|
||||
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
|
||||
__bpf_selem_free(selem, reuse_now);
|
||||
return;
|
||||
}
|
||||
|
|
@ -797,7 +801,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int bpf_local_storage_map_check_btf(const struct bpf_map *map,
|
||||
int bpf_local_storage_map_check_btf(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type)
|
||||
|
|
@ -958,10 +962,9 @@ restart:
|
|||
*/
|
||||
synchronize_rcu();
|
||||
|
||||
if (smap->use_kmalloc_nolock) {
|
||||
rcu_barrier_tasks_trace();
|
||||
rcu_barrier();
|
||||
}
|
||||
/* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */
|
||||
rcu_barrier_tasks_trace();
|
||||
rcu_barrier();
|
||||
kvfree(smap->buckets);
|
||||
bpf_map_area_free(smap);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -125,6 +125,11 @@ struct htab_elem {
|
|||
char key[] __aligned(8);
|
||||
};
|
||||
|
||||
struct htab_btf_record {
|
||||
struct btf_record *record;
|
||||
u32 key_size;
|
||||
};
|
||||
|
||||
static inline bool htab_is_prealloc(const struct bpf_htab *htab)
|
||||
{
|
||||
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
|
||||
|
|
@ -457,6 +462,83 @@ static int htab_map_alloc_check(union bpf_attr *attr)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void htab_mem_dtor(void *obj, void *ctx)
|
||||
{
|
||||
struct htab_btf_record *hrec = ctx;
|
||||
struct htab_elem *elem = obj;
|
||||
void *map_value;
|
||||
|
||||
if (IS_ERR_OR_NULL(hrec->record))
|
||||
return;
|
||||
|
||||
map_value = htab_elem_value(elem, hrec->key_size);
|
||||
bpf_obj_free_fields(hrec->record, map_value);
|
||||
}
|
||||
|
||||
static void htab_pcpu_mem_dtor(void *obj, void *ctx)
|
||||
{
|
||||
void __percpu *pptr = *(void __percpu **)obj;
|
||||
struct htab_btf_record *hrec = ctx;
|
||||
int cpu;
|
||||
|
||||
if (IS_ERR_OR_NULL(hrec->record))
|
||||
return;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu));
|
||||
}
|
||||
|
||||
static void htab_dtor_ctx_free(void *ctx)
|
||||
{
|
||||
struct htab_btf_record *hrec = ctx;
|
||||
|
||||
btf_record_free(hrec->record);
|
||||
kfree(ctx);
|
||||
}
|
||||
|
||||
static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
|
||||
{
|
||||
u32 key_size = htab->map.key_size;
|
||||
struct bpf_mem_alloc *ma;
|
||||
struct htab_btf_record *hrec;
|
||||
int err;
|
||||
|
||||
/* No need for dtors. */
|
||||
if (IS_ERR_OR_NULL(htab->map.record))
|
||||
return 0;
|
||||
|
||||
hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
|
||||
if (!hrec)
|
||||
return -ENOMEM;
|
||||
hrec->key_size = key_size;
|
||||
hrec->record = btf_record_dup(htab->map.record);
|
||||
if (IS_ERR(hrec->record)) {
|
||||
err = PTR_ERR(hrec->record);
|
||||
kfree(hrec);
|
||||
return err;
|
||||
}
|
||||
ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
|
||||
bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
|
||||
const struct btf_type *key_type, const struct btf_type *value_type)
|
||||
{
|
||||
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
|
||||
|
||||
if (htab_is_prealloc(htab))
|
||||
return 0;
|
||||
/*
|
||||
* We must set the dtor using this callback, as map's BTF record is not
|
||||
* populated in htab_map_alloc(), so it will always appear as NULL.
|
||||
*/
|
||||
if (htab_is_percpu(htab))
|
||||
return htab_set_dtor(htab, htab_pcpu_mem_dtor);
|
||||
else
|
||||
return htab_set_dtor(htab, htab_mem_dtor);
|
||||
}
|
||||
|
||||
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
|
||||
|
|
@ -2281,6 +2363,7 @@ const struct bpf_map_ops htab_map_ops = {
|
|||
.map_seq_show_elem = htab_map_seq_show_elem,
|
||||
.map_set_for_each_callback_args = map_set_for_each_callback_args,
|
||||
.map_for_each_callback = bpf_for_each_hash_elem,
|
||||
.map_check_btf = htab_map_check_btf,
|
||||
.map_mem_usage = htab_map_mem_usage,
|
||||
BATCH_OPS(htab),
|
||||
.map_btf_id = &htab_map_btf_ids[0],
|
||||
|
|
@ -2303,6 +2386,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
|
|||
.map_seq_show_elem = htab_map_seq_show_elem,
|
||||
.map_set_for_each_callback_args = map_set_for_each_callback_args,
|
||||
.map_for_each_callback = bpf_for_each_hash_elem,
|
||||
.map_check_btf = htab_map_check_btf,
|
||||
.map_mem_usage = htab_map_mem_usage,
|
||||
BATCH_OPS(htab_lru),
|
||||
.map_btf_id = &htab_map_btf_ids[0],
|
||||
|
|
@ -2482,6 +2566,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
|
|||
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
|
||||
.map_set_for_each_callback_args = map_set_for_each_callback_args,
|
||||
.map_for_each_callback = bpf_for_each_hash_elem,
|
||||
.map_check_btf = htab_map_check_btf,
|
||||
.map_mem_usage = htab_map_mem_usage,
|
||||
BATCH_OPS(htab_percpu),
|
||||
.map_btf_id = &htab_map_btf_ids[0],
|
||||
|
|
@ -2502,6 +2587,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
|
|||
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
|
||||
.map_set_for_each_callback_args = map_set_for_each_callback_args,
|
||||
.map_for_each_callback = bpf_for_each_hash_elem,
|
||||
.map_check_btf = htab_map_check_btf,
|
||||
.map_mem_usage = htab_map_mem_usage,
|
||||
BATCH_OPS(htab_lru_percpu),
|
||||
.map_btf_id = &htab_map_btf_ids[0],
|
||||
|
|
|
|||
|
|
@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int cgroup_storage_check_btf(const struct bpf_map *map,
|
||||
static int cgroup_storage_check_btf(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type)
|
||||
|
|
|
|||
|
|
@ -751,7 +751,7 @@ free_stack:
|
|||
return err;
|
||||
}
|
||||
|
||||
static int trie_check_btf(const struct bpf_map *map,
|
||||
static int trie_check_btf(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type)
|
||||
|
|
|
|||
|
|
@ -102,6 +102,8 @@ struct bpf_mem_cache {
|
|||
int percpu_size;
|
||||
bool draining;
|
||||
struct bpf_mem_cache *tgt;
|
||||
void (*dtor)(void *obj, void *ctx);
|
||||
void *dtor_ctx;
|
||||
|
||||
/* list of objects to be freed after RCU GP */
|
||||
struct llist_head free_by_rcu;
|
||||
|
|
@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu)
|
|||
kfree(obj);
|
||||
}
|
||||
|
||||
static int free_all(struct llist_node *llnode, bool percpu)
|
||||
static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu)
|
||||
{
|
||||
struct llist_node *pos, *t;
|
||||
int cnt = 0;
|
||||
|
||||
llist_for_each_safe(pos, t, llnode) {
|
||||
if (c->dtor)
|
||||
c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx);
|
||||
free_one(pos, percpu);
|
||||
cnt++;
|
||||
}
|
||||
|
|
@ -276,7 +280,7 @@ static void __free_rcu(struct rcu_head *head)
|
|||
{
|
||||
struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace);
|
||||
|
||||
free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
|
||||
free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size);
|
||||
atomic_set(&c->call_rcu_ttrace_in_progress, 0);
|
||||
}
|
||||
|
||||
|
|
@ -308,7 +312,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c)
|
|||
if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) {
|
||||
if (unlikely(READ_ONCE(c->draining))) {
|
||||
llnode = llist_del_all(&c->free_by_rcu_ttrace);
|
||||
free_all(llnode, !!c->percpu_size);
|
||||
free_all(c, llnode, !!c->percpu_size);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
|
@ -417,7 +421,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c)
|
|||
dec_active(c, &flags);
|
||||
|
||||
if (unlikely(READ_ONCE(c->draining))) {
|
||||
free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
|
||||
free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size);
|
||||
atomic_set(&c->call_rcu_in_progress, 0);
|
||||
} else {
|
||||
call_rcu_hurry(&c->rcu, __free_by_rcu);
|
||||
|
|
@ -635,13 +639,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c)
|
|||
* Except for waiting_for_gp_ttrace list, there are no concurrent operations
|
||||
* on these lists, so it is safe to use __llist_del_all().
|
||||
*/
|
||||
free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu);
|
||||
free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu);
|
||||
free_all(__llist_del_all(&c->free_llist), percpu);
|
||||
free_all(__llist_del_all(&c->free_llist_extra), percpu);
|
||||
free_all(__llist_del_all(&c->free_by_rcu), percpu);
|
||||
free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu);
|
||||
free_all(llist_del_all(&c->waiting_for_gp), percpu);
|
||||
free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu);
|
||||
free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu);
|
||||
free_all(c, __llist_del_all(&c->free_llist), percpu);
|
||||
free_all(c, __llist_del_all(&c->free_llist_extra), percpu);
|
||||
free_all(c, __llist_del_all(&c->free_by_rcu), percpu);
|
||||
free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu);
|
||||
free_all(c, llist_del_all(&c->waiting_for_gp), percpu);
|
||||
}
|
||||
|
||||
static void check_mem_cache(struct bpf_mem_cache *c)
|
||||
|
|
@ -680,6 +684,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma)
|
|||
|
||||
static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma)
|
||||
{
|
||||
/* We can free dtor ctx only once all callbacks are done using it. */
|
||||
if (ma->dtor_ctx_free)
|
||||
ma->dtor_ctx_free(ma->dtor_ctx);
|
||||
check_leaked_objs(ma);
|
||||
free_percpu(ma->cache);
|
||||
free_percpu(ma->caches);
|
||||
|
|
@ -1014,3 +1021,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx),
|
||||
void (*dtor_ctx_free)(void *ctx), void *ctx)
|
||||
{
|
||||
struct bpf_mem_caches *cc;
|
||||
struct bpf_mem_cache *c;
|
||||
int cpu, i;
|
||||
|
||||
ma->dtor_ctx_free = dtor_ctx_free;
|
||||
ma->dtor_ctx = ctx;
|
||||
|
||||
if (ma->cache) {
|
||||
for_each_possible_cpu(cpu) {
|
||||
c = per_cpu_ptr(ma->cache, cpu);
|
||||
c->dtor = dtor;
|
||||
c->dtor_ctx = ctx;
|
||||
}
|
||||
}
|
||||
if (ma->caches) {
|
||||
for_each_possible_cpu(cpu) {
|
||||
cc = per_cpu_ptr(ma->caches, cpu);
|
||||
for (i = 0; i < NUM_CACHES; i++) {
|
||||
c = &cc->cache[i];
|
||||
c->dtor = dtor;
|
||||
c->dtor_ctx = ctx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1234,7 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
|
||||
|
||||
int map_check_no_btf(const struct bpf_map *map,
|
||||
int map_check_no_btf(struct bpf_map *map,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *key_type,
|
||||
const struct btf_type *value_type)
|
||||
|
|
|
|||
218
tools/testing/selftests/bpf/prog_tests/map_kptr_race.c
Normal file
218
tools/testing/selftests/bpf/prog_tests/map_kptr_race.c
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
|
||||
#include <test_progs.h>
|
||||
#include <network_helpers.h>
|
||||
|
||||
#include "map_kptr_race.skel.h"
|
||||
|
||||
static int get_map_id(int map_fd)
|
||||
{
|
||||
struct bpf_map_info info = {};
|
||||
__u32 len = sizeof(info);
|
||||
|
||||
if (!ASSERT_OK(bpf_map_get_info_by_fd(map_fd, &info, &len), "get_map_info"))
|
||||
return -1;
|
||||
return info.id;
|
||||
}
|
||||
|
||||
static int read_refs(struct map_kptr_race *skel)
|
||||
{
|
||||
LIBBPF_OPTS(bpf_test_run_opts, opts);
|
||||
int ret;
|
||||
|
||||
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &opts);
|
||||
if (!ASSERT_OK(ret, "count_ref run"))
|
||||
return -1;
|
||||
if (!ASSERT_OK(opts.retval, "count_ref retval"))
|
||||
return -1;
|
||||
return skel->bss->num_of_refs;
|
||||
}
|
||||
|
||||
static void test_htab_leak(void)
|
||||
{
|
||||
LIBBPF_OPTS(bpf_test_run_opts, opts,
|
||||
.data_in = &pkt_v4,
|
||||
.data_size_in = sizeof(pkt_v4),
|
||||
.repeat = 1,
|
||||
);
|
||||
struct map_kptr_race *skel, *watcher;
|
||||
int ret, map_id;
|
||||
|
||||
skel = map_kptr_race__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "open_and_load"))
|
||||
return;
|
||||
|
||||
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_htab_leak), &opts);
|
||||
if (!ASSERT_OK(ret, "test_htab_leak run"))
|
||||
goto out_skel;
|
||||
if (!ASSERT_OK(opts.retval, "test_htab_leak retval"))
|
||||
goto out_skel;
|
||||
|
||||
map_id = get_map_id(bpf_map__fd(skel->maps.race_hash_map));
|
||||
if (!ASSERT_GE(map_id, 0, "map_id"))
|
||||
goto out_skel;
|
||||
|
||||
watcher = map_kptr_race__open_and_load();
|
||||
if (!ASSERT_OK_PTR(watcher, "watcher open_and_load"))
|
||||
goto out_skel;
|
||||
|
||||
watcher->bss->target_map_id = map_id;
|
||||
watcher->links.map_put = bpf_program__attach(watcher->progs.map_put);
|
||||
if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry"))
|
||||
goto out_watcher;
|
||||
watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free);
|
||||
if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit"))
|
||||
goto out_watcher;
|
||||
|
||||
map_kptr_race__destroy(skel);
|
||||
skel = NULL;
|
||||
|
||||
kern_sync_rcu();
|
||||
|
||||
while (!READ_ONCE(watcher->bss->map_freed))
|
||||
sched_yield();
|
||||
|
||||
ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed");
|
||||
ASSERT_EQ(read_refs(watcher), 2, "htab refcount");
|
||||
|
||||
out_watcher:
|
||||
map_kptr_race__destroy(watcher);
|
||||
out_skel:
|
||||
map_kptr_race__destroy(skel);
|
||||
}
|
||||
|
||||
static void test_percpu_htab_leak(void)
|
||||
{
|
||||
LIBBPF_OPTS(bpf_test_run_opts, opts,
|
||||
.data_in = &pkt_v4,
|
||||
.data_size_in = sizeof(pkt_v4),
|
||||
.repeat = 1,
|
||||
);
|
||||
struct map_kptr_race *skel, *watcher;
|
||||
int ret, map_id;
|
||||
|
||||
skel = map_kptr_race__open();
|
||||
if (!ASSERT_OK_PTR(skel, "open"))
|
||||
return;
|
||||
|
||||
skel->rodata->nr_cpus = libbpf_num_possible_cpus();
|
||||
if (skel->rodata->nr_cpus > 16)
|
||||
skel->rodata->nr_cpus = 16;
|
||||
|
||||
ret = map_kptr_race__load(skel);
|
||||
if (!ASSERT_OK(ret, "load"))
|
||||
goto out_skel;
|
||||
|
||||
ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_percpu_htab_leak), &opts);
|
||||
if (!ASSERT_OK(ret, "test_percpu_htab_leak run"))
|
||||
goto out_skel;
|
||||
if (!ASSERT_OK(opts.retval, "test_percpu_htab_leak retval"))
|
||||
goto out_skel;
|
||||
|
||||
map_id = get_map_id(bpf_map__fd(skel->maps.race_percpu_hash_map));
|
||||
if (!ASSERT_GE(map_id, 0, "map_id"))
|
||||
goto out_skel;
|
||||
|
||||
watcher = map_kptr_race__open_and_load();
|
||||
if (!ASSERT_OK_PTR(watcher, "watcher open_and_load"))
|
||||
goto out_skel;
|
||||
|
||||
watcher->bss->target_map_id = map_id;
|
||||
watcher->links.map_put = bpf_program__attach(watcher->progs.map_put);
|
||||
if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry"))
|
||||
goto out_watcher;
|
||||
watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free);
|
||||
if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit"))
|
||||
goto out_watcher;
|
||||
|
||||
map_kptr_race__destroy(skel);
|
||||
skel = NULL;
|
||||
|
||||
kern_sync_rcu();
|
||||
|
||||
while (!READ_ONCE(watcher->bss->map_freed))
|
||||
sched_yield();
|
||||
|
||||
ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed");
|
||||
ASSERT_EQ(read_refs(watcher), 2, "percpu_htab refcount");
|
||||
|
||||
out_watcher:
|
||||
map_kptr_race__destroy(watcher);
|
||||
out_skel:
|
||||
map_kptr_race__destroy(skel);
|
||||
}
|
||||
|
||||
static void test_sk_ls_leak(void)
|
||||
{
|
||||
struct map_kptr_race *skel, *watcher;
|
||||
int listen_fd = -1, client_fd = -1, map_id;
|
||||
|
||||
skel = map_kptr_race__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "open_and_load"))
|
||||
return;
|
||||
|
||||
if (!ASSERT_OK(map_kptr_race__attach(skel), "attach"))
|
||||
goto out_skel;
|
||||
|
||||
listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
|
||||
if (!ASSERT_GE(listen_fd, 0, "start_server"))
|
||||
goto out_skel;
|
||||
|
||||
client_fd = connect_to_fd(listen_fd, 0);
|
||||
if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
|
||||
goto out_skel;
|
||||
|
||||
if (!ASSERT_EQ(skel->bss->sk_ls_leak_done, 1, "sk_ls_leak_done"))
|
||||
goto out_skel;
|
||||
|
||||
close(client_fd);
|
||||
client_fd = -1;
|
||||
close(listen_fd);
|
||||
listen_fd = -1;
|
||||
|
||||
map_id = get_map_id(bpf_map__fd(skel->maps.race_sk_ls_map));
|
||||
if (!ASSERT_GE(map_id, 0, "map_id"))
|
||||
goto out_skel;
|
||||
|
||||
watcher = map_kptr_race__open_and_load();
|
||||
if (!ASSERT_OK_PTR(watcher, "watcher open_and_load"))
|
||||
goto out_skel;
|
||||
|
||||
watcher->bss->target_map_id = map_id;
|
||||
watcher->links.map_put = bpf_program__attach(watcher->progs.map_put);
|
||||
if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry"))
|
||||
goto out_watcher;
|
||||
watcher->links.sk_map_free = bpf_program__attach(watcher->progs.sk_map_free);
|
||||
if (!ASSERT_OK_PTR(watcher->links.sk_map_free, "attach fexit"))
|
||||
goto out_watcher;
|
||||
|
||||
map_kptr_race__destroy(skel);
|
||||
skel = NULL;
|
||||
|
||||
kern_sync_rcu();
|
||||
|
||||
while (!READ_ONCE(watcher->bss->map_freed))
|
||||
sched_yield();
|
||||
|
||||
ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed");
|
||||
ASSERT_EQ(read_refs(watcher), 2, "sk_ls refcount");
|
||||
|
||||
out_watcher:
|
||||
map_kptr_race__destroy(watcher);
|
||||
out_skel:
|
||||
if (client_fd >= 0)
|
||||
close(client_fd);
|
||||
if (listen_fd >= 0)
|
||||
close(listen_fd);
|
||||
map_kptr_race__destroy(skel);
|
||||
}
|
||||
|
||||
void serial_test_map_kptr_race(void)
|
||||
{
|
||||
if (test__start_subtest("htab_leak"))
|
||||
test_htab_leak();
|
||||
if (test__start_subtest("percpu_htab_leak"))
|
||||
test_percpu_htab_leak();
|
||||
if (test__start_subtest("sk_ls_leak"))
|
||||
test_sk_ls_leak();
|
||||
}
|
||||
197
tools/testing/selftests/bpf/progs/map_kptr_race.c
Normal file
197
tools/testing/selftests/bpf/progs/map_kptr_race.c
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
|
||||
#include <vmlinux.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include "../test_kmods/bpf_testmod_kfunc.h"
|
||||
|
||||
struct map_value {
|
||||
struct prog_test_ref_kfunc __kptr *ref_ptr;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__type(key, int);
|
||||
__type(value, struct map_value);
|
||||
__uint(max_entries, 1);
|
||||
} race_hash_map SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__type(key, int);
|
||||
__type(value, struct map_value);
|
||||
__uint(max_entries, 1);
|
||||
} race_percpu_hash_map SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_SK_STORAGE);
|
||||
__uint(map_flags, BPF_F_NO_PREALLOC);
|
||||
__type(key, int);
|
||||
__type(value, struct map_value);
|
||||
} race_sk_ls_map SEC(".maps");
|
||||
|
||||
int num_of_refs;
|
||||
int sk_ls_leak_done;
|
||||
int target_map_id;
|
||||
int map_freed;
|
||||
const volatile int nr_cpus;
|
||||
|
||||
SEC("tc")
|
||||
int test_htab_leak(struct __sk_buff *skb)
|
||||
{
|
||||
struct prog_test_ref_kfunc *p, *old;
|
||||
struct map_value val = {};
|
||||
struct map_value *v;
|
||||
int key = 0;
|
||||
|
||||
if (bpf_map_update_elem(&race_hash_map, &key, &val, BPF_ANY))
|
||||
return 1;
|
||||
|
||||
v = bpf_map_lookup_elem(&race_hash_map, &key);
|
||||
if (!v)
|
||||
return 2;
|
||||
|
||||
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
|
||||
if (!p)
|
||||
return 3;
|
||||
old = bpf_kptr_xchg(&v->ref_ptr, p);
|
||||
if (old)
|
||||
bpf_kfunc_call_test_release(old);
|
||||
|
||||
bpf_map_delete_elem(&race_hash_map, &key);
|
||||
|
||||
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
|
||||
if (!p)
|
||||
return 4;
|
||||
old = bpf_kptr_xchg(&v->ref_ptr, p);
|
||||
if (old)
|
||||
bpf_kfunc_call_test_release(old);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fill_percpu_kptr(struct map_value *v)
|
||||
{
|
||||
struct prog_test_ref_kfunc *p, *old;
|
||||
|
||||
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
|
||||
if (!p)
|
||||
return 1;
|
||||
old = bpf_kptr_xchg(&v->ref_ptr, p);
|
||||
if (old)
|
||||
bpf_kfunc_call_test_release(old);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tc")
|
||||
int test_percpu_htab_leak(struct __sk_buff *skb)
|
||||
{
|
||||
struct map_value *v, *arr[16] = {};
|
||||
struct map_value val = {};
|
||||
int key = 0;
|
||||
int err = 0;
|
||||
|
||||
if (bpf_map_update_elem(&race_percpu_hash_map, &key, &val, BPF_ANY))
|
||||
return 1;
|
||||
|
||||
for (int i = 0; i < nr_cpus; i++) {
|
||||
v = bpf_map_lookup_percpu_elem(&race_percpu_hash_map, &key, i);
|
||||
if (!v)
|
||||
return 2;
|
||||
arr[i] = v;
|
||||
}
|
||||
|
||||
bpf_map_delete_elem(&race_percpu_hash_map, &key);
|
||||
|
||||
for (int i = 0; i < nr_cpus; i++) {
|
||||
v = arr[i];
|
||||
err = fill_percpu_kptr(v);
|
||||
if (err)
|
||||
return 3;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tp_btf/inet_sock_set_state")
|
||||
int BPF_PROG(test_sk_ls_leak, struct sock *sk, int oldstate, int newstate)
|
||||
{
|
||||
struct prog_test_ref_kfunc *p, *old;
|
||||
struct map_value *v;
|
||||
|
||||
if (newstate != BPF_TCP_SYN_SENT)
|
||||
return 0;
|
||||
|
||||
if (sk_ls_leak_done)
|
||||
return 0;
|
||||
|
||||
v = bpf_sk_storage_get(&race_sk_ls_map, sk, NULL,
|
||||
BPF_SK_STORAGE_GET_F_CREATE);
|
||||
if (!v)
|
||||
return 0;
|
||||
|
||||
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
|
||||
if (!p)
|
||||
return 0;
|
||||
old = bpf_kptr_xchg(&v->ref_ptr, p);
|
||||
if (old)
|
||||
bpf_kfunc_call_test_release(old);
|
||||
|
||||
bpf_sk_storage_delete(&race_sk_ls_map, sk);
|
||||
|
||||
p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
|
||||
if (!p)
|
||||
return 0;
|
||||
old = bpf_kptr_xchg(&v->ref_ptr, p);
|
||||
if (old)
|
||||
bpf_kfunc_call_test_release(old);
|
||||
|
||||
sk_ls_leak_done = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
long target_map_ptr;
|
||||
|
||||
SEC("fentry/bpf_map_put")
|
||||
int BPF_PROG(map_put, struct bpf_map *map)
|
||||
{
|
||||
if (target_map_id && map->id == (u32)target_map_id)
|
||||
target_map_ptr = (long)map;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("fexit/htab_map_free")
|
||||
int BPF_PROG(htab_map_free, struct bpf_map *map)
|
||||
{
|
||||
if (target_map_ptr && (long)map == target_map_ptr)
|
||||
map_freed = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("fexit/bpf_sk_storage_map_free")
|
||||
int BPF_PROG(sk_map_free, struct bpf_map *map)
|
||||
{
|
||||
if (target_map_ptr && (long)map == target_map_ptr)
|
||||
map_freed = 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("syscall")
|
||||
int count_ref(void *ctx)
|
||||
{
|
||||
struct prog_test_ref_kfunc *p;
|
||||
unsigned long arg = 0;
|
||||
|
||||
p = bpf_kfunc_call_test_acquire(&arg);
|
||||
if (!p)
|
||||
return 1;
|
||||
|
||||
num_of_refs = p->cnt.refs.counter;
|
||||
|
||||
bpf_kfunc_call_test_release(p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
Loading…
Add table
Add a link
Reference in a new issue