diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b78b53198a2e..05b34a6355b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -124,7 +124,7 @@ struct bpf_map_ops { u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); - int (*map_check_btf)(const struct bpf_map *map, + int (*map_check_btf)(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); @@ -656,7 +656,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map) map->ops->map_seq_show_elem; } -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 85efa9772530..8157e8da61d4 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -176,7 +176,7 @@ u32 bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache); -int bpf_local_storage_map_check_btf(const struct bpf_map *map, +int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index e45162ef59bb..4ce0d27f8ea2 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -14,6 +14,8 @@ struct bpf_mem_alloc { struct obj_cgroup *objcg; bool percpu; struct work_struct work; + void (*dtor_ctx_free)(void *ctx); + void *dtor_ctx; }; /* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. @@ -32,6 +34,10 @@ int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg /* The percpu allocation with a specific unit size. */ int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); +void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, + void (*dtor)(void *obj, void *ctx), + void (*dtor_ctx_free)(void *ctx), + void *ctx); /* Check the allocation size for kmalloc equivalent allocator */ int bpf_mem_alloc_check_size(bool percpu, size_t size); diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 144f30e740e8..f355cf1c1a16 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -303,7 +303,7 @@ static long arena_map_update_elem(struct bpf_map *map, void *key, return -EOPNOTSUPP; } -static int arena_map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int arena_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) { return 0; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 26763df6134a..33de68c95ad8 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -548,7 +548,7 @@ static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, rcu_read_unlock(); } -static int array_map_check_btf(const struct bpf_map *map, +static int array_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 35e1ddca74d2..b73336c976b7 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -180,7 +180,7 @@ static long bloom_map_update_elem(struct bpf_map *map, void *key, return -EINVAL; } -static int bloom_map_check_btf(const struct bpf_map *map, +static int bloom_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bpf_insn_array.c b/kernel/bpf/bpf_insn_array.c index c0286f25ca3c..a2f84afe6f7c 100644 --- a/kernel/bpf/bpf_insn_array.c +++ b/kernel/bpf/bpf_insn_array.c @@ -98,7 +98,7 @@ static long insn_array_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } -static int insn_array_check_btf(const struct bpf_map *map, +static int insn_array_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b28f07d3a0db..9c96a4477f81 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -107,14 +107,12 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; - /* If RCU Tasks Trace grace period implies RCU grace period, do - * kfree(), else do kfree_rcu(). + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); - if (rcu_trace_implies_rcu_gp()) - kfree(local_storage); - else - kfree_rcu(local_storage, rcu); + kfree(local_storage); } /* Handle use_kmalloc_nolock == false */ @@ -138,10 +136,11 @@ static void bpf_local_storage_free_rcu(struct rcu_head *rcu) static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) { - if (rcu_trace_implies_rcu_gp()) - bpf_local_storage_free_rcu(rcu); - else - call_rcu(rcu, bpf_local_storage_free_rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + bpf_local_storage_free_rcu(rcu); } static void bpf_local_storage_free(struct bpf_local_storage *local_storage, @@ -164,16 +163,29 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage, bpf_local_storage_free_trace_rcu); } +/* rcu callback for use_kmalloc_nolock == false */ +static void __bpf_selem_free_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + /* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */ + smap = rcu_dereference_check(SDATA(selem)->smap, 1); + + if (smap) + bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); + kfree(selem); +} + /* rcu tasks trace callback for use_kmalloc_nolock == false */ static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu) { - struct bpf_local_storage_elem *selem; - - selem = container_of(rcu, struct bpf_local_storage_elem, rcu); - if (rcu_trace_implies_rcu_gp()) - kfree(selem); - else - kfree_rcu(selem, rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + __bpf_selem_free_rcu(rcu); } /* Handle use_kmalloc_nolock == false */ @@ -181,7 +193,7 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem, bool vanilla_rcu) { if (vanilla_rcu) - kfree_rcu(selem, rcu); + call_rcu(&selem->rcu, __bpf_selem_free_rcu); else call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu); } @@ -195,37 +207,29 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) /* The bpf_local_storage_map_free will wait for rcu_barrier */ smap = rcu_dereference_check(SDATA(selem)->smap, 1); - if (smap) { - migrate_disable(); + if (smap) bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); - migrate_enable(); - } kfree_nolock(selem); } static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) { - if (rcu_trace_implies_rcu_gp()) - bpf_selem_free_rcu(rcu); - else - call_rcu(rcu, bpf_selem_free_rcu); + /* + * RCU Tasks Trace grace period implies RCU grace period, do + * kfree() directly. + */ + bpf_selem_free_rcu(rcu); } void bpf_selem_free(struct bpf_local_storage_elem *selem, bool reuse_now) { - struct bpf_local_storage_map *smap; - - smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); - if (!selem->use_kmalloc_nolock) { /* * No uptr will be unpin even when reuse_now == false since uptr * is only supported in task local storage, where * smap->use_kmalloc_nolock == true. */ - if (smap) - bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); __bpf_selem_free(selem, reuse_now); return; } @@ -797,7 +801,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -int bpf_local_storage_map_check_btf(const struct bpf_map *map, +int bpf_local_storage_map_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) @@ -958,10 +962,9 @@ restart: */ synchronize_rcu(); - if (smap->use_kmalloc_nolock) { - rcu_barrier_tasks_trace(); - rcu_barrier(); - } + /* smap remains in use regardless of kmalloc_nolock, so wait unconditionally. */ + rcu_barrier_tasks_trace(); + rcu_barrier(); kvfree(smap->buckets); bpf_map_area_free(smap); } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3b9d297a53be..bc6bc8bb871d 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -125,6 +125,11 @@ struct htab_elem { char key[] __aligned(8); }; +struct htab_btf_record { + struct btf_record *record; + u32 key_size; +}; + static inline bool htab_is_prealloc(const struct bpf_htab *htab) { return !(htab->map.map_flags & BPF_F_NO_PREALLOC); @@ -457,6 +462,83 @@ static int htab_map_alloc_check(union bpf_attr *attr) return 0; } +static void htab_mem_dtor(void *obj, void *ctx) +{ + struct htab_btf_record *hrec = ctx; + struct htab_elem *elem = obj; + void *map_value; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + map_value = htab_elem_value(elem, hrec->key_size); + bpf_obj_free_fields(hrec->record, map_value); +} + +static void htab_pcpu_mem_dtor(void *obj, void *ctx) +{ + void __percpu *pptr = *(void __percpu **)obj; + struct htab_btf_record *hrec = ctx; + int cpu; + + if (IS_ERR_OR_NULL(hrec->record)) + return; + + for_each_possible_cpu(cpu) + bpf_obj_free_fields(hrec->record, per_cpu_ptr(pptr, cpu)); +} + +static void htab_dtor_ctx_free(void *ctx) +{ + struct htab_btf_record *hrec = ctx; + + btf_record_free(hrec->record); + kfree(ctx); +} + +static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *)) +{ + u32 key_size = htab->map.key_size; + struct bpf_mem_alloc *ma; + struct htab_btf_record *hrec; + int err; + + /* No need for dtors. */ + if (IS_ERR_OR_NULL(htab->map.record)) + return 0; + + hrec = kzalloc(sizeof(*hrec), GFP_KERNEL); + if (!hrec) + return -ENOMEM; + hrec->key_size = key_size; + hrec->record = btf_record_dup(htab->map.record); + if (IS_ERR(hrec->record)) { + err = PTR_ERR(hrec->record); + kfree(hrec); + return err; + } + ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma; + bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec); + return 0; +} + +static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf, + const struct btf_type *key_type, const struct btf_type *value_type) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + if (htab_is_prealloc(htab)) + return 0; + /* + * We must set the dtor using this callback, as map's BTF record is not + * populated in htab_map_alloc(), so it will always appear as NULL. + */ + if (htab_is_percpu(htab)) + return htab_set_dtor(htab, htab_pcpu_mem_dtor); + else + return htab_set_dtor(htab, htab_mem_dtor); +} + static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -2281,6 +2363,7 @@ const struct bpf_map_ops htab_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab), .map_btf_id = &htab_map_btf_ids[0], @@ -2303,6 +2386,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_seq_show_elem = htab_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru), .map_btf_id = &htab_map_btf_ids[0], @@ -2482,6 +2566,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_percpu), .map_btf_id = &htab_map_btf_ids[0], @@ -2502,6 +2587,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_seq_show_elem = htab_percpu_map_seq_show_elem, .map_set_for_each_callback_args = map_set_for_each_callback_args, .map_for_each_callback = bpf_for_each_hash_elem, + .map_check_btf = htab_map_check_btf, .map_mem_usage = htab_map_mem_usage, BATCH_OPS(htab_lru_percpu), .map_btf_id = &htab_map_btf_ids[0], diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 1ccbf28b2ad9..8fca0c64f7b1 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -364,7 +364,7 @@ static long cgroup_storage_delete_elem(struct bpf_map *map, void *key) return -EINVAL; } -static int cgroup_storage_check_btf(const struct bpf_map *map, +static int cgroup_storage_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1adeb4d3b8cf..0f57608b385d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -751,7 +751,7 @@ free_stack: return err; } -static int trie_check_btf(const struct bpf_map *map, +static int trie_check_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index bd45dda9dc35..682a9f34214b 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -102,6 +102,8 @@ struct bpf_mem_cache { int percpu_size; bool draining; struct bpf_mem_cache *tgt; + void (*dtor)(void *obj, void *ctx); + void *dtor_ctx; /* list of objects to be freed after RCU GP */ struct llist_head free_by_rcu; @@ -260,12 +262,14 @@ static void free_one(void *obj, bool percpu) kfree(obj); } -static int free_all(struct llist_node *llnode, bool percpu) +static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu) { struct llist_node *pos, *t; int cnt = 0; llist_for_each_safe(pos, t, llnode) { + if (c->dtor) + c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx); free_one(pos, percpu); cnt++; } @@ -276,7 +280,7 @@ static void __free_rcu(struct rcu_head *head) { struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); - free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); atomic_set(&c->call_rcu_ttrace_in_progress, 0); } @@ -308,7 +312,7 @@ static void do_call_rcu_ttrace(struct bpf_mem_cache *c) if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { if (unlikely(READ_ONCE(c->draining))) { llnode = llist_del_all(&c->free_by_rcu_ttrace); - free_all(llnode, !!c->percpu_size); + free_all(c, llnode, !!c->percpu_size); } return; } @@ -417,7 +421,7 @@ static void check_free_by_rcu(struct bpf_mem_cache *c) dec_active(c, &flags); if (unlikely(READ_ONCE(c->draining))) { - free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); + free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size); atomic_set(&c->call_rcu_in_progress, 0); } else { call_rcu_hurry(&c->rcu, __free_by_rcu); @@ -635,13 +639,13 @@ static void drain_mem_cache(struct bpf_mem_cache *c) * Except for waiting_for_gp_ttrace list, there are no concurrent operations * on these lists, so it is safe to use __llist_del_all(). */ - free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); - free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); - free_all(__llist_del_all(&c->free_llist), percpu); - free_all(__llist_del_all(&c->free_llist_extra), percpu); - free_all(__llist_del_all(&c->free_by_rcu), percpu); - free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); - free_all(llist_del_all(&c->waiting_for_gp), percpu); + free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu); + free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu); + free_all(c, __llist_del_all(&c->free_llist), percpu); + free_all(c, __llist_del_all(&c->free_llist_extra), percpu); + free_all(c, __llist_del_all(&c->free_by_rcu), percpu); + free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu); + free_all(c, llist_del_all(&c->waiting_for_gp), percpu); } static void check_mem_cache(struct bpf_mem_cache *c) @@ -680,6 +684,9 @@ static void check_leaked_objs(struct bpf_mem_alloc *ma) static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) { + /* We can free dtor ctx only once all callbacks are done using it. */ + if (ma->dtor_ctx_free) + ma->dtor_ctx_free(ma->dtor_ctx); check_leaked_objs(ma); free_percpu(ma->cache); free_percpu(ma->caches); @@ -1014,3 +1021,32 @@ int bpf_mem_alloc_check_size(bool percpu, size_t size) return 0; } + +void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx), + void (*dtor_ctx_free)(void *ctx), void *ctx) +{ + struct bpf_mem_caches *cc; + struct bpf_mem_cache *c; + int cpu, i; + + ma->dtor_ctx_free = dtor_ctx_free; + ma->dtor_ctx = ctx; + + if (ma->cache) { + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(ma->cache, cpu); + c->dtor = dtor; + c->dtor_ctx = ctx; + } + } + if (ma->caches) { + for_each_possible_cpu(cpu) { + cc = per_cpu_ptr(ma->caches, cpu); + for (i = 0; i < NUM_CACHES; i++) { + c = &cc->cache[i]; + c->dtor = dtor; + c->dtor_ctx = ctx; + } + } + } +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0378e83b4099..274039e36465 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1234,7 +1234,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size) } EXPORT_SYMBOL_GPL(bpf_obj_name_cpy); -int map_check_no_btf(const struct bpf_map *map, +int map_check_no_btf(struct bpf_map *map, const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type) diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c b/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c new file mode 100644 index 000000000000..506ed55e8528 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include + +#include "map_kptr_race.skel.h" + +static int get_map_id(int map_fd) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + + if (!ASSERT_OK(bpf_map_get_info_by_fd(map_fd, &info, &len), "get_map_info")) + return -1; + return info.id; +} + +static int read_refs(struct map_kptr_race *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + int ret; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &opts); + if (!ASSERT_OK(ret, "count_ref run")) + return -1; + if (!ASSERT_OK(opts.retval, "count_ref retval")) + return -1; + return skel->bss->num_of_refs; +} + +static void test_htab_leak(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + struct map_kptr_race *skel, *watcher; + int ret, map_id; + + skel = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_htab_leak), &opts); + if (!ASSERT_OK(ret, "test_htab_leak run")) + goto out_skel; + if (!ASSERT_OK(opts.retval, "test_htab_leak retval")) + goto out_skel; + + map_id = get_map_id(bpf_map__fd(skel->maps.race_hash_map)); + if (!ASSERT_GE(map_id, 0, "map_id")) + goto out_skel; + + watcher = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(watcher, "watcher open_and_load")) + goto out_skel; + + watcher->bss->target_map_id = map_id; + watcher->links.map_put = bpf_program__attach(watcher->progs.map_put); + if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry")) + goto out_watcher; + watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free); + if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit")) + goto out_watcher; + + map_kptr_race__destroy(skel); + skel = NULL; + + kern_sync_rcu(); + + while (!READ_ONCE(watcher->bss->map_freed)) + sched_yield(); + + ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed"); + ASSERT_EQ(read_refs(watcher), 2, "htab refcount"); + +out_watcher: + map_kptr_race__destroy(watcher); +out_skel: + map_kptr_race__destroy(skel); +} + +static void test_percpu_htab_leak(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1, + ); + struct map_kptr_race *skel, *watcher; + int ret, map_id; + + skel = map_kptr_race__open(); + if (!ASSERT_OK_PTR(skel, "open")) + return; + + skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + if (skel->rodata->nr_cpus > 16) + skel->rodata->nr_cpus = 16; + + ret = map_kptr_race__load(skel); + if (!ASSERT_OK(ret, "load")) + goto out_skel; + + ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_percpu_htab_leak), &opts); + if (!ASSERT_OK(ret, "test_percpu_htab_leak run")) + goto out_skel; + if (!ASSERT_OK(opts.retval, "test_percpu_htab_leak retval")) + goto out_skel; + + map_id = get_map_id(bpf_map__fd(skel->maps.race_percpu_hash_map)); + if (!ASSERT_GE(map_id, 0, "map_id")) + goto out_skel; + + watcher = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(watcher, "watcher open_and_load")) + goto out_skel; + + watcher->bss->target_map_id = map_id; + watcher->links.map_put = bpf_program__attach(watcher->progs.map_put); + if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry")) + goto out_watcher; + watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free); + if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit")) + goto out_watcher; + + map_kptr_race__destroy(skel); + skel = NULL; + + kern_sync_rcu(); + + while (!READ_ONCE(watcher->bss->map_freed)) + sched_yield(); + + ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed"); + ASSERT_EQ(read_refs(watcher), 2, "percpu_htab refcount"); + +out_watcher: + map_kptr_race__destroy(watcher); +out_skel: + map_kptr_race__destroy(skel); +} + +static void test_sk_ls_leak(void) +{ + struct map_kptr_race *skel, *watcher; + int listen_fd = -1, client_fd = -1, map_id; + + skel = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(skel, "open_and_load")) + return; + + if (!ASSERT_OK(map_kptr_race__attach(skel), "attach")) + goto out_skel; + + listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + if (!ASSERT_GE(listen_fd, 0, "start_server")) + goto out_skel; + + client_fd = connect_to_fd(listen_fd, 0); + if (!ASSERT_GE(client_fd, 0, "connect_to_fd")) + goto out_skel; + + if (!ASSERT_EQ(skel->bss->sk_ls_leak_done, 1, "sk_ls_leak_done")) + goto out_skel; + + close(client_fd); + client_fd = -1; + close(listen_fd); + listen_fd = -1; + + map_id = get_map_id(bpf_map__fd(skel->maps.race_sk_ls_map)); + if (!ASSERT_GE(map_id, 0, "map_id")) + goto out_skel; + + watcher = map_kptr_race__open_and_load(); + if (!ASSERT_OK_PTR(watcher, "watcher open_and_load")) + goto out_skel; + + watcher->bss->target_map_id = map_id; + watcher->links.map_put = bpf_program__attach(watcher->progs.map_put); + if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry")) + goto out_watcher; + watcher->links.sk_map_free = bpf_program__attach(watcher->progs.sk_map_free); + if (!ASSERT_OK_PTR(watcher->links.sk_map_free, "attach fexit")) + goto out_watcher; + + map_kptr_race__destroy(skel); + skel = NULL; + + kern_sync_rcu(); + + while (!READ_ONCE(watcher->bss->map_freed)) + sched_yield(); + + ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed"); + ASSERT_EQ(read_refs(watcher), 2, "sk_ls refcount"); + +out_watcher: + map_kptr_race__destroy(watcher); +out_skel: + if (client_fd >= 0) + close(client_fd); + if (listen_fd >= 0) + close(listen_fd); + map_kptr_race__destroy(skel); +} + +void serial_test_map_kptr_race(void) +{ + if (test__start_subtest("htab_leak")) + test_htab_leak(); + if (test__start_subtest("percpu_htab_leak")) + test_percpu_htab_leak(); + if (test__start_subtest("sk_ls_leak")) + test_sk_ls_leak(); +} diff --git a/tools/testing/selftests/bpf/progs/map_kptr_race.c b/tools/testing/selftests/bpf/progs/map_kptr_race.c new file mode 100644 index 000000000000..f6f136cd8f60 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/map_kptr_race.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "../test_kmods/bpf_testmod_kfunc.h" + +struct map_value { + struct prog_test_ref_kfunc __kptr *ref_ptr; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} race_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct map_value); + __uint(max_entries, 1); +} race_percpu_hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct map_value); +} race_sk_ls_map SEC(".maps"); + +int num_of_refs; +int sk_ls_leak_done; +int target_map_id; +int map_freed; +const volatile int nr_cpus; + +SEC("tc") +int test_htab_leak(struct __sk_buff *skb) +{ + struct prog_test_ref_kfunc *p, *old; + struct map_value val = {}; + struct map_value *v; + int key = 0; + + if (bpf_map_update_elem(&race_hash_map, &key, &val, BPF_ANY)) + return 1; + + v = bpf_map_lookup_elem(&race_hash_map, &key); + if (!v) + return 2; + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 3; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + bpf_map_delete_elem(&race_hash_map, &key); + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 4; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + return 0; +} + +static int fill_percpu_kptr(struct map_value *v) +{ + struct prog_test_ref_kfunc *p, *old; + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 1; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + return 0; +} + +SEC("tc") +int test_percpu_htab_leak(struct __sk_buff *skb) +{ + struct map_value *v, *arr[16] = {}; + struct map_value val = {}; + int key = 0; + int err = 0; + + if (bpf_map_update_elem(&race_percpu_hash_map, &key, &val, BPF_ANY)) + return 1; + + for (int i = 0; i < nr_cpus; i++) { + v = bpf_map_lookup_percpu_elem(&race_percpu_hash_map, &key, i); + if (!v) + return 2; + arr[i] = v; + } + + bpf_map_delete_elem(&race_percpu_hash_map, &key); + + for (int i = 0; i < nr_cpus; i++) { + v = arr[i]; + err = fill_percpu_kptr(v); + if (err) + return 3; + } + + return 0; +} + +SEC("tp_btf/inet_sock_set_state") +int BPF_PROG(test_sk_ls_leak, struct sock *sk, int oldstate, int newstate) +{ + struct prog_test_ref_kfunc *p, *old; + struct map_value *v; + + if (newstate != BPF_TCP_SYN_SENT) + return 0; + + if (sk_ls_leak_done) + return 0; + + v = bpf_sk_storage_get(&race_sk_ls_map, sk, NULL, + BPF_SK_STORAGE_GET_F_CREATE); + if (!v) + return 0; + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 0; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + bpf_sk_storage_delete(&race_sk_ls_map, sk); + + p = bpf_kfunc_call_test_acquire(&(unsigned long){0}); + if (!p) + return 0; + old = bpf_kptr_xchg(&v->ref_ptr, p); + if (old) + bpf_kfunc_call_test_release(old); + + sk_ls_leak_done = 1; + return 0; +} + +long target_map_ptr; + +SEC("fentry/bpf_map_put") +int BPF_PROG(map_put, struct bpf_map *map) +{ + if (target_map_id && map->id == (u32)target_map_id) + target_map_ptr = (long)map; + return 0; +} + +SEC("fexit/htab_map_free") +int BPF_PROG(htab_map_free, struct bpf_map *map) +{ + if (target_map_ptr && (long)map == target_map_ptr) + map_freed = 1; + return 0; +} + +SEC("fexit/bpf_sk_storage_map_free") +int BPF_PROG(sk_map_free, struct bpf_map *map) +{ + if (target_map_ptr && (long)map == target_map_ptr) + map_freed = 1; + return 0; +} + +SEC("syscall") +int count_ref(void *ctx) +{ + struct prog_test_ref_kfunc *p; + unsigned long arg = 0; + + p = bpf_kfunc_call_test_acquire(&arg); + if (!p) + return 1; + + num_of_refs = p->cnt.refs.counter; + + bpf_kfunc_call_test_release(p); + return 0; +} + +char _license[] SEC("license") = "GPL";