From e1c4350327b39c9cad27b6c5779b3754384f26c8 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Tue, 26 Aug 2025 14:23:14 +0800 Subject: [PATCH 01/40] mm/slub: Fix cmp_loc_by_count() to return 0 when counts are equal The comparison function cmp_loc_by_count() used for sorting stack trace locations in debugfs currently returns -1 if a->count > b->count and 1 otherwise. This breaks the antisymmetry property required by sort(), because when two counts are equal, both cmp(a, b) and cmp(b, a) return 1. This can lead to undefined or incorrect ordering results. Fix it by updating the comparison logic to explicitly handle the case when counts are equal, and use cmp_int() to ensure the comparison function adheres to the required mathematical properties of antisymmetry. Fixes: 553c0369b3e1 ("mm/slub: sort debugfs output by frequency of stack traces") Reviewed-by: Joshua Hahn Signed-off-by: Kuan-Wei Chiu Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d257141896c9..264fc76455d7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -7731,10 +7731,7 @@ static int cmp_loc_by_count(const void *a, const void *b, const void *data) struct location *loc1 = (struct location *)a; struct location *loc2 = (struct location *)b; - if (loc1->count > loc2->count) - return -1; - else - return 1; + return cmp_int(loc2->count, loc1->count); } static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) From ba7a896427cbade13d30f3c7e18c15e8be243c18 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Tue, 26 Aug 2025 14:23:15 +0800 Subject: [PATCH 02/40] mm/slub: Replace sort_r() with sort() for debugfs stack trace sorting The comparison function used to sort stack trace locations in debugfs never relied on the third argument. Therefore, sort_r() is unnecessary. Switch to sort() with a two-argument comparison function to keep the code simple and aligned with the intended usage. Signed-off-by: Kuan-Wei Chiu Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 264fc76455d7..9074ce914e9e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -7726,7 +7726,7 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) return NULL; } -static int cmp_loc_by_count(const void *a, const void *b, const void *data) +static int cmp_loc_by_count(const void *a, const void *b) { struct location *loc1 = (struct location *)a; struct location *loc2 = (struct location *)b; @@ -7793,8 +7793,8 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) } /* Sort locations by count */ - sort_r(t->loc, t->count, sizeof(struct location), - cmp_loc_by_count, NULL, NULL); + sort(t->loc, t->count, sizeof(struct location), + cmp_loc_by_count, NULL); bitmap_free(obj_map); return 0; From 41534d499e50e23571d6b9960498777d93f817ce Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Tue, 9 Sep 2025 09:33:07 +0800 Subject: [PATCH 03/40] mm/slub: Refactor note_cmpxchg_failure for better readability Use IS_ENABLED() and standard if-else to make the code clearer. Signed-off-by: Ye Liu Reviewed-by: Harry Yoo Reviewed-by: Christoph Lameter (Ampere) Reviewed-by: Anshuman Khandual Signed-off-by: Vlastimil Babka --- mm/slub.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 9074ce914e9e..3062f56bf498 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3069,18 +3069,17 @@ static inline void note_cmpxchg_failure(const char *n, pr_info("%s %s: cmpxchg redo ", n, s->name); -#ifdef CONFIG_PREEMPTION - if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + if (IS_ENABLED(CONFIG_PREEMPTION) && + tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); - else -#endif - if (tid_to_event(tid) != tid_to_event(actual_tid)) + } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); - else + } else { pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); + } #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } From f4930de03dcf4a3f2515546f5b3c04bac016685d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 15 Sep 2025 15:55:08 +0200 Subject: [PATCH 04/40] slab: Remove dead code in free_consistency_checks() We already know that slab is a valid slab as that's checked by the caller. In the future, we won't be able to get to a slab pointer from a non-slab page. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 3062f56bf498..56143bfd1ae3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1684,10 +1684,7 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 0; if (unlikely(s != slab->slab_cache)) { - if (!folio_test_slab(slab_folio(slab))) { - slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", - object); - } else if (!slab->slab_cache) { + if (!slab->slab_cache) { slab_err(NULL, slab, "No slab cache for object 0x%p", object); } else { From 86169b00f89633e255048feb5dde438dd3522fd4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 15 Sep 2025 15:55:09 +0200 Subject: [PATCH 05/40] slab: wrap debug slab validation in validate_slab_ptr() This will make it clear where we currently cast struct slab to folio only to check the slab type, and allow to change the implementation later with memdesc conversion. For now use a struct page based implementation instead of struct folio to be compatible with further upcoming changes. Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 56143bfd1ae3..75e4388d507d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -821,6 +821,15 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) return *(unsigned int *)p; } +/* + * For debugging context when we want to check if the struct slab pointer + * appears to be valid. + */ +static inline bool validate_slab_ptr(struct slab *slab) +{ + return PageSlab(slab_page(slab)); +} + #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -1453,7 +1462,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) { int maxobj; - if (!folio_test_slab(slab_folio(slab))) { + if (!validate_slab_ptr(slab)) { slab_err(s, slab, "Not a valid slab page"); return 0; } @@ -1653,7 +1662,7 @@ static noinline bool alloc_debug_processing(struct kmem_cache *s, return true; bad: - if (folio_test_slab(slab_folio(slab))) { + if (validate_slab_ptr(slab)) { /* * If this is a slab page then lets do the best we can * to avoid issues in the future. Marking all objects @@ -2818,7 +2827,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s, slab->inuse++; if (!alloc_debug_processing(s, slab, object, orig_size)) { - if (folio_test_slab(slab_folio(slab))) + if (validate_slab_ptr(slab)) remove_partial(n, slab); return NULL; } From 6f6fcd463410445101fae04335f617f40e6eca8a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 15 Sep 2025 15:55:10 +0200 Subject: [PATCH 06/40] slab: move validate_slab_ptr() from check_slab() to its callers We will want to do the validation earlier in some callers or remove it completely, so extract it from check_slab() first. No functional change. Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 75e4388d507d..6fb24010c170 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1458,15 +1458,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab, return ret; } +/* + * Checks if the slab state looks sane. Assumes the struct slab pointer + * was either obtained in a way that ensures it's valid, or validated + * by validate_slab_ptr() + */ static int check_slab(struct kmem_cache *s, struct slab *slab) { int maxobj; - if (!validate_slab_ptr(slab)) { - slab_err(s, slab, "Not a valid slab page"); - return 0; - } - maxobj = order_objects(slab_order(slab), s->size); if (slab->objects > maxobj) { slab_err(s, slab, "objects %u > max %u", @@ -1633,6 +1633,11 @@ void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) static inline int alloc_consistency_checks(struct kmem_cache *s, struct slab *slab, void *object) { + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return 0; + } + if (!check_slab(s, slab)) return 0; @@ -3485,6 +3490,11 @@ static inline bool free_debug_processing(struct kmem_cache *s, int cnt = 0; if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + goto out; + } + if (!check_slab(s, slab)) goto out; } @@ -6519,6 +6529,11 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, void *p; void *addr = slab_address(slab); + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return; + } + if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) return; From 40522db59b5bd36dd63d215bfb7fa83d64bce05a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 15 Sep 2025 15:55:11 +0200 Subject: [PATCH 07/40] slab: move validate_slab_ptr() from alloc_consistency_checks() to its caller In alloc_debug_processing() we can call validate_slab_ptr() upfront and then don't need to recheck when alloc_consistency_checks() fails for other reasons. Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 6fb24010c170..12ad42f3d2e0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1633,11 +1633,6 @@ void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) static inline int alloc_consistency_checks(struct kmem_cache *s, struct slab *slab, void *object) { - if (!validate_slab_ptr(slab)) { - slab_err(s, slab, "Not a valid slab page"); - return 0; - } - if (!check_slab(s, slab)) return 0; @@ -1656,6 +1651,11 @@ static noinline bool alloc_debug_processing(struct kmem_cache *s, struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return false; + } + if (!alloc_consistency_checks(s, slab, object)) goto bad; } @@ -1667,17 +1667,15 @@ static noinline bool alloc_debug_processing(struct kmem_cache *s, return true; bad: - if (validate_slab_ptr(slab)) { - /* - * If this is a slab page then lets do the best we can - * to avoid issues in the future. Marking all objects - * as used avoids touching the remaining objects. - */ - slab_fix(s, "Marking all objects used"); - slab->inuse = slab->objects; - slab->freelist = NULL; - slab->frozen = 1; /* mark consistency-failed slab as frozen */ - } + /* + * Let's do the best we can to avoid issues in the future. Marking all + * objects as used avoids touching the remaining objects. + */ + slab_fix(s, "Marking all objects used"); + slab->inuse = slab->objects; + slab->freelist = NULL; + slab->frozen = 1; /* mark consistency-failed slab as frozen */ + return false; } From a21fe7b010e32c51c62a86dcba02f9404ed77cac Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 15 Sep 2025 15:55:12 +0200 Subject: [PATCH 08/40] slab: validate slab before using it in alloc_single_from_partial() We touch slab->freelist and slab->inuse before checking the slab pointer is actually sane. Do that validation first, which will be safer. We can thus also remove the check from alloc_debug_processing(). This adds a new "s->flags & SLAB_CONSISTENCY_CHECKS" test but alloc_single_from_partial() is only called for caches with debugging enabled so it's acceptable. In alloc_single_from_new_slab() we just created the struct slab and call alloc_debug_processing() to mainly set up redzones, tracking etc, while not really expecting the consistency checks to fail. Thus don't validate it there. Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 12ad42f3d2e0..b3b65429e2d7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -821,6 +821,8 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) return *(unsigned int *)p; } +#ifdef CONFIG_SLUB_DEBUG + /* * For debugging context when we want to check if the struct slab pointer * appears to be valid. @@ -830,7 +832,6 @@ static inline bool validate_slab_ptr(struct slab *slab) return PageSlab(slab_page(slab)); } -#ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -1651,11 +1652,6 @@ static noinline bool alloc_debug_processing(struct kmem_cache *s, struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!validate_slab_ptr(slab)) { - slab_err(s, slab, "Not a valid slab page"); - return false; - } - if (!alloc_consistency_checks(s, slab, object)) goto bad; } @@ -2825,13 +2821,21 @@ static void *alloc_single_from_partial(struct kmem_cache *s, lockdep_assert_held(&n->list_lock); +#ifdef CONFIG_SLUB_DEBUG + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return NULL; + } + } +#endif + object = slab->freelist; slab->freelist = get_freepointer(s, object); slab->inuse++; if (!alloc_debug_processing(s, slab, object, orig_size)) { - if (validate_slab_ptr(slab)) - remove_partial(n, slab); + remove_partial(n, slab); return NULL; } From 3864e4d5a526870e011e6aadc05645bc93ca3dd6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 15 Sep 2025 15:55:13 +0200 Subject: [PATCH 09/40] slab: don't validate slab pointer in free_debug_processing() The struct slab pointer has been obtained from the object being freed on all the paths that lead to this function. In all cases this already includes the test for slab type of the struct page which struct slab is overlaying. Thus we would not reach this function if it was not a valid slab pointer in the first place. One less obvious case is that kmem_cache_free() trusts virt_to_slab() blindly so it may be NULL if the slab type check is false. But with SLAB_CONSISTENCY_CHECKS, cache_from_obj() called also from kmem_cache_free() catches this and returns NULL, which terminates freeing immediately. Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index b3b65429e2d7..674c5036a18a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3492,11 +3492,6 @@ static inline bool free_debug_processing(struct kmem_cache *s, int cnt = 0; if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!validate_slab_ptr(slab)) { - slab_err(s, slab, "Not a valid slab page"); - goto out; - } - if (!check_slab(s, slab)) goto out; } From 4038016397da5c1cebb10e7c85a36d06123724a8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 15 Sep 2025 13:09:17 -0700 Subject: [PATCH 10/40] slab: prevent warnings when slab obj_exts vector allocation fails When object extension vector allocation fails, we set slab->obj_exts to OBJEXTS_ALLOC_FAIL to indicate the failure. Later, once the vector is successfully allocated, we will use this flag to mark codetag references stored in that vector as empty to avoid codetag warnings. slab_obj_exts() used to retrieve the slab->obj_exts vector pointer checks slab->obj_exts for being either NULL or a pointer with MEMCG_DATA_OBJEXTS bit set. However it does not handle the case when slab->obj_exts equals OBJEXTS_ALLOC_FAIL. Add the missing condition to avoid extra warning. Fixes: 09c46563ff6d ("codetag: debug: introduce OBJEXTS_ALLOC_FAIL to mark failed slab_ext allocations") Reported-by: Shakeel Butt Closes: https://lore.kernel.org/all/jftidhymri2af5u3xtcqry3cfu6aqzte3uzlznhlaylgrdztsi@5vpjnzpsemf5/ Signed-off-by: Suren Baghdasaryan Cc: stable@vger.kernel.org # v6.10+ Acked-by: Shakeel Butt Signed-off-by: Vlastimil Babka --- mm/slab.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 248b34c839b7..0dd3f33c0963 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -526,8 +526,12 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) unsigned long obj_exts = READ_ONCE(slab->obj_exts); #ifdef CONFIG_MEMCG - VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), - slab_page(slab)); + /* + * obj_exts should be either NULL, a valid pointer with + * MEMCG_DATA_OBJEXTS bit set or be equal to OBJEXTS_ALLOC_FAIL. + */ + VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS) && + obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab)); VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); #endif return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); From f7381b9116407ba2a429977c80ff8df953ea9354 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 15 Sep 2025 13:09:18 -0700 Subject: [PATCH 11/40] slab: mark slab->obj_exts allocation failures unconditionally alloc_slab_obj_exts() should mark failed obj_exts vector allocations independent on whether the vector is being allocated for a new or an existing slab. Current implementation skips doing this for existing slabs. Fix this by marking failed allocations unconditionally. Fixes: 09c46563ff6d ("codetag: debug: introduce OBJEXTS_ALLOC_FAIL to mark failed slab_ext allocations") Reported-by: Shakeel Butt Closes: https://lore.kernel.org/all/avhakjldsgczmq356gkwmvfilyvf7o6temvcmtt5lqd4fhp5rk@47gp2ropyixg/ Signed-off-by: Suren Baghdasaryan Cc: stable@vger.kernel.org # v6.10+ Acked-by: Shakeel Butt Signed-off-by: Vlastimil Babka --- mm/slub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 674c5036a18a..b74d65aa32c6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2039,8 +2039,7 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, slab_nid(slab)); if (!vec) { /* Mark vectors which failed to allocate */ - if (new_slab) - mark_failed_objexts_alloc(slab); + mark_failed_objexts_alloc(slab); return -ENOMEM; } From 212b0f07cf021575ec25e0b2336df77c7a4d2e68 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 3 Sep 2025 14:59:43 +0200 Subject: [PATCH 12/40] locking/local_lock: Expose dep_map in local_trylock_t. lockdep_is_held() macro assumes that "struct lockdep_map dep_map;" is a top level field of any lock that participates in LOCKDEP. Make it so for local_trylock_t. Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Alexei Starovoitov Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- include/linux/local_lock_internal.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index d80b5306a2c0..949de37700db 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -17,7 +17,10 @@ typedef struct { /* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */ typedef struct { - local_lock_t llock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + struct task_struct *owner; +#endif u8 acquired; } local_trylock_t; @@ -31,7 +34,7 @@ typedef struct { .owner = NULL, # define LOCAL_TRYLOCK_DEBUG_INIT(lockname) \ - .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) }, + LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { @@ -81,7 +84,7 @@ do { \ local_lock_debug_init(lock); \ } while (0) -#define __local_trylock_init(lock) __local_lock_init(lock.llock) +#define __local_trylock_init(lock) __local_lock_init((local_lock_t *)lock) #define __spinlock_nested_bh_init(lock) \ do { \ From 9d4e6ab865c48c70e684b176d3ee1574d092626f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:44 +0200 Subject: [PATCH 13/40] slab: simplify init_kmem_cache_nodes() error handling We don't need to call free_kmem_cache_nodes() immediately when failing to allocate a kmem_cache_node, because when we return 0, do_kmem_cache_create() calls __kmem_cache_release() which also performs free_kmem_cache_nodes(). Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- mm/slub.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 30003763d224..9f671ec76131 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5669,10 +5669,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - if (!n) { - free_kmem_cache_nodes(s); + if (!n) return 0; - } init_kmem_cache_node(n); s->node[node] = n; From 2d517aa09bbc4203f10cdee7e1d42f3bbdc1b1cd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:45 +0200 Subject: [PATCH 14/40] slab: add opt-in caching layer of percpu sheaves Specifying a non-zero value for a new struct kmem_cache_args field sheaf_capacity will setup a caching layer of percpu arrays called sheaves of given capacity for the created cache. Allocations from the cache will allocate via the percpu sheaves (main or spare) as long as they have no NUMA node preference. Frees will also put the object back into one of the sheaves. When both percpu sheaves are found empty during an allocation, an empty sheaf may be replaced with a full one from the per-node barn. If none are available and the allocation is allowed to block, an empty sheaf is refilled from slab(s) by an internal bulk alloc operation. When both percpu sheaves are full during freeing, the barn can replace a full one with an empty one, unless over a full sheaves limit. In that case a sheaf is flushed to slab(s) by an internal bulk free operation. Flushing sheaves and barns is also wired to the existing cpu flushing and cache shrinking operations. The sheaves do not distinguish NUMA locality of the cached objects. If an allocation is requested with kmem_cache_alloc_node() (or a mempolicy with strict_numa mode enabled) with a specific node (not NUMA_NO_NODE), the sheaves are bypassed. The bulk operations exposed to slab users also try to utilize the sheaves as long as the necessary (full or empty) sheaves are available on the cpu or in the barn. Once depleted, they will fallback to bulk alloc/free to slabs directly to avoid double copying. The sheaf_capacity value is exported in sysfs for observability. Sysfs CONFIG_SLUB_STATS counters alloc_cpu_sheaf and free_cpu_sheaf count objects allocated or freed using the sheaves (and thus not counting towards the other alloc/free path counters). Counters sheaf_refill and sheaf_flush count objects filled or flushed from or to slab pages, and can be used to assess how effective the caching is. The refill and flush operations will also count towards the usual alloc_fastpath/slowpath, free_fastpath/slowpath and other counters for the backing slabs. For barn operations, barn_get and barn_put count how many full sheaves were get from or put to the barn, the _fail variants count how many such requests could not be satisfied mainly because the barn was either empty or full. While the barn also holds empty sheaves to make some operations easier, these are not as critical to mandate own counters. Finally, there are sheaf_alloc/sheaf_free counters. Access to the percpu sheaves is protected by local_trylock() when potential callers include irq context, and local_lock() otherwise (such as when we already know the gfp flags allow blocking). The trylock failures should be rare and we can easily fallback. Each per-NUMA-node barn has a spin_lock. When slub_debug is enabled for a cache with sheaf_capacity also specified, the latter is ignored so that allocations and frees reach the slow path where debugging hooks are processed. Similarly, we ignore it with CONFIG_SLUB_TINY which prefers low memory usage to performance. [boot failure: https://lore.kernel.org/all/583eacf5-c971-451a-9f76-fed0e341b815@linux.ibm.com/ ] Reported-and-tested-by: Venkat Rao Bagalkote Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 31 ++ mm/slab.h | 2 + mm/slab_common.c | 5 +- mm/slub.c | 1140 ++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 1130 insertions(+), 48 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index d5a8ab98035c..49acbcdc6696 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -335,6 +335,37 @@ struct kmem_cache_args { * %NULL means no constructor. */ void (*ctor)(void *); + /** + * @sheaf_capacity: Enable sheaves of given capacity for the cache. + * + * With a non-zero value, allocations from the cache go through caching + * arrays called sheaves. Each cpu has a main sheaf that's always + * present, and a spare sheaf that may be not present. When both become + * empty, there's an attempt to replace an empty sheaf with a full sheaf + * from the per-node barn. + * + * When no full sheaf is available, and gfp flags allow blocking, a + * sheaf is allocated and filled from slab(s) using bulk allocation. + * Otherwise the allocation falls back to the normal operation + * allocating a single object from a slab. + * + * Analogically when freeing and both percpu sheaves are full, the barn + * may replace it with an empty sheaf, unless it's over capacity. In + * that case a sheaf is bulk freed to slab pages. + * + * The sheaves do not enforce NUMA placement of objects, so allocations + * via kmem_cache_alloc_node() with a node specified other than + * NUMA_NO_NODE will bypass them. + * + * Bulk allocation and free operations also try to use the cpu sheaves + * and barn, but fallback to using slab pages directly. + * + * When slub_debug is enabled for the cache, the sheaf_capacity argument + * is ignored. + * + * %0 means no sheaves will be created. + */ + unsigned int sheaf_capacity; }; struct kmem_cache *__kmem_cache_create_args(const char *name, diff --git a/mm/slab.h b/mm/slab.h index 248b34c839b7..206987ce44a4 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -235,6 +235,7 @@ struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; #endif + struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; unsigned long min_partial; @@ -248,6 +249,7 @@ struct kmem_cache { /* Number of per cpu partial slabs to keep around */ unsigned int cpu_partial_slabs; #endif + unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ diff --git a/mm/slab_common.c b/mm/slab_common.c index bfe7c40eeee1..e2b197e47866 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -163,6 +163,9 @@ int slab_unmergeable(struct kmem_cache *s) return 1; #endif + if (s->cpu_sheaves) + return 1; + /* * We may have set a slab to be unmergeable during bootstrap. */ @@ -321,7 +324,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, object_size - args->usersize < args->useroffset)) args->usersize = args->useroffset = 0; - if (!args->usersize) + if (!args->usersize && !args->sheaf_capacity) s = __kmem_cache_alias(name, object_size, args->align, flags, args->ctor); if (s) diff --git a/mm/slub.c b/mm/slub.c index 9f671ec76131..cba188b7e04d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -363,8 +363,10 @@ static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif enum stat_item { + ALLOC_PCS, /* Allocation from percpu sheaf */ ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_PCS, /* Free to percpu sheaf */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ @@ -389,6 +391,14 @@ enum stat_item { CPU_PARTIAL_FREE, /* Refill cpu partial on free */ CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + SHEAF_FLUSH, /* Objects flushed from a sheaf */ + SHEAF_REFILL, /* Objects refilled to a sheaf */ + SHEAF_ALLOC, /* Allocation of an empty sheaf */ + SHEAF_FREE, /* Freeing of an empty sheaf */ + BARN_GET, /* Got full sheaf from barn */ + BARN_GET_FAIL, /* Failed to get full sheaf from barn */ + BARN_PUT, /* Put full sheaf to barn */ + BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ NR_SLUB_STAT_ITEMS }; @@ -435,6 +445,32 @@ void stat_add(const struct kmem_cache *s, enum stat_item si, int v) #endif } +#define MAX_FULL_SHEAVES 10 +#define MAX_EMPTY_SHEAVES 10 + +struct node_barn { + spinlock_t lock; + struct list_head sheaves_full; + struct list_head sheaves_empty; + unsigned int nr_full; + unsigned int nr_empty; +}; + +struct slab_sheaf { + union { + struct rcu_head rcu_head; + struct list_head barn_list; + }; + unsigned int size; + void *objects[]; +}; + +struct slub_percpu_sheaves { + local_trylock_t lock; + struct slab_sheaf *main; /* never NULL when unlocked */ + struct slab_sheaf *spare; /* empty or full, may be NULL */ +}; + /* * The slab lists for all objects. */ @@ -447,6 +483,7 @@ struct kmem_cache_node { atomic_long_t total_objects; struct list_head full; #endif + struct node_barn *barn; }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) @@ -454,6 +491,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) return s->node[node]; } +/* Get the barn of the current cpu's memory node */ +static inline struct node_barn *get_barn(struct kmem_cache *s) +{ + return get_node(s, numa_mem_id())->barn; +} + /* * Iterator over all nodes. The body will be executed for each node that has * a kmem_cache_node structure allocated (which is true for all online nodes) @@ -470,12 +513,19 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) */ static nodemask_t slab_nodes; -#ifndef CONFIG_SLUB_TINY /* * Workqueue used for flush_cpu_slab(). */ static struct workqueue_struct *flushwq; -#endif + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); /******************************************************************** * Core slab cache functions @@ -2473,6 +2523,360 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } +static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = kzalloc(struct_size(sheaf, objects, + s->sheaf_capacity), gfp); + + if (unlikely(!sheaf)) + return NULL; + + stat(s, SHEAF_ALLOC); + + return sheaf; +} + +static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + kfree(sheaf); + + stat(s, SHEAF_FREE); +} + +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + + +static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, + gfp_t gfp) +{ + int to_fill = s->sheaf_capacity - sheaf->size; + int filled; + + if (!to_fill) + return 0; + + filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, + &sheaf->objects[sheaf->size]); + + sheaf->size += filled; + + stat_add(s, SHEAF_REFILL, filled); + + if (filled < to_fill) + return -ENOMEM; + + return 0; +} + + +static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); + + if (!sheaf) + return NULL; + + if (refill_sheaf(s, sheaf, gfp)) { + free_empty_sheaf(s, sheaf); + return NULL; + } + + return sheaf; +} + +/* + * Maximum number of objects freed during a single flush of main pcs sheaf. + * Translates directly to an on-stack array size. + */ +#define PCS_BATCH_MAX 32U + +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); + +/* + * Free all objects from the main sheaf. In order to perform + * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where + * object pointers are moved to a on-stack array under the lock. To bound the + * stack usage, limit each batch to PCS_BATCH_MAX. + * + * returns true if at least partially flushed + */ +static bool sheaf_flush_main(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + unsigned int batch, remaining; + void *objects[PCS_BATCH_MAX]; + struct slab_sheaf *sheaf; + bool ret = false; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return ret; + + pcs = this_cpu_ptr(s->cpu_sheaves); + sheaf = pcs->main; + + batch = min(PCS_BATCH_MAX, sheaf->size); + + sheaf->size -= batch; + memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *)); + + remaining = sheaf->size; + + local_unlock(&s->cpu_sheaves->lock); + + __kmem_cache_free_bulk(s, batch, &objects[0]); + + stat_add(s, SHEAF_FLUSH, batch); + + ret = true; + + if (remaining) + goto next_batch; + + return ret; +} + +/* + * Free all objects from a sheaf that's unused, i.e. not linked to any + * cpu_sheaves, so we need no locking and batching. The locking is also not + * necessary when flushing cpu's sheaves (both spare and main) during cpu + * hotremove as the cpu is not executing anymore. + */ +static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + if (!sheaf->size) + return; + + stat_add(s, SHEAF_FLUSH, sheaf->size); + + __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); + + sheaf->size = 0; +} + +/* + * Caller needs to make sure migration is disabled in order to fully flush + * single cpu's sheaves + * + * must not be called from an irq + * + * flushing operations are rare so let's keep it simple and flush to slabs + * directly, skipping the barn + */ +static void pcs_flush_all(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *spare; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + spare = pcs->spare; + pcs->spare = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (spare) { + sheaf_flush_unused(s, spare); + free_empty_sheaf(s, spare); + } + + sheaf_flush_main(s); +} + +static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) +{ + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* The cpu is not executing anymore so we don't need pcs->lock */ + sheaf_flush_unused(s, pcs->main); + if (pcs->spare) { + sheaf_flush_unused(s, pcs->spare); + free_empty_sheaf(s, pcs->spare); + pcs->spare = NULL; + } +} + +static void pcs_destroy(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* can happen when unwinding failed create */ + if (!pcs->main) + continue; + + /* + * We have already passed __kmem_cache_shutdown() so everything + * was flushed and there should be no objects allocated from + * slabs, otherwise kmem_cache_destroy() would have aborted. + * Therefore something would have to be really wrong if the + * warnings here trigger, and we should rather leave objects and + * sheaves to leak in that case. + */ + + WARN_ON(pcs->spare); + + if (!WARN_ON(pcs->main->size)) { + free_empty_sheaf(s, pcs->main); + pcs->main = NULL; + } + } + + free_percpu(s->cpu_sheaves); + s->cpu_sheaves = NULL; +} + +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *empty = NULL; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_empty) { + empty = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&empty->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +/* + * The following two functions are used mainly in cases where we have to undo an + * intended action due to a race or cpu migration. Thus they do not check the + * empty or full sheaf limits for simplicity. + */ + +static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_empty); + barn->nr_empty++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_full); + barn->nr_full++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +/* + * If a full sheaf is available, return it and put the supplied empty one to + * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't + * change. + */ +static struct slab_sheaf * +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +{ + struct slab_sheaf *full = NULL; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&full->barn_list); + list_add(&empty->barn_list, &barn->sheaves_empty); + barn->nr_full--; + barn->nr_empty++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return full; +} + +/* + * If an empty sheaf is available, return it and put the supplied full one to + * barn. But if there are too many full sheaves, reject this with -E2BIG. + */ +static struct slab_sheaf * +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +{ + struct slab_sheaf *empty; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full >= MAX_FULL_SHEAVES) { + empty = ERR_PTR(-E2BIG); + } else if (!barn->nr_empty) { + empty = ERR_PTR(-ENOMEM); + } else { + empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, + barn_list); + list_del(&empty->barn_list); + list_add(&full->barn_list, &barn->sheaves_full); + barn->nr_empty--; + barn->nr_full++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +static void barn_init(struct node_barn *barn) +{ + spin_lock_init(&barn->lock); + INIT_LIST_HEAD(&barn->sheaves_full); + INIT_LIST_HEAD(&barn->sheaves_empty); + barn->nr_full = 0; + barn->nr_empty = 0; +} + +static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) +{ + struct list_head empty_list; + struct list_head full_list; + struct slab_sheaf *sheaf, *sheaf2; + unsigned long flags; + + INIT_LIST_HEAD(&empty_list); + INIT_LIST_HEAD(&full_list); + + spin_lock_irqsave(&barn->lock, flags); + + list_splice_init(&barn->sheaves_full, &full_list); + barn->nr_full = 0; + list_splice_init(&barn->sheaves_empty, &empty_list); + barn->nr_empty = 0; + + spin_unlock_irqrestore(&barn->lock, flags); + + list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + } + + list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list) + free_empty_sheaf(s, sheaf); +} + /* * Slab allocation and freeing */ @@ -3344,27 +3748,9 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) put_partials_cpu(s, c); } -struct slub_flush_work { - struct work_struct work; - struct kmem_cache *s; - bool skip; -}; - -/* - * Flush cpu slab. - * - * Called from CPU work handler with migration disabled. - */ -static void flush_cpu_slab(struct work_struct *w) +static inline void flush_this_cpu_slab(struct kmem_cache *s) { - struct kmem_cache *s; - struct kmem_cache_cpu *c; - struct slub_flush_work *sfw; - - sfw = container_of(w, struct slub_flush_work, work); - - s = sfw->s; - c = this_cpu_ptr(s->cpu_slab); + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); if (c->slab) flush_slab(s, c); @@ -3379,8 +3765,43 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s) return c->slab || slub_percpu_partial(c); } -static DEFINE_MUTEX(flush_lock); -static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); +#else /* CONFIG_SLUB_TINY */ +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } +static inline bool has_cpu_slab(int cpu, struct kmem_cache *s) { return false; } +static inline void flush_this_cpu_slab(struct kmem_cache *s) { } +#endif /* CONFIG_SLUB_TINY */ + +static bool has_pcs_used(int cpu, struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + + if (!s->cpu_sheaves) + return false; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + return (pcs->spare || pcs->main->size); +} + +/* + * Flush cpu slab. + * + * Called from CPU work handler with migration disabled. + */ +static void flush_cpu_slab(struct work_struct *w) +{ + struct kmem_cache *s; + struct slub_flush_work *sfw; + + sfw = container_of(w, struct slub_flush_work, work); + + s = sfw->s; + + if (s->cpu_sheaves) + pcs_flush_all(s); + + flush_this_cpu_slab(s); +} static void flush_all_cpus_locked(struct kmem_cache *s) { @@ -3392,7 +3813,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s)) { + if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { sfw->skip = true; continue; } @@ -3428,19 +3849,15 @@ static int slub_cpu_dead(unsigned int cpu) struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) + list_for_each_entry(s, &slab_caches, list) { __flush_cpu_slab(s, cpu); + if (s->cpu_sheaves) + __pcs_flush_all_cpu(s, cpu); + } mutex_unlock(&slab_mutex); return 0; } -#else /* CONFIG_SLUB_TINY */ -static inline void flush_all_cpus_locked(struct kmem_cache *s) { } -static inline void flush_all(struct kmem_cache *s) { } -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } -static inline int slub_cpu_dead(unsigned int cpu) { return 0; } -#endif /* CONFIG_SLUB_TINY */ - /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@ -4190,6 +4607,212 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, return memcg_slab_post_alloc_hook(s, lru, flags, size, p); } +/* + * Replace the empty main sheaf with a (at least partially) full sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp) +{ + struct slab_sheaf *empty = NULL; + struct slab_sheaf *full; + struct node_barn *barn; + bool can_alloc; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + return pcs; + } + + barn = get_barn(s); + + full = barn_replace_empty_sheaf(barn, pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + return pcs; + } + + stat(s, BARN_GET_FAIL); + + can_alloc = gfpflags_allow_blocking(gfp); + + if (can_alloc) { + if (pcs->spare) { + empty = pcs->spare; + pcs->spare = NULL; + } else { + empty = barn_get_empty_sheaf(barn); + } + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!can_alloc) + return NULL; + + if (empty) { + if (!refill_sheaf(s, empty, gfp)) { + full = empty; + } else { + /* + * we must be very low on memory so don't bother + * with the barn + */ + free_empty_sheaf(s, empty); + } + } else { + full = alloc_full_sheaf(s, gfp); + } + + if (!full) + return NULL; + + /* + * we can reach here only when gfpflags_allow_blocking + * so this must not be an irq + */ + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * If we are returning empty sheaf, we either got it from the + * barn or had to allocate one. If we are returning a full + * sheaf, it's due to racing or being migrated to a different + * cpu. Breaching the barn's sheaf limits should be thus rare + * enough so just ignore them to simplify the recovery. + */ + + if (pcs->main->size == 0) { + barn_put_empty_sheaf(barn, pcs->main); + pcs->main = full; + return pcs; + } + + if (!pcs->spare) { + pcs->spare = full; + return pcs; + } + + if (pcs->spare->size == 0) { + barn_put_empty_sheaf(barn, pcs->spare); + pcs->spare = full; + return pcs; + } + + barn_put_full_sheaf(barn, full); + stat(s, BARN_PUT); + + return pcs; +} + +static __fastpath_inline +void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp) +{ + struct slub_percpu_sheaves *pcs; + void *object; + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa)) { + if (current->mempolicy) + return NULL; + } +#endif + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + pcs = __pcs_replace_empty_main(s, pcs, gfp); + if (unlikely(!pcs)) + return NULL; + } + + object = pcs->main->objects[--pcs->main->size]; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, ALLOC_PCS); + + return object; +} + +static __fastpath_inline +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main; + unsigned int allocated = 0; + unsigned int batch; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return allocated; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + + struct slab_sheaf *full; + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + goto do_alloc; + } + + full = barn_replace_empty_sheaf(get_barn(s), pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + goto do_alloc; + } + + stat(s, BARN_GET_FAIL); + + local_unlock(&s->cpu_sheaves->lock); + + /* + * Once full sheaves in barn are depleted, let the bulk + * allocation continue from slab pages, otherwise we would just + * be copying arrays of pointers twice. + */ + return allocated; + } + +do_alloc: + + main = pcs->main; + batch = min(size, main->size); + + main->size -= batch; + memcpy(p, main->objects + main->size, batch * sizeof(void *)); + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, ALLOC_PCS, batch); + + allocated += batch; + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return allocated; +} + + /* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call @@ -4214,7 +4837,11 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + if (s->cpu_sheaves && node == NUMA_NO_NODE) + object = alloc_from_pcs(s, gfpflags); + + if (!object) + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); @@ -4591,6 +5218,295 @@ slab_empty: discard_slab(s, slab); } +/* + * pcs is locked. We should have get rid of the spare sheaf and obtained an + * empty sheaf, while the main sheaf is full. We want to install the empty sheaf + * as a main sheaf, and make the current main sheaf a spare sheaf. + * + * However due to having relinquished the cpu_sheaves lock when obtaining + * the empty sheaf, we need to handle some unlikely but possible cases. + * + * If we put any sheaf to barn here, it's because we were interrupted or have + * been migrated to a different cpu, which should be rare enough so just ignore + * the barn's limits to simplify the handling. + * + * An alternative scenario that gets us here is when we fail + * barn_replace_full_sheaf(), because there's no empty sheaf available in the + * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the + * limit on full sheaves was not exceeded, we assume it didn't change and just + * put the full sheaf there. + */ +static void __pcs_install_empty_sheaf(struct kmem_cache *s, + struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty) +{ + struct node_barn *barn; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + /* This is what we expect to find if nobody interrupted us. */ + if (likely(!pcs->spare)) { + pcs->spare = pcs->main; + pcs->main = empty; + return; + } + + barn = get_barn(s); + + /* + * Unlikely because if the main sheaf had space, we would have just + * freed to it. Get rid of our empty sheaf. + */ + if (pcs->main->size < s->sheaf_capacity) { + barn_put_empty_sheaf(barn, empty); + return; + } + + /* Also unlikely for the same reason */ + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + barn_put_empty_sheaf(barn, empty); + return; + } + + /* + * We probably failed barn_replace_full_sheaf() due to no empty sheaf + * available there, but we allocated one, so finish the job. + */ + barn_put_full_sheaf(barn, pcs->main); + stat(s, BARN_PUT); + pcs->main = empty; +} + +/* + * Replace the full main sheaf with a (at least partially) empty sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +{ + struct slab_sheaf *empty; + struct node_barn *barn; + bool put_fail; + +restart: + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + barn = get_barn(s); + put_fail = false; + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (empty) { + pcs->spare = pcs->main; + pcs->main = empty; + return pcs; + } + goto alloc_empty; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + return pcs; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + + if (!IS_ERR(empty)) { + stat(s, BARN_PUT); + pcs->main = empty; + return pcs; + } + + if (PTR_ERR(empty) == -E2BIG) { + /* Since we got here, spare exists and is full */ + struct slab_sheaf *to_flush = pcs->spare; + + stat(s, BARN_PUT_FAIL); + + pcs->spare = NULL; + local_unlock(&s->cpu_sheaves->lock); + + sheaf_flush_unused(s, to_flush); + empty = to_flush; + goto got_empty; + } + + /* + * We could not replace full sheaf because barn had no empty + * sheaves. We can still allocate it and put the full sheaf in + * __pcs_install_empty_sheaf(), but if we fail to allocate it, + * make sure to count the fail. + */ + put_fail = true; + +alloc_empty: + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + if (empty) + goto got_empty; + + if (put_fail) + stat(s, BARN_PUT_FAIL); + + if (!sheaf_flush_main(s)) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * we flushed the main sheaf so it should be empty now, + * but in case we got preempted or migrated, we need to + * check again + */ + if (pcs->main->size == s->sheaf_capacity) + goto restart; + + return pcs; + +got_empty: + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + return NULL; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + __pcs_install_empty_sheaf(s, pcs, empty); + + return pcs; +} + +/* + * Free an object to the percpu sheaves. + * The object is expected to have passed slab_free_hook() already. + */ +static __fastpath_inline +bool free_to_pcs(struct kmem_cache *s, void *object) +{ + struct slub_percpu_sheaves *pcs; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return false; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == s->sheaf_capacity)) { + + pcs = __pcs_replace_full_main(s, pcs); + if (unlikely(!pcs)) + return false; + } + + pcs->main->objects[pcs->main->size++] = object; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_PCS); + + return true; +} + +/* + * Bulk free objects to the percpu sheaves. + * Unlike free_to_pcs() this includes the calls to all necessary hooks + * and the fallback to freeing to slab pages. + */ +static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main, *empty; + bool init = slab_want_init_on_free(s); + unsigned int batch, i = 0; + struct node_barn *barn; + + while (i < size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, false))) { + p[i] = p[--size]; + if (!size) + return; + continue; + } + + i++; + } + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fallback; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (likely(pcs->main->size < s->sheaf_capacity)) + goto do_free; + + barn = get_barn(s); + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (!empty) + goto no_empty; + + pcs->spare = pcs->main; + pcs->main = empty; + goto do_free; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + goto do_free; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + if (IS_ERR(empty)) { + stat(s, BARN_PUT_FAIL); + goto no_empty; + } + + stat(s, BARN_PUT); + pcs->main = empty; + +do_free: + main = pcs->main; + batch = min(size, s->sheaf_capacity - main->size); + + memcpy(main->objects + main->size, p, batch * sizeof(void *)); + main->size += batch; + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, FREE_PCS, batch); + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return; + +no_empty: + local_unlock(&s->cpu_sheaves->lock); + + /* + * if we depleted all empty sheaves in the barn or there are too + * many full sheaves, free the rest to slab pages + */ +fallback: + __kmem_cache_free_bulk(s, size, p); +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -4677,7 +5593,10 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, memcg_slab_free_hook(s, slab, &object, 1); alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) + return; + + if (!s->cpu_sheaves || !free_to_pcs(s, object)) do_slab_free(s, slab, object, object, 1, addr); } @@ -5273,6 +6192,15 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!size) return; + /* + * freeing to sheaves is so incompatible with the detached freelist so + * once we go that way, we have to do everything differently + */ + if (s && s->cpu_sheaves) { + free_to_pcs_bulk(s, size, p); + return; + } + do { struct detached_freelist df; @@ -5391,7 +6319,7 @@ error: int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - int i; + unsigned int i = 0; if (!size) return 0; @@ -5400,9 +6328,20 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!s)) return 0; - i = __kmem_cache_alloc_bulk(s, flags, size, p); - if (unlikely(i == 0)) - return 0; + if (s->cpu_sheaves) + i = alloc_from_pcs_bulk(s, size, p); + + if (i < size) { + /* + * If we ran out of memory, don't bother with freeing back to + * the percpu sheaves, we have bigger problems. + */ + if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { + if (i > 0) + __kmem_cache_free_bulk(s, i, p); + return 0; + } + } /* * memcg and kmem_cache debug support and memory initialization. @@ -5412,11 +6351,11 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s), s->object_size))) { return 0; } - return i; + + return size; } EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); - /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -5550,7 +6489,7 @@ static inline int calculate_order(unsigned int size) } static void -init_kmem_cache_node(struct kmem_cache_node *n) +init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -5560,6 +6499,9 @@ init_kmem_cache_node(struct kmem_cache_node *n) atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif + n->barn = barn; + if (barn) + barn_init(barn); } #ifndef CONFIG_SLUB_TINY @@ -5590,6 +6532,26 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) } #endif /* CONFIG_SLUB_TINY */ +static int init_percpu_sheaves(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + local_trylock_init(&pcs->lock); + + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + + if (!pcs->main) + return -ENOMEM; + } + + return 0; +} + static struct kmem_cache *kmem_cache_node; /* @@ -5625,7 +6587,7 @@ static void early_kmem_cache_node_alloc(int node) slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; kmem_cache_node->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, NULL); inc_slabs_node(kmem_cache_node, node, slab->objects); /* @@ -5641,6 +6603,13 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { + if (n->barn) { + WARN_ON(n->barn->nr_full); + WARN_ON(n->barn->nr_empty); + kfree(n->barn); + n->barn = NULL; + } + s->node[node] = NULL; kmem_cache_free(kmem_cache_node, n); } @@ -5649,6 +6618,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); + if (s->cpu_sheaves) + pcs_destroy(s); #ifndef CONFIG_SLUB_TINY free_percpu(s->cpu_slab); #endif @@ -5661,18 +6632,29 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; + struct node_barn *barn = NULL; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) + return 0; + } + n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - - if (!n) + if (!n) { + kfree(barn); return 0; + } + + init_kmem_cache_node(n, barn); - init_kmem_cache_node(n); s->node[node] = n; } return 1; @@ -5929,6 +6911,8 @@ int __kmem_cache_shutdown(struct kmem_cache *s) flush_all_cpus_locked(s); /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { + if (n->barn) + barn_shrink(s, n->barn); free_partial(s, n); if (n->nr_partial || node_nr_slabs(n)) return 1; @@ -6132,6 +7116,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); + if (n->barn) + barn_shrink(s, n->barn); + spin_lock_irqsave(&n->list_lock, flags); /* @@ -6211,12 +7198,24 @@ static int slab_mem_going_online_callback(int nid) */ mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { + struct node_barn *barn = NULL; + /* * The structure may already exist if the node was previously * onlined and offlined. */ if (get_node(s, nid)) continue; + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); + + if (!barn) { + ret = -ENOMEM; + goto out; + } + } + /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that @@ -6224,10 +7223,13 @@ static int slab_mem_going_online_callback(int nid) */ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { + kfree(barn); ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + + init_kmem_cache_node(n, barn); + s->node[nid] = n; } /* @@ -6440,6 +7442,17 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, set_cpu_partial(s); + if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) + && !(s->flags & SLAB_DEBUG_FLAGS)) { + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; + } + // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? + s->sheaf_capacity = args->sheaf_capacity; + } + #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif @@ -6456,6 +7469,12 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!alloc_kmem_cache_cpus(s)) goto out; + if (s->cpu_sheaves) { + err = init_percpu_sheaves(s); + if (err) + goto out; + } + err = 0; /* Mutex is not taken during early boot */ @@ -6908,6 +7927,12 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(order); +static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->sheaf_capacity); +} +SLAB_ATTR_RO(sheaf_capacity); + static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%lu\n", s->min_partial); @@ -7255,8 +8280,10 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ +STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_PCS, free_cpu_sheaf); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_FROZEN, free_frozen); @@ -7281,6 +8308,14 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +STAT_ATTR(SHEAF_FLUSH, sheaf_flush); +STAT_ATTR(SHEAF_REFILL, sheaf_refill); +STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); +STAT_ATTR(SHEAF_FREE, sheaf_free); +STAT_ATTR(BARN_GET, barn_get); +STAT_ATTR(BARN_GET_FAIL, barn_get_fail); +STAT_ATTR(BARN_PUT, barn_put); +STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); #endif /* CONFIG_SLUB_STATS */ #ifdef CONFIG_KFENCE @@ -7311,6 +8346,7 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &sheaf_capacity_attr.attr, &min_partial_attr.attr, &cpu_partial_attr.attr, &objects_partial_attr.attr, @@ -7342,8 +8378,10 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS + &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, + &free_cpu_sheaf_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_frozen_attr.attr, @@ -7368,6 +8406,14 @@ static struct attribute *slab_attrs[] = { &cpu_partial_free_attr.attr, &cpu_partial_node_attr.attr, &cpu_partial_drain_attr.attr, + &sheaf_flush_attr.attr, + &sheaf_refill_attr.attr, + &sheaf_alloc_attr.attr, + &sheaf_free_attr.attr, + &barn_get_attr.attr, + &barn_get_fail_attr.attr, + &barn_put_attr.attr, + &barn_put_fail_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, From ec66e0d599520ab414db745544e25d80d0ac5054 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:46 +0200 Subject: [PATCH 15/40] slab: add sheaf support for batching kfree_rcu() operations Extend the sheaf infrastructure for more efficient kfree_rcu() handling. For caches with sheaves, on each cpu maintain a rcu_free sheaf in addition to main and spare sheaves. kfree_rcu() operations will try to put objects on this sheaf. Once full, the sheaf is detached and submitted to call_rcu() with a handler that will try to put it in the barn, or flush to slab pages using bulk free, when the barn is full. Then a new empty sheaf must be obtained to put more objects there. It's possible that no free sheaves are available to use for a new rcu_free sheaf, and the allocation in kfree_rcu() context can only use GFP_NOWAIT and thus may fail. In that case, fall back to the existing kfree_rcu() implementation. Expected advantages: - batching the kfree_rcu() operations, that could eventually replace the existing batching - sheaves can be reused for allocations via barn instead of being flushed to slabs, which is more efficient - this includes cases where only some cpus are allowed to process rcu callbacks (CONFIG_RCU_NOCB_CPU) Possible disadvantage: - objects might be waiting for more than their grace period (it is determined by the last object freed into the sheaf), increasing memory usage - but the existing batching does that too. Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny implementation favors smaller memory footprint over performance. Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the contexts where kfree_rcu() is called might not be compatible with taking a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a spinlock - the current kfree_rcu() implementation avoids doing that. Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches that have them. This is not a cheap operation, but the barrier usage is rare - currently kmem_cache_destroy() or on module unload. Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to count how many kfree_rcu() used the rcu_free sheaf successfully and how many had to fall back to the existing implementation. Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- mm/slab.h | 3 + mm/slab_common.c | 26 +++++ mm/slub.c | 268 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 295 insertions(+), 2 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 206987ce44a4..e82e51c44bd0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -435,6 +435,9 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); } +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj); +void flush_all_rcu_sheaves(void); + #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \ diff --git a/mm/slab_common.c b/mm/slab_common.c index e2b197e47866..005a4319c06a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1608,6 +1608,27 @@ static void kfree_rcu_work(struct work_struct *work) kvfree_rcu_list(head); } +static bool kfree_rcu_sheaf(void *obj) +{ + struct kmem_cache *s; + struct folio *folio; + struct slab *slab; + + if (is_vmalloc_addr(obj)) + return false; + + folio = virt_to_folio(obj); + if (unlikely(!folio_test_slab(folio))) + return false; + + slab = folio_slab(folio); + s = slab->slab_cache; + if (s->cpu_sheaves) + return __kfree_rcu_sheaf(s, obj); + + return false; +} + static bool need_offload_krc(struct kfree_rcu_cpu *krcp) { @@ -1952,6 +1973,9 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) if (!head) might_sleep(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr)) + return; + // Queue the object but don't yet schedule the batch. if (debug_rcu_head_queue(ptr)) { // Probable double kfree_rcu(), just leak. @@ -2026,6 +2050,8 @@ void kvfree_rcu_barrier(void) bool queued; int i, cpu; + flush_all_rcu_sheaves(); + /* * Firstly we detach objects and queue them over an RCU-batch * for all CPUs. Finally queued works are flushed for each CPU. diff --git a/mm/slub.c b/mm/slub.c index cba188b7e04d..fec0cdc7ef37 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -367,6 +367,8 @@ enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ FREE_PCS, /* Free to percpu sheaf */ + FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ + FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ FREE_FROZEN, /* Freeing to frozen slab */ @@ -461,6 +463,7 @@ struct slab_sheaf { struct rcu_head rcu_head; struct list_head barn_list; }; + struct kmem_cache *cache; unsigned int size; void *objects[]; }; @@ -469,6 +472,7 @@ struct slub_percpu_sheaves { local_trylock_t lock; struct slab_sheaf *main; /* never NULL when unlocked */ struct slab_sheaf *spare; /* empty or full, may be NULL */ + struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */ }; /* @@ -2531,6 +2535,8 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) if (unlikely(!sheaf)) return NULL; + sheaf->cache = s; + stat(s, SHEAF_ALLOC); return sheaf; @@ -2655,6 +2661,43 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) sheaf->size = 0; } +static void __rcu_free_sheaf_prepare(struct kmem_cache *s, + struct slab_sheaf *sheaf) +{ + bool init = slab_want_init_on_free(s); + void **p = &sheaf->objects[0]; + unsigned int i = 0; + + while (i < sheaf->size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, true))) { + p[i] = p[--sheaf->size]; + continue; + } + + i++; + } +} + +static void rcu_free_sheaf_nobarn(struct rcu_head *head) +{ + struct slab_sheaf *sheaf; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + s = sheaf->cache; + + __rcu_free_sheaf_prepare(s, sheaf); + + sheaf_flush_unused(s, sheaf); + + free_empty_sheaf(s, sheaf); +} + /* * Caller needs to make sure migration is disabled in order to fully flush * single cpu's sheaves @@ -2667,7 +2710,7 @@ static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) static void pcs_flush_all(struct kmem_cache *s) { struct slub_percpu_sheaves *pcs; - struct slab_sheaf *spare; + struct slab_sheaf *spare, *rcu_free; local_lock(&s->cpu_sheaves->lock); pcs = this_cpu_ptr(s->cpu_sheaves); @@ -2675,6 +2718,9 @@ static void pcs_flush_all(struct kmem_cache *s) spare = pcs->spare; pcs->spare = NULL; + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + local_unlock(&s->cpu_sheaves->lock); if (spare) { @@ -2682,6 +2728,9 @@ static void pcs_flush_all(struct kmem_cache *s) free_empty_sheaf(s, spare); } + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); + sheaf_flush_main(s); } @@ -2698,6 +2747,11 @@ static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) free_empty_sheaf(s, pcs->spare); pcs->spare = NULL; } + + if (pcs->rcu_free) { + call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn); + pcs->rcu_free = NULL; + } } static void pcs_destroy(struct kmem_cache *s) @@ -2723,6 +2777,7 @@ static void pcs_destroy(struct kmem_cache *s) */ WARN_ON(pcs->spare); + WARN_ON(pcs->rcu_free); if (!WARN_ON(pcs->main->size)) { free_empty_sheaf(s, pcs->main); @@ -3780,7 +3835,7 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s) pcs = per_cpu_ptr(s->cpu_sheaves, cpu); - return (pcs->spare || pcs->main->size); + return (pcs->spare || pcs->rcu_free || pcs->main->size); } /* @@ -3840,6 +3895,74 @@ static void flush_all(struct kmem_cache *s) cpus_read_unlock(); } +static void flush_rcu_sheaf(struct work_struct *w) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_free; + struct slub_flush_work *sfw; + struct kmem_cache *s; + + sfw = container_of(w, struct slub_flush_work, work); + s = sfw->s; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); +} + + +/* needed for kvfree_rcu_barrier() */ +void flush_all_rcu_sheaves(void) +{ + struct slub_flush_work *sfw; + struct kmem_cache *s; + unsigned int cpu; + + cpus_read_lock(); + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!s->cpu_sheaves) + continue; + + mutex_lock(&flush_lock); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + + /* + * we don't check if rcu_free sheaf exists - racing + * __kfree_rcu_sheaf() might have just removed it. + * by executing flush_rcu_sheaf() on the cpu we make + * sure the __kfree_rcu_sheaf() finished its call_rcu() + */ + + INIT_WORK(&sfw->work, flush_rcu_sheaf); + sfw->s = s; + queue_work_on(cpu, flushwq, &sfw->work); + } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + flush_work(&sfw->work); + } + + mutex_unlock(&flush_lock); + } + + mutex_unlock(&slab_mutex); + cpus_read_unlock(); + + rcu_barrier(); +} + /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -5413,6 +5536,138 @@ bool free_to_pcs(struct kmem_cache *s, void *object) return true; } +static void rcu_free_sheaf(struct rcu_head *head) +{ + struct slab_sheaf *sheaf; + struct node_barn *barn; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + + s = sheaf->cache; + + /* + * This may remove some objects due to slab_free_hook() returning false, + * so that the sheaf might no longer be completely full. But it's easier + * to handle it as full (unless it became completely empty), as the code + * handles it fine. The only downside is that sheaf will serve fewer + * allocations when reused. It only happens due to debugging, which is a + * performance hit anyway. + */ + __rcu_free_sheaf_prepare(s, sheaf); + + barn = get_node(s, numa_mem_id())->barn; + + /* due to slab_free_hook() */ + if (unlikely(sheaf->size == 0)) + goto empty; + + /* + * Checking nr_full/nr_empty outside lock avoids contention in case the + * barn is at the respective limit. Due to the race we might go over the + * limit but that should be rare and harmless. + */ + + if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) { + stat(s, BARN_PUT); + barn_put_full_sheaf(barn, sheaf); + return; + } + + stat(s, BARN_PUT_FAIL); + sheaf_flush_unused(s, sheaf); + +empty: + if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { + barn_put_empty_sheaf(barn, sheaf); + return; + } + + free_empty_sheaf(s, sheaf); +} + +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_sheaf; + + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fail; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(!pcs->rcu_free)) { + + struct slab_sheaf *empty; + struct node_barn *barn; + + if (pcs->spare && pcs->spare->size == 0) { + pcs->rcu_free = pcs->spare; + pcs->spare = NULL; + goto do_free; + } + + barn = get_barn(s); + + empty = barn_get_empty_sheaf(barn); + + if (empty) { + pcs->rcu_free = empty; + goto do_free; + } + + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + + if (!empty) + goto fail; + + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + goto fail; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->rcu_free)) + barn_put_empty_sheaf(barn, empty); + else + pcs->rcu_free = empty; + } + +do_free: + + rcu_sheaf = pcs->rcu_free; + + /* + * Since we flush immediately when size reaches capacity, we never reach + * this with size already at capacity, so no OOB write is possible. + */ + rcu_sheaf->objects[rcu_sheaf->size++] = obj; + + if (likely(rcu_sheaf->size < s->sheaf_capacity)) + rcu_sheaf = NULL; + else + pcs->rcu_free = NULL; + + /* + * we flush before local_unlock to make sure a racing + * flush_all_rcu_sheaves() doesn't miss this sheaf + */ + if (rcu_sheaf) + call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_RCU_SHEAF); + return true; + +fail: + stat(s, FREE_RCU_SHEAF_FAIL); + return false; +} + /* * Bulk free objects to the percpu sheaves. * Unlike free_to_pcs() this includes the calls to all necessary hooks @@ -6909,6 +7164,11 @@ int __kmem_cache_shutdown(struct kmem_cache *s) struct kmem_cache_node *n; flush_all_cpus_locked(s); + + /* we might have rcu sheaves in flight */ + if (s->cpu_sheaves) + rcu_barrier(); + /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { if (n->barn) @@ -8284,6 +8544,8 @@ STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); STAT_ATTR(FREE_PCS, free_cpu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_FROZEN, free_frozen); @@ -8382,6 +8644,8 @@ static struct attribute *slab_attrs[] = { &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, &free_cpu_sheaf_attr.attr, + &free_rcu_sheaf_attr.attr, + &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_frozen_attr.attr, From a8541957e7b18921dbf5d887780e8eaeb04916cf Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Mon, 4 Aug 2025 20:56:57 +0800 Subject: [PATCH 16/40] maple_tree: remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Link: https://lkml.kernel.org/r/20250804125657.482109-1-rongqianfeng@vivo.com Signed-off-by: Qianfeng Rong Reviewed-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Vlastimil Babka --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b4ee2d29d7a9..38fb68c08291 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1344,11 +1344,11 @@ static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) * @mas: The maple state * @count: The number of nodes needed * - * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. + * Note: Uses GFP_NOWAIT for gfp flags. */ static void mas_node_count(struct ma_state *mas, int count) { - return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); + return mas_node_count_gfp(mas, count, GFP_NOWAIT); } /* From 3c1ea5c5019ff197aca7e886a3a240c38f6c6f0d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:47 +0200 Subject: [PATCH 17/40] slab: sheaf prefilling for guaranteed allocations Add functions for efficient guaranteed allocations e.g. in a critical section that cannot sleep, when the exact number of allocations is not known beforehand, but an upper limit can be calculated. kmem_cache_prefill_sheaf() returns a sheaf containing at least given number of objects. kmem_cache_alloc_from_sheaf() will allocate an object from the sheaf and is guaranteed not to fail until depleted. kmem_cache_return_sheaf() is for giving the sheaf back to the slab allocator after the critical section. This will also attempt to refill it to cache's sheaf capacity for better efficiency of sheaves handling, but it's not stricly necessary to succeed. kmem_cache_refill_sheaf() can be used to refill a previously obtained sheaf to requested size. If the current size is sufficient, it does nothing. If the requested size exceeds cache's sheaf_capacity and the sheaf's current capacity, the sheaf will be replaced with a new one, hence the indirect pointer parameter. kmem_cache_sheaf_size() can be used to query the current size. The implementation supports requesting sizes that exceed cache's sheaf_capacity, but it is not efficient - such "oversize" sheaves are allocated fresh in kmem_cache_prefill_sheaf() and flushed and freed immediately by kmem_cache_return_sheaf(). kmem_cache_refill_sheaf() might be especially ineffective when replacing a sheaf with a new one of a larger capacity. It is therefore better to size cache's sheaf_capacity accordingly to make oversize sheaves exceptional. CONFIG_SLUB_STATS counters are added for sheaf prefill and return operations. A prefill or return is considered _fast when it is able to grab or return a percpu spare sheaf (even if the sheaf needs a refill to satisfy the request, as those should amortize over time), and _slow otherwise (when the barn or even sheaf allocation/freeing has to be involved). sheaf_prefill_oversize is provided to determine how many prefills were oversize (counter for oversize returns is not necessary as all oversize refills result in oversize returns). When slub_debug is enabled for a cache with sheaves, no percpu sheaves exist for it, but the prefill functionality is still provided simply by all prefilled sheaves becoming oversize. If percpu sheaves are not created for a cache due to not passing the sheaf_capacity argument on cache creation, the prefills also work through oversize sheaves, but there's a WARN_ON_ONCE() to indicate the omission. Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 16 +++ mm/slub.c | 263 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 279 insertions(+) diff --git a/include/linux/slab.h b/include/linux/slab.h index 49acbcdc6696..680193356ac7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -829,6 +829,22 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size); + +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size); + +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf); + +void *kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *cachep, gfp_t gfp, + struct slab_sheaf *sheaf) __assume_slab_alignment __malloc; +#define kmem_cache_alloc_from_sheaf(...) \ + alloc_hooks(kmem_cache_alloc_from_sheaf_noprof(__VA_ARGS__)) + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf); + /* * These macros allow declaring a kmem_buckets * parameter alongside size, which * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call diff --git a/mm/slub.c b/mm/slub.c index fec0cdc7ef37..4bd67d5d5ff5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -401,6 +401,11 @@ enum stat_item { BARN_GET_FAIL, /* Failed to get full sheaf from barn */ BARN_PUT, /* Put full sheaf to barn */ BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ + SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */ + SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */ + SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */ + SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */ + SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */ NR_SLUB_STAT_ITEMS }; @@ -462,6 +467,8 @@ struct slab_sheaf { union { struct rcu_head rcu_head; struct list_head barn_list; + /* only used for prefilled sheafs */ + unsigned int capacity; }; struct kmem_cache *cache; unsigned int size; @@ -2838,6 +2845,30 @@ static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf spin_unlock_irqrestore(&barn->lock, flags); } +static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *sheaf = NULL; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&sheaf->barn_list); + barn->nr_full--; + } else if (barn->nr_empty) { + sheaf = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&sheaf->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return sheaf; +} + /* * If a full sheaf is available, return it and put the supplied empty one to * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't @@ -5036,6 +5067,228 @@ void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int nod } EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); +/* + * returns a sheaf that has at least the requested size + * when prefilling is needed, do so with given gfp flags + * + * return NULL if sheaf allocation or prefilling failed + */ +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *sheaf = NULL; + + if (unlikely(size > s->sheaf_capacity)) { + + /* + * slab_debug disables cpu sheaves intentionally so all + * prefilled sheaves become "oversize" and we give up on + * performance for the debugging. Same with SLUB_TINY. + * Creating a cache without sheaves and then requesting a + * prefilled sheaf is however not expected, so warn. + */ + WARN_ON_ONCE(s->sheaf_capacity == 0 && + !IS_ENABLED(CONFIG_SLUB_TINY) && + !(s->flags & SLAB_DEBUG_FLAGS)); + + sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); + if (!sheaf) + return NULL; + + stat(s, SHEAF_PREFILL_OVERSIZE); + sheaf->cache = s; + sheaf->capacity = size; + + if (!__kmem_cache_alloc_bulk(s, gfp, size, + &sheaf->objects[0])) { + kfree(sheaf); + return NULL; + } + + sheaf->size = size; + + return sheaf; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (pcs->spare) { + sheaf = pcs->spare; + pcs->spare = NULL; + stat(s, SHEAF_PREFILL_FAST); + } else { + stat(s, SHEAF_PREFILL_SLOW); + sheaf = barn_get_full_or_empty_sheaf(get_barn(s)); + if (sheaf && sheaf->size) + stat(s, BARN_GET); + else + stat(s, BARN_GET_FAIL); + } + + local_unlock(&s->cpu_sheaves->lock); + + + if (!sheaf) + sheaf = alloc_empty_sheaf(s, gfp); + + if (sheaf && sheaf->size < size) { + if (refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + sheaf = NULL; + } + } + + if (sheaf) + sheaf->capacity = s->sheaf_capacity; + + return sheaf; +} + +/* + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf() + * + * If the sheaf cannot simply become the percpu spare sheaf, but there's space + * for a full sheaf in the barn, we try to refill the sheaf back to the cache's + * sheaf_capacity to avoid handling partially full sheaves. + * + * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the + * sheaf is instead flushed and freed. + */ +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + struct slub_percpu_sheaves *pcs; + struct node_barn *barn; + + if (unlikely(sheaf->capacity != s->sheaf_capacity)) { + sheaf_flush_unused(s, sheaf); + kfree(sheaf); + return; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + barn = get_barn(s); + + if (!pcs->spare) { + pcs->spare = sheaf; + sheaf = NULL; + stat(s, SHEAF_RETURN_FAST); + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!sheaf) + return; + + stat(s, SHEAF_RETURN_SLOW); + + /* + * If the barn has too many full sheaves or we fail to refill the sheaf, + * simply flush and free it. + */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES || + refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + return; + } + + barn_put_full_sheaf(barn, sheaf); + stat(s, BARN_PUT); +} + +/* + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least + * the given size + * + * the sheaf might be replaced by a new one when requesting more than + * s->sheaf_capacity objects if such replacement is necessary, but the refill + * fails (returning -ENOMEM), the existing sheaf is left intact + * + * In practice we always refill to full sheaf's capacity. + */ +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size) +{ + struct slab_sheaf *sheaf; + + /* + * TODO: do we want to support *sheaf == NULL to be equivalent of + * kmem_cache_prefill_sheaf() ? + */ + if (!sheafp || !(*sheafp)) + return -EINVAL; + + sheaf = *sheafp; + if (sheaf->size >= size) + return 0; + + if (likely(sheaf->capacity >= size)) { + if (likely(sheaf->capacity == s->sheaf_capacity)) + return refill_sheaf(s, sheaf, gfp); + + if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, + &sheaf->objects[sheaf->size])) { + return -ENOMEM; + } + sheaf->size = sheaf->capacity; + + return 0; + } + + /* + * We had a regular sized sheaf and need an oversize one, or we had an + * oversize one already but need a larger one now. + * This should be a very rare path so let's not complicate it. + */ + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); + if (!sheaf) + return -ENOMEM; + + kmem_cache_return_sheaf(s, gfp, *sheafp); + *sheafp = sheaf; + return 0; +} + +/* + * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf() + * + * Guaranteed not to fail as many allocations as was the requested size. + * After the sheaf is emptied, it fails - no fallback to the slab cache itself. + * + * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT + * memcg charging is forced over limit if necessary, to avoid failure. + */ +void * +kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + void *ret = NULL; + bool init; + + if (sheaf->size == 0) + goto out; + + ret = sheaf->objects[--sheaf->size]; + + init = slab_want_init_on_alloc(gfp, s); + + /* add __GFP_NOFAIL to force successful memcg charging */ + slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size); +out: + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE); + + return ret; +} + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) +{ + return sheaf->size; +} /* * To avoid unnecessary overhead, we pass through large allocation requests * directly to the page allocator. We use __GFP_COMP, because we will need to @@ -8578,6 +8831,11 @@ STAT_ATTR(BARN_GET, barn_get); STAT_ATTR(BARN_GET_FAIL, barn_get_fail); STAT_ATTR(BARN_PUT, barn_put); STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); +STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast); +STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow); +STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize); +STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast); +STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow); #endif /* CONFIG_SLUB_STATS */ #ifdef CONFIG_KFENCE @@ -8678,6 +8936,11 @@ static struct attribute *slab_attrs[] = { &barn_get_fail_attr.attr, &barn_put_attr.attr, &barn_put_fail_attr.attr, + &sheaf_prefill_fast_attr.attr, + &sheaf_prefill_slow_attr.attr, + &sheaf_prefill_oversize_attr.attr, + &sheaf_return_fast_attr.attr, + &sheaf_return_slow_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, From da577f1fcbdd0317b7c5089c19d5d61ec194f0e0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 26 Aug 2025 11:28:24 +0100 Subject: [PATCH 18/40] tools/testing/vma: clean up stubs in vma_internal.h We do not need to references arguments just to avoid compiler warnings, the warning in question does not arise here, so remove all of the instances of '(void)xxx' introduced purely to avoid this warning. As reported by WagYuli in the referenced mail, GCC 8.3 and before will have issues compiling this file if parameter names are not provided, so ensure these are always provided. Finally, perform a trivial fix up of kmem_cache_alloc() which technically has parameters in the incorrect order (as reported by Vlastimil Babka off-list). Link: https://lkml.kernel.org/r/20250826102824.22730-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reported-by: WangYuli Closes: https://lore.kernel.org/linux-mm/EFCEBE7E301589DE+20250729084700.208767-1-wangyuli@uniontech.com/ Reported-by: Vlastimil Babka Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: WangYuli Signed-off-by: Andrew Morton Signed-off-by: Vlastimil Babka --- tools/testing/vma/vma_internal.h | 167 +++++++++++-------------------- 1 file changed, 57 insertions(+), 110 deletions(-) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 3639aa8dd2b0..f8cf5b184d5b 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -676,9 +676,7 @@ static inline struct kmem_cache *__kmem_cache_create(const char *name, static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - (void)gfpflags; - - return calloc(s->object_size, 1); + return calloc(1, s->object_size); } static inline void kmem_cache_free(struct kmem_cache *s, void *x) @@ -842,11 +840,11 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } -static inline void fput(struct file *) +static inline void fput(struct file *file) { } -static inline void mpol_put(struct mempolicy *) +static inline void mpol_put(struct mempolicy *pol) { } @@ -854,15 +852,15 @@ static inline void lru_add_drain(void) { } -static inline void tlb_gather_mmu(struct mmu_gather *, struct mm_struct *) +static inline void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) { } -static inline void update_hiwater_rss(struct mm_struct *) +static inline void update_hiwater_rss(struct mm_struct *mm) { } -static inline void update_hiwater_vm(struct mm_struct *) +static inline void update_hiwater_vm(struct mm_struct *mm) { } @@ -871,36 +869,23 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, unsigned long end_addr, unsigned long tree_end, bool mm_wr_locked) { - (void)tlb; - (void)mas; - (void)vma; - (void)start_addr; - (void)end_addr; - (void)tree_end; - (void)mm_wr_locked; } static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked) { - (void)tlb; - (void)mas; - (void)vma; - (void)floor; - (void)ceiling; - (void)mm_wr_locked; } -static inline void mapping_unmap_writable(struct address_space *) +static inline void mapping_unmap_writable(struct address_space *mapping) { } -static inline void flush_dcache_mmap_lock(struct address_space *) +static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } -static inline void tlb_finish_mmu(struct mmu_gather *) +static inline void tlb_finish_mmu(struct mmu_gather *tlb) { } @@ -909,7 +894,7 @@ static inline struct file *get_file(struct file *f) return f; } -static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct *) +static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { return 0; } @@ -936,10 +921,6 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long end, struct vm_area_struct *next) { - (void)vma; - (void)start; - (void)end; - (void)next; } static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {} @@ -959,51 +940,48 @@ static inline void vm_acct_memory(long pages) { } -static inline void vma_interval_tree_insert(struct vm_area_struct *, - struct rb_root_cached *) +static inline void vma_interval_tree_insert(struct vm_area_struct *vma, + struct rb_root_cached *rb) { } -static inline void vma_interval_tree_remove(struct vm_area_struct *, - struct rb_root_cached *) +static inline void vma_interval_tree_remove(struct vm_area_struct *vma, + struct rb_root_cached *rb) { } -static inline void flush_dcache_mmap_unlock(struct address_space *) +static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } -static inline void anon_vma_interval_tree_insert(struct anon_vma_chain*, - struct rb_root_cached *) +static inline void anon_vma_interval_tree_insert(struct anon_vma_chain *avc, + struct rb_root_cached *rb) { } -static inline void anon_vma_interval_tree_remove(struct anon_vma_chain*, - struct rb_root_cached *) +static inline void anon_vma_interval_tree_remove(struct anon_vma_chain *avc, + struct rb_root_cached *rb) { } -static inline void uprobe_mmap(struct vm_area_struct *) +static inline void uprobe_mmap(struct vm_area_struct *vma) { } static inline void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - (void)vma; - (void)start; - (void)end; } -static inline void i_mmap_lock_write(struct address_space *) +static inline void i_mmap_lock_write(struct address_space *mapping) { } -static inline void anon_vma_lock_write(struct anon_vma *) +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { } -static inline void vma_assert_write_locked(struct vm_area_struct *) +static inline void vma_assert_write_locked(struct vm_area_struct *vma) { } @@ -1013,16 +991,16 @@ static inline void unlink_anon_vmas(struct vm_area_struct *vma) vma->anon_vma->was_unlinked = true; } -static inline void anon_vma_unlock_write(struct anon_vma *) +static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) { } -static inline void i_mmap_unlock_write(struct address_space *) +static inline void i_mmap_unlock_write(struct address_space *mapping) { } -static inline void anon_vma_merge(struct vm_area_struct *, - struct vm_area_struct *) +static inline void anon_vma_merge(struct vm_area_struct *vma, + struct vm_area_struct *next) { } @@ -1031,27 +1009,22 @@ static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long end, struct list_head *unmaps) { - (void)vma; - (void)start; - (void)end; - (void)unmaps; - return 0; } -static inline void mmap_write_downgrade(struct mm_struct *) +static inline void mmap_write_downgrade(struct mm_struct *mm) { } -static inline void mmap_read_unlock(struct mm_struct *) +static inline void mmap_read_unlock(struct mm_struct *mm) { } -static inline void mmap_write_unlock(struct mm_struct *) +static inline void mmap_write_unlock(struct mm_struct *mm) { } -static inline int mmap_write_lock_killable(struct mm_struct *) +static inline int mmap_write_lock_killable(struct mm_struct *mm) { return 0; } @@ -1060,10 +1033,6 @@ static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) { - (void)mm; - (void)start; - (void)end; - return true; } @@ -1071,16 +1040,13 @@ static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { - (void)mm; - (void)start; - (void)end; } -static inline void mmap_assert_locked(struct mm_struct *) +static inline void mmap_assert_locked(struct mm_struct *mm) { } -static inline bool mpol_equal(struct mempolicy *, struct mempolicy *) +static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } @@ -1088,63 +1054,62 @@ static inline bool mpol_equal(struct mempolicy *, struct mempolicy *) static inline void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { - (void)vma; - (void)vm_flags; } -static inline bool mapping_can_writeback(struct address_space *) +static inline bool mapping_can_writeback(struct address_space *mapping) { return true; } -static inline bool is_vm_hugetlb_page(struct vm_area_struct *) +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return false; } -static inline bool vma_soft_dirty_enabled(struct vm_area_struct *) +static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { return false; } -static inline bool userfaultfd_wp(struct vm_area_struct *) +static inline bool userfaultfd_wp(struct vm_area_struct *vma) { return false; } -static inline void mmap_assert_write_locked(struct mm_struct *) +static inline void mmap_assert_write_locked(struct mm_struct *mm) { } -static inline void mutex_lock(struct mutex *) +static inline void mutex_lock(struct mutex *lock) { } -static inline void mutex_unlock(struct mutex *) +static inline void mutex_unlock(struct mutex *lock) { } -static inline bool mutex_is_locked(struct mutex *) +static inline bool mutex_is_locked(struct mutex *lock) { return true; } -static inline bool signal_pending(void *) +static inline bool signal_pending(void *p) { return false; } -static inline bool is_file_hugepages(struct file *) +static inline bool is_file_hugepages(struct file *file) { return false; } -static inline int security_vm_enough_memory_mm(struct mm_struct *, long) +static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { return 0; } -static inline bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long) +static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, + unsigned long npages) { return true; } @@ -1169,7 +1134,7 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, vma->__vm_flags &= ~flags; } -static inline int shmem_zero_setup(struct vm_area_struct *) +static inline int shmem_zero_setup(struct vm_area_struct *vma) { return 0; } @@ -1179,20 +1144,20 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) vma->vm_ops = NULL; } -static inline void ksm_add_vma(struct vm_area_struct *) +static inline void ksm_add_vma(struct vm_area_struct *vma) { } -static inline void perf_event_mmap(struct vm_area_struct *) +static inline void perf_event_mmap(struct vm_area_struct *vma) { } -static inline bool vma_is_dax(struct vm_area_struct *) +static inline bool vma_is_dax(struct vm_area_struct *vma) { return false; } -static inline struct vm_area_struct *get_gate_vma(struct mm_struct *) +static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } @@ -1217,16 +1182,16 @@ static inline void vma_set_page_prot(struct vm_area_struct *vma) WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } -static inline bool arch_validate_flags(vm_flags_t) +static inline bool arch_validate_flags(vm_flags_t flags) { return true; } -static inline void vma_close(struct vm_area_struct *) +static inline void vma_close(struct vm_area_struct *vma) { } -static inline int mmap_file(struct file *, struct vm_area_struct *) +static inline int mmap_file(struct file *file, struct vm_area_struct *vma) { return 0; } @@ -1388,8 +1353,6 @@ static inline int mapping_map_writable(struct address_space *mapping) static inline unsigned long move_page_tables(struct pagetable_move_control *pmc) { - (void)pmc; - return 0; } @@ -1397,51 +1360,36 @@ static inline void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { - (void)tlb; - (void)addr; - (void)end; - (void)floor; - (void)ceiling; } static inline int ksm_execve(struct mm_struct *mm) { - (void)mm; - return 0; } static inline void ksm_exit(struct mm_struct *mm) { - (void)mm; } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) { - (void)vma; - (void)reset_refcnt; } static inline void vma_numab_state_init(struct vm_area_struct *vma) { - (void)vma; } static inline void vma_numab_state_free(struct vm_area_struct *vma) { - (void)vma; } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { - (void)orig_vma; - (void)new_vma; } static inline void free_anon_vma_name(struct vm_area_struct *vma) { - (void)vma; } /* Declared in vma.h. */ @@ -1495,7 +1443,6 @@ static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc) static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) { - (void)vma; } static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) @@ -1506,13 +1453,13 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file) fput(file); } -static inline bool shmem_file(struct file *) +static inline bool shmem_file(struct file *file) { return false; } -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *, const struct file *, - vm_flags_t vm_flags) +static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, + const struct file *file, vm_flags_t vm_flags) { return vm_flags; } From 08294229210916c3b179186f3efa3b9c62a04678 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:48 +0200 Subject: [PATCH 19/40] slab: determine barn status racily outside of lock The possibility of many barn operations is determined by the current number of full or empty sheaves. Taking the barn->lock just to find out that e.g. there are no empty sheaves results in unnecessary overhead and lock contention. Thus perform these checks outside of the lock with a data_race() annotated variable read and fail quickly without taking the lock. Checks for sheaf availability that racily succeed have to be obviously repeated under the lock for correctness, but we can skip repeating checks if there are too many sheaves on the given list as the limits don't need to be strict. Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 4bd67d5d5ff5..596f375870af 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2801,9 +2801,12 @@ static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) struct slab_sheaf *empty = NULL; unsigned long flags; + if (!data_race(barn->nr_empty)) + return NULL; + spin_lock_irqsave(&barn->lock, flags); - if (barn->nr_empty) { + if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, barn_list); list_del(&empty->barn_list); @@ -2850,6 +2853,9 @@ static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) struct slab_sheaf *sheaf = NULL; unsigned long flags; + if (!data_race(barn->nr_full) && !data_race(barn->nr_empty)) + return NULL; + spin_lock_irqsave(&barn->lock, flags); if (barn->nr_full) { @@ -2880,9 +2886,12 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) struct slab_sheaf *full = NULL; unsigned long flags; + if (!data_race(barn->nr_full)) + return NULL; + spin_lock_irqsave(&barn->lock, flags); - if (barn->nr_full) { + if (likely(barn->nr_full)) { full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, barn_list); list_del(&full->barn_list); @@ -2906,19 +2915,23 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) struct slab_sheaf *empty; unsigned long flags; + /* we don't repeat this check under barn->lock as it's not critical */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES) + return ERR_PTR(-E2BIG); + if (!data_race(barn->nr_empty)) + return ERR_PTR(-ENOMEM); + spin_lock_irqsave(&barn->lock, flags); - if (barn->nr_full >= MAX_FULL_SHEAVES) { - empty = ERR_PTR(-E2BIG); - } else if (!barn->nr_empty) { - empty = ERR_PTR(-ENOMEM); - } else { + if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, barn_list); list_del(&empty->barn_list); list_add(&full->barn_list, &barn->sheaves_full); barn->nr_empty--; barn->nr_full++; + } else { + empty = ERR_PTR(-ENOMEM); } spin_unlock_irqrestore(&barn->lock, flags); From e3852a1213ffc6fbd89c768e7c54a360652648b8 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 28 Aug 2025 10:53:45 -0400 Subject: [PATCH 20/40] maple_tree: Drop bulk insert support Bulk insert mode was added to facilitate forking faster, but forking now uses __mt_dup() to duplicate the tree. The addition of sheaves has made the bulk allocations difficult to maintain - since the expected entries would preallocate into the maple state. A big part of the maple state node allocation was the ability to push nodes back onto the state for later use, which was essential to the bulk insert algorithm. Remove mas_expected_entries() and mas_destroy_rebalance() functions as well as the MA_STATE_BULK and MA_STATE_REBALANCE maple state flags since there are no users anymore. Drop the associated testing as well. Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- lib/maple_tree.c | 270 +------------------------------ lib/test_maple_tree.c | 137 ---------------- tools/testing/radix-tree/maple.c | 36 ----- 3 files changed, 4 insertions(+), 439 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 38fb68c08291..4f0e30b57b0c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -83,13 +83,9 @@ /* * Maple state flags - * * MA_STATE_BULK - Bulk insert mode - * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation */ -#define MA_STATE_BULK 1 -#define MA_STATE_REBALANCE 2 -#define MA_STATE_PREALLOC 4 +#define MA_STATE_PREALLOC 1 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) @@ -1031,24 +1027,6 @@ static inline void mas_descend(struct ma_state *mas) mas->node = mas_slot(mas, slots, mas->offset); } -/* - * mte_set_gap() - Set a maple node gap. - * @mn: The encoded maple node - * @gap: The offset of the gap to set - * @val: The gap value - */ -static inline void mte_set_gap(const struct maple_enode *mn, - unsigned char gap, unsigned long val) -{ - switch (mte_node_type(mn)) { - default: - break; - case maple_arange_64: - mte_to_node(mn)->ma64.gap[gap] = val; - break; - } -} - /* * mas_ascend() - Walk up a level of the tree. * @mas: The maple state @@ -1878,21 +1856,7 @@ static inline int mab_calc_split(struct ma_state *mas, * end on a NULL entry, with the exception of the left-most leaf. The * limitation means that the split of a node must be checked for this condition * and be able to put more data in one direction or the other. - */ - if (unlikely((mas->mas_flags & MA_STATE_BULK))) { - *mid_split = 0; - split = b_end - mt_min_slots[bn->type]; - - if (!ma_is_leaf(bn->type)) - return split; - - mas->mas_flags |= MA_STATE_REBALANCE; - if (!bn->slot[split]) - split--; - return split; - } - - /* + * * Although extremely rare, it is possible to enter what is known as the 3-way * split scenario. The 3-way split comes about by means of a store of a range * that overwrites the end and beginning of two full nodes. The result is a set @@ -2039,27 +2003,6 @@ static inline void mab_mas_cp(struct maple_big_node *b_node, } } -/* - * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. - * @mas: The maple state - * @end: The maple node end - * @mt: The maple node type - */ -static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, - enum maple_type mt) -{ - if (!(mas->mas_flags & MA_STATE_BULK)) - return; - - if (mte_is_root(mas->node)) - return; - - if (end > mt_min_slots[mt]) { - mas->mas_flags &= ~MA_STATE_REBALANCE; - return; - } -} - /* * mas_store_b_node() - Store an @entry into the b_node while also copying the * data from a maple encoded node. @@ -2109,9 +2052,6 @@ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, /* Handle new range ending before old range ends */ piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { - if (piv == ULONG_MAX) - mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); - if (offset_end != slot) wr_mas->content = mas_slot_locked(mas, wr_mas->slots, offset_end); @@ -3011,126 +2951,6 @@ static inline void mas_rebalance(struct ma_state *mas, return mas_spanning_rebalance(mas, &mast, empty_count); } -/* - * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple - * state. - * @mas: The maple state - * @end: The end of the left-most node. - * - * During a mass-insert event (such as forking), it may be necessary to - * rebalance the left-most node when it is not sufficient. - */ -static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end) -{ - enum maple_type mt = mte_node_type(mas->node); - struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; - struct maple_enode *eparent, *old_eparent; - unsigned char offset, tmp, split = mt_slots[mt] / 2; - void __rcu **l_slots, **slots; - unsigned long *l_pivs, *pivs, gap; - bool in_rcu = mt_in_rcu(mas->tree); - unsigned char new_height = mas_mt_height(mas); - - MA_STATE(l_mas, mas->tree, mas->index, mas->last); - - l_mas = *mas; - mas_prev_sibling(&l_mas); - - /* set up node. */ - if (in_rcu) { - newnode = mas_pop_node(mas); - } else { - newnode = &reuse; - } - - node = mas_mn(mas); - newnode->parent = node->parent; - slots = ma_slots(newnode, mt); - pivs = ma_pivots(newnode, mt); - left = mas_mn(&l_mas); - l_slots = ma_slots(left, mt); - l_pivs = ma_pivots(left, mt); - if (!l_slots[split]) - split++; - tmp = mas_data_end(&l_mas) - split; - - memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp); - memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp); - pivs[tmp] = l_mas.max; - memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end); - memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end); - - l_mas.max = l_pivs[split]; - mas->min = l_mas.max + 1; - old_eparent = mt_mk_node(mte_parent(l_mas.node), - mas_parent_type(&l_mas, l_mas.node)); - tmp += end; - if (!in_rcu) { - unsigned char max_p = mt_pivots[mt]; - unsigned char max_s = mt_slots[mt]; - - if (tmp < max_p) - memset(pivs + tmp, 0, - sizeof(unsigned long) * (max_p - tmp)); - - if (tmp < mt_slots[mt]) - memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); - - memcpy(node, newnode, sizeof(struct maple_node)); - ma_set_meta(node, mt, 0, tmp - 1); - mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), - l_pivs[split]); - - /* Remove data from l_pivs. */ - tmp = split + 1; - memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); - memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); - ma_set_meta(left, mt, 0, split); - eparent = old_eparent; - - goto done; - } - - /* RCU requires replacing both l_mas, mas, and parent. */ - mas->node = mt_mk_node(newnode, mt); - ma_set_meta(newnode, mt, 0, tmp); - - new_left = mas_pop_node(mas); - new_left->parent = left->parent; - mt = mte_node_type(l_mas.node); - slots = ma_slots(new_left, mt); - pivs = ma_pivots(new_left, mt); - memcpy(slots, l_slots, sizeof(void *) * split); - memcpy(pivs, l_pivs, sizeof(unsigned long) * split); - ma_set_meta(new_left, mt, 0, split); - l_mas.node = mt_mk_node(new_left, mt); - - /* replace parent. */ - offset = mte_parent_slot(mas->node); - mt = mas_parent_type(&l_mas, l_mas.node); - parent = mas_pop_node(mas); - slots = ma_slots(parent, mt); - pivs = ma_pivots(parent, mt); - memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); - rcu_assign_pointer(slots[offset], mas->node); - rcu_assign_pointer(slots[offset - 1], l_mas.node); - pivs[offset - 1] = l_mas.max; - eparent = mt_mk_node(parent, mt); -done: - gap = mas_leaf_max_gap(mas); - mte_set_gap(eparent, mte_parent_slot(mas->node), gap); - gap = mas_leaf_max_gap(&l_mas); - mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); - mas_ascend(mas); - - if (in_rcu) { - mas_replace_node(mas, old_eparent, new_height); - mas_adopt_children(mas, mas->node); - } - - mas_update_gap(mas); -} - /* * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state @@ -3837,8 +3657,6 @@ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ - else if (unlikely(wr_mas->r_max == ULONG_MAX)) - mas_bulk_rebalance(mas, mas->end, wr_mas->type); /* set up node. */ if (in_rcu) { @@ -4255,7 +4073,7 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { - if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) + if (!mte_is_root(mas->node)) return wr_rebalance; return wr_node_store; } @@ -5562,25 +5380,7 @@ void mas_destroy(struct ma_state *mas) struct maple_alloc *node; unsigned long total; - /* - * When using mas_for_each() to insert an expected number of elements, - * it is possible that the number inserted is less than the expected - * number. To fix an invalid final node, a check is performed here to - * rebalance the previous node with the final node. - */ - if (mas->mas_flags & MA_STATE_REBALANCE) { - unsigned char end; - if (mas_is_err(mas)) - mas_reset(mas); - mas_start(mas); - mtree_range_walk(mas); - end = mas->end + 1; - if (end < mt_min_slot_count(mas->node) - 1) - mas_destroy_rebalance(mas, end); - - mas->mas_flags &= ~MA_STATE_REBALANCE; - } - mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); + mas->mas_flags &= ~MA_STATE_PREALLOC; total = mas_allocated(mas); while (total) { @@ -5600,68 +5400,6 @@ void mas_destroy(struct ma_state *mas) } EXPORT_SYMBOL_GPL(mas_destroy); -/* - * mas_expected_entries() - Set the expected number of entries that will be inserted. - * @mas: The maple state - * @nr_entries: The number of expected entries. - * - * This will attempt to pre-allocate enough nodes to store the expected number - * of entries. The allocations will occur using the bulk allocator interface - * for speed. Please call mas_destroy() on the @mas after inserting the entries - * to ensure any unused nodes are freed. - * - * Return: 0 on success, -ENOMEM if memory could not be allocated. - */ -int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) -{ - int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2; - struct maple_enode *enode = mas->node; - int nr_nodes; - int ret; - - /* - * Sometimes it is necessary to duplicate a tree to a new tree, such as - * forking a process and duplicating the VMAs from one tree to a new - * tree. When such a situation arises, it is known that the new tree is - * not going to be used until the entire tree is populated. For - * performance reasons, it is best to use a bulk load with RCU disabled. - * This allows for optimistic splitting that favours the left and reuse - * of nodes during the operation. - */ - - /* Optimize splitting for bulk insert in-order */ - mas->mas_flags |= MA_STATE_BULK; - - /* - * Avoid overflow, assume a gap between each entry and a trailing null. - * If this is wrong, it just means allocation can happen during - * insertion of entries. - */ - nr_nodes = max(nr_entries, nr_entries * 2 + 1); - if (!mt_is_alloc(mas->tree)) - nonleaf_cap = MAPLE_RANGE64_SLOTS - 2; - - /* Leaves; reduce slots to keep space for expansion */ - nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2); - /* Internal nodes */ - nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); - /* Add working room for split (2 nodes) + new parents */ - mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL); - - /* Detect if allocations run out */ - mas->mas_flags |= MA_STATE_PREALLOC; - - if (!mas_is_err(mas)) - return 0; - - ret = xa_err(mas->node); - mas->node = enode; - mas_destroy(mas); - return ret; - -} -EXPORT_SYMBOL_GPL(mas_expected_entries); - static void mas_may_activate(struct ma_state *mas) { if (!mas->node) { diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index cb3936595b0d..14fbbee32046 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2746,139 +2746,6 @@ static noinline void __init check_fuzzer(struct maple_tree *mt) mtree_test_erase(mt, ULONG_MAX - 10); } -/* duplicate the tree with a specific gap */ -static noinline void __init check_dup_gaps(struct maple_tree *mt, - unsigned long nr_entries, bool zero_start, - unsigned long gap) -{ - unsigned long i = 0; - struct maple_tree newmt; - int ret; - void *tmp; - MA_STATE(mas, mt, 0, 0); - MA_STATE(newmas, &newmt, 0, 0); - struct rw_semaphore newmt_lock; - - init_rwsem(&newmt_lock); - mt_set_external_lock(&newmt, &newmt_lock); - - if (!zero_start) - i = 1; - - mt_zero_nr_tallocated(); - for (; i <= nr_entries; i++) - mtree_store_range(mt, i*10, (i+1)*10 - gap, - xa_mk_value(i), GFP_KERNEL); - - mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN); - mt_set_non_kernel(99999); - down_write(&newmt_lock); - ret = mas_expected_entries(&newmas, nr_entries); - mt_set_non_kernel(0); - MT_BUG_ON(mt, ret != 0); - - rcu_read_lock(); - mas_for_each(&mas, tmp, ULONG_MAX) { - newmas.index = mas.index; - newmas.last = mas.last; - mas_store(&newmas, tmp); - } - rcu_read_unlock(); - mas_destroy(&newmas); - - __mt_destroy(&newmt); - up_write(&newmt_lock); -} - -/* Duplicate many sizes of trees. Mainly to test expected entry values */ -static noinline void __init check_dup(struct maple_tree *mt) -{ - int i; - int big_start = 100010; - - /* Check with a value at zero */ - for (i = 10; i < 1000; i++) { - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); - check_dup_gaps(mt, i, true, 5); - mtree_destroy(mt); - rcu_barrier(); - } - - cond_resched(); - mt_cache_shrink(); - /* Check with a value at zero, no gap */ - for (i = 1000; i < 2000; i++) { - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); - check_dup_gaps(mt, i, true, 0); - mtree_destroy(mt); - rcu_barrier(); - } - - cond_resched(); - mt_cache_shrink(); - /* Check with a value at zero and unreasonably large */ - for (i = big_start; i < big_start + 10; i++) { - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); - check_dup_gaps(mt, i, true, 5); - mtree_destroy(mt); - rcu_barrier(); - } - - cond_resched(); - mt_cache_shrink(); - /* Small to medium size not starting at zero*/ - for (i = 200; i < 1000; i++) { - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); - check_dup_gaps(mt, i, false, 5); - mtree_destroy(mt); - rcu_barrier(); - } - - cond_resched(); - mt_cache_shrink(); - /* Unreasonably large not starting at zero*/ - for (i = big_start; i < big_start + 10; i++) { - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); - check_dup_gaps(mt, i, false, 5); - mtree_destroy(mt); - rcu_barrier(); - cond_resched(); - mt_cache_shrink(); - } - - /* Check non-allocation tree not starting at zero */ - for (i = 1500; i < 3000; i++) { - mt_init_flags(mt, 0); - check_dup_gaps(mt, i, false, 5); - mtree_destroy(mt); - rcu_barrier(); - cond_resched(); - if (i % 2 == 0) - mt_cache_shrink(); - } - - mt_cache_shrink(); - /* Check non-allocation tree starting at zero */ - for (i = 200; i < 1000; i++) { - mt_init_flags(mt, 0); - check_dup_gaps(mt, i, true, 5); - mtree_destroy(mt); - rcu_barrier(); - cond_resched(); - } - - mt_cache_shrink(); - /* Unreasonably large */ - for (i = big_start + 5; i < big_start + 10; i++) { - mt_init_flags(mt, 0); - check_dup_gaps(mt, i, true, 5); - mtree_destroy(mt); - rcu_barrier(); - mt_cache_shrink(); - cond_resched(); - } -} - static noinline void __init check_bnode_min_spanning(struct maple_tree *mt) { int i = 50; @@ -4077,10 +3944,6 @@ static int __init maple_tree_seed(void) check_fuzzer(&tree); mtree_destroy(&tree); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); - check_dup(&tree); - mtree_destroy(&tree); - mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_bnode_min_spanning(&tree); mtree_destroy(&tree); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 172700fb7784..c0543060dae2 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35455,17 +35455,6 @@ static void check_dfs_preorder(struct maple_tree *mt) MT_BUG_ON(mt, count != e); mtree_destroy(mt); - mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); - mas_reset(&mas); - mt_zero_nr_tallocated(); - mt_set_non_kernel(200); - mas_expected_entries(&mas, max); - for (count = 0; count <= max; count++) { - mas.index = mas.last = count; - mas_store(&mas, xa_mk_value(count)); - MT_BUG_ON(mt, mas_is_err(&mas)); - } - mas_destroy(&mas); rcu_barrier(); /* * pr_info(" ->seq test of 0-%lu %luK in %d active (%d total)\n", @@ -36454,27 +36443,6 @@ static inline int check_vma_modification(struct maple_tree *mt) return 0; } -/* - * test to check that bulk stores do not use wr_rebalance as the store - * type. - */ -static inline void check_bulk_rebalance(struct maple_tree *mt) -{ - MA_STATE(mas, mt, ULONG_MAX, ULONG_MAX); - int max = 10; - - build_full_tree(mt, 0, 2); - - /* erase every entry in the tree */ - do { - /* set up bulk store mode */ - mas_expected_entries(&mas, max); - mas_erase(&mas); - MT_BUG_ON(mt, mas.store_type == wr_rebalance); - } while (mas_prev(&mas, 0) != NULL); - - mas_destroy(&mas); -} void farmer_tests(void) { @@ -36487,10 +36455,6 @@ void farmer_tests(void) check_vma_modification(&tree); mtree_destroy(&tree); - mt_init(&tree); - check_bulk_rebalance(&tree); - mtree_destroy(&tree); - tree.ma_root = xa_mk_value(0); mt_dump(&tree, mt_dump_dec); From 989b09b73978a0b3dbee9ef343ab108742b1cc9f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:49 +0200 Subject: [PATCH 21/40] slab: skip percpu sheaves for remote object freeing Since we don't control the NUMA locality of objects in percpu sheaves, allocations with node restrictions bypass them. Allocations without restrictions may however still expect to get local objects with high probability, and the introduction of sheaves can decrease it due to freed object from a remote node ending up in percpu sheaves. The fraction of such remote frees seems low (5% on an 8-node machine) but it can be expected that some cache or workload specific corner cases exist. We can either conclude that this is not a problem due to the low fraction, or we can make remote frees bypass percpu sheaves and go directly to their slabs. This will make the remote frees more expensive, but if it's only a small fraction, most frees will still benefit from the lower overhead of percpu sheaves. This patch thus makes remote object freeing bypass percpu sheaves, including bulk freeing, and kfree_rcu() via the rcu_free sheaf. However it's not intended to be 100% guarantee that percpu sheaves will only contain local objects. The refill from slabs does not provide that guarantee in the first place, and there might be cpu migrations happening when we need to unlock the local_lock. Avoiding all that could be possible but complicated so we can leave it for later investigation whether it would be worth it. It can be expected that the more selective freeing will itself prevent accumulation of remote objects in percpu sheaves so any such violations would have only short-term effects. Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 7 +++++-- mm/slub.c | 41 +++++++++++++++++++++++++++++++++++------ 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 005a4319c06a..b6601e0fe598 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1623,8 +1623,11 @@ static bool kfree_rcu_sheaf(void *obj) slab = folio_slab(folio); s = slab->slab_cache; - if (s->cpu_sheaves) - return __kfree_rcu_sheaf(s, obj); + if (s->cpu_sheaves) { + if (likely(!IS_ENABLED(CONFIG_NUMA) || + slab_nid(slab) == numa_mem_id())) + return __kfree_rcu_sheaf(s, obj); + } return false; } diff --git a/mm/slub.c b/mm/slub.c index 596f375870af..6bcddc513963 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -472,6 +472,7 @@ struct slab_sheaf { }; struct kmem_cache *cache; unsigned int size; + int node; /* only used for rcu_sheaf */ void *objects[]; }; @@ -5822,7 +5823,7 @@ static void rcu_free_sheaf(struct rcu_head *head) */ __rcu_free_sheaf_prepare(s, sheaf); - barn = get_node(s, numa_mem_id())->barn; + barn = get_node(s, sheaf->node)->barn; /* due to slab_free_hook() */ if (unlikely(sheaf->size == 0)) @@ -5912,10 +5913,12 @@ do_free: */ rcu_sheaf->objects[rcu_sheaf->size++] = obj; - if (likely(rcu_sheaf->size < s->sheaf_capacity)) + if (likely(rcu_sheaf->size < s->sheaf_capacity)) { rcu_sheaf = NULL; - else + } else { pcs->rcu_free = NULL; + rcu_sheaf->node = numa_mem_id(); + } /* * we flush before local_unlock to make sure a racing @@ -5946,7 +5949,11 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) bool init = slab_want_init_on_free(s); unsigned int batch, i = 0; struct node_barn *barn; + void *remote_objects[PCS_BATCH_MAX]; + unsigned int remote_nr = 0; + int node = numa_mem_id(); +next_remote_batch: while (i < size) { struct slab *slab = virt_to_slab(p[i]); @@ -5956,7 +5963,15 @@ static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) if (unlikely(!slab_free_hook(s, p[i], init, false))) { p[i] = p[--size]; if (!size) - return; + goto flush_remote; + continue; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { + remote_objects[remote_nr] = p[i]; + p[i] = p[--size]; + if (++remote_nr >= PCS_BATCH_MAX) + goto flush_remote; continue; } @@ -6026,6 +6041,15 @@ no_empty: */ fallback: __kmem_cache_free_bulk(s, size, p); + +flush_remote: + if (remote_nr) { + __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); + if (i < size) { + remote_nr = 0; + goto next_remote_batch; + } + } } #ifndef CONFIG_SLUB_TINY @@ -6117,8 +6141,13 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) return; - if (!s->cpu_sheaves || !free_to_pcs(s, object)) - do_slab_free(s, slab, object, object, 1, addr); + if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || + slab_nid(slab) == numa_mem_id())) { + if (likely(free_to_pcs(s, object))) + return; + } + + do_slab_free(s, slab, object, object, 1, addr); } #ifdef CONFIG_MEMCG From d09a61a3aa7de53fe055743993d79eaeb945cbad Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 14:59:52 +0200 Subject: [PATCH 22/40] tools/testing/vma: Implement vm_refcnt reset Add the reset of the ref count in vma_lock_init(). This is needed if the vma memory is not zeroed on allocation. Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- tools/testing/vma/vma_internal.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index f8cf5b184d5b..6b6e2b05918c 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1373,6 +1373,8 @@ static inline void ksm_exit(struct mm_struct *mm) static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) { + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); } static inline void vma_numab_state_init(struct vm_area_struct *vma) From 4ec1a08d20315d2d2a6f942ea563e5dadd988e98 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:50 +0200 Subject: [PATCH 23/40] slab: allow NUMA restricted allocations to use percpu sheaves Currently allocations asking for a specific node explicitly or via mempolicy in strict_numa node bypass percpu sheaves. Since sheaves contain mostly local objects, we can try allocating from them if the local node happens to be the requested node or allowed by the mempolicy. If we find the object from percpu sheaves is not from the expected node, we skip the sheaves - this should be rare. Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- mm/slub.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 6bcddc513963..114ec18d223b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4882,18 +4882,43 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, } static __fastpath_inline -void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp) +void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) { struct slub_percpu_sheaves *pcs; + bool node_requested; void *object; #ifdef CONFIG_NUMA - if (static_branch_unlikely(&strict_numa)) { - if (current->mempolicy) - return NULL; + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If the local node + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || + !node_isset(numa_mem_id(), mpol->nodes)) + + node = mempolicy_slab_node(); + } } #endif + node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE; + + /* + * We assume the percpu sheaves contain only local objects although it's + * not completely guaranteed, so we verify later. + */ + if (unlikely(node_requested && node != numa_mem_id())) + return NULL; + if (!local_trylock(&s->cpu_sheaves->lock)) return NULL; @@ -4905,7 +4930,21 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp) return NULL; } - object = pcs->main->objects[--pcs->main->size]; + object = pcs->main->objects[pcs->main->size - 1]; + + if (unlikely(node_requested)) { + /* + * Verify that the object was from the node we want. This could + * be false because of cpu migration during an unlocked part of + * the current allocation or previous freeing process. + */ + if (folio_nid(virt_to_folio(object)) != node) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + } + + pcs->main->size--; local_unlock(&s->cpu_sheaves->lock); @@ -5005,8 +5044,8 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - if (s->cpu_sheaves && node == NUMA_NO_NODE) - object = alloc_from_pcs(s, gfpflags); + if (s->cpu_sheaves) + object = alloc_from_pcs(s, gfpflags, node); if (!object) object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); From c4fb7f0a79771dfd18838bfc5015650a9730e9c0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 14:59:53 +0200 Subject: [PATCH 24/40] tools/testing: Add support for changes to slab for sheaves The slab changes for sheaves requires more effort in the testing code. Unite all the kmem_cache work into the tools/include slab header for both the vma and maple tree testing. The vma test code also requires importing more #defines to allow for seamless use of the shared kmem_cache code. This adds the pthread header to the slab header in the tools directory to allow for the pthread_mutex in linux.c. Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- tools/include/linux/slab.h | 137 +++++++++++++++++++++++++++++- tools/testing/shared/linux.c | 26 ++---- tools/testing/shared/maple-shim.c | 1 + tools/testing/vma/vma_internal.h | 92 +------------------- 4 files changed, 142 insertions(+), 114 deletions(-) diff --git a/tools/include/linux/slab.h b/tools/include/linux/slab.h index c87051e2b26f..c5c5cc6db566 100644 --- a/tools/include/linux/slab.h +++ b/tools/include/linux/slab.h @@ -4,11 +4,31 @@ #include #include +#include -#define SLAB_PANIC 2 #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ #define kzalloc_node(size, flags, node) kmalloc(size, flags) +enum _slab_flag_bits { + _SLAB_KMALLOC, + _SLAB_HWCACHE_ALIGN, + _SLAB_PANIC, + _SLAB_TYPESAFE_BY_RCU, + _SLAB_ACCOUNT, + _SLAB_FLAGS_LAST_BIT +}; + +#define __SLAB_FLAG_BIT(nr) ((unsigned int __force)(1U << (nr))) +#define __SLAB_FLAG_UNUSED ((unsigned int __force)(0U)) + +#define SLAB_HWCACHE_ALIGN __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN) +#define SLAB_PANIC __SLAB_FLAG_BIT(_SLAB_PANIC) +#define SLAB_TYPESAFE_BY_RCU __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU) +#ifdef CONFIG_MEMCG +# define SLAB_ACCOUNT __SLAB_FLAG_BIT(_SLAB_ACCOUNT) +#else +# define SLAB_ACCOUNT __SLAB_FLAG_UNUSED +#endif void *kmalloc(size_t size, gfp_t gfp); void kfree(void *p); @@ -23,6 +43,86 @@ enum slab_state { FULL }; +struct kmem_cache { + pthread_mutex_t lock; + unsigned int size; + unsigned int align; + unsigned int sheaf_capacity; + int nr_objs; + void *objs; + void (*ctor)(void *); + bool non_kernel_enabled; + unsigned int non_kernel; + unsigned long nr_allocated; + unsigned long nr_tallocated; + bool exec_callback; + void (*callback)(void *); + void *private; +}; + +struct kmem_cache_args { + /** + * @align: The required alignment for the objects. + * + * %0 means no specific alignment is requested. + */ + unsigned int align; + /** + * @sheaf_capacity: The maximum size of the sheaf. + */ + unsigned int sheaf_capacity; + /** + * @useroffset: Usercopy region offset. + * + * %0 is a valid offset, when @usersize is non-%0 + */ + unsigned int useroffset; + /** + * @usersize: Usercopy region size. + * + * %0 means no usercopy region is specified. + */ + unsigned int usersize; + /** + * @freeptr_offset: Custom offset for the free pointer + * in &SLAB_TYPESAFE_BY_RCU caches + * + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer + * outside of the object. This might cause the object to grow in size. + * Cache creators that have a reason to avoid this can specify a custom + * free pointer offset in their struct where the free pointer will be + * placed. + * + * Note that placing the free pointer inside the object requires the + * caller to ensure that no fields are invalidated that are required to + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for + * details). + * + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset + * is specified, %use_freeptr_offset must be set %true. + * + * Note that @ctor currently isn't supported with custom free pointers + * as a @ctor requires an external free pointer. + */ + unsigned int freeptr_offset; + /** + * @use_freeptr_offset: Whether a @freeptr_offset is used. + */ + bool use_freeptr_offset; + /** + * @ctor: A constructor for the objects. + * + * The constructor is invoked for each object in a newly allocated slab + * page. It is the cache user's responsibility to free object in the + * same state as after calling the constructor, or deal appropriately + * with any differences between a freshly constructed and a reallocated + * object. + * + * %NULL means no constructor. + */ + void (*ctor)(void *); +}; + static inline void *kzalloc(size_t size, gfp_t gfp) { return kmalloc(size, gfp | __GFP_ZERO); @@ -37,9 +137,38 @@ static inline void *kmem_cache_alloc(struct kmem_cache *cachep, int flags) } void kmem_cache_free(struct kmem_cache *cachep, void *objp); -struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, - unsigned int align, unsigned int flags, - void (*ctor)(void *)); + +struct kmem_cache * +__kmem_cache_create_args(const char *name, unsigned int size, + struct kmem_cache_args *args, unsigned int flags); + +/* If NULL is passed for @args, use this variant with default arguments. */ +static inline struct kmem_cache * +__kmem_cache_default_args(const char *name, unsigned int size, + struct kmem_cache_args *args, unsigned int flags) +{ + struct kmem_cache_args kmem_default_args = {}; + + return __kmem_cache_create_args(name, size, &kmem_default_args, flags); +} + +static inline struct kmem_cache * +__kmem_cache_create(const char *name, unsigned int size, unsigned int align, + unsigned int flags, void (*ctor)(void *)) +{ + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); +} + +#define kmem_cache_create(__name, __object_size, __args, ...) \ + _Generic((__args), \ + struct kmem_cache_args *: __kmem_cache_create_args, \ + void *: __kmem_cache_default_args, \ + default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__) void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list); int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 0f97fb0d19e1..97b8412ccbb6 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -16,21 +16,6 @@ int nr_allocated; int preempt_count; int test_verbose; -struct kmem_cache { - pthread_mutex_t lock; - unsigned int size; - unsigned int align; - int nr_objs; - void *objs; - void (*ctor)(void *); - unsigned int non_kernel; - unsigned long nr_allocated; - unsigned long nr_tallocated; - bool exec_callback; - void (*callback)(void *); - void *private; -}; - void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)) { cachep->callback = callback; @@ -234,23 +219,26 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, } struct kmem_cache * -kmem_cache_create(const char *name, unsigned int size, unsigned int align, - unsigned int flags, void (*ctor)(void *)) +__kmem_cache_create_args(const char *name, unsigned int size, + struct kmem_cache_args *args, + unsigned int flags) { struct kmem_cache *ret = malloc(sizeof(*ret)); pthread_mutex_init(&ret->lock, NULL); ret->size = size; - ret->align = align; + ret->align = args->align; + ret->sheaf_capacity = args->sheaf_capacity; ret->nr_objs = 0; ret->nr_allocated = 0; ret->nr_tallocated = 0; ret->objs = NULL; - ret->ctor = ctor; + ret->ctor = args->ctor; ret->non_kernel = 0; ret->exec_callback = false; ret->callback = NULL; ret->private = NULL; + return ret; } diff --git a/tools/testing/shared/maple-shim.c b/tools/testing/shared/maple-shim.c index 640df76f483e..9d7b74341566 100644 --- a/tools/testing/shared/maple-shim.c +++ b/tools/testing/shared/maple-shim.c @@ -3,5 +3,6 @@ /* Very simple shim around the maple tree. */ #include "maple-shared.h" +#include #include "../../../lib/maple_tree.c" diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 6b6e2b05918c..d5b87fa6a133 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -26,6 +26,7 @@ #include #include #include +#include extern unsigned long stack_guard_gap; #ifdef CONFIG_MMU @@ -509,65 +510,6 @@ struct pagetable_move_control { .len_in = len_, \ } -struct kmem_cache_args { - /** - * @align: The required alignment for the objects. - * - * %0 means no specific alignment is requested. - */ - unsigned int align; - /** - * @useroffset: Usercopy region offset. - * - * %0 is a valid offset, when @usersize is non-%0 - */ - unsigned int useroffset; - /** - * @usersize: Usercopy region size. - * - * %0 means no usercopy region is specified. - */ - unsigned int usersize; - /** - * @freeptr_offset: Custom offset for the free pointer - * in &SLAB_TYPESAFE_BY_RCU caches - * - * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer - * outside of the object. This might cause the object to grow in size. - * Cache creators that have a reason to avoid this can specify a custom - * free pointer offset in their struct where the free pointer will be - * placed. - * - * Note that placing the free pointer inside the object requires the - * caller to ensure that no fields are invalidated that are required to - * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for - * details). - * - * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset - * is specified, %use_freeptr_offset must be set %true. - * - * Note that @ctor currently isn't supported with custom free pointers - * as a @ctor requires an external free pointer. - */ - unsigned int freeptr_offset; - /** - * @use_freeptr_offset: Whether a @freeptr_offset is used. - */ - bool use_freeptr_offset; - /** - * @ctor: A constructor for the objects. - * - * The constructor is invoked for each object in a newly allocated slab - * page. It is the cache user's responsibility to free object in the - * same state as after calling the constructor, or deal appropriately - * with any differences between a freshly constructed and a reallocated - * object. - * - * %NULL means no constructor. - */ - void (*ctor)(void *); -}; - static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); @@ -652,38 +594,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_lock_seq = UINT_MAX; } -struct kmem_cache { - const char *name; - size_t object_size; - struct kmem_cache_args *args; -}; - -static inline struct kmem_cache *__kmem_cache_create(const char *name, - size_t object_size, - struct kmem_cache_args *args) -{ - struct kmem_cache *ret = malloc(sizeof(struct kmem_cache)); - - ret->name = name; - ret->object_size = object_size; - ret->args = args; - - return ret; -} - -#define kmem_cache_create(__name, __object_size, __args, ...) \ - __kmem_cache_create((__name), (__object_size), (__args)) - -static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) -{ - return calloc(1, s->object_size); -} - -static inline void kmem_cache_free(struct kmem_cache *s, void *x) -{ - free(x); -} - /* * These are defined in vma.h, but sadly vm_stat_account() is referenced by * kernel/fork.c, so we have to these broadly available there, and temporarily From 3accabda4da1b00ba21dc18573a0a43e032e35f4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:54 +0200 Subject: [PATCH 25/40] mm, vma: use percpu sheaves for vm_area_struct cache Create the vm_area_struct cache with percpu sheaves of size 32 to improve its performance. Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- mm/vma_init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vma_init.c b/mm/vma_init.c index 8e53c7943561..52c6b55fac45 100644 --- a/mm/vma_init.c +++ b/mm/vma_init.c @@ -16,6 +16,7 @@ void __init vma_state_init(void) struct kmem_cache_args args = { .use_freeptr_offset = true, .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + .sheaf_capacity = 32, }; vm_area_cachep = kmem_cache_create("vm_area_struct", From 59faa4da7cd4565cbce25358495556b75bb37022 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:55 +0200 Subject: [PATCH 26/40] maple_tree: use percpu sheaves for maple_node_cache Setup the maple_node_cache with percpu sheaves of size 32 to hopefully improve its performance. Note this will not immediately take advantage of sheaf batching of kfree_rcu() operations due to the maple tree using call_rcu with custom callbacks. The followup changes to maple tree will change that and also make use of the prefilled sheaves functionality. Reviewed-by: Sidhartha Kumar Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- lib/maple_tree.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4f0e30b57b0c..d034f170ac89 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6040,9 +6040,14 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) void __init maple_tree_init(void) { + struct kmem_cache_args args = { + .align = sizeof(struct maple_node), + .sheaf_capacity = 32, + }; + maple_node_cache = kmem_cache_create("maple_node", - sizeof(struct maple_node), sizeof(struct maple_node), - SLAB_PANIC, NULL); + sizeof(struct maple_node), &args, + SLAB_PANIC); } /** From 9f910f7d3d84927c388ede3c8053760a5947973b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Sep 2025 14:59:56 +0200 Subject: [PATCH 27/40] tools/testing: include maple-shim.c in maple.c There's some duplicated code and we are about to add more functionality in maple-shared.h that we will need in the userspace maple test to be available, so include it via maple-shim.c Co-developed-by: Liam R. Howlett Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- tools/testing/radix-tree/maple.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index c0543060dae2..4a35e1e7c64b 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -8,14 +8,6 @@ * difficult to handle in kernel tests. */ -#define CONFIG_DEBUG_MAPLE_TREE -#define CONFIG_MAPLE_SEARCH -#define MAPLE_32BIT (MAPLE_NODE_SLOTS > 31) -#include "test.h" -#include -#include -#include - #define module_init(x) #define module_exit(x) #define MODULE_AUTHOR(x) @@ -23,7 +15,9 @@ #define MODULE_LICENSE(x) #define dump_stack() assert(0) -#include "../../../lib/maple_tree.c" +#include "test.h" + +#include "../shared/maple-shim.c" #include "../../../lib/test_maple_tree.c" #define RCU_RANGE_COUNT 1000 From 551a6e757a95c23cd808533ef96e083d2a858ba0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 14:59:57 +0200 Subject: [PATCH 28/40] testing/radix-tree/maple: Hack around kfree_rcu not existing liburcu doesn't have kfree_rcu (or anything similar). Despite that, we can hack around it in a trivial fashion, by adding a wrapper. The wrapper only works for maple_nodes because we cannot get the kmem_cache pointer any other way in the test code. Link: https://lore.kernel.org/all/20250812162124.59417-1-pfalcato@suse.de/ Suggested-by: Pedro Falcato Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- tools/testing/shared/maple-shared.h | 11 +++++++++++ tools/testing/shared/maple-shim.c | 6 ++++++ 2 files changed, 17 insertions(+) diff --git a/tools/testing/shared/maple-shared.h b/tools/testing/shared/maple-shared.h index dc4d30f3860b..2a1e9a8594a2 100644 --- a/tools/testing/shared/maple-shared.h +++ b/tools/testing/shared/maple-shared.h @@ -10,4 +10,15 @@ #include #include "linux/init.h" +void maple_rcu_cb(struct rcu_head *head); +#define rcu_cb maple_rcu_cb + +#define kfree_rcu(_struct, _memb) \ +do { \ + typeof(_struct) _p_struct = (_struct); \ + \ + call_rcu(&((_p_struct)->_memb), rcu_cb); \ +} while(0); + + #endif /* __MAPLE_SHARED_H__ */ diff --git a/tools/testing/shared/maple-shim.c b/tools/testing/shared/maple-shim.c index 9d7b74341566..16252ee616c0 100644 --- a/tools/testing/shared/maple-shim.c +++ b/tools/testing/shared/maple-shim.c @@ -6,3 +6,9 @@ #include #include "../../../lib/maple_tree.c" + +void maple_rcu_cb(struct rcu_head *head) { + struct maple_node *node = container_of(head, struct maple_node, rcu); + + kmem_cache_free(maple_node_cache, node); +} From 9b60811cb3b42fe9a8e3f0aa3e007cd9cc814a78 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 3 Sep 2025 14:59:58 +0200 Subject: [PATCH 29/40] maple_tree: Use kfree_rcu in ma_free_rcu kfree_rcu is an optimized version of call_rcu + kfree. It used to not be possible to call it on non-kmalloc objects, but this restriction was lifted ever since SLOB was dropped from the kernel, and since commit 6c6c47b063b5 ("mm, slab: call kvfree_rcu_barrier() from kmem_cache_destroy()"). Thus, replace call_rcu + mt_free_rcu with kfree_rcu. Signed-off-by: Pedro Falcato Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- lib/maple_tree.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d034f170ac89..c706e2e48f88 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -187,13 +187,6 @@ static inline void mt_free_bulk(size_t size, void __rcu **nodes) kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } -static void mt_free_rcu(struct rcu_head *head) -{ - struct maple_node *node = container_of(head, struct maple_node, rcu); - - kmem_cache_free(maple_node_cache, node); -} - /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free @@ -204,7 +197,7 @@ static void mt_free_rcu(struct rcu_head *head) static void ma_free_rcu(struct maple_node *node) { WARN_ON(node->parent != ma_parent_ptr(node)); - call_rcu(&node->rcu, mt_free_rcu); + kfree_rcu(node, rcu); } static void mt_set_height(struct maple_tree *mt, unsigned char height) @@ -5099,7 +5092,7 @@ static void mt_free_walk(struct rcu_head *head) mt_free_bulk(node->slot_len, slots); free_leaf: - mt_free_rcu(&node->rcu); + mt_free_one(node); } static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, @@ -5183,7 +5176,7 @@ next: free_leaf: if (free) - mt_free_rcu(&node->rcu); + mt_free_one(node); else mt_clear_meta(mt, node, node->type); } From 025f93101bb4e0a0cdc0fc4ee264f7905271ad75 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 3 Sep 2025 14:59:59 +0200 Subject: [PATCH 30/40] maple_tree: Replace mt_free_one() with kfree() kfree() is a little shorter and works with kmem_cache_alloc'd pointers too. Also lets us remove one more helper. Signed-off-by: Pedro Falcato Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- lib/maple_tree.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c706e2e48f88..0439aaacf6cb 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -177,11 +177,6 @@ static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } -static inline void mt_free_one(struct maple_node *node) -{ - kmem_cache_free(maple_node_cache, node); -} - static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); @@ -5092,7 +5087,7 @@ static void mt_free_walk(struct rcu_head *head) mt_free_bulk(node->slot_len, slots); free_leaf: - mt_free_one(node); + kfree(node); } static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, @@ -5176,7 +5171,7 @@ next: free_leaf: if (free) - mt_free_one(node); + kfree(node); else mt_clear_meta(mt, node, node->type); } @@ -5385,7 +5380,7 @@ void mas_destroy(struct ma_state *mas) mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } - mt_free_one(ma_mnode_ptr(node)); + kfree(ma_mnode_ptr(node)); total--; } @@ -6373,7 +6368,7 @@ static void mas_dup_free(struct ma_state *mas) } node = mte_to_node(mas->node); - mt_free_one(node); + kfree(node); } /* From fdbebab19f147af6b1459c821bc11162911245fa Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 15:00:00 +0200 Subject: [PATCH 31/40] tools/testing: Add support for prefilled slab sheafs Add the prefilled sheaf structs to the slab header and the associated functions to the testing/shared/linux.c file. Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- tools/include/linux/slab.h | 28 ++++++++++++ tools/testing/shared/linux.c | 89 ++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/tools/include/linux/slab.h b/tools/include/linux/slab.h index c5c5cc6db566..94937a699402 100644 --- a/tools/include/linux/slab.h +++ b/tools/include/linux/slab.h @@ -123,6 +123,18 @@ struct kmem_cache_args { void (*ctor)(void *); }; +struct slab_sheaf { + union { + struct list_head barn_list; + /* only used for prefilled sheafs */ + unsigned int capacity; + }; + struct kmem_cache *cache; + unsigned int size; + int node; /* only used for rcu_sheaf */ + void *objects[]; +}; + static inline void *kzalloc(size_t size, gfp_t gfp) { return kmalloc(size, gfp | __GFP_ZERO); @@ -173,5 +185,21 @@ __kmem_cache_create(const char *name, unsigned int size, unsigned int align, void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list); int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, void **list); +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size); + +void * +kmem_cache_alloc_from_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf); + +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf); +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size); + +static inline unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) +{ + return sheaf->size; +} #endif /* _TOOLS_SLAB_H */ diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 97b8412ccbb6..4ceff7969b78 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -137,6 +137,12 @@ void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list) if (kmalloc_verbose) pr_debug("Bulk free %p[0-%zu]\n", list, size - 1); + if (cachep->exec_callback) { + if (cachep->callback) + cachep->callback(cachep->private); + cachep->exec_callback = false; + } + pthread_mutex_lock(&cachep->lock); for (int i = 0; i < size; i++) kmem_cache_free_locked(cachep, list[i]); @@ -242,6 +248,89 @@ __kmem_cache_create_args(const char *name, unsigned int size, return ret; } +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) +{ + struct slab_sheaf *sheaf; + unsigned int capacity; + + if (s->exec_callback) { + if (s->callback) + s->callback(s->private); + s->exec_callback = false; + } + + capacity = max(size, s->sheaf_capacity); + + sheaf = calloc(1, sizeof(*sheaf) + sizeof(void *) * capacity); + if (!sheaf) + return NULL; + + sheaf->cache = s; + sheaf->capacity = capacity; + sheaf->size = kmem_cache_alloc_bulk(s, gfp, size, sheaf->objects); + if (!sheaf->size) { + free(sheaf); + return NULL; + } + + return sheaf; +} + +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size) +{ + struct slab_sheaf *sheaf = *sheafp; + int refill; + + if (sheaf->size >= size) + return 0; + + if (size > sheaf->capacity) { + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); + if (!sheaf) + return -ENOMEM; + + kmem_cache_return_sheaf(s, gfp, *sheafp); + *sheafp = sheaf; + return 0; + } + + refill = kmem_cache_alloc_bulk(s, gfp, size - sheaf->size, + &sheaf->objects[sheaf->size]); + if (!refill) + return -ENOMEM; + + sheaf->size += refill; + return 0; +} + +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + if (sheaf->size) + kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); + + free(sheaf); +} + +void * +kmem_cache_alloc_from_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + void *obj; + + if (sheaf->size == 0) { + printf("Nothing left in sheaf!\n"); + return NULL; + } + + obj = sheaf->objects[--sheaf->size]; + sheaf->objects[sheaf->size] = NULL; + + return obj; +} + /* * Test the test infrastructure for kem_cache_alloc/free and bulk counterparts. */ From 9b05890a25d9197e39fcf5b2298f0b911c323306 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 15:00:01 +0200 Subject: [PATCH 32/40] maple_tree: Prefilled sheaf conversion and testing Use prefilled sheaves instead of bulk allocations. This should speed up the allocations and the return path of unused allocations. Remove the push and pop of nodes from the maple state as this is now handled by the slab layer with sheaves. Testing has been removed as necessary since the features of the tree have been reduced. Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- include/linux/maple_tree.h | 6 +- lib/maple_tree.c | 326 +++++----------------- tools/testing/radix-tree/maple.c | 461 ++----------------------------- tools/testing/shared/linux.c | 5 +- 4 files changed, 88 insertions(+), 710 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index bafe143b1f78..0e31b191e3be 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -442,7 +442,8 @@ struct ma_state { struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ - struct maple_alloc *alloc; /* Allocated nodes for this operation */ + struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ + unsigned long node_request; /* The number of nodes to allocate for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; @@ -490,7 +491,8 @@ struct ma_wr_state { .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ - .alloc = NULL, \ + .node_request = 0, \ + .sheaf = NULL, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0439aaacf6cb..b41245f2cc65 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -182,6 +182,22 @@ static inline void mt_free_bulk(size_t size, void __rcu **nodes) kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } +static void mt_return_sheaf(struct slab_sheaf *sheaf) +{ + kmem_cache_return_sheaf(maple_node_cache, GFP_NOWAIT, sheaf); +} + +static struct slab_sheaf *mt_get_sheaf(gfp_t gfp, int count) +{ + return kmem_cache_prefill_sheaf(maple_node_cache, gfp, count); +} + +static int mt_refill_sheaf(gfp_t gfp, struct slab_sheaf **sheaf, + unsigned int size) +{ + return kmem_cache_refill_sheaf(maple_node_cache, gfp, sheaf, size); +} + /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free @@ -574,67 +590,6 @@ static __always_inline bool mte_dead_node(const struct maple_enode *enode) return ma_dead_node(node); } -/* - * mas_allocated() - Get the number of nodes allocated in a maple state. - * @mas: The maple state - * - * The ma_state alloc member is overloaded to hold a pointer to the first - * allocated node or to the number of requested nodes to allocate. If bit 0 is - * set, then the alloc contains the number of requested nodes. If there is an - * allocated node, then the total allocated nodes is in that node. - * - * Return: The total number of nodes allocated - */ -static inline unsigned long mas_allocated(const struct ma_state *mas) -{ - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) - return 0; - - return mas->alloc->total; -} - -/* - * mas_set_alloc_req() - Set the requested number of allocations. - * @mas: the maple state - * @count: the number of allocations. - * - * The requested number of allocations is either in the first allocated node, - * located in @mas->alloc->request_count, or directly in @mas->alloc if there is - * no allocated node. Set the request either in the node or do the necessary - * encoding to store in @mas->alloc directly. - */ -static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) -{ - if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { - if (!count) - mas->alloc = NULL; - else - mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); - return; - } - - mas->alloc->request_count = count; -} - -/* - * mas_alloc_req() - get the requested number of allocations. - * @mas: The maple state - * - * The alloc count is either stored directly in @mas, or in - * @mas->alloc->request_count if there is at least one node allocated. Decode - * the request count if it's stored directly in @mas->alloc. - * - * Return: The allocation request count. - */ -static inline unsigned int mas_alloc_req(const struct ma_state *mas) -{ - if ((unsigned long)mas->alloc & 0x1) - return (unsigned long)(mas->alloc) >> 1; - else if (mas->alloc) - return mas->alloc->request_count; - return 0; -} - /* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node @@ -1120,77 +1075,15 @@ static int mas_ascend(struct ma_state *mas) */ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { - struct maple_alloc *ret, *node = mas->alloc; - unsigned long total = mas_allocated(mas); - unsigned int req = mas_alloc_req(mas); + struct maple_node *ret; - /* nothing or a request pending. */ - if (WARN_ON(!total)) + if (WARN_ON_ONCE(!mas->sheaf)) return NULL; - if (total == 1) { - /* single allocation in this ma_state */ - mas->alloc = NULL; - ret = node; - goto single_node; - } - - if (node->node_count == 1) { - /* Single allocation in this node. */ - mas->alloc = node->slot[0]; - mas->alloc->total = node->total - 1; - ret = node; - goto new_head; - } - node->total--; - ret = node->slot[--node->node_count]; - node->slot[node->node_count] = NULL; - -single_node: -new_head: - if (req) { - req++; - mas_set_alloc_req(mas, req); - } - + ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf); memset(ret, 0, sizeof(*ret)); - return (struct maple_node *)ret; -} -/* - * mas_push_node() - Push a node back on the maple state allocation. - * @mas: The maple state - * @used: The used maple node - * - * Stores the maple node back into @mas->alloc for reuse. Updates allocated and - * requested node count as necessary. - */ -static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) -{ - struct maple_alloc *reuse = (struct maple_alloc *)used; - struct maple_alloc *head = mas->alloc; - unsigned long count; - unsigned int requested = mas_alloc_req(mas); - - count = mas_allocated(mas); - - reuse->request_count = 0; - reuse->node_count = 0; - if (count) { - if (head->node_count < MAPLE_ALLOC_SLOTS) { - head->slot[head->node_count++] = reuse; - head->total++; - goto done; - } - reuse->slot[0] = head; - reuse->node_count = 1; - } - - reuse->total = count + 1; - mas->alloc = reuse; -done: - if (requested > 1) - mas_set_alloc_req(mas, requested - 1); + return ret; } /* @@ -1200,75 +1093,32 @@ done: */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { - struct maple_alloc *node; - unsigned long allocated = mas_allocated(mas); - unsigned int requested = mas_alloc_req(mas); - unsigned int count; - void **slots = NULL; - unsigned int max_req = 0; + if (unlikely(mas->sheaf)) { + unsigned long refill = mas->node_request; - if (!requested) - return; - - mas_set_alloc_req(mas, 0); - if (mas->mas_flags & MA_STATE_PREALLOC) { - if (allocated) + if (kmem_cache_sheaf_size(mas->sheaf) >= refill) { + mas->node_request = 0; return; - WARN_ON(!allocated); - } - - if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { - node = (struct maple_alloc *)mt_alloc_one(gfp); - if (!node) - goto nomem_one; - - if (allocated) { - node->slot[0] = mas->alloc; - node->node_count = 1; - } else { - node->node_count = 0; } - mas->alloc = node; - node->total = ++allocated; - node->request_count = 0; - requested--; + if (mt_refill_sheaf(gfp, &mas->sheaf, refill)) + goto error; + + mas->node_request = 0; + return; } - node = mas->alloc; - while (requested) { - max_req = MAPLE_ALLOC_SLOTS - node->node_count; - slots = (void **)&node->slot[node->node_count]; - max_req = min(requested, max_req); - count = mt_alloc_bulk(gfp, max_req, slots); - if (!count) - goto nomem_bulk; - - if (node->node_count == 0) { - node->slot[0]->node_count = 0; - node->slot[0]->request_count = 0; - } - - node->node_count += count; - allocated += count; - /* find a non-full node*/ - do { - node = node->slot[0]; - } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); - requested -= count; + mas->sheaf = mt_get_sheaf(gfp, mas->node_request); + if (likely(mas->sheaf)) { + mas->node_request = 0; + return; } - mas->alloc->total = allocated; - return; -nomem_bulk: - /* Clean up potential freed allocations on bulk failure */ - memset(slots, 0, max_req * sizeof(unsigned long)); - mas->alloc->total = allocated; -nomem_one: - mas_set_alloc_req(mas, requested); +error: mas_set_err(mas, -ENOMEM); } + /* * mas_free() - Free an encoded maple node * @mas: The maple state @@ -1279,42 +1129,7 @@ nomem_one: */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { - struct maple_node *tmp = mte_to_node(used); - - if (mt_in_rcu(mas->tree)) - ma_free_rcu(tmp); - else - mas_push_node(mas, tmp); -} - -/* - * mas_node_count_gfp() - Check if enough nodes are allocated and request more - * if there is not enough nodes. - * @mas: The maple state - * @count: The number of nodes needed - * @gfp: the gfp flags - */ -static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) -{ - unsigned long allocated = mas_allocated(mas); - - if (allocated < count) { - mas_set_alloc_req(mas, count - allocated); - mas_alloc_nodes(mas, gfp); - } -} - -/* - * mas_node_count() - Check if enough nodes are allocated and request more if - * there is not enough nodes. - * @mas: The maple state - * @count: The number of nodes needed - * - * Note: Uses GFP_NOWAIT for gfp flags. - */ -static void mas_node_count(struct ma_state *mas, int count) -{ - return mas_node_count_gfp(mas, count, GFP_NOWAIT); + ma_free_rcu(mte_to_node(used)); } /* @@ -2451,10 +2266,7 @@ static inline void mas_topiary_node(struct ma_state *mas, enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); - if (in_rcu) - ma_free_rcu(tmp); - else - mas_push_node(mas, tmp); + ma_free_rcu(tmp); } /* @@ -3980,7 +3792,7 @@ set_content: * * Return: Number of nodes required for preallocation. */ -static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) +static inline void mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; unsigned char height = mas_mt_height(mas); @@ -4026,7 +3838,7 @@ static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry) WARN_ON_ONCE(1); } - return ret; + mas->node_request = ret; } /* @@ -4087,15 +3899,15 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { - int request; + struct ma_state *mas = wr_mas->mas; mas_wr_prealloc_setup(wr_mas); - wr_mas->mas->store_type = mas_wr_store_type(wr_mas); - request = mas_prealloc_calc(wr_mas, entry); - if (!request) + mas->store_type = mas_wr_store_type(wr_mas); + mas_prealloc_calc(wr_mas, entry); + if (!mas->node_request) return; - mas_node_count(wr_mas->mas, request); + mas_alloc_nodes(mas, GFP_NOWAIT); } /** @@ -5208,7 +5020,6 @@ static inline void mte_destroy_walk(struct maple_enode *enode, */ void *mas_store(struct ma_state *mas, void *entry) { - int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); @@ -5238,11 +5049,11 @@ void *mas_store(struct ma_state *mas, void *entry) return wr_mas.content; } - request = mas_prealloc_calc(&wr_mas, entry); - if (!request) + mas_prealloc_calc(&wr_mas, entry); + if (!mas->node_request) goto store; - mas_node_count(mas, request); + mas_alloc_nodes(mas, GFP_NOWAIT); if (mas_is_err(mas)) return NULL; @@ -5330,20 +5141,19 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); - int ret = 0; - int request; mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); - request = mas_prealloc_calc(&wr_mas, entry); - if (!request) + mas_prealloc_calc(&wr_mas, entry); + if (!mas->node_request) goto set_flag; mas->mas_flags &= ~MA_STATE_PREALLOC; - mas_node_count_gfp(mas, request, gfp); + mas_alloc_nodes(mas, gfp); if (mas_is_err(mas)) { - mas_set_alloc_req(mas, 0); - ret = xa_err(mas->node); + int ret = xa_err(mas->node); + + mas->node_request = 0; mas_destroy(mas); mas_reset(mas); return ret; @@ -5351,7 +5161,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) set_flag: mas->mas_flags |= MA_STATE_PREALLOC; - return ret; + return 0; } EXPORT_SYMBOL_GPL(mas_preallocate); @@ -5365,26 +5175,13 @@ EXPORT_SYMBOL_GPL(mas_preallocate); */ void mas_destroy(struct ma_state *mas) { - struct maple_alloc *node; - unsigned long total; - mas->mas_flags &= ~MA_STATE_PREALLOC; - total = mas_allocated(mas); - while (total) { - node = mas->alloc; - mas->alloc = node->slot[0]; - if (node->node_count > 1) { - size_t count = node->node_count - 1; + mas->node_request = 0; + if (mas->sheaf) + mt_return_sheaf(mas->sheaf); - mt_free_bulk(count, (void __rcu **)&node->slot[1]); - total -= count; - } - kfree(ma_mnode_ptr(node)); - total--; - } - - mas->alloc = NULL; + mas->sheaf = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); @@ -6019,7 +5816,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) mas_alloc_nodes(mas, gfp); } - if (!mas_allocated(mas)) + if (!mas->sheaf) return false; mas->status = ma_start; @@ -7414,8 +7211,9 @@ void mas_dump(const struct ma_state *mas) pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); - pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", - mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); + pr_err(" min=%lx max=%lx sheaf=" PTR_FMT ", request %lu depth=%u, flags=%x\n", + mas->min, mas->max, mas->sheaf, mas->node_request, mas->depth, + mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 4a35e1e7c64b..72a8fe8e832a 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -57,430 +57,6 @@ struct rcu_reader_struct { struct rcu_test_struct2 *test; }; -static int get_alloc_node_count(struct ma_state *mas) -{ - int count = 1; - struct maple_alloc *node = mas->alloc; - - if (!node || ((unsigned long)node & 0x1)) - return 0; - while (node->node_count) { - count += node->node_count; - node = node->slot[0]; - } - return count; -} - -static void check_mas_alloc_node_count(struct ma_state *mas) -{ - mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 1, GFP_KERNEL); - mas_node_count_gfp(mas, MAPLE_ALLOC_SLOTS + 3, GFP_KERNEL); - MT_BUG_ON(mas->tree, get_alloc_node_count(mas) != mas->alloc->total); - mas_destroy(mas); -} - -/* - * check_new_node() - Check the creation of new nodes and error path - * verification. - */ -static noinline void __init check_new_node(struct maple_tree *mt) -{ - - struct maple_node *mn, *mn2, *mn3; - struct maple_alloc *smn; - struct maple_node *nodes[100]; - int i, j, total; - - MA_STATE(mas, mt, 0, 0); - - check_mas_alloc_node_count(&mas); - - /* Try allocating 3 nodes */ - mtree_lock(mt); - mt_set_non_kernel(0); - /* request 3 nodes to be allocated. */ - mas_node_count(&mas, 3); - /* Allocation request of 3. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 3); - /* Allocate failed. */ - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, mas.alloc == NULL); - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); - mas_push_node(&mas, mn); - mas_reset(&mas); - mas_destroy(&mas); - mtree_unlock(mt); - - - /* Try allocating 1 node, then 2 more */ - mtree_lock(mt); - /* Set allocation request to 1. */ - mas_set_alloc_req(&mas, 1); - /* Check Allocation request of 1. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); - mas_set_err(&mas, -ENOMEM); - /* Validate allocation request. */ - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - /* Eat the requested node. */ - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, mn->slot[0] != NULL); - MT_BUG_ON(mt, mn->slot[1] != NULL); - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mas.status = ma_start; - mas_destroy(&mas); - /* Allocate 3 nodes, will fail. */ - mas_node_count(&mas, 3); - /* Drop the lock and allocate 3 nodes. */ - mas_nomem(&mas, GFP_KERNEL); - /* Ensure 3 are allocated. */ - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - /* Allocation request of 0. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 0); - - MT_BUG_ON(mt, mas.alloc == NULL); - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); - MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); - /* Ensure we counted 3. */ - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - /* Free. */ - mas_reset(&mas); - mas_destroy(&mas); - - /* Set allocation request to 1. */ - mas_set_alloc_req(&mas, 1); - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); - mas_set_err(&mas, -ENOMEM); - /* Validate allocation request. */ - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != 1); - /* Check the node is only one node. */ - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, mn->slot[0] != NULL); - MT_BUG_ON(mt, mn->slot[1] != NULL); - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != 1); - MT_BUG_ON(mt, mas.alloc->node_count); - - mas_set_alloc_req(&mas, 2); /* request 2 more. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 2); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != 3); - MT_BUG_ON(mt, mas.alloc == NULL); - MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); - MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); - for (i = 2; i >= 0; i--) { - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != i); - MT_BUG_ON(mt, !mn); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - - total = 64; - mas_set_alloc_req(&mas, total); /* request 2 more. */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != total); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - for (i = total; i > 0; i--) { - unsigned int e = 0; /* expected node_count */ - - if (!MAPLE_32BIT) { - if (i >= 35) - e = i - 34; - else if (i >= 5) - e = i - 4; - else if (i >= 2) - e = i - 1; - } else { - if (i >= 4) - e = i - 3; - else if (i >= 1) - e = i - 1; - else - e = 0; - } - - MT_BUG_ON(mt, mas.alloc->node_count != e); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != i - 1); - MT_BUG_ON(mt, !mn); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - - total = 100; - for (i = 1; i < total; i++) { - mas_set_alloc_req(&mas, i); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - for (j = i; j > 0; j--) { - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); - MT_BUG_ON(mt, !mn); - MT_BUG_ON(mt, not_empty(mn)); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != j); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - - mas_set_alloc_req(&mas, i); - mas_set_err(&mas, -ENOMEM); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - for (j = 0; j <= i/2; j++) { - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); - nodes[j] = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); - } - - while (j) { - j--; - mas_push_node(&mas, nodes[j]); - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); - } - MT_BUG_ON(mt, mas_allocated(&mas) != i); - for (j = 0; j <= i/2; j++) { - MT_BUG_ON(mt, mas_allocated(&mas) != i - j); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); - } - mas_reset(&mas); - MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); - mas_destroy(&mas); - - } - - /* Set allocation request. */ - total = 500; - mas_node_count(&mas, total); - /* Drop the lock and allocate the nodes. */ - mas_nomem(&mas, GFP_KERNEL); - MT_BUG_ON(mt, !mas.alloc); - i = 1; - smn = mas.alloc; - while (i < total) { - for (j = 0; j < MAPLE_ALLOC_SLOTS; j++) { - i++; - MT_BUG_ON(mt, !smn->slot[j]); - if (i == total) - break; - } - smn = smn->slot[0]; /* next. */ - } - MT_BUG_ON(mt, mas_allocated(&mas) != total); - mas_reset(&mas); - mas_destroy(&mas); /* Free. */ - - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - for (i = 1; i < 128; i++) { - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ - for (j = i; j > 0; j--) { /*Free the requests */ - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - } - - for (i = 1; i < MAPLE_NODE_MASK + 1; i++) { - MA_STATE(mas2, mt, 0, 0); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ - for (j = 1; j <= i; j++) { /* Move the allocations to mas2 */ - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - mas_push_node(&mas2, mn); - MT_BUG_ON(mt, mas_allocated(&mas2) != j); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_allocated(&mas2) != i); - - for (j = i; j > 0; j--) { /*Free the requests */ - MT_BUG_ON(mt, mas_allocated(&mas2) != j); - mn = mas_pop_node(&mas2); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas2) != 0); - } - - - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - - mn = mas_pop_node(&mas); /* get the next node. */ - MT_BUG_ON(mt, mn == NULL); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); - - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - - /* Check the limit of pop/push/pop */ - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ - MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); - MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); - MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); - MT_BUG_ON(mt, mas_alloc_req(&mas)); - MT_BUG_ON(mt, mas.alloc->node_count != 1); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas.alloc->node_count != 1); - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) { - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, not_empty(mn)); - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - } - MT_BUG_ON(mt, mas_allocated(&mas) != 0); - - - for (i = 3; i < MAPLE_NODE_MASK * 3; i++) { - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mas_push_node(&mas, mn); /* put it back */ - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn2 = mas_pop_node(&mas); /* get the next node. */ - mas_push_node(&mas, mn); /* put them back */ - mas_push_node(&mas, mn2); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn2 = mas_pop_node(&mas); /* get the next node. */ - mn3 = mas_pop_node(&mas); /* get the next node. */ - mas_push_node(&mas, mn); /* put them back */ - mas_push_node(&mas, mn2); - mas_push_node(&mas, mn3); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, i); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mn = mas_pop_node(&mas); /* get the next node. */ - mn->parent = ma_parent_ptr(mn); - ma_free_rcu(mn); - mas_destroy(&mas); - } - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, 5); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != 5); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, 10); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != 10); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS - 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS - 1); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, 10 + MAPLE_ALLOC_SLOTS - 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 2); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 2); - mas_destroy(&mas); - - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 1); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 1); - mas.node = MA_ERROR(-ENOMEM); - mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 3 + 2); /* Request */ - mas_nomem(&mas, GFP_KERNEL); /* Fill request */ - mas.status = ma_start; - MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 3 + 2); - mas_destroy(&mas); - - mtree_unlock(mt); -} - /* * Check erasing including RCU. */ @@ -35507,6 +35083,13 @@ static unsigned char get_vacant_height(struct ma_wr_state *wr_mas, void *entry) return vacant_height; } +static int mas_allocated(struct ma_state *mas) +{ + if (mas->sheaf) + return kmem_cache_sheaf_size(mas->sheaf); + + return 0; +} /* Preallocation testing */ static noinline void __init check_prealloc(struct maple_tree *mt) { @@ -35525,7 +35108,10 @@ static noinline void __init check_prealloc(struct maple_tree *mt) /* Spanning store */ mas_set_range(&mas, 470, 500); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + + mas_wr_preallocate(&wr_mas, ptr); + MT_BUG_ON(mt, mas.store_type != wr_spanning_store); + MT_BUG_ON(mt, mas_is_err(&mas)); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); vacant_height = get_vacant_height(&wr_mas, ptr); @@ -35535,6 +35121,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); + mas_wr_preallocate(&wr_mas, ptr); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); @@ -35575,20 +35162,6 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); - allocated = mas_allocated(&mas); - height = mas_mt_height(&mas); - vacant_height = get_vacant_height(&wr_mas, ptr); - MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3); - mn = mas_pop_node(&mas); - MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - mas_push_node(&mas, mn); - MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); - mas_destroy(&mas); - allocated = mas_allocated(&mas); - MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); @@ -36389,11 +35962,17 @@ static void check_nomem_writer_race(struct maple_tree *mt) check_load(mt, 6, xa_mk_value(0xC)); mtree_unlock(mt); + mt_set_non_kernel(0); /* test for the same race but with mas_store_gfp() */ mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); mas_set_range(&mas, 0, 5); + + /* setup writer 2 that will trigger the race condition */ + mt_set_private(mt); + mt_set_callback(writer2); + mtree_lock(mt); mas_store_gfp(&mas, NULL, GFP_KERNEL); @@ -36508,10 +36087,6 @@ void farmer_tests(void) check_erase_testset(&tree); mtree_destroy(&tree); - mt_init_flags(&tree, 0); - check_new_node(&tree); - mtree_destroy(&tree); - if (!MAPLE_32BIT) { mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_rcu_simulated(&tree); diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 4ceff7969b78..8c7257155958 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -64,7 +64,8 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, if (!(gfp & __GFP_DIRECT_RECLAIM)) { if (!cachep->non_kernel) { - cachep->exec_callback = true; + if (cachep->callback) + cachep->exec_callback = true; return NULL; } @@ -210,6 +211,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, for (i = 0; i < size; i++) __kmem_cache_free_locked(cachep, p[i]); pthread_mutex_unlock(&cachep->lock); + if (cachep->callback) + cachep->exec_callback = true; return 0; } From 6bf377b06c08049d0f4042493df302285e45165e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 15:00:02 +0200 Subject: [PATCH 33/40] maple_tree: Add single node allocation support to maple state The fast path through a write will require replacing a single node in the tree. Using a sheaf (32 nodes) is too heavy for the fast path, so special case the node store operation by just allocating one node in the maple state. Signed-off-by: Liam R. Howlett Signed-off-by: Vlastimil Babka --- include/linux/maple_tree.h | 4 ++- lib/maple_tree.c | 47 ++++++++++++++++++++++++++++---- tools/testing/radix-tree/maple.c | 11 ++++++-- 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0e31b191e3be..51a64ff23b88 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -443,6 +443,7 @@ struct ma_state { unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct slab_sheaf *sheaf; /* Allocated nodes for this operation */ + struct maple_node *alloc; /* A single allocated node for fast path writes */ unsigned long node_request; /* The number of nodes to allocate for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ @@ -491,8 +492,9 @@ struct ma_wr_state { .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ - .node_request = 0, \ .sheaf = NULL, \ + .alloc = NULL, \ + .node_request = 0, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b41245f2cc65..2b7299409900 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1073,16 +1073,23 @@ static int mas_ascend(struct ma_state *mas) * * Return: A pointer to a maple node. */ -static inline struct maple_node *mas_pop_node(struct ma_state *mas) +static __always_inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_node *ret; + if (mas->alloc) { + ret = mas->alloc; + mas->alloc = NULL; + goto out; + } + if (WARN_ON_ONCE(!mas->sheaf)) return NULL; ret = kmem_cache_alloc_from_sheaf(maple_node_cache, GFP_NOWAIT, mas->sheaf); - memset(ret, 0, sizeof(*ret)); +out: + memset(ret, 0, sizeof(*ret)); return ret; } @@ -1093,9 +1100,34 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { - if (unlikely(mas->sheaf)) { - unsigned long refill = mas->node_request; + if (!mas->node_request) + return; + if (mas->node_request == 1) { + if (mas->sheaf) + goto use_sheaf; + + if (mas->alloc) + return; + + mas->alloc = mt_alloc_one(gfp); + if (!mas->alloc) + goto error; + + mas->node_request = 0; + return; + } + +use_sheaf: + if (unlikely(mas->alloc)) { + kfree(mas->alloc); + mas->alloc = NULL; + } + + if (mas->sheaf) { + unsigned long refill; + + refill = mas->node_request; if (kmem_cache_sheaf_size(mas->sheaf) >= refill) { mas->node_request = 0; return; @@ -5180,8 +5212,11 @@ void mas_destroy(struct ma_state *mas) mas->node_request = 0; if (mas->sheaf) mt_return_sheaf(mas->sheaf); - mas->sheaf = NULL; + + if (mas->alloc) + kfree(mas->alloc); + mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); @@ -5816,7 +5851,7 @@ bool mas_nomem(struct ma_state *mas, gfp_t gfp) mas_alloc_nodes(mas, gfp); } - if (!mas->sheaf) + if (!mas->sheaf && !mas->alloc) return false; mas->status = ma_start; diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 72a8fe8e832a..83260f2efb19 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35085,10 +35085,15 @@ static unsigned char get_vacant_height(struct ma_wr_state *wr_mas, void *entry) static int mas_allocated(struct ma_state *mas) { - if (mas->sheaf) - return kmem_cache_sheaf_size(mas->sheaf); + int total = 0; - return 0; + if (mas->alloc) + total++; + + if (mas->sheaf) + total += kmem_cache_sheaf_size(mas->sheaf); + + return total; } /* Preallocation testing */ static noinline void __init check_prealloc(struct maple_tree *mt) From 719a42e563bb087758500e43e67a57b27f303c4c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 3 Sep 2025 15:00:03 +0200 Subject: [PATCH 34/40] maple_tree: Convert forking to use the sheaf interface Use the generic interface which should result in less bulk allocations during a forking. A part of this is to abstract the freeing of the sheaf or maple state allocations into its own function so mas_destroy() and the tree duplication code can use the same functionality to return any unused resources. [andriy.shevchenko@linux.intel.com: remove unused mt_alloc_bulk()] Signed-off-by: Liam R. Howlett Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- lib/maple_tree.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 2b7299409900..ab4c6c21a625 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -172,11 +172,6 @@ static inline struct maple_node *mt_alloc_one(gfp_t gfp) return kmem_cache_alloc(maple_node_cache, gfp); } -static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) -{ - return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); -} - static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); @@ -1150,6 +1145,19 @@ error: mas_set_err(mas, -ENOMEM); } +static inline void mas_empty_nodes(struct ma_state *mas) +{ + mas->node_request = 0; + if (mas->sheaf) { + mt_return_sheaf(mas->sheaf); + mas->sheaf = NULL; + } + + if (mas->alloc) { + kfree(mas->alloc); + mas->alloc = NULL; + } +} /* * mas_free() - Free an encoded maple node @@ -5208,15 +5216,7 @@ EXPORT_SYMBOL_GPL(mas_preallocate); void mas_destroy(struct ma_state *mas) { mas->mas_flags &= ~MA_STATE_PREALLOC; - - mas->node_request = 0; - if (mas->sheaf) - mt_return_sheaf(mas->sheaf); - mas->sheaf = NULL; - - if (mas->alloc) - kfree(mas->alloc); - mas->alloc = NULL; + mas_empty_nodes(mas); } EXPORT_SYMBOL_GPL(mas_destroy); @@ -6241,7 +6241,7 @@ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); enum maple_type type; - unsigned char request, count, i; + unsigned char count, i; void __rcu **slots; void __rcu **new_slots; unsigned long val; @@ -6249,20 +6249,17 @@ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, /* Allocate memory for child nodes. */ type = mte_node_type(mas->node); new_slots = ma_slots(new_node, type); - request = mas_data_end(mas) + 1; - count = mt_alloc_bulk(gfp, request, (void **)new_slots); - if (unlikely(count < request)) { - memset(new_slots, 0, request * sizeof(void *)); - mas_set_err(mas, -ENOMEM); + count = mas->node_request = mas_data_end(mas) + 1; + mas_alloc_nodes(mas, gfp); + if (unlikely(mas_is_err(mas))) return; - } - /* Restore node type information in slots. */ slots = ma_slots(node, type); for (i = 0; i < count; i++) { val = (unsigned long)mt_slot_locked(mas->tree, slots, i); val &= MAPLE_NODE_MASK; - ((unsigned long *)new_slots)[i] |= val; + new_slots[i] = ma_mnode_ptr((unsigned long)mas_pop_node(mas) | + val); } } @@ -6316,7 +6313,7 @@ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, /* Only allocate child nodes for non-leaf nodes. */ mas_dup_alloc(mas, new_mas, gfp); if (unlikely(mas_is_err(mas))) - return; + goto empty_mas; } else { /* * This is the last leaf node and duplication is @@ -6349,6 +6346,8 @@ set_new_tree: /* Make them the same height */ new_mas->tree->ma_flags = mas->tree->ma_flags; rcu_assign_pointer(new_mas->tree->ma_root, root); +empty_mas: + mas_empty_nodes(mas); } /** From 4957089a23f41f31f8e7e22802a8ef9f5789c191 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:02 -0700 Subject: [PATCH 35/40] locking/local_lock: Introduce local_lock_is_locked(). Introduce local_lock_is_locked() that returns true when given local_lock is locked by current cpu (in !PREEMPT_RT) or by current task (in PREEMPT_RT). The goal is to detect a deadlock by the caller. Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Alexei Starovoitov Signed-off-by: Vlastimil Babka --- include/linux/local_lock.h | 2 ++ include/linux/local_lock_internal.h | 7 +++++++ include/linux/rtmutex.h | 10 ++++++++++ kernel/locking/rtmutex_common.h | 9 --------- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 2ba846419524..0d91d060e3e9 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -66,6 +66,8 @@ */ #define local_trylock(lock) __local_trylock(this_cpu_ptr(lock)) +#define local_lock_is_locked(lock) __local_lock_is_locked(lock) + /** * local_trylock_irqsave - Try to acquire a per CPU local lock, save and disable * interrupts if acquired diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 949de37700db..a4dc479157b5 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -165,6 +165,9 @@ do { \ !!tl; \ }) +/* preemption or migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(lock) READ_ONCE(this_cpu_ptr(lock)->acquired) + #define __local_lock_release(lock) \ do { \ local_trylock_t *tl; \ @@ -285,4 +288,8 @@ do { \ __local_trylock(lock); \ }) +/* migration must be disabled before calling __local_lock_is_locked */ +#define __local_lock_is_locked(__lock) \ + (rt_mutex_owner(&this_cpu_ptr(__lock)->lock) == current) + #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index fa9f1021541e..ede4c6bf6f22 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -44,6 +44,16 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) return READ_ONCE(lock->owner) != NULL; } +#ifdef CONFIG_RT_MUTEXES +#define RT_MUTEX_HAS_WAITERS 1UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) +{ + unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); +} +#endif extern void rt_mutex_base_init(struct rt_mutex_base *rtb); /** diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 78dd3d8c6554..cf6ddd1b23a2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -153,15 +153,6 @@ static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) pi_tree.entry); } -#define RT_MUTEX_HAS_WAITERS 1UL - -static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) -{ - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - - return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); -} - /* * Constants for rt mutex functions which have a selectable deadlock * detection. From 99253de51f80acccc528a9c94e2f4d5f329071f1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:03 -0700 Subject: [PATCH 36/40] mm: Allow GFP_ACCOUNT to be used in alloc_pages_nolock(). Change alloc_pages_nolock() to default to __GFP_COMP when allocating pages, since upcoming reentrant alloc_slab_page() needs __GFP_COMP. Also allow __GFP_ACCOUNT flag to be specified, since most of BPF infra needs __GFP_ACCOUNT except BPF streams. Reviewed-by: Vlastimil Babka Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/gfp.h | 2 +- kernel/bpf/stream.c | 2 +- kernel/bpf/syscall.c | 2 +- mm/page_alloc.c | 10 ++++++---- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5ebf26fcdcfa..0ceb4e09306c 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -354,7 +354,7 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order); +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_pages_nolock(...) alloc_hooks(alloc_pages_nolock_noprof(__VA_ARGS__)) extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index ab592db4a4bf..eb6c5a21c2ef 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -83,7 +83,7 @@ static struct bpf_stream_page *bpf_stream_page_replace(void) struct bpf_stream_page *stream_page, *old_stream_page; struct page *page; - page = alloc_pages_nolock(NUMA_NO_NODE, 0); + page = alloc_pages_nolock(/* Don't account */ 0, NUMA_NO_NODE, 0); if (!page) return NULL; stream_page = page_address(page); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0fbfa8532c39..dbf86f8014de 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -581,7 +581,7 @@ static bool can_alloc_pages(void) static struct page *__bpf_alloc_page(int nid) { if (!can_alloc_pages()) - return alloc_pages_nolock(nid, 0); + return alloc_pages_nolock(__GFP_ACCOUNT, nid, 0); return alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1d037f97c5f..30ccff0283fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7480,6 +7480,7 @@ static bool __free_unaccepted(struct page *page) /** * alloc_pages_nolock - opportunistic reentrant allocation from any context + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. * @nid: node to allocate from * @order: allocation order size * @@ -7493,7 +7494,7 @@ static bool __free_unaccepted(struct page *page) * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. * It means ENOMEM. There is no reason to call it again and expect !NULL. */ -struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) { /* * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. @@ -7515,12 +7516,13 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) * specify it here to highlight that alloc_pages_nolock() * doesn't want to deplete reserves. */ - gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC - | __GFP_ACCOUNT; + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP + | gfp_flags; unsigned int alloc_flags = ALLOC_TRYLOCK; struct alloc_context ac = { }; struct page *page; + VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT); /* * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is * unsafe in NMI. If spin_trylock() is called from hard IRQ the current @@ -7558,7 +7560,7 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order) if (page) set_page_refcounted(page); - if (memcg_kmem_online() && page && + if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) && unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { free_pages_nolock(page, order); page = NULL; From d7242af8643409aae32243450341ef25b28d8a8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:04 -0700 Subject: [PATCH 37/40] mm: Introduce alloc_frozen_pages_nolock() Split alloc_pages_nolock() and introduce alloc_frozen_pages_nolock() to be used by alloc_slab_page(). Reviewed-by: Vlastimil Babka Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/internal.h | 4 ++++ mm/page_alloc.c | 49 ++++++++++++++++++++++++++++--------------------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 45b725c3dc03..9904421cabc1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -842,6 +842,10 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord #define alloc_frozen_pages(...) \ alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__)) +struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); +#define alloc_frozen_pages_nolock(...) \ + alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) + extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30ccff0283fd..5a40e2b7d148 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7478,23 +7478,7 @@ static bool __free_unaccepted(struct page *page) #endif /* CONFIG_UNACCEPTED_MEMORY */ -/** - * alloc_pages_nolock - opportunistic reentrant allocation from any context - * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. - * @nid: node to allocate from - * @order: allocation order size - * - * Allocates pages of a given order from the given node. This is safe to - * call from any context (from atomic, NMI, and also reentrant - * allocator -> tracepoint -> alloc_pages_nolock_noprof). - * Allocation is best effort and to be expected to fail easily so nobody should - * rely on the success. Failures are not reported via warn_alloc(). - * See always fail conditions below. - * - * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. - * It means ENOMEM. There is no reason to call it again and expect !NULL. - */ -struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) +struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) { /* * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. @@ -7557,15 +7541,38 @@ struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int or /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ - if (page) - set_page_refcounted(page); - if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) && unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { - free_pages_nolock(page, order); + __free_frozen_pages(page, order, FPI_TRYLOCK); page = NULL; } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp); return page; } +/** + * alloc_pages_nolock - opportunistic reentrant allocation from any context + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed. + * @nid: node to allocate from + * @order: allocation order size + * + * Allocates pages of a given order from the given node. This is safe to + * call from any context (from atomic, NMI, and also reentrant + * allocator -> tracepoint -> alloc_pages_nolock_noprof). + * Allocation is best effort and to be expected to fail easily so nobody should + * rely on the success. Failures are not reported via warn_alloc(). + * See always fail conditions below. + * + * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN. + * It means ENOMEM. There is no reason to call it again and expect !NULL. + */ +struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order) +{ + struct page *page; + + page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order); + if (page) + set_page_refcounted(page); + return page; +} +EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof); From 83382af9ddc3cb0ef43f67d049b461720ad785e6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:05 -0700 Subject: [PATCH 38/40] slab: Make slub local_(try)lock more precise for LOCKDEP kmalloc_nolock() can be called from any context the ___slab_alloc() can acquire local_trylock_t (which is rt_spin_lock in PREEMPT_RT) and attempt to acquire a different local_trylock_t while in the same task context. The calling sequence might look like: kmalloc() -> tracepoint -> bpf -> kmalloc_nolock() or more precisely: __lock_acquire+0x12ad/0x2590 lock_acquire+0x133/0x2d0 rt_spin_lock+0x6f/0x250 ___slab_alloc+0xb7/0xec0 kmalloc_nolock_noprof+0x15a/0x430 my_debug_callback+0x20e/0x390 [testmod] ___slab_alloc+0x256/0xec0 __kmalloc_cache_noprof+0xd6/0x3b0 Make LOCKDEP understand that local_trylock_t-s protect different kmem_caches. In order to do that add lock_class_key for each kmem_cache and use that key in local_trylock_t. This stack trace is possible on both PREEMPT_RT and !PREEMPT_RT, but teach lockdep about it only for PREEMPT_RT, since in !PREEMPT_RT the ___slab_alloc() code is using local_trylock_irqsave() when lockdep is on. Note, this patch applies this logic to local_lock_t while the next one converts it to local_trylock_t. Both are mapped to rt_spin_lock in PREEMPT_RT. Signed-off-by: Alexei Starovoitov Signed-off-by: Vlastimil Babka --- mm/slab.h | 1 + mm/slub.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/mm/slab.h b/mm/slab.h index e82e51c44bd0..43245d9207b6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -234,6 +234,7 @@ struct kmem_cache_order_objects { struct kmem_cache { #ifndef CONFIG_SLUB_TINY struct kmem_cache_cpu __percpu *cpu_slab; + struct lock_class_key lock_key; #endif struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ diff --git a/mm/slub.c b/mm/slub.c index 114ec18d223b..ee575ed9250f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3586,12 +3586,29 @@ static inline void note_cmpxchg_failure(const char *n, static void init_kmem_cache_cpus(struct kmem_cache *s) { +#ifdef CONFIG_PREEMPT_RT + /* + * Register lockdep key for non-boot kmem caches to avoid + * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() + */ + bool finegrain_lockdep = !init_section_contains(s, 1); +#else + /* + * Don't bother with different lockdep classes for each + * kmem_cache, since we only use local_trylock_irqsave(). + */ + bool finegrain_lockdep = false; +#endif int cpu; struct kmem_cache_cpu *c; + if (finegrain_lockdep) + lockdep_register_key(&s->lock_key); for_each_possible_cpu(cpu) { c = per_cpu_ptr(s->cpu_slab, cpu); local_lock_init(&c->lock); + if (finegrain_lockdep) + lockdep_set_class(&c->lock, &s->lock_key); c->tid = init_tid(cpu); } } @@ -7210,6 +7227,9 @@ void __kmem_cache_release(struct kmem_cache *s) if (s->cpu_sheaves) pcs_destroy(s); #ifndef CONFIG_SLUB_TINY +#ifdef CONFIG_PREEMPT_RT + lockdep_unregister_key(&s->lock_key); +#endif free_percpu(s->cpu_slab); #endif free_kmem_cache_nodes(s); From 7612833192d56af86061de8ab51989b75daf5b0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:06 -0700 Subject: [PATCH 39/40] slab: Reuse first bit for OBJEXTS_ALLOC_FAIL Since the combination of valid upper bits in slab->obj_exts with OBJEXTS_ALLOC_FAIL bit can never happen, use OBJEXTS_ALLOC_FAIL == (1ull << 0) as a magic sentinel instead of (1ull << 2) to free up bit 2. Signed-off-by: Alexei Starovoitov Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/memcontrol.h | 10 ++++++++-- mm/slub.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..d254c0b96d0d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -341,17 +341,23 @@ enum page_memcg_data_flags { __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; +#define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ +#define __OBJEXTS_ALLOC_FAIL (1UL << 0) #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { - /* slabobj_ext vector failed to allocate */ - OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, + /* + * Use bit 0 with zero other bits to signal that slabobj_ext vector + * failed to allocate. The same bit 0 with valid upper bits means + * MEMCG_DATA_OBJEXTS. + */ + OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; diff --git a/mm/slub.c b/mm/slub.c index ee575ed9250f..189cd5aa4ac4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2046,7 +2046,7 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts, * objects with no tag reference. Mark all references in this * vector as empty to avoid warnings later on. */ - if (obj_exts & OBJEXTS_ALLOC_FAIL) { + if (obj_exts == OBJEXTS_ALLOC_FAIL) { unsigned int i; for (i = 0; i < objects; i++) From af92793e52c3a99b828ed4bdd277fd3e11c18d08 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:07 -0700 Subject: [PATCH 40/40] slab: Introduce kmalloc_nolock() and kfree_nolock(). kmalloc_nolock() relies on ability of local_trylock_t to detect the situation when per-cpu kmem_cache is locked. In !PREEMPT_RT local_(try)lock_irqsave(&s->cpu_slab->lock, flags) disables IRQs and marks s->cpu_slab->lock as acquired. local_lock_is_locked(&s->cpu_slab->lock) returns true when slab is in the middle of manipulating per-cpu cache of that specific kmem_cache. kmalloc_nolock() can be called from any context and can re-enter into ___slab_alloc(): kmalloc() -> ___slab_alloc(cache_A) -> irqsave -> NMI -> bpf -> kmalloc_nolock() -> ___slab_alloc(cache_B) or kmalloc() -> ___slab_alloc(cache_A) -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() -> ___slab_alloc(cache_B) Hence the caller of ___slab_alloc() checks if &s->cpu_slab->lock can be acquired without a deadlock before invoking the function. If that specific per-cpu kmem_cache is busy the kmalloc_nolock() retries in a different kmalloc bucket. The second attempt will likely succeed, since this cpu locked different kmem_cache. Similarly, in PREEMPT_RT local_lock_is_locked() returns true when per-cpu rt_spin_lock is locked by current _task_. In this case re-entrance into the same kmalloc bucket is unsafe, and kmalloc_nolock() tries a different bucket that is most likely is not locked by the current task. Though it may be locked by a different task it's safe to rt_spin_lock() and sleep on it. Similar to alloc_pages_nolock() the kmalloc_nolock() returns NULL immediately if called from hard irq or NMI in PREEMPT_RT. kfree_nolock() defers freeing to irq_work when local_lock_is_locked() and (in_nmi() or in PREEMPT_RT). SLUB_TINY config doesn't use local_lock_is_locked() and relies on spin_trylock_irqsave(&n->list_lock) to allocate, while kfree_nolock() always defers to irq_work. Note, kfree_nolock() must be called _only_ for objects allocated with kmalloc_nolock(). Debug checks (like kmemleak and kfence) were skipped on allocation, hence obj = kmalloc(); kfree_nolock(obj); will miss kmemleak/kfence book keeping and will cause false positives. large_kmalloc is not supported by either kmalloc_nolock() or kfree_nolock(). Signed-off-by: Alexei Starovoitov Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 13 +- include/linux/memcontrol.h | 2 + include/linux/slab.h | 4 + mm/Kconfig | 1 + mm/kasan/common.c | 5 +- mm/slab.h | 6 + mm/slab_common.c | 3 + mm/slub.c | 504 +++++++++++++++++++++++++++++++++---- 8 files changed, 483 insertions(+), 55 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 890011071f2b..acdc8cb0152e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, } bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, - bool still_accessible); + bool still_accessible, bool no_quarantine); /** * kasan_slab_free - Poison, initialize, and quarantine a slab object. * @object: Object to be freed. @@ -226,11 +226,13 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, * @Return true if KASAN took ownership of the object; false otherwise. */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, - void *object, bool init, - bool still_accessible) + void *object, bool init, + bool still_accessible, + bool no_quarantine) { if (kasan_enabled()) - return __kasan_slab_free(s, object, init, still_accessible); + return __kasan_slab_free(s, object, init, still_accessible, + no_quarantine); return false; } @@ -427,7 +429,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) } static inline bool kasan_slab_free(struct kmem_cache *s, void *object, - bool init, bool still_accessible) + bool init, bool still_accessible, + bool no_quarantine) { return false; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d254c0b96d0d..82563236f35c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -358,6 +358,8 @@ enum objext_flags { * MEMCG_DATA_OBJEXTS. */ OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, + /* slabobj_ext vector allocated with kmalloc_nolock() */ + OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; diff --git a/include/linux/slab.h b/include/linux/slab.h index 680193356ac7..561597dd2164 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -501,6 +501,7 @@ void * __must_check krealloc_noprof(const void *objp, size_t new_size, #define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) void kfree(const void *objp); +void kfree_nolock(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -957,6 +958,9 @@ static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t f } #define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node); +#define kmalloc_nolock(...) alloc_hooks(kmalloc_nolock_noprof(__VA_ARGS__)) + #define kmem_buckets_alloc(_b, _size, _flags) \ alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..202e044f2b4d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -194,6 +194,7 @@ menu "Slab allocator options" config SLUB def_bool y + select IRQ_WORK config KVFREE_RCU_BATCHED def_bool y diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 9142964ab9c9..3264900b942f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -252,7 +252,7 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, } bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, - bool still_accessible) + bool still_accessible, bool no_quarantine) { if (!kasan_arch_is_ready() || is_kfence_address(object)) return false; @@ -274,6 +274,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, poison_slab_object(cache, object, init); + if (no_quarantine) + return false; + /* * If the object is put into quarantine, do not let slab put the object * onto the freelist for now. The object's metadata is kept until the diff --git a/mm/slab.h b/mm/slab.h index 43245d9207b6..35e533e59b07 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -57,6 +57,10 @@ struct slab { struct { union { struct list_head slab_list; + struct { /* For deferred deactivate_slab() */ + struct llist_node llnode; + void *flush_freelist; + }; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; @@ -662,6 +666,8 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); +void defer_free_barrier(void); + static inline bool slub_debug_orig_size(struct kmem_cache *s) { return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && diff --git a/mm/slab_common.c b/mm/slab_common.c index b6601e0fe598..932d13ada36c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -510,6 +510,9 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); } + /* Wait for deferred work from kmalloc/kfree_nolock() */ + defer_free_barrier(); + cpus_read_lock(); mutex_lock(&slab_mutex); diff --git a/mm/slub.c b/mm/slub.c index 189cd5aa4ac4..f9f7f3942074 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -44,7 +44,8 @@ #include #include #include - +#include +#include #include #include @@ -426,7 +427,7 @@ struct kmem_cache_cpu { #ifdef CONFIG_SLUB_CPU_PARTIAL struct slab *partial; /* Partially allocated slabs */ #endif - local_lock_t lock; /* Protects the fields above */ + local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif @@ -2079,6 +2080,7 @@ static inline void init_slab_obj_exts(struct slab *slab) int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { + bool allow_spin = gfpflags_allow_spinning(gfp); unsigned int objects = objs_per_slab(s, slab); unsigned long new_exts; unsigned long old_exts; @@ -2087,8 +2089,22 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp &= ~OBJCGS_CLEAR_MASK; /* Prevent recursive extension vector allocation */ gfp |= __GFP_NO_OBJ_EXT; - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, - slab_nid(slab)); + + /* + * Note that allow_spin may be false during early boot and its + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting + * architectures with cmpxchg16b, early obj_exts will be missing for + * very early allocations on those. + */ + if (unlikely(!allow_spin)) { + size_t sz = objects * sizeof(struct slabobj_ext); + + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, + slab_nid(slab)); + } else { + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + } if (!vec) { /* Mark vectors which failed to allocate */ if (new_slab) @@ -2098,6 +2114,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, } new_exts = (unsigned long)vec; + if (unlikely(!allow_spin)) + new_exts |= OBJEXTS_NOSPIN_ALLOC; #ifdef CONFIG_MEMCG new_exts |= MEMCG_DATA_OBJEXTS; #endif @@ -2118,7 +2136,10 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, * objcg vector should be reused. */ mark_objexts_empty(vec); - kfree(vec); + if (unlikely(!allow_spin)) + kfree_nolock(vec); + else + kfree(vec); return 0; } @@ -2142,7 +2163,10 @@ static inline void free_slab_obj_exts(struct slab *slab) * the extension for obj_exts is expected to be NULL. */ mark_objexts_empty(obj_exts); - kfree(obj_exts); + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) + kfree_nolock(obj_exts); + else + kfree(obj_exts); slab->obj_exts = 0; } @@ -2476,7 +2500,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init, } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init, still_accessible); + return !kasan_slab_free(s, x, init, still_accessible, false); } static __fastpath_inline @@ -2981,13 +3005,17 @@ static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) * Slab allocation and freeing */ static inline struct slab *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) + struct kmem_cache_order_objects oo, + bool allow_spin) { struct folio *folio; struct slab *slab; unsigned int order = oo_order(oo); - if (node == NUMA_NO_NODE) + if (unlikely(!allow_spin)) + folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + node, order); + else if (node == NUMA_NO_NODE) folio = (struct folio *)alloc_frozen_pages(flags, order); else folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); @@ -3137,6 +3165,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order, static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { + bool allow_spin = gfpflags_allow_spinning(flags); struct slab *slab; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; @@ -3156,7 +3185,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; - slab = alloc_slab_page(alloc_gfp, node, oo); + /* + * __GFP_RECLAIM could be cleared on the first allocation attempt, + * so pass allow_spin flag directly. + */ + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; @@ -3164,7 +3197,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - slab = alloc_slab_page(alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) return NULL; stat(s, ORDER_FALLBACK); @@ -3333,33 +3366,47 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); + /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist * and put the slab to the partial (or full) list. */ -static void *alloc_single_from_new_slab(struct kmem_cache *s, - struct slab *slab, int orig_size) +static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, + int orig_size, gfp_t gfpflags) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); int nid = slab_nid(slab); struct kmem_cache_node *n = get_node(s, nid); unsigned long flags; void *object; + if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + slab->frozen = 1; + defer_deactivate_slab(slab, NULL); + return NULL; + } object = slab->freelist; slab->freelist = get_freepointer(s, object); slab->inuse = 1; - if (!alloc_debug_processing(s, slab, object, orig_size)) + if (!alloc_debug_processing(s, slab, object, orig_size)) { /* * It's not really expected that this would fail on a * freshly allocated slab, but a concurrent memory * corruption in theory could cause that. + * Leak memory of allocated slab. */ + if (!allow_spin) + spin_unlock_irqrestore(&n->list_lock, flags); return NULL; + } - spin_lock_irqsave(&n->list_lock, flags); + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); if (slab->inuse == slab->objects) add_full(s, n, slab); @@ -3400,7 +3447,10 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!n || !n->nr_partial) return NULL; - spin_lock_irqsave(&n->list_lock, flags); + if (gfpflags_allow_spinning(pc->flags)) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { if (!pfmemalloc_match(slab, pc->flags)) continue; @@ -3606,7 +3656,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) lockdep_register_key(&s->lock_key); for_each_possible_cpu(cpu) { c = per_cpu_ptr(s->cpu_slab, cpu); - local_lock_init(&c->lock); + local_trylock_init(&c->lock); if (finegrain_lockdep) lockdep_set_class(&c->lock, &s->lock_key); c->tid = init_tid(cpu); @@ -3699,6 +3749,47 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } } +/* + * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock + * can be acquired without a deadlock before invoking the function. + * + * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is + * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), + * and kmalloc() is not used in an unsupported context. + * + * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). + * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but + * lockdep_assert() will catch a bug in case: + * #1 + * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() + * or + * #2 + * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() + * + * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt + * disabled context. The lock will always be acquired and if needed it + * block and sleep until the lock is available. + * #1 is possible in !PREEMPT_RT only. + * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: + * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> + * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) + * + * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B + */ +#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) +#define local_lock_cpu_slab(s, flags) \ + local_lock_irqsave(&(s)->cpu_slab->lock, flags) +#else +#define local_lock_cpu_slab(s, flags) \ + do { \ + bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ + lockdep_assert(__l); \ + } while (0) +#endif + +#define local_unlock_cpu_slab(s, flags) \ + local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) + #ifdef CONFIG_SLUB_CPU_PARTIAL static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { @@ -3783,7 +3874,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) unsigned long flags; int slabs = 0; - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); oldslab = this_cpu_read(s->cpu_slab->partial); @@ -3808,7 +3899,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) this_cpu_write(s->cpu_slab->partial, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); if (slab_to_put) { __put_partials(s, slab_to_put); @@ -4323,6 +4414,7 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); void *freelist; struct slab *slab; unsigned long flags; @@ -4348,9 +4440,21 @@ reread_slab: if (unlikely(!node_match(slab, node))) { /* * same as above but node_match() being false already - * implies node != NUMA_NO_NODE + * implies node != NUMA_NO_NODE. + * + * We don't strictly honor pfmemalloc and NUMA preferences + * when !allow_spin because: + * + * 1. Most kmalloc() users allocate objects on the local node, + * so kmalloc_nolock() tries not to interfere with them by + * deactivating the cpu slab. + * + * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause + * unnecessary slab allocations even when n->partial list + * is not empty. */ - if (!node_isset(node, slab_nodes)) { + if (!node_isset(node, slab_nodes) || + !allow_spin) { node = NUMA_NO_NODE; } else { stat(s, ALLOC_NODE_MISMATCH); @@ -4363,13 +4467,14 @@ reread_slab: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) goto deactivate_slab; /* must check again c->slab in case we got preempted and it changed */ - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); + if (unlikely(slab != c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; @@ -4381,7 +4486,7 @@ reread_slab: if (!freelist) { c->slab = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -4400,34 +4505,34 @@ load_freelist: VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); return freelist; deactivate_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (slab != c->slab) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; c->slab = NULL; c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); deactivate_slab(s, slab, freelist); new_slab: #ifdef CONFIG_SLUB_CPU_PARTIAL while (slub_percpu_partial(c)) { - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); /* we were preempted and partial list got empty */ goto new_objects; } @@ -4436,7 +4541,8 @@ new_slab: slub_set_percpu_partial(c, slab); if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags))) { + pfmemalloc_match(slab, gfpflags)) || + !allow_spin) { c->slab = slab; freelist = get_freelist(s, slab); VM_BUG_ON(!freelist); @@ -4444,7 +4550,7 @@ new_slab: goto load_freelist; } - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); slab->next = NULL; __put_partials(s, slab); @@ -4466,8 +4572,13 @@ new_objects: * allocating new page from other nodes */ if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) - && try_thisnode)) - pc.flags = GFP_NOWAIT | __GFP_THISNODE; + && try_thisnode)) { + if (unlikely(!allow_spin)) + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ + pc.flags = gfpflags | __GFP_THISNODE; + else + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + } pc.orig_size = orig_size; slab = get_partial(s, node, &pc); @@ -4506,7 +4617,7 @@ new_objects: stat(s, ALLOC_SLAB); if (kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size); + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); if (unlikely(!freelist)) goto new_objects; @@ -4528,7 +4639,7 @@ new_objects: inc_slabs_node(s, slab_nid(slab), slab->objects); - if (unlikely(!pfmemalloc_match(slab, gfpflags))) { + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. @@ -4539,7 +4650,7 @@ new_objects: retry_load_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { void *flush_freelist = c->freelist; struct slab *flush_slab = c->slab; @@ -4548,9 +4659,14 @@ retry_load_slab: c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); - deactivate_slab(s, flush_slab, flush_freelist); + if (unlikely(!allow_spin)) { + /* Reentrant slub cannot take locks, defer */ + defer_deactivate_slab(flush_slab, flush_freelist); + } else { + deactivate_slab(s, flush_slab, flush_freelist); + } stat(s, CPUSLAB_FLUSH); @@ -4560,6 +4676,19 @@ retry_load_slab: goto load_freelist; } +/* + * We disallow kprobes in ___slab_alloc() to prevent reentrance + * + * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of + * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> + * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() + * manipulating c->freelist without lock. + * + * This does not prevent kprobe in functions called from ___slab_alloc() such as + * local_lock_irqsave() itself, and that is fine, we only need to protect the + * c->freelist manipulation in ___slab_alloc() itself. + */ +NOKPROBE_SYMBOL(___slab_alloc); /* * A wrapper for ___slab_alloc() for contexts where preemption is not yet @@ -4579,8 +4708,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ c = slub_get_cpu_ptr(s->cpu_slab); #endif - + if (unlikely(!gfpflags_allow_spinning(gfpflags))) { + if (local_lock_is_locked(&s->cpu_slab->lock)) { + /* + * EBUSY is an internal signal to kmalloc_nolock() to + * retry a different bucket. It's not propagated + * to the caller. + */ + p = ERR_PTR(-EBUSY); + goto out; + } + } p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); +out: #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif @@ -4704,7 +4844,7 @@ static void *__slab_alloc_node(struct kmem_cache *s, return NULL; } - object = alloc_single_from_new_slab(s, slab, orig_size); + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); return object; } @@ -4783,8 +4923,9 @@ bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); - kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, init_flags); + if (gfpflags_allow_spinning(flags)) + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); alloc_tagging_slab_alloc_hook(s, p[i], flags); } @@ -5451,6 +5592,96 @@ void *__kmalloc_noprof(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc_noprof); +/** + * kmalloc_nolock - Allocate an object of given size from any context. + * @size: size to allocate + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT + * allowed. + * @node: node number of the target node. + * + * Return: pointer to the new object or NULL in case of error. + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. + * There is no reason to call it again and expect !NULL. + */ +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) +{ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; + struct kmem_cache *s; + bool can_retry = true; + void *ret = ERR_PTR(-EBUSY); + + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | + __GFP_NO_OBJ_EXT)); + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + return NULL; +retry: + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); + + if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) + /* + * kmalloc_nolock() is not supported on architectures that + * don't implement cmpxchg16b, but debug caches don't use + * per-cpu slab and per-cpu partial slabs. They rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can + * attempt to allocate from debug caches by + * spin_trylock_irqsave(&n->list_lock, ...) + */ + return NULL; + + /* + * Do not call slab_alloc_node(), since trylock mode isn't + * compatible with slab_pre_alloc_hook/should_failslab and + * kfence_alloc. Hence call __slab_alloc_node() (at most twice) + * and slab_post_alloc_hook() directly. + * + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair + * in irq saved region. It assumes that the same cpu will not + * __update_cpu_freelist_fast() into the same (freelist,tid) pair. + * Therefore use in_nmi() to check whether particular bucket is in + * irq protected section. + * + * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that + * this cpu was interrupted somewhere inside ___slab_alloc() after + * it did local_lock_irqsave(&s->cpu_slab->lock, flags). + * In this case fast path with __update_cpu_freelist_fast() is not safe. + */ +#ifndef CONFIG_SLUB_TINY + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) +#endif + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + + if (PTR_ERR(ret) == -EBUSY) { + if (can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; + } + ret = NULL; + } + + maybe_wipe_obj_freeptr(s, ret); + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, + slab_want_init_on_alloc(alloc_gfp, s), size); + + ret = kasan_kmalloc(s, ret, size, alloc_gfp); + return ret; +} +EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); + void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node, unsigned long caller) { @@ -6108,6 +6339,93 @@ flush_remote: } } +struct defer_free { + struct llist_head objects; + struct llist_head slabs; + struct irq_work work; +}; + +static void free_deferred_objects(struct irq_work *work); + +static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { + .objects = LLIST_HEAD_INIT(objects), + .slabs = LLIST_HEAD_INIT(slabs), + .work = IRQ_WORK_INIT(free_deferred_objects), +}; + +/* + * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe + * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). + */ +static void free_deferred_objects(struct irq_work *work) +{ + struct defer_free *df = container_of(work, struct defer_free, work); + struct llist_head *objs = &df->objects; + struct llist_head *slabs = &df->slabs; + struct llist_node *llnode, *pos, *t; + + if (llist_empty(objs) && llist_empty(slabs)) + return; + + llnode = llist_del_all(objs); + llist_for_each_safe(pos, t, llnode) { + struct kmem_cache *s; + struct slab *slab; + void *x = pos; + + slab = virt_to_slab(x); + s = slab->slab_cache; + + /* + * We used freepointer in 'x' to link 'x' into df->objects. + * Clear it to NULL to avoid false positive detection + * of "Freepointer corruption". + */ + *(void **)x = NULL; + + /* Point 'x' back to the beginning of allocated object */ + x -= s->offset; + __slab_free(s, slab, x, x, 1, _THIS_IP_); + } + + llnode = llist_del_all(slabs); + llist_for_each_safe(pos, t, llnode) { + struct slab *slab = container_of(pos, struct slab, llnode); + +#ifdef CONFIG_SLUB_TINY + discard_slab(slab->slab_cache, slab); +#else + deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); +#endif + } +} + +static void defer_free(struct kmem_cache *s, void *head) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + if (llist_add(head + s->offset, &df->objects)) + irq_work_queue(&df->work); +} + +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) +{ + struct defer_free *df = this_cpu_ptr(&defer_free_objects); + + slab->flush_freelist = flush_freelist; + if (llist_add(&slab->llnode, &df->slabs)) + irq_work_queue(&df->work); +} + +void defer_free_barrier(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); +} + #ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that @@ -6128,6 +6446,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { + /* cnt == 0 signals that it's called from kfree_nolock() */ + bool allow_spin = cnt; struct kmem_cache_cpu *c; unsigned long tid; void **freelist; @@ -6146,10 +6466,29 @@ redo: barrier(); if (unlikely(slab != c->slab)) { - __slab_free(s, slab, head, tail, cnt, addr); + if (unlikely(!allow_spin)) { + /* + * __slab_free() can locklessly cmpxchg16 into a slab, + * but then it might need to take spin_lock or local_lock + * in put_cpu_partial() for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, head); + } else { + __slab_free(s, slab, head, tail, cnt, addr); + } return; } + if (unlikely(!allow_spin)) { + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && + local_lock_is_locked(&s->cpu_slab->lock)) { + defer_free(s, head); + return; + } + cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ + } + if (USE_LOCKLESS_FAST_PATH()) { freelist = READ_ONCE(c->freelist); @@ -6160,11 +6499,13 @@ redo: goto redo; } } else { + __maybe_unused unsigned long flags = 0; + /* Update the free list under the local lock */ - local_lock(&s->cpu_slab->lock); + local_lock_cpu_slab(s, flags); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); goto redo; } tid = c->tid; @@ -6174,7 +6515,7 @@ redo: c->freelist = head; c->tid = next_tid(tid); - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); } stat_add(s, FREE_FASTPATH, cnt); } @@ -6405,6 +6746,71 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); +/* + * Can be called while holding raw_spinlock_t or from IRQ and NMI, + * but ONLY for objects allocated by kmalloc_nolock(). + * Debug checks (like kmemleak and kfence) were skipped on allocation, + * hence + * obj = kmalloc(); kfree_nolock(obj); + * will miss kmemleak/kfence book keeping and will cause false positives. + * large_kmalloc is not supported either. + */ +void kfree_nolock(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + memcg_slab_free_hook(s, slab, &x, 1); + alloc_tagging_slab_free_hook(s, slab, &x, 1); + /* + * Unlike slab_free() do NOT call the following: + * kmemleak_free_recursive(x, s->flags); + * debug_check_no_locks_freed(x, s->object_size); + * debug_check_no_obj_freed(x, s->object_size); + * __kcsan_check_access(x, s->object_size, ..); + * kfence_free(x); + * since they take spinlocks or not safe from any context. + */ + kmsan_slab_free(s, x); + /* + * If KASAN finds a kernel bug it will do kasan_report_invalid_free() + * which will call raw_spin_lock_irqsave() which is technically + * unsafe from NMI, but take chance and report kernel bug. + * The sequence of + * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI + * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU + * is double buggy and deserves to deadlock. + */ + if (kasan_slab_pre_free(s, x)) + return; + /* + * memcg, kasan_slab_pre_free are done for 'x'. + * The only thing left is kasan_poison without quarantine, + * since kasan quarantine takes locks and not supported from NMI. + */ + kasan_slab_free(s, x, false, false, /* skip quarantine */true); +#ifndef CONFIG_SLUB_TINY + do_slab_free(s, slab, x, x, 0, _RET_IP_); +#else + defer_free(s, x); +#endif +} +EXPORT_SYMBOL_GPL(kfree_nolock); + static __always_inline __realloc_size(2) void * __do_krealloc(const void *p, size_t new_size, gfp_t flags) {