diff --git a/include/linux/slab.h b/include/linux/slab.h index 34db237319c1..a0081642606b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -57,9 +57,7 @@ enum _slab_flag_bits { #endif _SLAB_OBJECT_POISON, _SLAB_CMPXCHG_DOUBLE, -#ifdef CONFIG_SLAB_OBJ_EXT _SLAB_NO_OBJ_EXT, -#endif #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) _SLAB_OBJ_EXT_IN_OBJ, #endif @@ -241,11 +239,7 @@ enum _slab_flag_bits { #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* Slab created using create_boot_cache */ -#ifdef CONFIG_SLAB_OBJ_EXT #define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT) -#else -#define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED -#endif #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) #define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ) diff --git a/mm/Kconfig b/mm/Kconfig index bd0ea5454af8..08593674cd20 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -247,17 +247,6 @@ config SLUB_STATS out which slabs are relevant to a particular load. Try running: slabinfo -DA -config SLUB_CPU_PARTIAL - default y - depends on SMP && !SLUB_TINY - bool "Enable per cpu partial caches" - help - Per cpu partial caches accelerate objects allocation and freeing - that is local to a processor at the price of more indeterminism - in the latency of the free. On overflow these caches will be cleared - which requires the taking of locks that may cause latency spikes. - Typically one would choose no for a realtime system. - config RANDOM_KMALLOC_CACHES default n depends on !SLUB_TINY diff --git a/mm/internal.h b/mm/internal.h index e430da900430..1f44ccb4badf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -846,6 +846,7 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_frozen_pages_nolock(...) \ alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) +void free_frozen_pages_nolock(struct page *page, unsigned int order); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c380f063e8b7..0127e9d661ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2981,6 +2981,11 @@ void free_frozen_pages(struct page *page, unsigned int order) __free_frozen_pages(page, order, FPI_NONE); } +void free_frozen_pages_nolock(struct page *page, unsigned int order) +{ + __free_frozen_pages(page, order, FPI_TRYLOCK); +} + /* * Free a batch of folios */ diff --git a/mm/slab.h b/mm/slab.h index 3f49666e943c..71c7261bf822 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -21,14 +21,12 @@ # define system_has_freelist_aba() system_has_cmpxchg128() # define try_cmpxchg_freelist try_cmpxchg128 # endif -#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 typedef u128 freelist_full_t; #else /* CONFIG_64BIT */ # ifdef system_has_cmpxchg64 # define system_has_freelist_aba() system_has_cmpxchg64() # define try_cmpxchg_freelist try_cmpxchg64 # endif -#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 typedef u64 freelist_full_t; #endif /* CONFIG_64BIT */ @@ -79,19 +77,7 @@ struct slab { struct kmem_cache *slab_cache; union { struct { - union { - struct list_head slab_list; - struct { /* For deferred deactivate_slab() */ - struct llist_node llnode; - void *flush_freelist; - }; -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct { - struct slab *next; - int slabs; /* Nr of slabs left */ - }; -#endif - }; + struct list_head slab_list; /* Double-word boundary */ struct freelist_counters; }; @@ -196,23 +182,6 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_percpu_partial(c) ((c)->partial) - -#define slub_set_percpu_partial(c, p) \ -({ \ - slub_percpu_partial(c) = (p)->next; \ -}) - -#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) -#else -#define slub_percpu_partial(c) NULL - -#define slub_set_percpu_partial(c, p) - -#define slub_percpu_partial_read_once(c) NULL -#endif // CONFIG_SLUB_CPU_PARTIAL - /* * Word size structure that can be atomically updated or read and that * contains both the order and the number of objects that a slab of the @@ -226,8 +195,6 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { - struct kmem_cache_cpu __percpu *cpu_slab; - struct lock_class_key lock_key; struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; @@ -236,12 +203,6 @@ struct kmem_cache { unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - /* Number of per cpu partial objects to keep around */ - unsigned int cpu_partial; - /* Number of per cpu partial slabs to keep around */ - unsigned int cpu_partial_slabs; -#endif unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; @@ -282,9 +243,25 @@ struct kmem_cache { unsigned int usersize; /* Usercopy region size */ #endif +#ifdef CONFIG_SLUB_STATS + struct kmem_cache_stats __percpu *cpu_stats; +#endif + struct kmem_cache_node *node[MAX_NUMNODES]; }; +/* + * Every cache has !NULL s->cpu_sheaves but they may point to the + * bootstrap_sheaf temporarily during init, or permanently for the boot caches + * and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This + * helper distinguishes whether cache has real non-bootstrap sheaves. + */ +static inline bool cache_has_sheaves(struct kmem_cache *s) +{ + /* Test CONFIG_SLUB_TINY for code elimination purposes */ + return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity; +} + #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) #define SLAB_SUPPORTS_SYSFS 1 void sysfs_slab_unlink(struct kmem_cache *s); diff --git a/mm/slab_common.c b/mm/slab_common.c index 094afa2792d0..d5a70a831a2a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1604,11 +1604,8 @@ static bool kfree_rcu_sheaf(void *obj) return false; s = slab->slab_cache; - if (s->cpu_sheaves) { - if (likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id())) - return __kfree_rcu_sheaf(s, obj); - } + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) + return __kfree_rcu_sheaf(s, obj); return false; } @@ -2112,7 +2109,7 @@ EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); */ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) { - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { flush_rcu_sheaves_on_cache(s); rcu_barrier(); } diff --git a/mm/slub.c b/mm/slub.c index 6fac2b123b42..11a99bd06ac7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1,13 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* - * SLUB: A slab allocator that limits cache line use instead of queuing - * objects in per cpu and per node lists. + * SLUB: A slab allocator with low overhead percpu array caches and mostly + * lockless freeing of objects to slabs in the slowpath. * - * The allocator synchronizes using per slab locks or atomic operations - * and only uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using spin_trylock for percpu arrays in the + * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing. + * Uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter * (C) 2011 Linux Foundation, Christoph Lameter + * (C) 2025 SUSE, Vlastimil Babka */ #include @@ -53,11 +55,13 @@ /* * Lock order: - * 1. slab_mutex (Global Mutex) - * 2. node->list_lock (Spinlock) - * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches) - * 5. object_map_lock (Only for debugging) + * 0. cpu_hotplug_lock + * 1. slab_mutex (Global Mutex) + * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) + * 2b. node->barn->lock (Spinlock) + * 2c. node->list_lock (Spinlock) + * 3. slab_lock(slab) (Only on some arches) + * 4. object_map_lock (Only for debugging) * * slab_mutex * @@ -78,31 +82,38 @@ * C. slab->objects -> Number of objects in slab * D. slab->frozen -> frozen state * + * SL_partial slabs + * + * Slabs on node partial list have at least one free object. A limited number + * of slabs on the list can be fully free (slab->inuse == 0), until we start + * discarding them. These slabs are marked with SL_partial, and the flag is + * cleared while removing them, usually to grab their freelist afterwards. + * This clearing also exempts them from list management. Please see + * __slab_free() for more details. + * + * Full slabs + * + * For caches without debugging enabled, full slabs (slab->inuse == + * slab->objects and slab->freelist == NULL) are not placed on any list. + * The __slab_free() freeing the first object from such a slab will place + * it on the partial list. Caches with debugging enabled place such slab + * on the full list and use different allocation and freeing paths. + * * Frozen slabs * - * If a slab is frozen then it is exempt from list management. It is - * the cpu slab which is actively allocated from by the processor that - * froze it and it is not on any list. The processor that froze the - * slab is the one who can perform list operations on the slab. Other - * processors may put objects onto the freelist but the processor that - * froze the slab is the only one that can retrieve the objects from the - * slab's freelist. - * - * CPU partial slabs - * - * The partially empty slabs cached on the CPU partial list are used - * for performance reasons, which speeds up the allocation process. - * These slabs are not frozen, but are also exempt from list management, - * by clearing the SL_partial flag when moving out of the node - * partial list. Please see __slab_free() for more details. + * If a slab is frozen then it is exempt from list management. It is used to + * indicate a slab that has failed consistency checks and thus cannot be + * allocated from anymore - it is also marked as full. Any previously + * allocated objects will be simply leaked upon freeing instead of attempting + * to modify the potentially corrupted freelist and metadata. * * To sum up, the current scheme is: - * - node partial slab: SL_partial && !frozen - * - cpu partial slab: !SL_partial && !frozen - * - cpu slab: !SL_partial && frozen - * - full slab: !SL_partial && !frozen + * - node partial slab: SL_partial && !full && !frozen + * - taken off partial list: !SL_partial && !full && !frozen + * - full slab, not on any list: !SL_partial && full && !frozen + * - frozen due to inconsistency: !SL_partial && full && frozen * - * list_lock + * node->list_lock (spinlock) * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -112,47 +123,46 @@ * * The list_lock is a centralized lock and thus we avoid taking it as * much as possible. As long as SLUB does not have to handle partial - * slabs, operations can continue without any centralized lock. F.e. - * allocating a long series of objects that fill up slabs does not require - * the list lock. + * slabs, operations can continue without any centralized lock. * * For debug caches, all allocations are forced to go through a list_lock * protected region to serialize against concurrent validation. * - * cpu_slab->lock local lock + * cpu_sheaves->lock (local_trylock) * - * This locks protect slowpath manipulation of all kmem_cache_cpu fields - * except the stat counters. This is a percpu structure manipulated only by - * the local cpu, so the lock protects against being preempted or interrupted - * by an irq. Fast path operations rely on lockless operations instead. + * This lock protects fastpath operations on the percpu sheaves. On !RT it + * only disables preemption and does no atomic operations. As long as the main + * or spare sheaf can handle the allocation or free, there is no other + * overhead. * - * On PREEMPT_RT, the local lock neither disables interrupts nor preemption - * which means the lockless fastpath cannot be used as it might interfere with - * an in-progress slow path operations. In this case the local lock is always - * taken but it still utilizes the freelist for the common operations. + * node->barn->lock (spinlock) * - * lockless fastpaths + * This lock protects the operations on per-NUMA-node barn. It can quickly + * serve an empty or full sheaf if available, and avoid more expensive refill + * or flush operation. * - * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) - * are fully lockless when satisfied from the percpu slab (and when - * cmpxchg_double is possible to use, otherwise slab_lock is taken). - * They also don't disable preemption or migration or irqs. They rely on - * the transaction id (tid) field to detect being preempted or moved to - * another cpu. + * Lockless freeing + * + * Objects may have to be freed to their slabs when they are from a remote + * node (where we want to avoid filling local sheaves with remote objects) + * or when there are too many full sheaves. On architectures supporting + * cmpxchg_double this is done by a lockless update of slab's freelist and + * counters, otherwise slab_lock is taken. This only needs to take the + * list_lock if it's a first free to a full slab, or when a slab becomes empty + * after the free. * * irq, preemption, migration considerations * - * Interrupts are disabled as part of list_lock or local_lock operations, or + * Interrupts are disabled as part of list_lock or barn lock operations, or * around the slab_lock operation, in order to make the slab allocator safe * to use in the context of an irq. + * Preemption is disabled as part of local_trylock operations. + * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see + * their limitations. * - * In addition, preemption (or migration on PREEMPT_RT) is disabled in the - * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the - * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer - * doesn't have to be revalidated in each section protected by the local lock. - * - * SLUB assigns one slab for allocation to each processor. - * Allocations only occur from these slabs called cpu slabs. + * SLUB assigns two object arrays called sheaves for caching allocations and + * frees on each cpu, with a NUMA node shared barn for balancing between cpus. + * Allocations and frees are primarily served from these sheaves. * * Slabs with free elements are kept on a partial list and during regular * operations no list for full slabs is used. If an object in a full slab is @@ -160,25 +170,8 @@ * We track full slabs for debugging purposes though because otherwise we * cannot scan all objects. * - * Slabs are freed when they become empty. Teardown and setup is - * minimal so we rely on the page allocators per cpu caches for - * fast frees and allocs. - * - * slab->frozen The slab is frozen and exempt from list processing. - * This means that the slab is dedicated to a purpose - * such as satisfying allocations for a specific - * processor. Objects may be freed in the slab while - * it is frozen but slab_free will then skip the usual - * list operations. It is up to the processor holding - * the slab to integrate the slab into the slab lists - * when the slab is no longer needed. - * - * One use of this flag is to mark slabs that are - * used for allocations. Then such a slab becomes a cpu - * slab. The cpu slab may be equipped with an additional - * freelist that allows lockless access to - * free objects in addition to the regular freelist - * that requires the slab lock. + * Slabs are freed when they become empty. Teardown and setup is minimal so we + * rely on the page allocators per cpu caches for fast frees and allocs. * * SLAB_DEBUG_FLAGS Slab requires special handling due to debug * options set. This moves slab handling out of @@ -201,28 +194,6 @@ enum slab_flags { SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ }; -/* - * We could simply use migrate_disable()/enable() but as long as it's a - * function call even on !PREEMPT_RT, use inline preempt_disable() there. - */ -#ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) -#define USE_LOCKLESS_FAST_PATH() (true) -#else -#define slub_get_cpu_ptr(var) \ -({ \ - migrate_disable(); \ - this_cpu_ptr(var); \ -}) -#define slub_put_cpu_ptr(var) \ -do { \ - (void)(var); \ - migrate_enable(); \ -} while (0) -#define USE_LOCKLESS_FAST_PATH() (false) -#endif - #ifndef CONFIG_SLUB_TINY #define __fastpath_inline __always_inline #else @@ -241,11 +212,18 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); static DEFINE_STATIC_KEY_FALSE(strict_numa); #endif -/* Structure holding parameters for get_partial() call chain */ +/* Structure holding parameters for get_from_partial() call chain */ struct partial_context { gfp_t flags; unsigned int orig_size; - void *object; +}; + +/* Structure holding parameters for get_partial_node_bulk() */ +struct partial_bulk_context { + gfp_t flags; + unsigned int min_objects; + unsigned int max_objects; + struct list_head slabs; }; static inline bool kmem_cache_debug(struct kmem_cache *s) @@ -261,15 +239,6 @@ void *fixup_red_left(struct kmem_cache *s, void *p) return p; } -static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_CPU_PARTIAL - return !kmem_cache_debug(s); -#else - return false; -#endif -} - /* * Issues still to be resolved: * @@ -360,37 +329,25 @@ static void debugfs_slab_add(struct kmem_cache *); static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif +enum add_mode { + ADD_TO_HEAD, + ADD_TO_TAIL, +}; + enum stat_item { - ALLOC_PCS, /* Allocation from percpu sheaf */ - ALLOC_FASTPATH, /* Allocation from cpu slab */ - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ - FREE_PCS, /* Free to percpu sheaf */ + ALLOC_FASTPATH, /* Allocation from percpu sheaves */ + ALLOC_SLOWPATH, /* Allocation from partial or new slab */ FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ - FREE_FASTPATH, /* Free to cpu slab */ - FREE_SLOWPATH, /* Freeing not to cpu slab */ - FREE_FROZEN, /* Freeing to frozen slab */ + FREE_FASTPATH, /* Free to percpu sheaves */ + FREE_SLOWPATH, /* Free to a slab */ FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ FREE_REMOVE_PARTIAL, /* Freeing removes last object */ - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ + ALLOC_SLAB, /* New slab acquired from page allocator */ + ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */ FREE_SLAB, /* Slab freed to the page allocator */ - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ - DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ - CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ SHEAF_FLUSH, /* Objects flushed from a sheaf */ SHEAF_REFILL, /* Objects refilled to a sheaf */ SHEAF_ALLOC, /* Allocation of an empty sheaf */ @@ -407,31 +364,11 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; -struct freelist_tid { - union { - struct { - void *freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ - }; - freelist_full_t freelist_tid; - }; -}; - -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { - struct freelist_tid; - struct slab *slab; /* The slab from which we are allocating */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated slabs */ -#endif - local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS +struct kmem_cache_stats { unsigned int stat[NR_SLUB_STAT_ITEMS]; -#endif }; +#endif static inline void stat(const struct kmem_cache *s, enum stat_item si) { @@ -440,7 +377,7 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * The rmw is racy on a preemptible kernel but this is acceptable, so * avoid this_cpu_add()'s irq-disable overhead. */ - raw_cpu_inc(s->cpu_slab->stat[si]); + raw_cpu_inc(s->cpu_stats->stat[si]); #endif } @@ -448,7 +385,7 @@ static inline void stat_add(const struct kmem_cache *s, enum stat_item si, int v) { #ifdef CONFIG_SLUB_STATS - raw_cpu_add(s->cpu_slab->stat[si], v); + raw_cpu_add(s->cpu_stats->stat[si], v); #endif } @@ -537,7 +474,7 @@ static inline struct node_barn *get_barn(struct kmem_cache *s) static nodemask_t slab_nodes; /* - * Workqueue used for flush_cpu_slab(). + * Workqueue used for flushing cpu and kfree_rcu sheaves. */ static struct workqueue_struct *flushwq; @@ -596,36 +533,6 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_ptr_decode(s, p, ptr_addr); } -static void prefetch_freepointer(const struct kmem_cache *s, void *object) -{ - prefetchw(object + s->offset); -} - -/* - * When running under KMSAN, get_freepointer_safe() may return an uninitialized - * pointer value in the case the current thread loses the race for the next - * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in - * slab_alloc_node() will fail, so the uninitialized value won't be used, but - * KMSAN will still check all arguments of cmpxchg because of imperfect - * handling of inline assembly. - * To work around this problem, we apply __no_kmsan_checks to ensure that - * get_freepointer_safe() returns initialized memory. - */ -__no_kmsan_checks -static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) -{ - unsigned long freepointer_addr; - freeptr_t p; - - if (!debug_pagealloc_enabled_static()) - return get_freepointer(s, object); - - object = kasan_reset_tag(object); - freepointer_addr = (unsigned long)object + s->offset; - copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); - return freelist_ptr_decode(s, p, freepointer_addr); -} - static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { unsigned long freeptr_addr = (unsigned long)object + s->offset; @@ -689,41 +596,6 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) -{ - unsigned int nr_slabs; - - s->cpu_partial = nr_objects; - - /* - * We take the number of objects but actually limit the number of - * slabs on the per cpu partial list, in order to limit excessive - * growth of the list. For simplicity we assume that the slabs will - * be half-full. - */ - nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); - s->cpu_partial_slabs = nr_slabs; -} - -static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) -{ - return s->cpu_partial_slabs; -} -#else -#ifdef SLAB_SUPPORTS_SYSFS -static inline void -slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) -{ -} -#endif - -static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) -{ - return 0; -} -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - /* * If network-based swap is enabled, slub must keep track of whether memory * were allocated from pfmemalloc reserves. @@ -779,7 +651,8 @@ __update_freelist_slow(struct slab *slab, struct freelist_counters *old, if (slab->freelist == old->freelist && slab->counters == old->counters) { slab->freelist = new->freelist; - slab->counters = new->counters; + /* prevent tearing for the read in get_partial_node_bulk() */ + WRITE_ONCE(slab->counters, new->counters); ret = true; } slab_unlock(slab); @@ -799,7 +672,7 @@ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *sla { bool ret; - if (USE_LOCKLESS_FAST_PATH()) + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) lockdep_assert_irqs_disabled(); if (s->flags & __CMPXCHG_DOUBLE) @@ -1178,7 +1051,7 @@ static void set_track_update(struct kmem_cache *s, void *object, p->handle = handle; #endif p->addr = addr; - p->cpu = smp_processor_id(); + p->cpu = raw_smp_processor_id(); p->pid = current->pid; p->when = jiffies; } @@ -1342,20 +1215,6 @@ static void object_err(struct kmem_cache *s, struct slab *slab, WARN_ON(1); } -static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, - void **freelist, void *nextfree) -{ - if ((s->flags & SLAB_CONSISTENCY_CHECKS) && - !check_valid_pointer(s, slab, nextfree) && freelist) { - object_err(s, slab, *freelist, "Freechain corrupt"); - *freelist = NULL; - slab_fix(s, "Isolate corrupted freechain"); - return true; - } - - return false; -} - static void __slab_err(struct slab *slab) { if (slab_in_kunit_test()) @@ -2167,11 +2026,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, - void **freelist, void *nextfree) -{ - return false; -} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -2872,7 +2726,8 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } -static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, + unsigned int capacity) { struct slab_sheaf *sheaf; size_t sheaf_size; @@ -2890,7 +2745,7 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) if (s->flags & SLAB_KMALLOC) gfp |= __GFP_NO_OBJ_EXT; - sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf_size = struct_size(sheaf, objects, capacity); sheaf = kzalloc(sheaf_size, gfp); if (unlikely(!sheaf)) @@ -2903,6 +2758,12 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) return sheaf; } +static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, + gfp_t gfp) +{ + return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity); +} + static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) { kfree(sheaf); @@ -2910,9 +2771,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) stat(s, SHEAF_FREE); } -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p); - +static unsigned int +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max); static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, gfp_t gfp) @@ -2923,8 +2784,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, if (!to_fill) return 0; - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, - &sheaf->objects[sheaf->size]); + filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill, + to_fill); sheaf->size += filled; @@ -3125,12 +2986,23 @@ static void pcs_destroy(struct kmem_cache *s) { int cpu; + /* + * We may be unwinding cache creation that failed before or during the + * allocation of this. + */ + if (!s->cpu_sheaves) + return; + + /* pcs->main can only point to the bootstrap sheaf, nothing to free */ + if (!cache_has_sheaves(s)) + goto free_pcs; + for_each_possible_cpu(cpu) { struct slub_percpu_sheaves *pcs; pcs = per_cpu_ptr(s->cpu_sheaves, cpu); - /* can happen when unwinding failed create */ + /* This can happen when unwinding failed cache creation. */ if (!pcs->main) continue; @@ -3152,11 +3024,13 @@ static void pcs_destroy(struct kmem_cache *s) } } +free_pcs: free_percpu(s->cpu_sheaves); s->cpu_sheaves = NULL; } -static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn, + bool allow_spin) { struct slab_sheaf *empty = NULL; unsigned long flags; @@ -3164,7 +3038,10 @@ static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) if (!data_race(barn->nr_empty)) return NULL; - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return NULL; if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, @@ -3241,7 +3118,8 @@ static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) * change. */ static struct slab_sheaf * -barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty, + bool allow_spin) { struct slab_sheaf *full = NULL; unsigned long flags; @@ -3249,7 +3127,10 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) if (!data_race(barn->nr_full)) return NULL; - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return NULL; if (likely(barn->nr_full)) { full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, @@ -3270,7 +3151,8 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) * barn. But if there are too many full sheaves, reject this with -E2BIG. */ static struct slab_sheaf * -barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full, + bool allow_spin) { struct slab_sheaf *empty; unsigned long flags; @@ -3281,7 +3163,10 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) if (!data_race(barn->nr_empty)) return ERR_PTR(-ENOMEM); - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return ERR_PTR(-EBUSY); if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, @@ -3585,7 +3470,7 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); } -static void __free_slab(struct kmem_cache *s, struct slab *slab) +static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin) { struct page *page = slab_page(slab); int order = compound_order(page); @@ -3596,14 +3481,26 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) __ClearPageSlab(page); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - free_frozen_pages(page, order); + if (allow_spin) + free_frozen_pages(page, order); + else + free_frozen_pages_nolock(page, order); +} + +static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab) +{ + /* + * Since it was just allocated, we can skip the actions in + * discard_slab() and free_slab(). + */ + __free_slab(s, slab, false); } static void rcu_free_slab(struct rcu_head *h) { struct slab *slab = container_of(h, struct slab, rcu_head); - __free_slab(slab->slab_cache, slab); + __free_slab(slab->slab_cache, slab, true); } static void free_slab(struct kmem_cache *s, struct slab *slab) @@ -3619,7 +3516,7 @@ static void free_slab(struct kmem_cache *s, struct slab *slab) if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&slab->rcu_head, rcu_free_slab); else - __free_slab(s, slab); + __free_slab(s, slab, true); } static void discard_slab(struct kmem_cache *s, struct slab *slab) @@ -3647,10 +3544,10 @@ static inline void slab_clear_node_partial(struct slab *slab) * Management of partially allocated slabs. */ static inline void -__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) +__add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode) { n->nr_partial++; - if (tail == DEACTIVATE_TO_TAIL) + if (mode == ADD_TO_TAIL) list_add_tail(&slab->slab_list, &n->partial); else list_add(&slab->slab_list, &n->partial); @@ -3658,10 +3555,10 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) } static inline void add_partial(struct kmem_cache_node *n, - struct slab *slab, int tail) + struct slab *slab, enum add_mode mode) { lockdep_assert_held(&n->list_lock); - __add_partial(n, slab, tail); + __add_partial(n, slab, mode); } static inline void remove_partial(struct kmem_cache_node *n, @@ -3712,8 +3609,6 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } -static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); - /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist @@ -3729,8 +3624,8 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, void *object; if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { - /* Unlucky, discard newly allocated slab */ - defer_deactivate_slab(slab, NULL); + /* Unlucky, discard newly allocated slab. */ + free_new_slab_nolock(s, slab); return NULL; } @@ -3756,7 +3651,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, if (slab->inuse == slab->objects) add_full(s, n, slab); else - add_partial(n, slab, DEACTIVATE_TO_HEAD); + add_partial(n, slab, ADD_TO_HEAD); inc_slabs_node(s, nid, slab->objects); spin_unlock_irqrestore(&n->list_lock, flags); @@ -3764,29 +3659,78 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, return object; } -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); -#else -static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, - int drain) { } -#endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); -/* - * Try to allocate a partial slab from a specific node. - */ -static struct slab *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n, - struct partial_context *pc) +static bool get_partial_node_bulk(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_bulk_context *pc, + bool allow_spin) { - struct slab *slab, *slab2, *partial = NULL; + struct slab *slab, *slab2; + unsigned int total_free = 0; unsigned long flags; - unsigned int partial_slabs = 0; + + /* Racy check to avoid taking the lock unnecessarily. */ + if (!n || data_race(!n->nr_partial)) + return false; + + INIT_LIST_HEAD(&pc->slabs); + + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return false; + + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + struct freelist_counters flc; + unsigned int slab_free; + + if (!pfmemalloc_match(slab, pc->flags)) + continue; + + /* + * determine the number of free objects in the slab racily + * + * slab_free is a lower bound due to possible subsequent + * concurrent freeing, so the caller may get more objects than + * requested and must handle that + */ + flc.counters = data_race(READ_ONCE(slab->counters)); + slab_free = flc.objects - flc.inuse; + + /* we have already min and this would get us over the max */ + if (total_free >= pc->min_objects + && total_free + slab_free > pc->max_objects) + break; + + remove_partial(n, slab); + + list_add(&slab->slab_list, &pc->slabs); + + total_free += slab_free; + if (total_free >= pc->max_objects) + break; + } + + spin_unlock_irqrestore(&n->list_lock, flags); + return total_free > 0; +} + +/* + * Try to allocate object from a partial slab on a specific node. + */ +static void *get_from_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_context *pc) +{ + struct slab *slab, *slab2; + unsigned long flags; + void *object = NULL; /* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a - * partial slab and there is none available then get_partial() + * partial slab and there is none available then get_from_partial() * will return NULL. */ if (!n || !n->nr_partial) @@ -3797,54 +3741,55 @@ static struct slab *get_partial_node(struct kmem_cache *s, else if (!spin_trylock_irqsave(&n->list_lock, flags)) return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + + struct freelist_counters old, new; + if (!pfmemalloc_match(slab, pc->flags)) continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - void *object = alloc_single_from_partial(s, n, slab, + object = alloc_single_from_partial(s, n, slab, pc->orig_size); - if (object) { - partial = slab; - pc->object = object; + if (object) break; - } continue; } - remove_partial(n, slab); + /* + * get a single object from the slab. This might race against + * __slab_free(), which however has to take the list_lock if + * it's about to make the slab fully free. + */ + do { + old.freelist = slab->freelist; + old.counters = slab->counters; - if (!partial) { - partial = slab; - stat(s, ALLOC_FROM_PARTIAL); + new.freelist = get_freepointer(s, old.freelist); + new.counters = old.counters; + new.inuse++; - if ((slub_get_cpu_partial(s) == 0)) { - break; - } - } else { - put_cpu_partial(s, slab, 0); - stat(s, CPU_PARTIAL_NODE); + } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node")); - if (++partial_slabs > slub_get_cpu_partial(s) / 2) { - break; - } - } + object = old.freelist; + if (!new.freelist) + remove_partial(n, slab); + + break; } spin_unlock_irqrestore(&n->list_lock, flags); - return partial; + return object; } /* - * Get a slab from somewhere. Search in increasing NUMA distances. + * Get an object from somewhere. Search in increasing NUMA distances. */ -static struct slab *get_any_partial(struct kmem_cache *s, - struct partial_context *pc) +static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(pc->flags); - struct slab *slab; unsigned int cpuset_mems_cookie; /* @@ -3879,8 +3824,10 @@ static struct slab *get_any_partial(struct kmem_cache *s, if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - slab = get_partial_node(s, n, pc); - if (slab) { + + void *object = get_from_partial_node(s, n, pc); + + if (object) { /* * Don't check read_mems_allowed_retry() * here - if mems_allowed was updated in @@ -3888,7 +3835,7 @@ static struct slab *get_any_partial(struct kmem_cache *s, * between allocation and the cpuset * update */ - return slab; + return object; } } } @@ -3898,424 +3845,29 @@ static struct slab *get_any_partial(struct kmem_cache *s, } /* - * Get a partial slab, lock it and return it. + * Get an object from a partial slab */ -static struct slab *get_partial(struct kmem_cache *s, int node, - struct partial_context *pc) +static void *get_from_partial(struct kmem_cache *s, int node, + struct partial_context *pc) { - struct slab *slab; int searchnode = node; + void *object; if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - slab = get_partial_node(s, get_node(s, searchnode), pc); - if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) - return slab; + object = get_from_partial_node(s, get_node(s, searchnode), pc); + if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) + return object; - return get_any_partial(s, pc); -} - -#ifdef CONFIG_PREEMPTION -/* - * Calculate the next globally unique transaction for disambiguation - * during cmpxchg. The transactions start with the cpu number and are then - * incremented by CONFIG_NR_CPUS. - */ -#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) -#else -/* - * No preemption supported therefore also no need to check for - * different cpus. - */ -#define TID_STEP 1 -#endif /* CONFIG_PREEMPTION */ - -static inline unsigned long next_tid(unsigned long tid) -{ - return tid + TID_STEP; -} - -#ifdef SLUB_DEBUG_CMPXCHG -static inline unsigned int tid_to_cpu(unsigned long tid) -{ - return tid % TID_STEP; -} - -static inline unsigned long tid_to_event(unsigned long tid) -{ - return tid / TID_STEP; -} -#endif - -static inline unsigned int init_tid(int cpu) -{ - return cpu; -} - -static inline void note_cmpxchg_failure(const char *n, - const struct kmem_cache *s, unsigned long tid) -{ -#ifdef SLUB_DEBUG_CMPXCHG - unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - - pr_info("%s %s: cmpxchg redo ", n, s->name); - - if (IS_ENABLED(CONFIG_PREEMPTION) && - tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { - pr_warn("due to cpu change %d -> %d\n", - tid_to_cpu(tid), tid_to_cpu(actual_tid)); - } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { - pr_warn("due to cpu running other code. Event %ld->%ld\n", - tid_to_event(tid), tid_to_event(actual_tid)); - } else { - pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", - actual_tid, tid, next_tid(tid)); - } -#endif - stat(s, CMPXCHG_DOUBLE_CPU_FAIL); -} - -static void init_kmem_cache_cpus(struct kmem_cache *s) -{ -#ifdef CONFIG_PREEMPT_RT - /* - * Register lockdep key for non-boot kmem caches to avoid - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() - */ - bool finegrain_lockdep = !init_section_contains(s, 1); -#else - /* - * Don't bother with different lockdep classes for each - * kmem_cache, since we only use local_trylock_irqsave(). - */ - bool finegrain_lockdep = false; -#endif - int cpu; - struct kmem_cache_cpu *c; - - if (finegrain_lockdep) - lockdep_register_key(&s->lock_key); - for_each_possible_cpu(cpu) { - c = per_cpu_ptr(s->cpu_slab, cpu); - local_trylock_init(&c->lock); - if (finegrain_lockdep) - lockdep_set_class(&c->lock, &s->lock_key); - c->tid = init_tid(cpu); - } -} - -/* - * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, - * unfreezes the slabs and puts it on the proper list. - * Assumes the slab has been already safely taken away from kmem_cache_cpu - * by the caller. - */ -static void deactivate_slab(struct kmem_cache *s, struct slab *slab, - void *freelist) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - int free_delta = 0; - void *nextfree, *freelist_iter, *freelist_tail; - int tail = DEACTIVATE_TO_HEAD; - unsigned long flags = 0; - struct freelist_counters old, new; - - if (READ_ONCE(slab->freelist)) { - stat(s, DEACTIVATE_REMOTE_FREES); - tail = DEACTIVATE_TO_TAIL; - } - - /* - * Stage one: Count the objects on cpu's freelist as free_delta and - * remember the last object in freelist_tail for later splicing. - */ - freelist_tail = NULL; - freelist_iter = freelist; - while (freelist_iter) { - nextfree = get_freepointer(s, freelist_iter); - - /* - * If 'nextfree' is invalid, it is possible that the object at - * 'freelist_iter' is already corrupted. So isolate all objects - * starting at 'freelist_iter' by skipping them. - */ - if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) - break; - - freelist_tail = freelist_iter; - free_delta++; - - freelist_iter = nextfree; - } - - /* - * Stage two: Unfreeze the slab while splicing the per-cpu - * freelist to the head of slab's freelist. - */ - do { - old.freelist = READ_ONCE(slab->freelist); - old.counters = READ_ONCE(slab->counters); - VM_BUG_ON(!old.frozen); - - /* Determine target state of the slab */ - new.counters = old.counters; - new.frozen = 0; - if (freelist_tail) { - new.inuse -= free_delta; - set_freepointer(s, freelist_tail, old.freelist); - new.freelist = freelist; - } else { - new.freelist = old.freelist; - } - } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); - - /* - * Stage three: Manipulate the slab list based on the updated state. - */ - if (!new.inuse && n->nr_partial >= s->min_partial) { - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, slab); - stat(s, FREE_SLAB); - } else if (new.freelist) { - spin_lock_irqsave(&n->list_lock, flags); - add_partial(n, slab, tail); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, tail); - } else { - stat(s, DEACTIVATE_FULL); - } -} - -/* - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock - * can be acquired without a deadlock before invoking the function. - * - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), - * and kmalloc() is not used in an unsupported context. - * - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but - * lockdep_assert() will catch a bug in case: - * #1 - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() - * or - * #2 - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() - * - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt - * disabled context. The lock will always be acquired and if needed it - * block and sleep until the lock is available. - * #1 is possible in !PREEMPT_RT only. - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) - * - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B - */ -#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) -#define local_lock_cpu_slab(s, flags) \ - local_lock_irqsave(&(s)->cpu_slab->lock, flags) -#else -#define local_lock_cpu_slab(s, flags) \ - do { \ - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ - lockdep_assert(__l); \ - } while (0) -#endif - -#define local_unlock_cpu_slab(s, flags) \ - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) - -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) -{ - struct kmem_cache_node *n = NULL, *n2 = NULL; - struct slab *slab, *slab_to_discard = NULL; - unsigned long flags = 0; - - while (partial_slab) { - slab = partial_slab; - partial_slab = slab->next; - - n2 = get_node(s, slab_nid(slab)); - if (n != n2) { - if (n) - spin_unlock_irqrestore(&n->list_lock, flags); - - n = n2; - spin_lock_irqsave(&n->list_lock, flags); - } - - if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { - slab->next = slab_to_discard; - slab_to_discard = slab; - } else { - add_partial(n, slab, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } - } - - if (n) - spin_unlock_irqrestore(&n->list_lock, flags); - - while (slab_to_discard) { - slab = slab_to_discard; - slab_to_discard = slab_to_discard->next; - - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, slab); - stat(s, FREE_SLAB); - } -} - -/* - * Put all the cpu partial slabs to the node partial list. - */ -static void put_partials(struct kmem_cache *s) -{ - struct slab *partial_slab; - unsigned long flags; - - local_lock_irqsave(&s->cpu_slab->lock, flags); - partial_slab = this_cpu_read(s->cpu_slab->partial); - this_cpu_write(s->cpu_slab->partial, NULL); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (partial_slab) - __put_partials(s, partial_slab); -} - -static void put_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - struct slab *partial_slab; - - partial_slab = slub_percpu_partial(c); - c->partial = NULL; - - if (partial_slab) - __put_partials(s, partial_slab); -} - -/* - * Put a slab into a partial slab slot if available. - * - * If we did not find a slot then simply move all the partials to the - * per node partial list. - */ -static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) -{ - struct slab *oldslab; - struct slab *slab_to_put = NULL; - unsigned long flags; - int slabs = 0; - - local_lock_cpu_slab(s, flags); - - oldslab = this_cpu_read(s->cpu_slab->partial); - - if (oldslab) { - if (drain && oldslab->slabs >= s->cpu_partial_slabs) { - /* - * Partial array is full. Move the existing set to the - * per node partial list. Postpone the actual unfreezing - * outside of the critical section. - */ - slab_to_put = oldslab; - oldslab = NULL; - } else { - slabs = oldslab->slabs; - } - } - - slabs++; - - slab->slabs = slabs; - slab->next = oldslab; - - this_cpu_write(s->cpu_slab->partial, slab); - - local_unlock_cpu_slab(s, flags); - - if (slab_to_put) { - __put_partials(s, slab_to_put); - stat(s, CPU_PARTIAL_DRAIN); - } -} - -#else /* CONFIG_SLUB_CPU_PARTIAL */ - -static inline void put_partials(struct kmem_cache *s) { } -static inline void put_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) { } - -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - -static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) -{ - unsigned long flags; - struct slab *slab; - void *freelist; - - local_lock_irqsave(&s->cpu_slab->lock, flags); - - slab = c->slab; - freelist = c->freelist; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (slab) { - deactivate_slab(s, slab, freelist); - stat(s, CPUSLAB_FLUSH); - } -} - -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - void *freelist = c->freelist; - struct slab *slab = c->slab; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - if (slab) { - deactivate_slab(s, slab, freelist); - stat(s, CPUSLAB_FLUSH); - } - - put_partials_cpu(s, c); -} - -static inline void flush_this_cpu_slab(struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); - - if (c->slab) - flush_slab(s, c); - - put_partials(s); -} - -static bool has_cpu_slab(int cpu, struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - - return c->slab || slub_percpu_partial(c); + return get_from_any_partial(s, pc); } static bool has_pcs_used(int cpu, struct kmem_cache *s) { struct slub_percpu_sheaves *pcs; - if (!s->cpu_sheaves) + if (!cache_has_sheaves(s)) return false; pcs = per_cpu_ptr(s->cpu_sheaves, cpu); @@ -4324,11 +3876,11 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s) } /* - * Flush cpu slab. + * Flush percpu sheaves * * Called from CPU work handler with migration disabled. */ -static void flush_cpu_slab(struct work_struct *w) +static void flush_cpu_sheaves(struct work_struct *w) { struct kmem_cache *s; struct slub_flush_work *sfw; @@ -4337,10 +3889,8 @@ static void flush_cpu_slab(struct work_struct *w) s = sfw->s; - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) pcs_flush_all(s); - - flush_this_cpu_slab(s); } static void flush_all_cpus_locked(struct kmem_cache *s) @@ -4353,11 +3903,11 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { + if (!has_pcs_used(cpu, s)) { sfw->skip = true; continue; } - INIT_WORK(&sfw->work, flush_cpu_slab); + INIT_WORK(&sfw->work, flush_cpu_sheaves); sfw->skip = false; sfw->s = s; queue_work_on(cpu, flushwq, &sfw->work); @@ -4442,7 +3992,7 @@ void flush_all_rcu_sheaves(void) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - if (!s->cpu_sheaves) + if (!cache_has_sheaves(s)) continue; flush_rcu_sheaves_on_cache(s); } @@ -4463,27 +4013,13 @@ static int slub_cpu_dead(unsigned int cpu) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - __flush_cpu_slab(s, cpu); - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) __pcs_flush_all_cpu(s, cpu); } mutex_unlock(&slab_mutex); return 0; } -/* - * Check if the objects in a per cpu structure fit numa - * locality expectations. - */ -static inline int node_match(struct slab *slab, int node) -{ -#ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && slab_nid(slab) != node) - return 0; -#endif - return 1; -} - #ifdef CONFIG_SLUB_DEBUG static int count_free(struct slab *slab) { @@ -4656,52 +4192,15 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } -static inline bool -__update_cpu_freelist_fast(struct kmem_cache *s, - void *freelist_old, void *freelist_new, - unsigned long tid) -{ - struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; - struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; - - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, - &old.freelist_tid, new.freelist_tid); -} - /* - * Check the slab->freelist and either transfer the freelist to the - * per cpu freelist or deactivate the slab. + * Get the slab's freelist and do not freeze it. * - * The slab is still frozen if the return value is not NULL. + * Assumes the slab is isolated from node partial list and not frozen. * - * If this function returns NULL then the slab has been unfrozen. + * Assumes this is performed only for caches without debugging so we + * don't need to worry about adding the slab to the full list. */ -static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) -{ - struct freelist_counters old, new; - - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); - - do { - old.freelist = slab->freelist; - old.counters = slab->counters; - - new.freelist = NULL; - new.counters = old.counters; - - new.inuse = old.objects; - new.frozen = old.freelist != NULL; - - - } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); - - return old.freelist; -} - -/* - * Freeze the partial slab and return the pointer to the freelist. - */ -static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) +static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) { struct freelist_counters old, new; @@ -4711,458 +4210,15 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) new.freelist = NULL; new.counters = old.counters; - VM_BUG_ON(new.frozen); + VM_WARN_ON_ONCE(new.frozen); new.inuse = old.objects; - new.frozen = 1; - } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); return old.freelist; } -/* - * Slow path. The lockless freelist is empty or we need to perform - * debugging duties. - * - * Processing is still very fast if new objects have been freed to the - * regular freelist. In that case we simply take over the regular freelist - * as the lockless freelist and zap the regular freelist. - * - * If that is not working then we fall back to the partial lists. We take the - * first element of the freelist as the object to allocate now and move the - * rest of the freelist to the lockless freelist. - * - * And if we were unable to get a new slab from the partial slab lists then - * we need to allocate a new slab. This is the slowest path since it involves - * a call to the page allocator and the setup of a new slab. - * - * Version of __slab_alloc to use when we know that preemption is - * already disabled (which is the case for bulk allocation). - */ -static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) -{ - bool allow_spin = gfpflags_allow_spinning(gfpflags); - void *freelist; - struct slab *slab; - unsigned long flags; - struct partial_context pc; - bool try_thisnode = true; - - stat(s, ALLOC_SLOWPATH); - -reread_slab: - - slab = READ_ONCE(c->slab); - if (!slab) { - /* - * if the node is not online or has no normal memory, just - * ignore the node constraint - */ - if (unlikely(node != NUMA_NO_NODE && - !node_isset(node, slab_nodes))) - node = NUMA_NO_NODE; - goto new_slab; - } - - if (unlikely(!node_match(slab, node))) { - /* - * same as above but node_match() being false already - * implies node != NUMA_NO_NODE. - * - * We don't strictly honor pfmemalloc and NUMA preferences - * when !allow_spin because: - * - * 1. Most kmalloc() users allocate objects on the local node, - * so kmalloc_nolock() tries not to interfere with them by - * deactivating the cpu slab. - * - * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause - * unnecessary slab allocations even when n->partial list - * is not empty. - */ - if (!node_isset(node, slab_nodes) || - !allow_spin) { - node = NUMA_NO_NODE; - } else { - stat(s, ALLOC_NODE_MISMATCH); - goto deactivate_slab; - } - } - - /* - * By rights, we should be searching for a slab page that was - * PFMEMALLOC but right now, we are losing the pfmemalloc - * information when the page leaves the per-cpu allocator - */ - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) - goto deactivate_slab; - - /* must check again c->slab in case we got preempted and it changed */ - local_lock_cpu_slab(s, flags); - - if (unlikely(slab != c->slab)) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - freelist = c->freelist; - if (freelist) - goto load_freelist; - - freelist = get_freelist(s, slab); - - if (!freelist) { - c->slab = NULL; - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - stat(s, DEACTIVATE_BYPASS); - goto new_slab; - } - - stat(s, ALLOC_REFILL); - -load_freelist: - - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); - - /* - * freelist is pointing to the list of objects to be used. - * slab is pointing to the slab from which the objects are obtained. - * That slab must be frozen for per cpu allocations to work. - */ - VM_BUG_ON(!c->slab->frozen); - c->freelist = get_freepointer(s, freelist); - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - return freelist; - -deactivate_slab: - - local_lock_cpu_slab(s, flags); - if (slab != c->slab) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - freelist = c->freelist; - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - deactivate_slab(s, slab, freelist); - -new_slab: - -#ifdef CONFIG_SLUB_CPU_PARTIAL - while (slub_percpu_partial(c)) { - local_lock_cpu_slab(s, flags); - if (unlikely(c->slab)) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - if (unlikely(!slub_percpu_partial(c))) { - local_unlock_cpu_slab(s, flags); - /* we were preempted and partial list got empty */ - goto new_objects; - } - - slab = slub_percpu_partial(c); - slub_set_percpu_partial(c, slab); - - if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags)) || - !allow_spin) { - c->slab = slab; - freelist = get_freelist(s, slab); - VM_BUG_ON(!freelist); - stat(s, CPU_PARTIAL_ALLOC); - goto load_freelist; - } - - local_unlock_cpu_slab(s, flags); - - slab->next = NULL; - __put_partials(s, slab); - } -#endif - -new_objects: - - pc.flags = gfpflags; - /* - * When a preferred node is indicated but no __GFP_THISNODE - * - * 1) try to get a partial slab from target node only by having - * __GFP_THISNODE in pc.flags for get_partial() - * 2) if 1) failed, try to allocate a new slab from target node with - * GPF_NOWAIT | __GFP_THISNODE opportunistically - * 3) if 2) failed, retry with original gfpflags which will allow - * get_partial() try partial lists of other nodes before potentially - * allocating new page from other nodes - */ - if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) - && try_thisnode)) { - if (unlikely(!allow_spin)) - /* Do not upgrade gfp to NOWAIT from more restrictive mode */ - pc.flags = gfpflags | __GFP_THISNODE; - else - pc.flags = GFP_NOWAIT | __GFP_THISNODE; - } - - pc.orig_size = orig_size; - slab = get_partial(s, node, &pc); - if (slab) { - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - freelist = pc.object; - /* - * For debug caches here we had to go through - * alloc_single_from_partial() so just store the - * tracking info and return the object. - * - * Due to disabled preemption we need to disallow - * blocking. The flags are further adjusted by - * gfp_nested_mask() in stack_depot itself. - */ - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr, - gfpflags & ~(__GFP_DIRECT_RECLAIM)); - - return freelist; - } - - freelist = freeze_slab(s, slab); - goto retry_load_slab; - } - - slub_put_cpu_ptr(s->cpu_slab); - slab = new_slab(s, pc.flags, node); - c = slub_get_cpu_ptr(s->cpu_slab); - - if (unlikely(!slab)) { - if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) - && try_thisnode) { - try_thisnode = false; - goto new_objects; - } - slab_out_of_memory(s, gfpflags, node); - return NULL; - } - - stat(s, ALLOC_SLAB); - - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - - if (unlikely(!freelist)) { - /* This could cause an endless loop. Fail instead. */ - if (!allow_spin) - return NULL; - goto new_objects; - } - - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr, - gfpflags & ~(__GFP_DIRECT_RECLAIM)); - - return freelist; - } - - /* - * No other reference to the slab yet so we can - * muck around with it freely without cmpxchg - */ - freelist = slab->freelist; - slab->freelist = NULL; - slab->inuse = slab->objects; - slab->frozen = 1; - - inc_slabs_node(s, slab_nid(slab), slab->objects); - - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { - /* - * For !pfmemalloc_match() case we don't load freelist so that - * we don't make further mismatched allocations easier. - */ - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; - } - -retry_load_slab: - - local_lock_cpu_slab(s, flags); - if (unlikely(c->slab)) { - void *flush_freelist = c->freelist; - struct slab *flush_slab = c->slab; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - local_unlock_cpu_slab(s, flags); - - if (unlikely(!allow_spin)) { - /* Reentrant slub cannot take locks, defer */ - defer_deactivate_slab(flush_slab, flush_freelist); - } else { - deactivate_slab(s, flush_slab, flush_freelist); - } - - stat(s, CPUSLAB_FLUSH); - - goto retry_load_slab; - } - c->slab = slab; - - goto load_freelist; -} -/* - * We disallow kprobes in ___slab_alloc() to prevent reentrance - * - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() - * manipulating c->freelist without lock. - * - * This does not prevent kprobe in functions called from ___slab_alloc() such as - * local_lock_irqsave() itself, and that is fine, we only need to protect the - * c->freelist manipulation in ___slab_alloc() itself. - */ -NOKPROBE_SYMBOL(___slab_alloc); - -/* - * A wrapper for ___slab_alloc() for contexts where preemption is not yet - * disabled. Compensates for possible cpu changes by refetching the per cpu area - * pointer. - */ -static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) -{ - void *p; - -#ifdef CONFIG_PREEMPT_COUNT - /* - * We may have been preempted and rescheduled on a different - * cpu before disabling preemption. Need to reload cpu area - * pointer. - */ - c = slub_get_cpu_ptr(s->cpu_slab); -#endif - if (unlikely(!gfpflags_allow_spinning(gfpflags))) { - if (local_lock_is_locked(&s->cpu_slab->lock)) { - /* - * EBUSY is an internal signal to kmalloc_nolock() to - * retry a different bucket. It's not propagated - * to the caller. - */ - p = ERR_PTR(-EBUSY); - goto out; - } - } - p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); -out: -#ifdef CONFIG_PREEMPT_COUNT - slub_put_cpu_ptr(s->cpu_slab); -#endif - return p; -} - -static __always_inline void *__slab_alloc_node(struct kmem_cache *s, - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) -{ - struct kmem_cache_cpu *c; - struct slab *slab; - unsigned long tid; - void *object; - -redo: - /* - * Must read kmem_cache cpu data via this cpu ptr. Preemption is - * enabled. We may switch back and forth between cpus while - * reading from one cpu area. That does not matter as long - * as we end up on the original cpu again when doing the cmpxchg. - * - * We must guarantee that tid and kmem_cache_cpu are retrieved on the - * same cpu. We read first the kmem_cache_cpu pointer and use it to read - * the tid. If we are preempted and switched to another cpu between the - * two reads, it's OK as the two are still associated with the same cpu - * and cmpxchg later will validate the cpu. - */ - c = raw_cpu_ptr(s->cpu_slab); - tid = READ_ONCE(c->tid); - - /* - * Irqless object alloc/free algorithm used here depends on sequence - * of fetching cpu_slab's data. tid should be fetched before anything - * on c to guarantee that object and slab associated with previous tid - * won't be used with current tid. If we fetch tid first, object and - * slab could be one associated with next tid and our alloc/free - * request will be failed. In this case, we will retry. So, no problem. - */ - barrier(); - - /* - * The transaction ids are globally unique per cpu and per operation on - * a per cpu queue. Thus they can be guarantee that the cmpxchg_double - * occurs on the right processor and that there was no operation on the - * linked list in between. - */ - - object = c->freelist; - slab = c->slab; - -#ifdef CONFIG_NUMA - if (static_branch_unlikely(&strict_numa) && - node == NUMA_NO_NODE) { - - struct mempolicy *mpol = current->mempolicy; - - if (mpol) { - /* - * Special BIND rule support. If existing slab - * is in permitted set then do not redirect - * to a particular node. - * Otherwise we apply the memory policy to get - * the node we need to allocate on. - */ - if (mpol->mode != MPOL_BIND || !slab || - !node_isset(slab_nid(slab), mpol->nodes)) - - node = mempolicy_slab_node(); - } - } -#endif - - if (!USE_LOCKLESS_FAST_PATH() || - unlikely(!object || !slab || !node_match(slab, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); - } else { - void *next_object = get_freepointer_safe(s, object); - - /* - * The cmpxchg will only match if there was no additional - * operation and if we are on the right processor. - * - * The cmpxchg does the following atomically (without lock - * semantics!) - * 1. Relocate first pointer to the current per cpu area. - * 2. Verify that tid and freelist have not been changed - * 3. If they were not changed replace tid and freelist - * - * Since this is without lock semantics the protection is only - * against code executing on this cpu *not* from access by - * other cpus. - */ - if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { - note_cmpxchg_failure("slab_alloc", s, tid); - goto redo; - } - prefetch_freepointer(s, next_object); - stat(s, ALLOC_FASTPATH); - } - - return object; -} - /* * If the object has been wiped upon free, make sure it's fully initialized by * zeroing out freelist pointer. @@ -5178,6 +4234,174 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, 0, sizeof(void *)); } +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, + void **p, unsigned int count, bool allow_spin) +{ + unsigned int allocated = 0; + struct kmem_cache_node *n; + bool needs_add_partial; + unsigned long flags; + void *object; + + /* + * Are we going to put the slab on the partial list? + * Note slab->inuse is 0 on a new slab. + */ + needs_add_partial = (slab->objects > count); + + if (!allow_spin && needs_add_partial) { + + n = get_node(s, slab_nid(slab)); + + if (!spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + free_new_slab_nolock(s, slab); + return 0; + } + } + + object = slab->freelist; + while (object && allocated < count) { + p[allocated] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[allocated]); + + slab->inuse++; + allocated++; + } + slab->freelist = object; + + if (needs_add_partial) { + + if (allow_spin) { + n = get_node(s, slab_nid(slab)); + spin_lock_irqsave(&n->list_lock, flags); + } + add_partial(n, slab, ADD_TO_HEAD); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + inc_slabs_node(s, slab_nid(slab), slab->objects); + return allocated; +} + +/* + * Slow path. We failed to allocate via percpu sheaves or they are not available + * due to bootstrap or debugging enabled or SLUB_TINY. + * + * We try to allocate from partial slab lists and fall back to allocating a new + * slab. + */ +static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, unsigned int orig_size) +{ + bool allow_spin = gfpflags_allow_spinning(gfpflags); + void *object; + struct slab *slab; + struct partial_context pc; + bool try_thisnode = true; + + stat(s, ALLOC_SLOWPATH); + +new_objects: + + pc.flags = gfpflags; + /* + * When a preferred node is indicated but no __GFP_THISNODE + * + * 1) try to get a partial slab from target node only by having + * __GFP_THISNODE in pc.flags for get_from_partial() + * 2) if 1) failed, try to allocate a new slab from target node with + * GPF_NOWAIT | __GFP_THISNODE opportunistically + * 3) if 2) failed, retry with original gfpflags which will allow + * get_from_partial() try partial lists of other nodes before + * potentially allocating new page from other nodes + */ + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode)) { + if (unlikely(!allow_spin)) + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ + pc.flags = gfpflags | __GFP_THISNODE; + else + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + } + + pc.orig_size = orig_size; + object = get_from_partial(s, node, &pc); + if (object) + goto success; + + slab = new_slab(s, pc.flags, node); + + if (unlikely(!slab)) { + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode) { + try_thisnode = false; + goto new_objects; + } + slab_out_of_memory(s, gfpflags, node); + return NULL; + } + + stat(s, ALLOC_SLAB); + + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); + + if (likely(object)) + goto success; + } else { + alloc_from_new_slab(s, slab, &object, 1, allow_spin); + + /* we don't need to check SLAB_STORE_USER here */ + if (likely(object)) + return object; + } + + if (allow_spin) + goto new_objects; + + /* This could cause an endless loop. Fail instead. */ + return NULL; + +success: + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) + set_track(s, object, TRACK_ALLOC, addr, gfpflags); + + return object; +} + +static __always_inline void *__slab_alloc_node(struct kmem_cache *s, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + void *object; + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If the local node + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || + !node_isset(numa_mem_id(), mpol->nodes)) + node = mempolicy_slab_node(); + } + } +#endif + + object = ___slab_alloc(s, gfpflags, node, addr, orig_size); + + return object; +} + static __fastpath_inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { @@ -5264,6 +4488,12 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + /* Bootstrap or debug cache, back off */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + if (pcs->spare && pcs->spare->size > 0) { swap(pcs->main, pcs->spare); return pcs; @@ -5275,7 +4505,8 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, return NULL; } - full = barn_replace_empty_sheaf(barn, pcs->main); + full = barn_replace_empty_sheaf(barn, pcs->main, + gfpflags_allow_spinning(gfp)); if (full) { stat(s, BARN_GET); @@ -5292,7 +4523,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, empty = pcs->spare; pcs->spare = NULL; } else { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); } } @@ -5334,7 +4565,10 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, */ if (pcs->main->size == 0) { - barn_put_empty_sheaf(barn, pcs->main); + if (!pcs->spare) + pcs->spare = pcs->main; + else + barn_put_empty_sheaf(barn, pcs->main); pcs->main = full; return pcs; } @@ -5391,8 +4625,10 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) * We assume the percpu sheaves contain only local objects although it's * not completely guaranteed, so we verify later. */ - if (unlikely(node_requested && node != numa_mem_id())) + if (unlikely(node_requested && node != numa_mem_id())) { + stat(s, ALLOC_NODE_MISMATCH); return NULL; + } if (!local_trylock(&s->cpu_sheaves->lock)) return NULL; @@ -5415,6 +4651,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) */ if (page_to_nid(virt_to_page(object)) != node) { local_unlock(&s->cpu_sheaves->lock); + stat(s, ALLOC_NODE_MISMATCH); return NULL; } } @@ -5423,13 +4660,14 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) local_unlock(&s->cpu_sheaves->lock); - stat(s, ALLOC_PCS); + stat(s, ALLOC_FASTPATH); return object; } static __fastpath_inline -unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size, + void **p) { struct slub_percpu_sheaves *pcs; struct slab_sheaf *main; @@ -5447,6 +4685,11 @@ next_batch: struct slab_sheaf *full; struct node_barn *barn; + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return allocated; + } + if (pcs->spare && pcs->spare->size > 0) { swap(pcs->main, pcs->spare); goto do_alloc; @@ -5458,7 +4701,8 @@ next_batch: return allocated; } - full = barn_replace_empty_sheaf(barn, pcs->main); + full = barn_replace_empty_sheaf(barn, pcs->main, + gfpflags_allow_spinning(gfp)); if (full) { stat(s, BARN_GET); @@ -5488,7 +4732,7 @@ do_alloc: local_unlock(&s->cpu_sheaves->lock); - stat_add(s, ALLOC_PCS, batch); + stat_add(s, ALLOC_FASTPATH, batch); allocated += batch; @@ -5526,8 +4770,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - if (s->cpu_sheaves) - object = alloc_from_pcs(s, gfpflags, node); + object = alloc_from_pcs(s, gfpflags, node); if (!object) object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); @@ -5622,6 +4865,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, return ret; } +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + /* * returns a sheaf that has at least the requested size * when prefilling is needed, do so with given gfp flags @@ -5635,18 +4881,10 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) struct slab_sheaf *sheaf = NULL; struct node_barn *barn; - if (unlikely(size > s->sheaf_capacity)) { + if (unlikely(!size)) + return NULL; - /* - * slab_debug disables cpu sheaves intentionally so all - * prefilled sheaves become "oversize" and we give up on - * performance for the debugging. Same with SLUB_TINY. - * Creating a cache without sheaves and then requesting a - * prefilled sheaf is however not expected, so warn. - */ - WARN_ON_ONCE(s->sheaf_capacity == 0 && - !IS_ENABLED(CONFIG_SLUB_TINY) && - !(s->flags & SLAB_DEBUG_FLAGS)); + if (unlikely(size > s->sheaf_capacity)) { sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); if (!sheaf) @@ -5968,7 +5206,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; struct kmem_cache *s; bool can_retry = true; - void *ret = ERR_PTR(-EBUSY); + void *ret; VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | __GFP_NO_OBJ_EXT)); @@ -5976,13 +5214,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) if (unlikely(!size)) return ZERO_SIZE_PTR; - if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) - /* - * kmalloc_nolock() in PREEMPT_RT is not supported from - * non-preemptible context because local_lock becomes a - * sleeping lock on RT. - */ + /* + * See the comment for the same check in + * alloc_frozen_pages_nolock_noprof() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) return NULL; + retry: if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return NULL; @@ -5991,50 +5229,47 @@ retry: if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) /* * kmalloc_nolock() is not supported on architectures that - * don't implement cmpxchg16b, but debug caches don't use - * per-cpu slab and per-cpu partial slabs. They rely on - * kmem_cache_node->list_lock, so kmalloc_nolock() can - * attempt to allocate from debug caches by + * don't implement cmpxchg16b and thus need slab_lock() + * which could be preempted by a nmi. + * But debug caches don't use that and only rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt + * to allocate from debug caches by * spin_trylock_irqsave(&n->list_lock, ...) */ return NULL; + ret = alloc_from_pcs(s, alloc_gfp, node); + if (ret) + goto success; + /* * Do not call slab_alloc_node(), since trylock mode isn't * compatible with slab_pre_alloc_hook/should_failslab and * kfence_alloc. Hence call __slab_alloc_node() (at most twice) * and slab_post_alloc_hook() directly. - * - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair - * in irq saved region. It assumes that the same cpu will not - * __update_cpu_freelist_fast() into the same (freelist,tid) pair. - * Therefore use in_nmi() to check whether particular bucket is in - * irq protected section. - * - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that - * this cpu was interrupted somewhere inside ___slab_alloc() after - * it did local_lock_irqsave(&s->cpu_slab->lock, flags). - * In this case fast path with __update_cpu_freelist_fast() is not safe. */ - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); - if (PTR_ERR(ret) == -EBUSY) { - if (can_retry) { - /* pick the next kmalloc bucket */ - size = s->object_size + 1; - /* - * Another alternative is to - * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; - * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; - * to retry from bucket of the same size. - */ - can_retry = false; - goto retry; - } - ret = NULL; + /* + * It's possible we failed due to trylock as we preempted someone with + * the sheaves locked, and the list_lock is also held by another cpu. + * But it should be rare that multiple kmalloc buckets would have + * sheaves locked, so try a larger one. + */ + if (!ret && can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; } +success: maybe_wipe_obj_freeptr(s, ret); slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, slab_want_init_on_alloc(alloc_gfp, s), size); @@ -6116,7 +5351,7 @@ static noinline void free_to_partial_list( /* was on full list */ remove_full(s, n, slab); if (!slab_free) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); + add_partial(n, slab, ADD_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } else if (slab_free) { @@ -6154,26 +5389,17 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { - bool was_frozen, was_full; + bool was_full; struct freelist_counters old, new; struct kmem_cache_node *n = NULL; unsigned long flags; bool on_node_partial; - stat(s, FREE_SLOWPATH); - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { free_to_partial_list(s, slab, head, tail, cnt, addr); return; } - /* - * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below - * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) - * is the only other reason it can be false, and it is already handled - * above. - */ - do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); @@ -6184,7 +5410,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, old.counters = slab->counters; was_full = (old.freelist == NULL); - was_frozen = old.frozen; set_freepointer(s, tail, old.freelist); @@ -6197,53 +5422,29 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * to (due to not being full anymore) the partial list. * Unless it's frozen. */ - if ((!new.inuse || was_full) && !was_frozen) { + if (!new.inuse || was_full) { + + n = get_node(s, slab_nid(slab)); /* - * If slab becomes non-full and we have cpu partial - * lists, we put it there unconditionally to avoid - * taking the list_lock. Otherwise we need it. + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. */ - if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { + spin_lock_irqsave(&n->list_lock, flags); - n = get_node(s, slab_nid(slab)); - /* - * Speculatively acquire the list_lock. - * If the cmpxchg does not succeed then we may - * drop the list_lock without any processing. - * - * Otherwise the list_lock will synchronize with - * other processors updating the list of slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); - - on_node_partial = slab_test_node_partial(slab); - } + on_node_partial = slab_test_node_partial(slab); } } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); if (likely(!n)) { - - if (likely(was_frozen)) { - /* - * The list lock was not taken therefore no list - * activity can be necessary. - */ - stat(s, FREE_FROZEN); - } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { - /* - * If we started with a full slab then put it onto the - * per cpu partial list. - */ - put_cpu_partial(s, slab, 1); - stat(s, CPU_PARTIAL_FREE); - } - /* - * In other cases we didn't take the list_lock because the slab - * was already on the partial list and will remain there. + * We didn't take the list_lock because the slab was already on + * the partial list and will remain there. */ - return; } @@ -6265,11 +5466,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, /* * Objects left in the slab. If it was not on the partial list before - * then add it. This can only happen when cache has no per cpu partial - * list otherwise we would have put it there. + * then add it. */ - if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); + if (unlikely(was_full)) { + add_partial(n, slab, ADD_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); @@ -6355,7 +5555,8 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s, * unlocked. */ static struct slub_percpu_sheaves * -__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, + bool allow_spin) { struct slab_sheaf *empty; struct node_barn *barn; @@ -6364,6 +5565,12 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) restart: lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + /* Bootstrap or debug cache, back off */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + barn = get_barn(s); if (!barn) { local_unlock(&s->cpu_sheaves->lock); @@ -6373,7 +5580,7 @@ restart: put_fail = false; if (!pcs->spare) { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, allow_spin); if (empty) { pcs->spare = pcs->main; pcs->main = empty; @@ -6387,7 +5594,7 @@ restart: return pcs; } - empty = barn_replace_full_sheaf(barn, pcs->main); + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin); if (!IS_ERR(empty)) { stat(s, BARN_PUT); @@ -6395,7 +5602,8 @@ restart: return pcs; } - if (PTR_ERR(empty) == -E2BIG) { + /* sheaf_flush_unused() doesn't support !allow_spin */ + if (PTR_ERR(empty) == -E2BIG && allow_spin) { /* Since we got here, spare exists and is full */ struct slab_sheaf *to_flush = pcs->spare; @@ -6420,6 +5628,14 @@ restart: alloc_empty: local_unlock(&s->cpu_sheaves->lock); + /* + * alloc_empty_sheaf() doesn't support !allow_spin and it's + * easier to fall back to freeing directly without sheaves + * than add the support (and to sheaf_flush_unused() above) + */ + if (!allow_spin) + return NULL; + empty = alloc_empty_sheaf(s, GFP_NOWAIT); if (empty) goto got_empty; @@ -6462,7 +5678,7 @@ got_empty: * The object is expected to have passed slab_free_hook() already. */ static __fastpath_inline -bool free_to_pcs(struct kmem_cache *s, void *object) +bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) { struct slub_percpu_sheaves *pcs; @@ -6473,7 +5689,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object) if (unlikely(pcs->main->size == s->sheaf_capacity)) { - pcs = __pcs_replace_full_main(s, pcs); + pcs = __pcs_replace_full_main(s, pcs, allow_spin); if (unlikely(!pcs)) return false; } @@ -6482,7 +5698,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object) local_unlock(&s->cpu_sheaves->lock); - stat(s, FREE_PCS); + stat(s, FREE_FASTPATH); return true; } @@ -6580,6 +5796,12 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) struct slab_sheaf *empty; struct node_barn *barn; + /* Bootstrap or debug cache, fall back */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + goto fail; + } + if (pcs->spare && pcs->spare->size == 0) { pcs->rcu_free = pcs->spare; pcs->spare = NULL; @@ -6592,7 +5814,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) goto fail; } - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); if (empty) { pcs->rcu_free = empty; @@ -6712,7 +5934,7 @@ next_batch: goto no_empty; if (!pcs->spare) { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); if (!empty) goto no_empty; @@ -6726,7 +5948,7 @@ next_batch: goto do_free; } - empty = barn_replace_full_sheaf(barn, pcs->main); + empty = barn_replace_full_sheaf(barn, pcs->main, true); if (IS_ERR(empty)) { stat(s, BARN_PUT_FAIL); goto no_empty; @@ -6744,7 +5966,7 @@ do_free: local_unlock(&s->cpu_sheaves->lock); - stat_add(s, FREE_PCS, batch); + stat_add(s, FREE_FASTPATH, batch); if (batch < size) { p += batch; @@ -6766,10 +5988,12 @@ no_empty: */ fallback: __kmem_cache_free_bulk(s, size, p); + stat_add(s, FREE_SLOWPATH, size); flush_remote: if (remote_nr) { __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); + stat_add(s, FREE_SLOWPATH, remote_nr); if (i < size) { remote_nr = 0; goto next_remote_batch; @@ -6779,7 +6003,6 @@ flush_remote: struct defer_free { struct llist_head objects; - struct llist_head slabs; struct irq_work work; }; @@ -6787,23 +6010,21 @@ static void free_deferred_objects(struct irq_work *work); static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { .objects = LLIST_HEAD_INIT(objects), - .slabs = LLIST_HEAD_INIT(slabs), .work = IRQ_WORK_INIT(free_deferred_objects), }; /* * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe - * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * to take sleeping spin_locks from __slab_free(). * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). */ static void free_deferred_objects(struct irq_work *work) { struct defer_free *df = container_of(work, struct defer_free, work); struct llist_head *objs = &df->objects; - struct llist_head *slabs = &df->slabs; struct llist_node *llnode, *pos, *t; - if (llist_empty(objs) && llist_empty(slabs)) + if (llist_empty(objs)) return; llnode = llist_del_all(objs); @@ -6826,16 +6047,7 @@ static void free_deferred_objects(struct irq_work *work) set_freepointer(s, x, NULL); __slab_free(s, slab, x, x, 1, _THIS_IP_); - } - - llnode = llist_del_all(slabs); - llist_for_each_safe(pos, t, llnode) { - struct slab *slab = container_of(pos, struct slab, llnode); - - if (slab->frozen) - deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); - else - free_slab(slab->slab_cache, slab); + stat(s, FREE_SLOWPATH); } } @@ -6852,19 +6064,6 @@ static void defer_free(struct kmem_cache *s, void *head) irq_work_queue(&df->work); } -static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) -{ - struct defer_free *df; - - slab->flush_freelist = flush_freelist; - - guard(preempt)(); - - df = this_cpu_ptr(&defer_free_objects); - if (llist_add(&slab->llnode, &df->slabs)) - irq_work_queue(&df->work); -} - void defer_free_barrier(void) { int cpu; @@ -6873,99 +6072,6 @@ void defer_free_barrier(void) irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); } -/* - * Fastpath with forced inlining to produce a kfree and kmem_cache_free that - * can perform fastpath freeing without additional function calls. - * - * The fastpath is only possible if we are freeing to the current cpu slab - * of this processor. This typically the case if we have just allocated - * the item before. - * - * If fastpath is not possible then fall back to __slab_free where we deal - * with all sorts of special processing. - * - * Bulk free of a freelist with several objects (all pointing to the - * same slab) possible by specifying head and tail ptr, plus objects - * count (cnt). Bulk free indicated by tail pointer being set. - */ -static __always_inline void do_slab_free(struct kmem_cache *s, - struct slab *slab, void *head, void *tail, - int cnt, unsigned long addr) -{ - /* cnt == 0 signals that it's called from kfree_nolock() */ - bool allow_spin = cnt; - struct kmem_cache_cpu *c; - unsigned long tid; - void **freelist; - -redo: - /* - * Determine the currently cpus per cpu slab. - * The cpu may change afterward. However that does not matter since - * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succeed. - */ - c = raw_cpu_ptr(s->cpu_slab); - tid = READ_ONCE(c->tid); - - /* Same with comment on barrier() in __slab_alloc_node() */ - barrier(); - - if (unlikely(slab != c->slab)) { - if (unlikely(!allow_spin)) { - /* - * __slab_free() can locklessly cmpxchg16 into a slab, - * but then it might need to take spin_lock or local_lock - * in put_cpu_partial() for further processing. - * Avoid the complexity and simply add to a deferred list. - */ - defer_free(s, head); - } else { - __slab_free(s, slab, head, tail, cnt, addr); - } - return; - } - - if (unlikely(!allow_spin)) { - if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && - local_lock_is_locked(&s->cpu_slab->lock)) { - defer_free(s, head); - return; - } - cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ - } - - if (USE_LOCKLESS_FAST_PATH()) { - freelist = READ_ONCE(c->freelist); - - set_freepointer(s, tail, freelist); - - if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { - note_cmpxchg_failure("slab_free", s, tid); - goto redo; - } - } else { - __maybe_unused unsigned long flags = 0; - - /* Update the free list under the local lock */ - local_lock_cpu_slab(s, flags); - c = this_cpu_ptr(s->cpu_slab); - if (unlikely(slab != c->slab)) { - local_unlock_cpu_slab(s, flags); - goto redo; - } - tid = c->tid; - freelist = c->freelist; - - set_freepointer(s, tail, freelist); - c->freelist = head; - c->tid = next_tid(tid); - - local_unlock_cpu_slab(s, flags); - } - stat_add(s, FREE_FASTPATH, cnt); -} - static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) @@ -6976,14 +6082,14 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) return; - if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id()) - && likely(!slab_test_pfmemalloc(slab))) { - if (likely(free_to_pcs(s, object))) + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) + && likely(!slab_test_pfmemalloc(slab))) { + if (likely(free_to_pcs(s, object, true))) return; } - do_slab_free(s, slab, object, object, 1, addr); + __slab_free(s, slab, object, object, 1, addr); + stat(s, FREE_SLOWPATH); } #ifdef CONFIG_MEMCG @@ -6992,7 +6098,7 @@ static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) { if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) - do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); + __slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); } #endif @@ -7006,8 +6112,10 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. */ - if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) - do_slab_free(s, slab, head, tail, cnt, addr); + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) { + __slab_free(s, slab, head, tail, cnt, addr); + stat_add(s, FREE_SLOWPATH, cnt); + } } #ifdef CONFIG_SLUB_RCU_DEBUG @@ -7032,15 +6140,18 @@ static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) return; /* resume freeing */ - if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) - do_slab_free(s, slab, object, object, 1, _THIS_IP_); + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) { + __slab_free(s, slab, object, object, 1, _THIS_IP_); + stat(s, FREE_SLOWPATH); + } } #endif /* CONFIG_SLUB_RCU_DEBUG */ #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { - do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); + __slab_free(cache, virt_to_slab(x), x, x, 1, addr); + stat(cache, FREE_SLOWPATH); } #endif @@ -7340,7 +6451,18 @@ void kfree_nolock(const void *object) * since kasan quarantine takes locks and not supported from NMI. */ kasan_slab_free(s, x, false, false, /* skip quarantine */true); - do_slab_free(s, slab, x, x, 0, _RET_IP_); + + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { + if (likely(free_to_pcs(s, x, false))) + return; + } + + /* + * __slab_free() can locklessly cmpxchg16 into a slab, but then it might + * need to take spin_lock for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, x); } EXPORT_SYMBOL_GPL(kfree_nolock); @@ -7766,7 +6888,7 @@ static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (kfence_free(df.freelist)) continue; - do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, + __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_); } while (likely(size)); } @@ -7781,7 +6903,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) * freeing to sheaves is so incompatible with the detached freelist so * once we go that way, we have to do everything differently */ - if (s && s->cpu_sheaves) { + if (s && cache_has_sheaves(s)) { free_to_pcs_bulk(s, size, p); return; } @@ -7799,72 +6921,224 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); +static unsigned int +__refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max, struct kmem_cache_node *n, + bool allow_spin) +{ + struct partial_bulk_context pc; + struct slab *slab, *slab2; + unsigned int refilled = 0; + unsigned long flags; + void *object; + + pc.flags = gfp; + pc.min_objects = min; + pc.max_objects = max; + + if (!get_partial_node_bulk(s, n, &pc, allow_spin)) + return 0; + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + + object = get_freelist_nofreeze(s, slab); + + while (object && refilled < max) { + p[refilled] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[refilled]); + + refilled++; + } + + /* + * Freelist had more objects than we can accommodate, we need to + * free them back. We can treat it like a detached freelist, just + * need to find the tail object. + */ + if (unlikely(object)) { + void *head = object; + void *tail; + int cnt = 0; + + do { + tail = object; + cnt++; + object = get_freepointer(s, object); + } while (object); + __slab_free(s, slab, head, tail, cnt, _RET_IP_); + } + + if (refilled >= max) + break; + } + + if (unlikely(!list_empty(&pc.slabs))) { + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) + continue; + + list_del(&slab->slab_list); + add_partial(n, slab, ADD_TO_HEAD); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + /* any slabs left are completely free and for discard */ + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + discard_slab(s, slab); + } + } + + return refilled; +} + +#ifdef CONFIG_NUMA +static unsigned int +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + struct zonelist *zonelist; + struct zoneref *z; + struct zone *zone; + enum zone_type highest_zoneidx = gfp_zone(gfp); + unsigned int cpuset_mems_cookie; + unsigned int refilled = 0; + + /* see get_from_any_partial() for the defrag ratio description */ + if (!s->remote_node_defrag_ratio || + get_cycles() % 1024 > s->remote_node_defrag_ratio) + return 0; + + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), gfp); + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { + struct kmem_cache_node *n; + unsigned int r; + + n = get_node(s, zone_to_nid(zone)); + + if (!n || !cpuset_zone_allowed(zone, gfp) || + n->nr_partial <= s->min_partial) + continue; + + r = __refill_objects_node(s, p, gfp, min, max, n, + /* allow_spin = */ false); + refilled += r; + + if (r >= min) { + /* + * Don't check read_mems_allowed_retry() here - + * if mems_allowed was updated in parallel, that + * was a harmless race between allocation and + * the cpuset update + */ + return refilled; + } + p += r; + min -= r; + max -= r; + } + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + return refilled; +} +#else +static inline unsigned int +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + return 0; +} +#endif + +static unsigned int +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + int local_node = numa_mem_id(); + unsigned int refilled; + struct slab *slab; + + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) + return 0; + + refilled = __refill_objects_node(s, p, gfp, min, max, + get_node(s, local_node), + /* allow_spin = */ true); + if (refilled >= min) + return refilled; + + refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled, + max - refilled); + if (refilled >= min) + return refilled; + +new_slab: + + slab = new_slab(s, gfp, local_node); + if (!slab) + goto out; + + stat(s, ALLOC_SLAB); + + /* + * TODO: possible optimization - if we know we will consume the whole + * slab we might skip creating the freelist? + */ + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, + /* allow_spin = */ true); + + if (refilled < min) + goto new_slab; + +out: + return refilled; +} + static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - struct kmem_cache_cpu *c; - unsigned long irqflags; int i; - /* - * Drain objects in the per cpu slab, while disabling local - * IRQs, which protects against PREEMPT and interrupts - * handlers invoking normal fastpath. - */ - c = slub_get_cpu_ptr(s->cpu_slab); - local_lock_irqsave(&s->cpu_slab->lock, irqflags); + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + for (i = 0; i < size; i++) { - for (i = 0; i < size; i++) { - void *object = c->freelist; - - if (unlikely(!object)) { - /* - * We may have removed an object from c->freelist using - * the fastpath in the previous iteration; in that case, - * c->tid has not been bumped yet. - * Since ___slab_alloc() may reenable interrupts while - * allocating memory, we should bump c->tid now. - */ - c->tid = next_tid(c->tid); - - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); - - /* - * Invoking slow path likely have side-effect - * of re-populating per CPU c->freelist - */ - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c, s->object_size); + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, + s->object_size); if (unlikely(!p[i])) goto error; - c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); - - local_lock_irqsave(&s->cpu_slab->lock, irqflags); - - continue; /* goto for-loop */ } - c->freelist = get_freepointer(s, object); - p[i] = object; - maybe_wipe_obj_freeptr(s, p[i]); - stat(s, ALLOC_FASTPATH); + } else { + i = refill_objects(s, p, flags, size, size); + if (i < size) + goto error; + stat_add(s, ALLOC_SLOWPATH, i); } - c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); - slub_put_cpu_ptr(s->cpu_slab); return i; error: - slub_put_cpu_ptr(s->cpu_slab); __kmem_cache_free_bulk(s, i, p); return 0; } -/* Note that interrupts must be enabled when calling this function. */ +/* + * Note that interrupts must be enabled when calling this function and gfp + * flags must allow spinning. + */ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { @@ -7892,8 +7166,7 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, size--; } - if (s->cpu_sheaves) - i = alloc_from_pcs_bulk(s, size, p); + i = alloc_from_pcs_bulk(s, flags, size, p); if (i < size) { /* @@ -8081,29 +7354,25 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) barn_init(barn); } -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +#ifdef CONFIG_SLUB_STATS +static inline int alloc_kmem_cache_stats(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * - sizeof(struct kmem_cache_cpu)); + sizeof(struct kmem_cache_stats)); - /* - * Must align to double word boundary for the double cmpxchg - * instructions to work; see __pcpu_double_call_return_bool(). - */ - s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), - 2 * sizeof(void *)); + s->cpu_stats = alloc_percpu(struct kmem_cache_stats); - if (!s->cpu_slab) + if (!s->cpu_stats) return 0; - init_kmem_cache_cpus(s); - return 1; } +#endif static int init_percpu_sheaves(struct kmem_cache *s) { + static struct slab_sheaf bootstrap_sheaf = {}; int cpu; for_each_possible_cpu(cpu) { @@ -8113,7 +7382,28 @@ static int init_percpu_sheaves(struct kmem_cache *s) local_trylock_init(&pcs->lock); - pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + /* + * Bootstrap sheaf has zero size so fast-path allocation fails. + * It has also size == s->sheaf_capacity, so fast-path free + * fails. In the slow paths we recognize the situation by + * checking s->sheaf_capacity. This allows fast paths to assume + * s->cpu_sheaves and pcs->main always exists and are valid. + * It's also safe to share the single static bootstrap_sheaf + * with zero-sized objects array as it's never modified. + * + * Bootstrap_sheaf also has NULL pointer to kmem_cache so we + * recognize it and not attempt to free it when destroying the + * cache. + * + * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node, + * caches with debug enabled, and all caches with SLUB_TINY. + * For kmalloc caches it's used temporarily during the initial + * bootstrap. + */ + if (!s->sheaf_capacity) + pcs->main = &bootstrap_sheaf; + else + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); if (!pcs->main) return -ENOMEM; @@ -8164,7 +7454,7 @@ static void early_kmem_cache_node_alloc(int node) * No locks need to be taken here as it has just been * initialized and there is no concurrent access. */ - __add_partial(n, slab, DEACTIVATE_TO_HEAD); + __add_partial(n, slab, ADD_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -8188,13 +7478,10 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); - if (s->cpu_sheaves) - pcs_destroy(s); -#ifdef CONFIG_PREEMPT_RT - if (s->cpu_slab) - lockdep_unregister_key(&s->lock_key); + pcs_destroy(s); +#ifdef CONFIG_SLUB_STATS + free_percpu(s->cpu_stats); #endif - free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); } @@ -8211,7 +7498,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) continue; } - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); if (!barn) @@ -8232,37 +7519,51 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 1; } -static void set_cpu_partial(struct kmem_cache *s) +static unsigned int calculate_sheaf_capacity(struct kmem_cache *s, + struct kmem_cache_args *args) + { -#ifdef CONFIG_SLUB_CPU_PARTIAL - unsigned int nr_objects; + unsigned int capacity; + size_t size; + + + if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS) + return 0; /* - * cpu_partial determined the maximum number of objects kept in the - * per cpu partial lists of a processor. - * - * Per cpu partial lists mainly contain slabs that just have one - * object freed. If they are used for allocation then they can be - * filled up again with minimal effort. The slab will never hit the - * per node partial lists and therefore no locking will be required. - * - * For backwards compatibility reasons, this is determined as number - * of objects, even though we now limit maximum number of pages, see - * slub_set_cpu_partial() + * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT). + * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not + * have sheaves to avoid recursion when sheaf allocation triggers + * kmemleak tracking. */ - if (!kmem_cache_has_cpu_partial(s)) - nr_objects = 0; - else if (s->size >= PAGE_SIZE) - nr_objects = 6; - else if (s->size >= 1024) - nr_objects = 24; - else if (s->size >= 256) - nr_objects = 52; - else - nr_objects = 120; + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return 0; - slub_set_cpu_partial(s, nr_objects); -#endif + /* + * For now we use roughly similar formula (divided by two as there are + * two percpu sheaves) as what was used for percpu partial slabs, which + * should result in similar lock contention (barn or list_lock) + */ + if (s->size >= PAGE_SIZE) + capacity = 4; + else if (s->size >= 1024) + capacity = 12; + else if (s->size >= 256) + capacity = 26; + else + capacity = 60; + + /* Increment capacity to make sheaf exactly a kmalloc size bucket */ + size = struct_size_t(struct slab_sheaf, objects, capacity); + size = kmalloc_size_roundup(size); + capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *); + + /* + * Respect an explicit request for capacity that's typically motivated by + * expected maximum size of kmem_cache_prefill_sheaf() to not end up + * using low-performance oversize sheaves + */ + return max(capacity, args->sheaf_capacity); } /* @@ -8409,6 +7710,13 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; + /* + * For KMALLOC_NORMAL caches we enable sheaves later by + * bootstrap_kmalloc_sheaves() to avoid recursion + */ + if (!is_kmalloc_normal(s)) + s->sheaf_capacity = calculate_sheaf_capacity(s, args); + /* * Determine the number of objects per slab */ @@ -8493,7 +7801,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) flush_all_cpus_locked(s); /* we might have rcu sheaves in flight */ - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) rcu_barrier(); /* Attempt to free all objects */ @@ -8805,7 +8113,7 @@ static int slab_mem_going_online_callback(int nid) if (get_node(s, nid)) continue; - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); if (!barn) { @@ -8880,12 +8188,6 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) memcpy(s, static_cache, kmem_cache->object_size); - /* - * This runs very early, and only the boot processor is supposed to be - * up. Even if it weren't true, IRQs are not up so we couldn't fire - * IPIs around. - */ - __flush_cpu_slab(s, smp_processor_id()); for_each_kmem_cache_node(s, node, n) { struct slab *p; @@ -8901,6 +8203,74 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) return s; } +/* + * Finish the sheaves initialization done normally by init_percpu_sheaves() and + * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it + * since sheaves and barns are allocated by kmalloc. + */ +static void __init bootstrap_cache_sheaves(struct kmem_cache *s) +{ + struct kmem_cache_args empty_args = {}; + unsigned int capacity; + bool failed = false; + int node, cpu; + + capacity = calculate_sheaf_capacity(s, &empty_args); + + /* capacity can be 0 due to debugging or SLUB_TINY */ + if (!capacity) + return; + + for_each_node_mask(node, slab_nodes) { + struct node_barn *barn; + + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) { + failed = true; + goto out; + } + + barn_init(barn); + get_node(s, node)->barn = barn; + } + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity); + + if (!pcs->main) { + failed = true; + break; + } + } + +out: + /* + * It's still early in boot so treat this like same as a failure to + * create the kmalloc cache in the first place + */ + if (failed) + panic("Out of memory when creating kmem_cache %s\n", s->name); + + s->sheaf_capacity = capacity; +} + +static void __init bootstrap_kmalloc_sheaves(void) +{ + enum kmalloc_cache_type type; + + for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) { + for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) { + if (kmalloc_caches[type][idx]) + bootstrap_cache_sheaves(kmalloc_caches[type][idx]); + } + } +} + void __init kmem_cache_init(void) { static __initdata struct kmem_cache boot_kmem_cache, @@ -8944,6 +8314,8 @@ void __init kmem_cache_init(void) setup_kmalloc_cache_index_table(); create_kmalloc_caches(); + bootstrap_kmalloc_sheaves(); + /* Setup random freelists for each cache */ init_freelist_randomization(); @@ -9011,17 +8383,10 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - set_cpu_partial(s); - - if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) - && !(s->flags & SLAB_DEBUG_FLAGS)) { - s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); - if (!s->cpu_sheaves) { - err = -ENOMEM; - goto out; - } - // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? - s->sheaf_capacity = args->sheaf_capacity; + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; } #ifdef CONFIG_NUMA @@ -9037,14 +8402,14 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!init_kmem_cache_nodes(s)) goto out; - if (!alloc_kmem_cache_cpus(s)) +#ifdef CONFIG_SLUB_STATS + if (!alloc_kmem_cache_stats(s)) goto out; +#endif - if (s->cpu_sheaves) { - err = init_percpu_sheaves(s); - if (err) - goto out; - } + err = init_percpu_sheaves(s); + if (err) + goto out; err = 0; @@ -9359,47 +8724,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, if (!nodes) return -ENOMEM; - if (flags & SO_CPU) { - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, - cpu); - int node; - struct slab *slab; - - slab = READ_ONCE(c->slab); - if (!slab) - continue; - - node = slab_nid(slab); - if (flags & SO_TOTAL) - x = slab->objects; - else if (flags & SO_OBJECTS) - x = slab->inuse; - else - x = 1; - - total += x; - nodes[node] += x; - -#ifdef CONFIG_SLUB_CPU_PARTIAL - slab = slub_percpu_partial_read_once(c); - if (slab) { - node = slab_nid(slab); - if (flags & SO_TOTAL) - WARN_ON_ONCE(1); - else if (flags & SO_OBJECTS) - WARN_ON_ONCE(1); - else - x = data_race(slab->slabs); - total += x; - nodes[node] += x; - } -#endif - } - } - /* * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" * already held which will conflict with an existing lock order: @@ -9531,12 +8855,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - unsigned int nr_partial = 0; -#ifdef CONFIG_SLUB_CPU_PARTIAL - nr_partial = s->cpu_partial; -#endif - - return sysfs_emit(buf, "%u\n", nr_partial); + return sysfs_emit(buf, "0\n"); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -9548,11 +8867,9 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, err = kstrtouint(buf, 10, &objects); if (err) return err; - if (objects && !kmem_cache_has_cpu_partial(s)) + if (objects) return -EINVAL; - slub_set_cpu_partial(s, objects); - flush_all(s); return length; } SLAB_ATTR(cpu_partial); @@ -9591,42 +8908,7 @@ SLAB_ATTR_RO(objects_partial); static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) { - int objects = 0; - int slabs = 0; - int cpu __maybe_unused; - int len = 0; - -#ifdef CONFIG_SLUB_CPU_PARTIAL - for_each_online_cpu(cpu) { - struct slab *slab; - - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - - if (slab) - slabs += data_race(slab->slabs); - } -#endif - - /* Approximate half-full slabs, see slub_set_cpu_partial() */ - objects = (slabs * oo_objects(s->oo)) / 2; - len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); - -#ifdef CONFIG_SLUB_CPU_PARTIAL - for_each_online_cpu(cpu) { - struct slab *slab; - - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (slab) { - slabs = data_race(slab->slabs); - objects = (slabs * oo_objects(s->oo)) / 2; - len += sysfs_emit_at(buf, len, " C%d=%d(%d)", - cpu, objects, slabs); - } - } -#endif - len += sysfs_emit_at(buf, len, "\n"); - - return len; + return sysfs_emit(buf, "0(0)\n"); } SLAB_ATTR_RO(slabs_cpu_partial); @@ -9812,7 +9094,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; + unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -9838,7 +9120,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si) int cpu; for_each_online_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; + per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0; } #define STAT_ATTR(si, text) \ @@ -9856,36 +9138,19 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ -STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); -STAT_ATTR(FREE_PCS, free_cpu_sheaf); STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); -STAT_ATTR(FREE_FROZEN, free_frozen); STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); -STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); -STAT_ATTR(ALLOC_REFILL, alloc_refill); STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); -STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); -STAT_ATTR(DEACTIVATE_FULL, deactivate_full); -STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); -STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); -STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); -STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); -STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); -STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); -STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); -STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); -STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); -STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); STAT_ATTR(SHEAF_FLUSH, sheaf_flush); STAT_ATTR(SHEAF_REFILL, sheaf_refill); STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); @@ -9961,36 +9226,19 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS - &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, - &free_cpu_sheaf_attr.attr, &free_rcu_sheaf_attr.attr, &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, - &free_frozen_attr.attr, &free_add_partial_attr.attr, &free_remove_partial_attr.attr, - &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, - &alloc_refill_attr.attr, &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, - &cpuslab_flush_attr.attr, - &deactivate_full_attr.attr, - &deactivate_empty_attr.attr, - &deactivate_to_head_attr.attr, - &deactivate_to_tail_attr.attr, - &deactivate_remote_frees_attr.attr, - &deactivate_bypass_attr.attr, &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, - &cmpxchg_double_cpu_fail_attr.attr, - &cpu_partial_alloc_attr.attr, - &cpu_partial_free_attr.attr, - &cpu_partial_node_attr.attr, - &cpu_partial_drain_attr.attr, &sheaf_flush_attr.attr, &sheaf_refill_attr.attr, &sheaf_alloc_attr.attr,