mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 01:04:41 +01:00
Total patches: 107 Reviews/patch: 1.07 Reviewed rate: 67% - The 2 patch series "ocfs2: give ocfs2 the ability to reclaim suballocator free bg" from Heming Zhao saves disk space by teaching ocfs2 to reclaim suballocator block group space. - The 4 patch series "Add ARRAY_END(), and use it to fix off-by-one bugs" from Alejandro Colomar adds the ARRAY_END() macro and uses it in various places. - The 2 patch series "vmcoreinfo: support VMCOREINFO_BYTES larger than PAGE_SIZE" from Pnina Feder makes the vmcore code future-safe, if VMCOREINFO_BYTES ever exceeds the page size. - The 7 patch series "kallsyms: Prevent invalid access when showing module buildid" from Petr Mladek cleans up kallsyms code related to module buildid and fixes an invalid access crash when printing backtraces. - The 3 patch series "Address page fault in ima_restore_measurement_list()" from Harshit Mogalapalli fixes a kexec-related crash that can occur when booting the second-stage kernel on x86. - The 6 patch series "kho: ABI headers and Documentation updates" from Mike Rapoport updates the kexec handover ABI documentation. - The 4 patch series "Align atomic storage" from Finn Thain adds the __aligned attribute to atomic_t and atomic64_t definitions to get natural alignment of both types on csky, m68k, microblaze, nios2, openrisc and sh. - The 2 patch series "kho: clean up page initialization logic" from Pratyush Yadav simplifies the page initialization logic in kho_restore_page(). - The 6 patch series "Unload linux/kernel.h" from Yury Norov moves several things out of kernel.h and into more appropriate places. - The 7 patch series "don't abuse task_struct.group_leader" from Oleg Nesterov removes the usage of ->group_leader when it is "obviously unnecessary". - The 5 patch series "list private v2 & luo flb" from Pasha Tatashin adds some infrastructure improvements to the live update orchestrator. -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCaY4giAAKCRDdBJ7gKXxA jgusAQDnKkP8UWTqXPC1jI+OrDJGU5ciAx8lzLeBVqMKzoYk9AD/TlhT2Nlx+Ef6 0HCUHUD0FMvAw/7/Dfc6ZKxwBEIxyww= =mmsH -----END PGP SIGNATURE----- Merge tag 'mm-nonmm-stable-2026-02-12-10-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull non-MM updates from Andrew Morton: - "ocfs2: give ocfs2 the ability to reclaim suballocator free bg" saves disk space by teaching ocfs2 to reclaim suballocator block group space (Heming Zhao) - "Add ARRAY_END(), and use it to fix off-by-one bugs" adds the ARRAY_END() macro and uses it in various places (Alejandro Colomar) - "vmcoreinfo: support VMCOREINFO_BYTES larger than PAGE_SIZE" makes the vmcore code future-safe, if VMCOREINFO_BYTES ever exceeds the page size (Pnina Feder) - "kallsyms: Prevent invalid access when showing module buildid" cleans up kallsyms code related to module buildid and fixes an invalid access crash when printing backtraces (Petr Mladek) - "Address page fault in ima_restore_measurement_list()" fixes a kexec-related crash that can occur when booting the second-stage kernel on x86 (Harshit Mogalapalli) - "kho: ABI headers and Documentation updates" updates the kexec handover ABI documentation (Mike Rapoport) - "Align atomic storage" adds the __aligned attribute to atomic_t and atomic64_t definitions to get natural alignment of both types on csky, m68k, microblaze, nios2, openrisc and sh (Finn Thain) - "kho: clean up page initialization logic" simplifies the page initialization logic in kho_restore_page() (Pratyush Yadav) - "Unload linux/kernel.h" moves several things out of kernel.h and into more appropriate places (Yury Norov) - "don't abuse task_struct.group_leader" removes the usage of ->group_leader when it is "obviously unnecessary" (Oleg Nesterov) - "list private v2 & luo flb" adds some infrastructure improvements to the live update orchestrator (Pasha Tatashin) * tag 'mm-nonmm-stable-2026-02-12-10-48' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (107 commits) watchdog/hardlockup: simplify perf event probe and remove per-cpu dependency procfs: fix missing RCU protection when reading real_parent in do_task_stat() watchdog/softlockup: fix sample ring index wrap in need_counting_irqs() kcsan, compiler_types: avoid duplicate type issues in BPF Type Format kho: fix doc for kho_restore_pages() tests/liveupdate: add in-kernel liveupdate test liveupdate: luo_flb: introduce File-Lifecycle-Bound global state liveupdate: luo_file: Use private list list: add kunit test for private list primitives list: add primitives for private list manipulations delayacct: fix uapi timespec64 definition panic: add panic_force_cpu= parameter to redirect panic to a specific CPU netclassid: use thread_group_leader(p) in update_classid_task() RDMA/umem: don't abuse current->group_leader drm/pan*: don't abuse current->group_leader drm/amd: kill the outdated "Only the pthreads threading model is supported" checks drm/amdgpu: don't abuse current->group_leader android/binder: use same_thread_group(proc->tsk, current) in binder_mmap() android/binder: don't abuse current->group_leader kho: skip memoryless NUMA nodes when reserving scratch areas ...
254 lines
7.7 KiB
C
254 lines
7.7 KiB
C
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
/*
|
|
* Resilient Queued Spin Lock
|
|
*
|
|
* (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
|
|
*
|
|
* Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
|
|
*/
|
|
#ifndef __ASM_GENERIC_RQSPINLOCK_H
|
|
#define __ASM_GENERIC_RQSPINLOCK_H
|
|
|
|
#include <linux/types.h>
|
|
#include <vdso/time64.h>
|
|
#include <linux/percpu.h>
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
#include <asm/qspinlock.h>
|
|
#endif
|
|
|
|
struct rqspinlock {
|
|
union {
|
|
atomic_t val;
|
|
u32 locked;
|
|
};
|
|
};
|
|
|
|
/* Even though this is same as struct rqspinlock, we need to emit a distinct
|
|
* type in BTF for BPF programs.
|
|
*/
|
|
struct bpf_res_spin_lock {
|
|
u32 val;
|
|
} __aligned(__alignof__(struct rqspinlock));
|
|
|
|
struct qspinlock;
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
typedef struct qspinlock rqspinlock_t;
|
|
#else
|
|
typedef struct rqspinlock rqspinlock_t;
|
|
#endif
|
|
|
|
extern int resilient_tas_spin_lock(rqspinlock_t *lock);
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
|
|
#endif
|
|
|
|
#ifndef resilient_virt_spin_lock_enabled
|
|
static __always_inline bool resilient_virt_spin_lock_enabled(void)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
#ifndef resilient_virt_spin_lock
|
|
static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Default timeout for waiting loops is 0.25 seconds
|
|
*/
|
|
#define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)
|
|
|
|
/*
|
|
* Choose 31 as it makes rqspinlock_held cacheline-aligned.
|
|
*/
|
|
#define RES_NR_HELD 31
|
|
|
|
struct rqspinlock_held {
|
|
int cnt;
|
|
void *locks[RES_NR_HELD];
|
|
};
|
|
|
|
DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
|
|
|
|
static __always_inline void grab_held_lock_entry(void *lock)
|
|
{
|
|
int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);
|
|
|
|
if (unlikely(cnt > RES_NR_HELD)) {
|
|
/* Still keep the inc so we decrement later. */
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Implied compiler barrier in per-CPU operations; otherwise we can have
|
|
* the compiler reorder inc with write to table, allowing interrupts to
|
|
* overwrite and erase our write to the table (as on interrupt exit it
|
|
* will be reset to NULL).
|
|
*
|
|
* It is fine for cnt inc to be reordered wrt remote readers though,
|
|
* they won't observe our entry until the cnt update is visible, that's
|
|
* all.
|
|
*/
|
|
this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
|
|
}
|
|
|
|
/*
|
|
* We simply don't support out-of-order unlocks, and keep the logic simple here.
|
|
* The verifier prevents BPF programs from unlocking out-of-order, and the same
|
|
* holds for in-kernel users.
|
|
*
|
|
* It is possible to run into misdetection scenarios of AA deadlocks on the same
|
|
* CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
|
|
* out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
|
|
* logic to preserve right entries in the table would be to walk the array of
|
|
* held locks and swap and clear out-of-order entries, but that's too
|
|
* complicated and we don't have a compelling use case for out of order unlocking.
|
|
*/
|
|
static __always_inline void release_held_lock_entry(void)
|
|
{
|
|
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
|
|
|
|
if (unlikely(rqh->cnt > RES_NR_HELD))
|
|
goto dec;
|
|
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
|
|
dec:
|
|
/*
|
|
* Reordering of clearing above with inc and its write in
|
|
* grab_held_lock_entry that came before us (in same acquisition
|
|
* attempt) is ok, we either see a valid entry or NULL when it's
|
|
* visible.
|
|
*
|
|
* But this helper is invoked when we unwind upon failing to acquire the
|
|
* lock. Unlike the unlock path which constitutes a release store after
|
|
* we clear the entry, we need to emit a write barrier here. Otherwise,
|
|
* we may have a situation as follows:
|
|
*
|
|
* <error> for lock B
|
|
* release_held_lock_entry
|
|
*
|
|
* grab_held_lock_entry
|
|
* try_cmpxchg_acquire for lock A
|
|
*
|
|
* Lack of any ordering means reordering may occur such that dec, inc
|
|
* are done before entry is overwritten. This permits a remote lock
|
|
* holder of lock B (which this CPU failed to acquire) to now observe it
|
|
* as being attempted on this CPU, and may lead to misdetection (if this
|
|
* CPU holds a lock it is attempting to acquire, leading to false ABBA
|
|
* diagnosis).
|
|
*
|
|
* The case of unlock is treated differently due to NMI reentrancy, see
|
|
* comments in res_spin_unlock.
|
|
*
|
|
* In theory we don't have a problem if the dec and WRITE_ONCE above get
|
|
* reordered with each other, we either notice an empty NULL entry on
|
|
* top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
|
|
* cannot be observed (if dec precedes WRITE_ONCE).
|
|
*
|
|
* Emit the write barrier _before_ the dec, this permits dec-inc
|
|
* reordering but that is harmless as we'd have new entry set to NULL
|
|
* already, i.e. they cannot precede the NULL store above.
|
|
*/
|
|
smp_wmb();
|
|
this_cpu_dec(rqspinlock_held_locks.cnt);
|
|
}
|
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
|
|
/**
|
|
* res_spin_lock - acquire a queued spinlock
|
|
* @lock: Pointer to queued spinlock structure
|
|
*
|
|
* Return:
|
|
* * 0 - Lock was acquired successfully.
|
|
* * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
|
|
* * -ETIMEDOUT - Lock acquisition failed because of timeout.
|
|
*/
|
|
static __always_inline int res_spin_lock(rqspinlock_t *lock)
|
|
{
|
|
int val = 0;
|
|
|
|
/*
|
|
* Grab the deadlock detection entry before doing the cmpxchg, so that
|
|
* reentrancy due to NMIs between the succeeding cmpxchg and creation of
|
|
* held lock entry can correctly detect an acquisition attempt in the
|
|
* interrupted context.
|
|
*
|
|
* cmpxchg lock A
|
|
* <NMI>
|
|
* res_spin_lock(A) --> missed AA, leads to timeout
|
|
* </NMI>
|
|
* grab_held_lock_entry(A)
|
|
*/
|
|
grab_held_lock_entry(lock);
|
|
|
|
if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
|
|
return 0;
|
|
return resilient_queued_spin_lock_slowpath(lock, val);
|
|
}
|
|
|
|
#else
|
|
|
|
#define res_spin_lock(lock) ({ grab_held_lock_entry(lock); resilient_tas_spin_lock(lock); })
|
|
|
|
#endif /* CONFIG_QUEUED_SPINLOCKS */
|
|
|
|
static __always_inline void res_spin_unlock(rqspinlock_t *lock)
|
|
{
|
|
struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
|
|
|
|
/*
|
|
* Release barrier, ensures correct ordering. Perform release store
|
|
* instead of queued_spin_unlock, since we use this function for the TAS
|
|
* fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
|
|
* the full 4-byte lockword.
|
|
*
|
|
* Perform the smp_store_release before clearing the lock entry so that
|
|
* NMIs landing in the unlock path can correctly detect AA issues. The
|
|
* opposite order shown below may lead to missed AA checks:
|
|
*
|
|
* WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
|
|
* <NMI>
|
|
* res_spin_lock(A) --> missed AA, leads to timeout
|
|
* </NMI>
|
|
* smp_store_release(A->locked, 0)
|
|
*/
|
|
smp_store_release(&lock->locked, 0);
|
|
if (likely(rqh->cnt <= RES_NR_HELD))
|
|
WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
|
|
this_cpu_dec(rqspinlock_held_locks.cnt);
|
|
}
|
|
|
|
#ifdef CONFIG_QUEUED_SPINLOCKS
|
|
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
|
|
#else
|
|
#define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
|
|
#endif
|
|
|
|
#define raw_res_spin_lock(lock) \
|
|
({ \
|
|
int __ret; \
|
|
preempt_disable(); \
|
|
__ret = res_spin_lock(lock); \
|
|
if (__ret) \
|
|
preempt_enable(); \
|
|
__ret; \
|
|
})
|
|
|
|
#define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
|
|
|
|
#define raw_res_spin_lock_irqsave(lock, flags) \
|
|
({ \
|
|
int __ret; \
|
|
local_irq_save(flags); \
|
|
__ret = raw_res_spin_lock(lock); \
|
|
if (__ret) \
|
|
local_irq_restore(flags); \
|
|
__ret; \
|
|
})
|
|
|
|
#define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
|
|
|
|
#endif /* __ASM_GENERIC_RQSPINLOCK_H */
|