mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 03:24:45 +01:00
f2fs: fix lock priority inversion issue
If userspace thread has held f2fs rw semaphore, due to its low priority, it could be runnable or preempted state for long time, during the time, it will block high priority thread which is trying to grab the same rw semaphore, e.g. cp_rwsem, io_rwsem... To fix such issue, let's detect thread's priority when it tries to grab f2fs_rwsem lock, if the priority is lower than a priority threshold, let's uplift the priority before it enters into critical region of lock, and restore the priority after it leaves from critical region. Meanwhile, introducing two new sysfs nodes: - /sys/fs/f2fs/<disk>/adjust_lock_priority, it is used to control whether the functionality is enable or not. ========== ================== Flag_Value Flag_Description ========== ================== 0x00000000 Disabled (default) 0x00000001 cp_rwsem 0x00000002 node_change 0x00000004 node_write 0x00000008 gc_lock 0x00000010 cp_global 0x00000020 io_rwsem ========== ================== - /sys/fs/f2fs/<disk>/lock_duration_priority, it is used to control priority threshold. Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
This commit is contained in:
parent
d860974a7e
commit
07de55cbf5
5 changed files with 120 additions and 2 deletions
|
|
@ -963,3 +963,27 @@ Description: This sysfs entry can be used to change type of injected timeout:
|
|||
0x00000003 Simulate Non-IO type sleep time
|
||||
0x00000004 Simulate runnable time
|
||||
========== ===============================
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/adjust_lock_priority
|
||||
Date: January 2026
|
||||
Contact: "Chao Yu" <chao@kernel.org>
|
||||
Description: This sysfs entry can be used to enable/disable to adjust priority for task
|
||||
which is in critical region covered by lock.
|
||||
========== ==================
|
||||
Flag_Value Flag_Description
|
||||
========== ==================
|
||||
0x00000000 Disabled (default)
|
||||
0x00000001 cp_rwsem
|
||||
0x00000002 node_change
|
||||
0x00000004 node_write
|
||||
0x00000008 gc_lock
|
||||
0x00000010 cp_global
|
||||
0x00000020 io_rwsem
|
||||
========== ==================
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/lock_duration_priority
|
||||
Date: January 2026
|
||||
Contact: "Chao Yu" <chao@kernel.org>
|
||||
Description: f2fs can tune priority of thread which has entered into critical region covered by
|
||||
f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the
|
||||
range is [100,139], by default the value is 120.
|
||||
|
|
|
|||
|
|
@ -90,16 +90,72 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
|
|||
runnable_time, io_sleep_time, other_time);
|
||||
}
|
||||
|
||||
static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write)
|
||||
{
|
||||
if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1)))
|
||||
return false;
|
||||
|
||||
switch (sem->name) {
|
||||
/*
|
||||
* writer is checkpoint which has high priority, let's just uplift
|
||||
* priority for reader
|
||||
*/
|
||||
case LOCK_NAME_CP_RWSEM:
|
||||
case LOCK_NAME_NODE_CHANGE:
|
||||
case LOCK_NAME_NODE_WRITE:
|
||||
return !is_write;
|
||||
case LOCK_NAME_GC_LOCK:
|
||||
case LOCK_NAME_CP_GLOBAL:
|
||||
case LOCK_NAME_IO_RWSEM:
|
||||
return true;
|
||||
default:
|
||||
f2fs_bug_on(sem->sbi, 1);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
|
||||
bool is_write)
|
||||
{
|
||||
lc->need_restore = false;
|
||||
if (!sem->sbi->adjust_lock_priority)
|
||||
return;
|
||||
if (rt_task(current))
|
||||
return;
|
||||
if (!need_uplift_priority(sem, is_write))
|
||||
return;
|
||||
lc->orig_nice = task_nice(current);
|
||||
lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority);
|
||||
if (lc->orig_nice <= lc->new_nice)
|
||||
return;
|
||||
set_user_nice(current, lc->new_nice);
|
||||
lc->need_restore = true;
|
||||
}
|
||||
|
||||
static void restore_priority(struct f2fs_lock_context *lc)
|
||||
{
|
||||
if (!lc->need_restore)
|
||||
return;
|
||||
/* someone has updated the priority */
|
||||
if (task_nice(current) != lc->new_nice)
|
||||
return;
|
||||
set_user_nice(current, lc->orig_nice);
|
||||
}
|
||||
|
||||
void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
|
||||
{
|
||||
uplift_priority(sem, lc, false);
|
||||
f2fs_down_read(sem);
|
||||
trace_lock_elapsed_time_start(sem, lc);
|
||||
}
|
||||
|
||||
int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
|
||||
{
|
||||
if (!f2fs_down_read_trylock(sem))
|
||||
uplift_priority(sem, lc, false);
|
||||
if (!f2fs_down_read_trylock(sem)) {
|
||||
restore_priority(lc);
|
||||
return 0;
|
||||
}
|
||||
trace_lock_elapsed_time_start(sem, lc);
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -107,19 +163,24 @@ int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_contex
|
|||
void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
|
||||
{
|
||||
f2fs_up_read(sem);
|
||||
restore_priority(lc);
|
||||
trace_lock_elapsed_time_end(sem, lc, false);
|
||||
}
|
||||
|
||||
void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
|
||||
{
|
||||
uplift_priority(sem, lc, true);
|
||||
f2fs_down_write(sem);
|
||||
trace_lock_elapsed_time_start(sem, lc);
|
||||
}
|
||||
|
||||
int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
|
||||
{
|
||||
if (!f2fs_down_write_trylock(sem))
|
||||
uplift_priority(sem, lc, true);
|
||||
if (!f2fs_down_write_trylock(sem)) {
|
||||
restore_priority(lc);
|
||||
return 0;
|
||||
}
|
||||
trace_lock_elapsed_time_start(sem, lc);
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -127,6 +188,7 @@ int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_conte
|
|||
void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
|
||||
{
|
||||
f2fs_up_write(sem);
|
||||
restore_priority(lc);
|
||||
trace_lock_elapsed_time_end(sem, lc, true);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -185,6 +185,7 @@ enum f2fs_lock_name {
|
|||
LOCK_NAME_GC_LOCK,
|
||||
LOCK_NAME_CP_GLOBAL,
|
||||
LOCK_NAME_IO_RWSEM,
|
||||
LOCK_NAME_MAX,
|
||||
};
|
||||
|
||||
enum f2fs_timeout_type {
|
||||
|
|
@ -1447,7 +1448,10 @@ struct f2fs_time_stat {
|
|||
|
||||
struct f2fs_lock_context {
|
||||
struct f2fs_time_stat ts;
|
||||
int orig_nice;
|
||||
int new_nice;
|
||||
bool lock_trace;
|
||||
bool need_restore;
|
||||
};
|
||||
|
||||
struct f2fs_gc_control {
|
||||
|
|
@ -1588,6 +1592,8 @@ enum node_type {
|
|||
/* a threshold of maximum elapsed time in critical region to print tracepoint */
|
||||
#define MAX_LOCK_ELAPSED_TIME 500
|
||||
|
||||
#define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO)
|
||||
|
||||
static inline int f2fs_test_bit(unsigned int nr, char *addr);
|
||||
static inline void f2fs_set_bit(unsigned int nr, char *addr);
|
||||
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
|
||||
|
|
@ -1998,6 +2004,12 @@ struct f2fs_sb_info {
|
|||
/* max elapsed time threshold in critical region that lock covered */
|
||||
unsigned long long max_lock_elapsed_time;
|
||||
|
||||
/* enable/disable to adjust task priority in critical region covered by lock */
|
||||
unsigned int adjust_lock_priority;
|
||||
|
||||
/* adjust priority for task which is in critical region covered by lock */
|
||||
unsigned int lock_duration_priority;
|
||||
|
||||
#ifdef CONFIG_F2FS_FS_COMPRESSION
|
||||
struct kmem_cache *page_array_slab; /* page array entry */
|
||||
unsigned int page_array_slab_size; /* default page array slab size */
|
||||
|
|
|
|||
|
|
@ -4338,6 +4338,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
|
|||
spin_lock_init(&sbi->gc_remaining_trials_lock);
|
||||
atomic64_set(&sbi->current_atomic_write, 0);
|
||||
sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME;
|
||||
sbi->adjust_lock_priority = 0;
|
||||
sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY;
|
||||
|
||||
sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ?
|
||||
4096 : sbi->blocksize;
|
||||
|
|
|
|||
|
|
@ -955,6 +955,20 @@ out:
|
|||
return count;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "adjust_lock_priority")) {
|
||||
if (t >= BIT(LOCK_NAME_MAX - 1))
|
||||
return -EINVAL;
|
||||
sbi->adjust_lock_priority = t;
|
||||
return count;
|
||||
}
|
||||
|
||||
if (!strcmp(a->attr.name, "lock_duration_priority")) {
|
||||
if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE))
|
||||
return -EINVAL;
|
||||
sbi->lock_duration_priority = t;
|
||||
return count;
|
||||
}
|
||||
|
||||
__sbi_store_value(a, sbi, ptr + a->offset, t);
|
||||
|
||||
return count;
|
||||
|
|
@ -1272,6 +1286,8 @@ F2FS_SBI_GENERAL_RW_ATTR(carve_out);
|
|||
F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
|
||||
F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware);
|
||||
F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time);
|
||||
F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority);
|
||||
F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority);
|
||||
|
||||
/* STAT_INFO ATTR */
|
||||
#ifdef CONFIG_F2FS_STAT_FS
|
||||
|
|
@ -1478,6 +1494,8 @@ static struct attribute *f2fs_attrs[] = {
|
|||
ATTR_LIST(allocate_section_hint),
|
||||
ATTR_LIST(allocate_section_policy),
|
||||
ATTR_LIST(max_lock_elapsed_time),
|
||||
ATTR_LIST(lock_duration_priority),
|
||||
ATTR_LIST(adjust_lock_priority),
|
||||
NULL,
|
||||
};
|
||||
ATTRIBUTE_GROUPS(f2fs);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue