f2fs: fix lock priority inversion issue

If userspace thread has held f2fs rw semaphore, due to its low priority,
it could be runnable or preempted state for long time, during the time,
it will block high priority thread which is trying to grab the same rw
semaphore, e.g. cp_rwsem, io_rwsem...

To fix such issue, let's detect thread's priority when it tries to grab
f2fs_rwsem lock, if the priority is lower than a priority threshold, let's
uplift the priority before it enters into critical region of lock, and
restore the priority after it leaves from critical region.

Meanwhile, introducing two new sysfs nodes:
- /sys/fs/f2fs/<disk>/adjust_lock_priority, it is used to control whether
the functionality is enable or not.
==========     ==================
Flag_Value     Flag_Description
==========     ==================
0x00000000     Disabled (default)
0x00000001     cp_rwsem
0x00000002     node_change
0x00000004     node_write
0x00000008     gc_lock
0x00000010     cp_global
0x00000020     io_rwsem
==========     ==================
- /sys/fs/f2fs/<disk>/lock_duration_priority, it is used to control
priority threshold.

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
This commit is contained in:
Chao Yu 2026-01-30 21:28:08 +08:00 committed by Jaegeuk Kim
parent d860974a7e
commit 07de55cbf5
5 changed files with 120 additions and 2 deletions

View file

@ -963,3 +963,27 @@ Description: This sysfs entry can be used to change type of injected timeout:
0x00000003 Simulate Non-IO type sleep time
0x00000004 Simulate runnable time
========== ===============================
What: /sys/fs/f2fs/<disk>/adjust_lock_priority
Date: January 2026
Contact: "Chao Yu" <chao@kernel.org>
Description: This sysfs entry can be used to enable/disable to adjust priority for task
which is in critical region covered by lock.
========== ==================
Flag_Value Flag_Description
========== ==================
0x00000000 Disabled (default)
0x00000001 cp_rwsem
0x00000002 node_change
0x00000004 node_write
0x00000008 gc_lock
0x00000010 cp_global
0x00000020 io_rwsem
========== ==================
What: /sys/fs/f2fs/<disk>/lock_duration_priority
Date: January 2026
Contact: "Chao Yu" <chao@kernel.org>
Description: f2fs can tune priority of thread which has entered into critical region covered by
f2fs rwsemphore lock. This sysfs entry can be used to control priority value, the
range is [100,139], by default the value is 120.

View file

@ -90,16 +90,72 @@ static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
runnable_time, io_sleep_time, other_time);
}
static bool need_uplift_priority(struct f2fs_rwsem *sem, bool is_write)
{
if (!(sem->sbi->adjust_lock_priority & BIT(sem->name - 1)))
return false;
switch (sem->name) {
/*
* writer is checkpoint which has high priority, let's just uplift
* priority for reader
*/
case LOCK_NAME_CP_RWSEM:
case LOCK_NAME_NODE_CHANGE:
case LOCK_NAME_NODE_WRITE:
return !is_write;
case LOCK_NAME_GC_LOCK:
case LOCK_NAME_CP_GLOBAL:
case LOCK_NAME_IO_RWSEM:
return true;
default:
f2fs_bug_on(sem->sbi, 1);
}
return false;
}
static void uplift_priority(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc,
bool is_write)
{
lc->need_restore = false;
if (!sem->sbi->adjust_lock_priority)
return;
if (rt_task(current))
return;
if (!need_uplift_priority(sem, is_write))
return;
lc->orig_nice = task_nice(current);
lc->new_nice = PRIO_TO_NICE(sem->sbi->lock_duration_priority);
if (lc->orig_nice <= lc->new_nice)
return;
set_user_nice(current, lc->new_nice);
lc->need_restore = true;
}
static void restore_priority(struct f2fs_lock_context *lc)
{
if (!lc->need_restore)
return;
/* someone has updated the priority */
if (task_nice(current) != lc->new_nice)
return;
set_user_nice(current, lc->orig_nice);
}
void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
uplift_priority(sem, lc, false);
f2fs_down_read(sem);
trace_lock_elapsed_time_start(sem, lc);
}
int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
if (!f2fs_down_read_trylock(sem))
uplift_priority(sem, lc, false);
if (!f2fs_down_read_trylock(sem)) {
restore_priority(lc);
return 0;
}
trace_lock_elapsed_time_start(sem, lc);
return 1;
}
@ -107,19 +163,24 @@ int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_contex
void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
f2fs_up_read(sem);
restore_priority(lc);
trace_lock_elapsed_time_end(sem, lc, false);
}
void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
uplift_priority(sem, lc, true);
f2fs_down_write(sem);
trace_lock_elapsed_time_start(sem, lc);
}
int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
if (!f2fs_down_write_trylock(sem))
uplift_priority(sem, lc, true);
if (!f2fs_down_write_trylock(sem)) {
restore_priority(lc);
return 0;
}
trace_lock_elapsed_time_start(sem, lc);
return 1;
}
@ -127,6 +188,7 @@ int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_conte
void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
{
f2fs_up_write(sem);
restore_priority(lc);
trace_lock_elapsed_time_end(sem, lc, true);
}

View file

@ -185,6 +185,7 @@ enum f2fs_lock_name {
LOCK_NAME_GC_LOCK,
LOCK_NAME_CP_GLOBAL,
LOCK_NAME_IO_RWSEM,
LOCK_NAME_MAX,
};
enum f2fs_timeout_type {
@ -1447,7 +1448,10 @@ struct f2fs_time_stat {
struct f2fs_lock_context {
struct f2fs_time_stat ts;
int orig_nice;
int new_nice;
bool lock_trace;
bool need_restore;
};
struct f2fs_gc_control {
@ -1588,6 +1592,8 @@ enum node_type {
/* a threshold of maximum elapsed time in critical region to print tracepoint */
#define MAX_LOCK_ELAPSED_TIME 500
#define F2FS_DEFAULT_TASK_PRIORITY (DEFAULT_PRIO)
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@ -1998,6 +2004,12 @@ struct f2fs_sb_info {
/* max elapsed time threshold in critical region that lock covered */
unsigned long long max_lock_elapsed_time;
/* enable/disable to adjust task priority in critical region covered by lock */
unsigned int adjust_lock_priority;
/* adjust priority for task which is in critical region covered by lock */
unsigned int lock_duration_priority;
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */

View file

@ -4338,6 +4338,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
spin_lock_init(&sbi->gc_remaining_trials_lock);
atomic64_set(&sbi->current_atomic_write, 0);
sbi->max_lock_elapsed_time = MAX_LOCK_ELAPSED_TIME;
sbi->adjust_lock_priority = 0;
sbi->lock_duration_priority = F2FS_DEFAULT_TASK_PRIORITY;
sbi->sum_blocksize = f2fs_sb_has_packed_ssa(sbi) ?
4096 : sbi->blocksize;

View file

@ -955,6 +955,20 @@ out:
return count;
}
if (!strcmp(a->attr.name, "adjust_lock_priority")) {
if (t >= BIT(LOCK_NAME_MAX - 1))
return -EINVAL;
sbi->adjust_lock_priority = t;
return count;
}
if (!strcmp(a->attr.name, "lock_duration_priority")) {
if (t < NICE_TO_PRIO(MIN_NICE) || t > NICE_TO_PRIO(MAX_NICE))
return -EINVAL;
sbi->lock_duration_priority = t;
return count;
}
__sbi_store_value(a, sbi, ptr + a->offset, t);
return count;
@ -1272,6 +1286,8 @@ F2FS_SBI_GENERAL_RW_ATTR(carve_out);
F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware);
F2FS_SBI_GENERAL_RW_ATTR(max_lock_elapsed_time);
F2FS_SBI_GENERAL_RW_ATTR(lock_duration_priority);
F2FS_SBI_GENERAL_RW_ATTR(adjust_lock_priority);
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
@ -1478,6 +1494,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(allocate_section_hint),
ATTR_LIST(allocate_section_policy),
ATTR_LIST(max_lock_elapsed_time),
ATTR_LIST(lock_duration_priority),
ATTR_LIST(adjust_lock_priority),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);