mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 01:04:41 +01:00
btrfs: fix transaction commit blocking during trim of unallocated space
When trimming unallocated space, btrfs_trim_fs() holds the device_list_mutex
for the entire duration while iterating through all devices. On large
filesystems with significant unallocated space, this operation can take
minutes to hours on large storage systems.
This causes a problem because btrfs_run_dev_stats(), which is called
during transaction commit, also requires device_list_mutex:
btrfs_trim_fs()
mutex_lock(&fs_devices->device_list_mutex)
list_for_each_entry(device, ...)
btrfs_trim_free_extents(device)
mutex_unlock(&fs_devices->device_list_mutex)
commit_transaction()
btrfs_run_dev_stats()
mutex_lock(&fs_devices->device_list_mutex) // blocked!
...
While trim is running, all transaction commits are blocked waiting for
the mutex.
Fix this by refactoring btrfs_trim_free_extents() to process devices in
bounded chunks (up to 2GB per iteration) and release device_list_mutex
between chunks.
Signed-off-by: robbieko <robbieko@synology.com>
Signed-off-by: jinbaohong <jinbaohong@synology.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
bfb670b918
commit
b291ad4458
2 changed files with 140 additions and 22 deletions
|
|
@ -6513,10 +6513,12 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6
|
|||
* it while performing the free space search since we have already
|
||||
* held back allocations.
|
||||
*/
|
||||
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
|
||||
static int btrfs_trim_free_extents_throttle(struct btrfs_device *device,
|
||||
u64 *trimmed, u64 pos, u64 *ret_next_pos)
|
||||
{
|
||||
u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0;
|
||||
int ret;
|
||||
u64 start = pos;
|
||||
u64 trim_len = 0;
|
||||
|
||||
*trimmed = 0;
|
||||
|
||||
|
|
@ -6536,15 +6538,20 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
|
|||
|
||||
while (1) {
|
||||
struct btrfs_fs_info *fs_info = device->fs_info;
|
||||
u64 cur_start;
|
||||
u64 end;
|
||||
u64 len;
|
||||
u64 bytes;
|
||||
|
||||
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
cur_start = start;
|
||||
btrfs_find_first_clear_extent_bit(&device->alloc_state, start,
|
||||
&start, &end,
|
||||
CHUNK_TRIMMED | CHUNK_ALLOCATED);
|
||||
start = max(start, cur_start);
|
||||
|
||||
/* Check if there are any CHUNK_* bits left */
|
||||
if (start > device->total_bytes) {
|
||||
|
|
@ -6570,6 +6577,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
|
|||
end = min(end, device->total_bytes - 1);
|
||||
|
||||
len = end - start + 1;
|
||||
len = min(len, BTRFS_MAX_TRIM_LENGTH);
|
||||
|
||||
/* We didn't find any extents */
|
||||
if (!len) {
|
||||
|
|
@ -6590,6 +6598,12 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
|
|||
|
||||
start += len;
|
||||
*trimmed += bytes;
|
||||
trim_len += len;
|
||||
if (trim_len >= BTRFS_MAX_TRIM_LENGTH) {
|
||||
*ret_next_pos = start;
|
||||
ret = -EAGAIN;
|
||||
break;
|
||||
}
|
||||
|
||||
if (btrfs_trim_interrupted()) {
|
||||
ret = -ERESTARTSYS;
|
||||
|
|
@ -6602,6 +6616,122 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int btrfs_trim_free_extents(struct btrfs_fs_info *fs_info, u64 *trimmed,
|
||||
u64 *dev_failed, int *dev_ret)
|
||||
{
|
||||
struct btrfs_device *dev;
|
||||
struct btrfs_device *working_dev = NULL;
|
||||
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
|
||||
u8 uuid[BTRFS_UUID_SIZE];
|
||||
u64 start = BTRFS_DEVICE_RANGE_RESERVED;
|
||||
|
||||
*trimmed = 0;
|
||||
*dev_failed = 0;
|
||||
*dev_ret = 0;
|
||||
|
||||
/* Find the device with the smallest UUID to start. */
|
||||
mutex_lock(&fs_devices->device_list_mutex);
|
||||
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
|
||||
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
|
||||
continue;
|
||||
if (!working_dev ||
|
||||
memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0)
|
||||
working_dev = dev;
|
||||
}
|
||||
if (working_dev)
|
||||
memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE);
|
||||
mutex_unlock(&fs_devices->device_list_mutex);
|
||||
|
||||
if (!working_dev)
|
||||
return 0;
|
||||
|
||||
while (1) {
|
||||
u64 group_trimmed = 0;
|
||||
u64 next_pos = 0;
|
||||
int ret = 0;
|
||||
|
||||
mutex_lock(&fs_devices->device_list_mutex);
|
||||
|
||||
/* Find and trim the current device. */
|
||||
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
|
||||
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
|
||||
continue;
|
||||
if (dev == working_dev) {
|
||||
ret = btrfs_trim_free_extents_throttle(working_dev,
|
||||
&group_trimmed, start, &next_pos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Throttle: continue the same device from the new position. */
|
||||
if (ret == -EAGAIN && next_pos > start) {
|
||||
mutex_unlock(&fs_devices->device_list_mutex);
|
||||
*trimmed += group_trimmed;
|
||||
start = next_pos;
|
||||
cond_resched();
|
||||
continue;
|
||||
}
|
||||
|
||||
/* User interrupted. */
|
||||
if (ret == -ERESTARTSYS || ret == -EINTR) {
|
||||
mutex_unlock(&fs_devices->device_list_mutex);
|
||||
*trimmed += group_trimmed;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Device completed (ret == 0), failed, or EAGAIN with no progress.
|
||||
* Record error if any, then move to next device.
|
||||
*/
|
||||
if (ret == -EAGAIN) {
|
||||
/* No progress - log and skip device. */
|
||||
btrfs_warn(fs_info,
|
||||
"trim throttle: no progress, offset=%llu device %s, skipping",
|
||||
start, btrfs_dev_name(working_dev));
|
||||
(*dev_failed)++;
|
||||
if (!*dev_ret)
|
||||
*dev_ret = ret;
|
||||
} else if (ret) {
|
||||
/* Device failed with error. */
|
||||
(*dev_failed)++;
|
||||
if (!*dev_ret)
|
||||
*dev_ret = ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find next device: smallest UUID larger than current.
|
||||
* Devices added during trim with smaller UUID will be skipped.
|
||||
*/
|
||||
working_dev = NULL;
|
||||
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
|
||||
if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
|
||||
continue;
|
||||
/* Must larger than current UUID. */
|
||||
if (memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE) <= 0)
|
||||
continue;
|
||||
/* Find the smallest. */
|
||||
if (!working_dev ||
|
||||
memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0)
|
||||
working_dev = dev;
|
||||
}
|
||||
if (working_dev)
|
||||
memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE);
|
||||
|
||||
mutex_unlock(&fs_devices->device_list_mutex);
|
||||
|
||||
*trimmed += group_trimmed;
|
||||
start = BTRFS_DEVICE_RANGE_RESERVED;
|
||||
|
||||
/* No more devices. */
|
||||
if (!working_dev)
|
||||
break;
|
||||
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Trim the whole filesystem by:
|
||||
* 1) trimming the free space in each block group
|
||||
|
|
@ -6613,9 +6743,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
|
|||
*/
|
||||
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
|
||||
{
|
||||
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
|
||||
struct btrfs_block_group *cache = NULL;
|
||||
struct btrfs_device *device;
|
||||
u64 group_trimmed;
|
||||
u64 range_end = U64_MAX;
|
||||
u64 start;
|
||||
|
|
@ -6686,24 +6814,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
|
|||
if (ret == -ERESTARTSYS || ret == -EINTR)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&fs_devices->device_list_mutex);
|
||||
list_for_each_entry(device, &fs_devices->devices, dev_list) {
|
||||
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
|
||||
continue;
|
||||
|
||||
ret = btrfs_trim_free_extents(device, &group_trimmed);
|
||||
|
||||
trimmed += group_trimmed;
|
||||
if (ret == -ERESTARTSYS || ret == -EINTR)
|
||||
break;
|
||||
if (ret) {
|
||||
dev_failed++;
|
||||
if (!dev_ret)
|
||||
dev_ret = ret;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&fs_devices->device_list_mutex);
|
||||
ret = btrfs_trim_free_extents(fs_info, &group_trimmed, &dev_failed, &dev_ret);
|
||||
trimmed += group_trimmed;
|
||||
|
||||
if (dev_failed)
|
||||
btrfs_warn(fs_info,
|
||||
|
|
|
|||
|
|
@ -65,6 +65,12 @@ struct btrfs_space_info;
|
|||
|
||||
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
|
||||
|
||||
/*
|
||||
* Maximum length to trim in a single iteration to avoid holding device list
|
||||
* mutex for too long.
|
||||
*/
|
||||
#define BTRFS_MAX_TRIM_LENGTH SZ_2G
|
||||
|
||||
#define BTRFS_OLDEST_GENERATION 0ULL
|
||||
|
||||
#define BTRFS_EMPTY_DIR_SIZE 0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue