From 828ec765f7968c636c4c163c050ad13da959adef Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 23 Sep 2025 10:57:02 -0700 Subject: [PATCH 001/147] btrfs: ignore ENOMEM from alloc_bitmap() btrfs_convert_free_space_to_bitmaps() and btrfs_convert_free_space_to_extents() both allocate a bitmap struct with: bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; btrfs_abort_transaction(trans); return ret; } This conversion is done based on a heuristic and the check triggers each time we call update_free_space_extent_count() on a block group (each time we add/remove an extent or modify a bitmap). Furthermore, nothing relies on maintaining some invariant of bitmap density, it's just an optimization for space usage. Therefore, it is safe to simply ignore any memory allocation errors that occur, rather than aborting the transaction and leaving the fs read only. Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index d86541073d42..9ed36bbe9d35 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -218,11 +218,8 @@ int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (unlikely(!bitmap)) { - ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); - goto out; - } + if (unlikely(!bitmap)) + return 0; start = block_group->start; end = block_group->start + block_group->length; @@ -361,11 +358,8 @@ int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_size = free_space_bitmap_size(fs_info, block_group->length); bitmap = alloc_bitmap(bitmap_size); - if (unlikely(!bitmap)) { - ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); - goto out; - } + if (unlikely(!bitmap)) + return 0; start = block_group->start; end = block_group->start + block_group->length; From 69e293d28a95ef2652014411038d91867e16e757 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 24 Sep 2025 17:10:27 +0100 Subject: [PATCH 002/147] btrfs: use single return value variable in btrfs_relocate_block_group() We are using 'ret' and 'err' variables to track return values and errors, which is pattern that is error prone and we had quite some bugs due to this pattern in the past. Simplify this and use a single variable, named 'ret', to track errors and the return value. Also rename the variable 'rw' to 'bg_is_ro' which is more meaningful name, and change its type from int to bool. Reviewed-by: Boris Burkov Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0765e06d00b8..748290758459 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3882,8 +3882,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, struct inode *inode; struct btrfs_path *path; int ret; - int rw = 0; - int err = 0; + bool bg_is_ro = false; /* * This only gets set if we had a half-deleted snapshot on mount. We @@ -3925,24 +3924,20 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, } ret = reloc_chunk_start(fs_info); - if (ret < 0) { - err = ret; + if (ret < 0) goto out_put_bg; - } rc->extent_root = extent_root; rc->block_group = bg; ret = btrfs_inc_block_group_ro(rc->block_group, true); - if (ret) { - err = ret; + if (ret) goto out; - } - rw = 1; + bg_is_ro = true; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -3954,14 +3949,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, else ret = PTR_ERR(inode); - if (ret && ret != -ENOENT) { - err = ret; + if (ret && ret != -ENOENT) goto out; - } rc->data_inode = create_reloc_inode(rc->block_group); if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); + ret = PTR_ERR(rc->data_inode); rc->data_inode = NULL; goto out; } @@ -3982,8 +3975,6 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); - if (ret < 0) - err = ret; finishes_stage = rc->stage; /* @@ -3996,16 +3987,18 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, * out of the loop if we hit an error. */ if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { - ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, - (u64)-1); - if (ret) - err = ret; + int wb_ret; + + wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, + (u64)-1); + if (wb_ret && ret == 0) + ret = wb_ret; invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; } - if (err < 0) + if (ret < 0) goto out; if (rc->extents_found == 0) @@ -4021,14 +4014,14 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, WARN_ON(rc->block_group->reserved > 0); WARN_ON(rc->block_group->used > 0); out: - if (err && rw) + if (ret && bg_is_ro) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); reloc_chunk_end(fs_info); out_put_bg: btrfs_put_block_group(bg); free_reloc_control(rc); - return err; + return ret; } static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) From 4e700ac62ac12ffe8579680e6987b27066339d71 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 25 Sep 2025 19:23:22 +0930 Subject: [PATCH 003/147] btrfs: remove unnecessary NULL fs_info check from find_lock_delalloc_range() [STATIC CHECK REPORT] Smatch is reporting that find_lock_delalloc_range() used to do a null pointer check before accessing fs_info, but now we're accessing it for sectorsize unconditionally. [FALSE ALERT] This is a false alert, the existing null pointer check is introduced in commit f7b12a62f008 ("btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size"), but way before that, commit 7c0260ee098d ("btrfs: tests, require fs_info for root") is already forcing every btrfs_root to have a correct fs_info pointer. So there is no way that btrfs_root::fs_info is NULL. [FIX] Just remove the unnecessary NULL pointer checker. Reported-by: kernel test robot Reported-by: Dan Carpenter Fixes: f7b12a62f008 ("btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size") Closes: https://lore.kernel.org/r/202509250925.4L4JQTtn-lkp@intel.com/ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 23273d0e6f22..3804029978ea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -374,8 +374,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; - /* The sanity tests may not set a valid fs_info. */ - u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; + u64 max_bytes = fs_info->max_extent_size; u64 delalloc_start; u64 delalloc_end; bool found; From 9594783e4b4901cbc28ff940eff7e2554a4f7801 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 26 Sep 2025 08:32:56 +0200 Subject: [PATCH 004/147] btrfs: print-tree: use string format for key names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's a warning when -Wformat=2 is used: fs/btrfs/print-tree.c: In function ‘key_type_string’: fs/btrfs/print-tree.c:424:17: warning: format not a string literal and no format arguments [-Wformat-nonliteral] 424 | scnprintf(buf, buf_size, key_to_str[key->type]); We're printing fixed strings from a table so there's no problem but let's fix the warning so we could enable the warning in fs/btrfs/. Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/print-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 62b993fae54f..d16f2960d55d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -421,7 +421,7 @@ static void key_type_string(const struct btrfs_key *key, char *buf, int buf_size if (key->type == 0 && key->objectid == BTRFS_FREE_SPACE_OBJECTID) scnprintf(buf, buf_size, "UNTYPED"); else if (key_to_str[key->type]) - scnprintf(buf, buf_size, key_to_str[key->type]); + scnprintf(buf, buf_size, "%s", key_to_str[key->type]); else scnprintf(buf, buf_size, "UNKNOWN.%d", key->type); } From aebe2bb0b861795cd832473b7257c6cc1cd086d0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 26 Sep 2025 11:47:30 +0200 Subject: [PATCH 005/147] btrfs: fix trivial -Wshadow warnings When compiling with -Wshadow (also in 'make W=2' build) there are several reports of shadowed variables that seem to be harmless: - btrfs_do_encoded_write() - we can reuse 'ordered', there's no previous value that would need to be preserved - scrub_write_endio() - we need a standalone 'i' for bio iteration - scrub_stripe() - duplicate ret2 for errors that must not overwrite 'ret' - btrfs_subpage_set_writeback() - 'flags' is used for another irqsave lock but is not overwritten when reused for xarray due to scoping, but for clarity let's rename it - process_dir_items_leaf() - duplicate 'ret', used only for immediate checks Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 -- fs/btrfs/scrub.c | 4 +--- fs/btrfs/subpage.c | 6 +++--- fs/btrfs/tree-log.c | 3 --- 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6282911e536f..6131589aba7c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9826,8 +9826,6 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, } for (;;) { - struct btrfs_ordered_extent *ordered; - ret = btrfs_wait_ordered_range(inode, start, num_bytes); if (ret) goto out_folios; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ba20d9286a34..e760e76df3f0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1284,7 +1284,7 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); - for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_WRITE_ERRS); } @@ -2529,8 +2529,6 @@ out: } if (sctx->is_dev_replace && ret >= 0) { - int ret2; - ret2 = sync_write_pointer_for_zoned(sctx, chunk_logical + offset, map->stripes[stripe_index].physical, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 5ca8d4db6722..01bf58fa92aa 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -460,12 +460,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, if (!folio_test_dirty(folio)) { struct address_space *mapping = folio_mapping(folio); XA_STATE(xas, &mapping->i_pages, folio->index); - unsigned long flags; + unsigned long xa_flags; - xas_lock_irqsave(&xas, flags); + xas_lock_irqsave(&xas, xa_flags); xas_load(&xas); xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); - xas_unlock_irqrestore(&xas, flags); + xas_unlock_irqrestore(&xas, xa_flags); } spin_unlock_irqrestore(&bfs->lock, flags); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 30f3c3b849c1..f7e5fe3adff2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4154,7 +4154,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, for (int i = path->slots[0]; i < nritems; i++) { struct btrfs_dir_item *di; struct btrfs_key key; - int ret; btrfs_item_key_to_cpu(src, &key, i); @@ -4224,8 +4223,6 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans, } if (batch_size > 0) { - int ret; - ret = flush_dir_items_batch(trans, inode, src, dst_path, batch_start, batch_size); if (ret < 0) From 2346b966c66a7b9cfef948939ae0526bebb4bef7 Mon Sep 17 00:00:00 2001 From: Mehdi Ben Hadj Khelifa Date: Tue, 30 Sep 2025 11:03:44 +0100 Subject: [PATCH 006/147] btrfs: refactor allocation size calculation in alloc_btrfs_io_context() Use struct_size() to replace the open-coded calculation, remove the comment as use of the helper is self explanatory. Reviewed-by: Qu Wenruo Signed-off-by: Mehdi Ben Hadj Khelifa Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2bec544d8ba3..65b02a93db31 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6076,12 +6076,7 @@ struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, { struct btrfs_io_context *bioc; - bioc = kzalloc( - /* The size of btrfs_io_context */ - sizeof(struct btrfs_io_context) + - /* Plus the variable array for the stripes */ - sizeof(struct btrfs_io_stripe) * (total_stripes), - GFP_NOFS); + bioc = kzalloc(struct_size(bioc, stripes, total_stripes), GFP_NOFS); if (!bioc) return NULL; From 2215e6b4034a4a850b4f10ada09ac9ceac38817f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 29 Sep 2025 14:41:15 +0200 Subject: [PATCH 007/147] btrfs: subpage: rename macro variables to avoid shadowing When compiling with -Wshadow there are warnings in the subpage helper macros that are used in functions like btrfs_subpage_dump_bitmap() or btrfs_subpage_clear_and_test_dirty() that also use 'bfs' (for struct btrfs_folio_state) or blocks_per_folio. Add '__' to the macro variables and unify naming in all subpage macros. Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 01bf58fa92aa..0a4a1ee81e63 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -194,12 +194,11 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ unsigned int __start_bit; \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ - __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ + __start_bit += __bpf * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -338,24 +337,20 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, #define subpage_test_bitmap_all_set(fs_info, folio, name) \ ({ \ - struct btrfs_folio_state *bfs = folio_get_private(folio); \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + struct btrfs_folio_state *__bfs = folio_get_private(folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_set(bfs->bitmaps, \ - blocks_per_folio * btrfs_bitmap_nr_##name, \ - blocks_per_folio); \ + bitmap_test_range_all_set(__bfs->bitmaps, \ + __bpf * btrfs_bitmap_nr_##name, __bpf); \ }) #define subpage_test_bitmap_all_zero(fs_info, folio, name) \ ({ \ - struct btrfs_folio_state *bfs = folio_get_private(folio); \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + struct btrfs_folio_state *__bfs = folio_get_private(folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ - bitmap_test_range_all_zero(bfs->bitmaps, \ - blocks_per_folio * btrfs_bitmap_nr_##name, \ - blocks_per_folio); \ + bitmap_test_range_all_zero(__bfs->bitmaps, \ + __bpf * btrfs_bitmap_nr_##name, __bpf); \ }) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, @@ -672,27 +667,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, #define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ { \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ - const struct btrfs_folio_state *bfs = folio_get_private(folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ + const struct btrfs_folio_state *__bfs = folio_get_private(folio); \ \ - ASSERT(blocks_per_folio <= BITS_PER_LONG); \ - *dst = bitmap_read(bfs->bitmaps, \ - blocks_per_folio * btrfs_bitmap_nr_##name, \ - blocks_per_folio); \ + ASSERT(__bpf <= BITS_PER_LONG); \ + *dst = bitmap_read(__bfs->bitmaps, \ + __bpf * btrfs_bitmap_nr_##name, __bpf); \ } #define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ { \ unsigned long bitmap; \ - const unsigned int blocks_per_folio = \ - btrfs_blocks_per_folio(fs_info, folio); \ + const unsigned int __bpf = btrfs_blocks_per_folio(fs_info, folio); \ \ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ "dumping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ - start, len, folio_pos(folio), \ - blocks_per_folio, &bitmap); \ + start, len, folio_pos(folio), __bpf, &bitmap); \ } /* From 725e46298876a2cc1f1c3fb22ba69d29102c3ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Wed, 1 Oct 2025 20:05:03 +0200 Subject: [PATCH 008/147] btrfs: fix double free of qgroup record after failure to add delayed ref head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the previous code it was possible to incur into a double kfree() scenario when calling add_delayed_ref_head(). This could happen if the record was reported to already exist in the btrfs_qgroup_trace_extent_nolock() call, but then there was an error later on add_delayed_ref_head(). In this case, since add_delayed_ref_head() returned an error, the caller went to free the record. Since add_delayed_ref_head() couldn't set this kfree'd pointer to NULL, then kfree() would have acted on a non-NULL 'record' object which was pointing to memory already freed by the callee. The problem comes from the fact that the responsibility to kfree the object is on both the caller and the callee at the same time. Hence, the fix for this is to shift the ownership of the 'qrecord' object out of the add_delayed_ref_head(). That is, we will never attempt to kfree() the given object inside of this function, and will expect the caller to act on the 'qrecord' object on its own. The only exception where the 'qrecord' object cannot be kfree'd is if it was inserted into the tracing logic, for which we already have the 'qrecord_inserted_ret' boolean to account for this. Hence, the caller has to kfree the object only if add_delayed_ref_head() reports not to have inserted it on the tracing logic. As a side-effect of the above, we must guarantee that 'qrecord_inserted_ret' is properly initialized at the start of the function, not at the end, and then set when an actual insert happens. This way we avoid 'qrecord_inserted_ret' having an invalid value on an early exit. The documentation from the add_delayed_ref_head() has also been updated to reflect on the exact ownership of the 'qrecord' object. Fixes: 6ef8fbce0104 ("btrfs: fix missing error handling when adding delayed ref with qgroups enabled") Reviewed-by: Filipe Manana Signed-off-by: Miquel Sabaté Solà Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 43 ++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 481802efaa14..f8fc26272f76 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -798,9 +798,13 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, } /* - * helper function to actually insert a head node into the rbtree. - * this does all the dirty work in terms of maintaining the correct - * overall modification count. + * Helper function to actually insert a head node into the xarray. This does all + * the dirty work in terms of maintaining the correct overall modification + * count. + * + * The caller is responsible for calling kfree() on @qrecord. More specifically, + * if this function reports that it did not insert it as noted in + * @qrecord_inserted_ret, then it's safe to call kfree() on it. * * Returns an error pointer in case of an error. */ @@ -814,7 +818,14 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_root *delayed_refs; const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits); - bool qrecord_inserted = false; + + /* + * If 'qrecord_inserted_ret' is provided, then the first thing we need + * to do is to initialize it to false just in case we have an exit + * before trying to insert the record. + */ + if (qrecord_inserted_ret) + *qrecord_inserted_ret = false; delayed_refs = &trans->transaction->delayed_refs; lockdep_assert_held(&delayed_refs->lock); @@ -833,6 +844,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, /* Record qgroup extent info if provided */ if (qrecord) { + /* + * Setting 'qrecord' but not 'qrecord_inserted_ret' will likely + * result in a memory leakage. + */ + ASSERT(qrecord_inserted_ret != NULL); + int ret; ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord, @@ -840,12 +857,10 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, if (ret) { /* Clean up if insertion fails or item exists. */ xa_release(&delayed_refs->dirty_extents, index); - /* Caller responsible for freeing qrecord on error. */ if (ret < 0) return ERR_PTR(ret); - kfree(qrecord); - } else { - qrecord_inserted = true; + } else if (qrecord_inserted_ret) { + *qrecord_inserted_ret = true; } } @@ -888,8 +903,6 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, delayed_refs->num_heads++; delayed_refs->num_heads_ready++; } - if (qrecord_inserted_ret) - *qrecord_inserted_ret = qrecord_inserted; return head_ref; } @@ -1049,6 +1062,14 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, xa_release(&delayed_refs->head_refs, index); spin_unlock(&delayed_refs->lock); ret = PTR_ERR(new_head_ref); + + /* + * It's only safe to call kfree() on 'qrecord' if + * add_delayed_ref_head() has _not_ inserted it for + * tracing. Otherwise we need to handle this here. + */ + if (!qrecord_reserved || qrecord_inserted) + goto free_head_ref; goto free_record; } head_ref = new_head_ref; @@ -1071,6 +1092,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans, if (qrecord_inserted) return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr); + + kfree(record); return 0; free_record: From 745483ea988b4abba63a881931675268d57b0f36 Mon Sep 17 00:00:00 2001 From: Rajeev Tapadia Date: Fri, 3 Oct 2025 19:00:02 +0530 Subject: [PATCH 009/147] btrfs: fix comment in alloc_bitmap() and drop stale TODO All callers of alloc_bitmap() hold a transaction handle, so GFP_NOFS is needed to avoid deadlocks on recursion. Update the comment and drop the stale TODO. Reviewed-by: Filipe Manana Signed-off-by: Rajeev Tapadia Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 9ed36bbe9d35..26eae347739f 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -165,11 +165,9 @@ static unsigned long *alloc_bitmap(u32 bitmap_size) /* * GFP_NOFS doesn't work with kvmalloc(), but we really can't recurse - * into the filesystem as the free space bitmap can be modified in the - * critical section of a transaction commit. - * - * TODO: push the memalloc_nofs_{save,restore}() to the caller where we - * know that recursion is unsafe. + * into the filesystem here. All callers hold a transaction handle + * open, so if a GFP_KERNEL allocation recurses into the filesystem + * and triggers a transaction commit, we would deadlock. */ nofs_flag = memalloc_nofs_save(); ret = kvzalloc(bitmap_rounded_size, GFP_KERNEL); From 38e818718c5e04961eea0fa8feff3f100ce40408 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 1 Oct 2025 17:20:22 -0700 Subject: [PATCH 010/147] btrfs: fix racy bitfield write in btrfs_clear_space_info_full() From the memory-barriers.txt document regarding memory barrier ordering guarantees: (*) These guarantees do not apply to bitfields, because compilers often generate code to modify these using non-atomic read-modify-write sequences. Do not attempt to use bitfields to synchronize parallel algorithms. (*) Even in cases where bitfields are protected by locks, all fields in a given bitfield must be protected by one lock. If two fields in a given bitfield are protected by different locks, the compiler's non-atomic read-modify-write sequences can cause an update to one field to corrupt the value of an adjacent field. btrfs_space_info has a bitfield sharing an underlying word consisting of the fields full, chunk_alloc, and flush: struct btrfs_space_info { struct btrfs_fs_info * fs_info; /* 0 8 */ struct btrfs_space_info * parent; /* 8 8 */ ... int clamp; /* 172 4 */ unsigned int full:1; /* 176: 0 4 */ unsigned int chunk_alloc:1; /* 176: 1 4 */ unsigned int flush:1; /* 176: 2 4 */ ... Therefore, to be safe from parallel read-modify-writes losing a write to one of the bitfield members protected by a lock, all writes to all the bitfields must use the lock. They almost universally do, except for btrfs_clear_space_info_full() which iterates over the space_infos and writes out found->full = 0 without a lock. Imagine that we have one thread completing a transaction in which we finished deleting a block_group and are thus calling btrfs_clear_space_info_full() while simultaneously the data reclaim ticket infrastructure is running do_async_reclaim_data_space(): T1 T2 btrfs_commit_transaction btrfs_clear_space_info_full data_sinfo->full = 0 READ: full:0, chunk_alloc:0, flush:1 do_async_reclaim_data_space(data_sinfo) spin_lock(&space_info->lock); if(list_empty(tickets)) space_info->flush = 0; READ: full: 0, chunk_alloc:0, flush:1 MOD/WRITE: full: 0, chunk_alloc:0, flush:0 spin_unlock(&space_info->lock); return; MOD/WRITE: full:0, chunk_alloc:0, flush:1 and now data_sinfo->flush is 1 but the reclaim worker has exited. This breaks the invariant that flush is 0 iff there is no work queued or running. Once this invariant is violated, future allocations that go into __reserve_bytes() will add tickets to space_info->tickets but will see space_info->flush is set to 1 and not queue the work. After this, they will block forever on the resulting ticket, as it is now impossible to kick the worker again. I also confirmed by looking at the assembly of the affected kernel that it is doing RMW operations. For example, to set the flush (3rd) bit to 0, the assembly is: andb $0xfb,0x60(%rbx) and similarly for setting the full (1st) bit to 0: andb $0xfe,-0x20(%rax) So I think this is really a bug on practical systems. I have observed a number of systems in this exact state, but am currently unable to reproduce it. Rather than leaving this footgun lying around for the future, take advantage of the fact that there is room in the struct anyway, and that it is already quite large and simply change the three bitfield members to bools. This avoids writes to space_info->full having any effect on writes to space_info->flush, regardless of locking. Fixes: 957780eb2788 ("Btrfs: introduce ticketed enospc infrastructure") Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 6 +++--- fs/btrfs/space-info.c | 22 +++++++++++----------- fs/btrfs/space-info.h | 6 +++--- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5322ef2ae015..8bf501fbcc0b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4215,7 +4215,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->chunk_mutex); } else { /* Proceed with allocation */ - space_info->chunk_alloc = 1; + space_info->chunk_alloc = true; wait_for_alloc = false; spin_unlock(&space_info->lock); } @@ -4264,7 +4264,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) - space_info->full = 1; + space_info->full = true; else goto out; } else { @@ -4274,7 +4274,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; out: - space_info->chunk_alloc = 0; + space_info->chunk_alloc = false; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 97452fb5d29b..85c466c85910 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -192,7 +192,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) struct btrfs_space_info *found; list_for_each_entry(found, head, list) - found->full = 0; + found->full = false; } /* @@ -372,7 +372,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, space_info->bytes_readonly += block_group->bytes_super; btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable); if (block_group->length > 0) - space_info->full = 0; + space_info->full = false; btrfs_try_granting_tickets(info, space_info); spin_unlock(&space_info->lock); @@ -1146,7 +1146,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); if (!to_reclaim) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1158,7 +1158,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) flush_space(fs_info, space_info, to_reclaim, flush_state, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1201,7 +1201,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) flush_state = FLUSH_DELAYED_ITEMS_NR; commit_cycles--; } else { - space_info->flush = 0; + space_info->flush = false; } } else { flush_state = FLUSH_DELAYED_ITEMS_NR; @@ -1383,7 +1383,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1394,7 +1394,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1411,7 +1411,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) data_flush_states[flush_state], false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); return; } @@ -1428,7 +1428,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) if (maybe_fail_all_tickets(fs_info, space_info)) flush_state = 0; else - space_info->flush = 0; + space_info->flush = false; } else { flush_state = 0; } @@ -1444,7 +1444,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) aborted_fs: maybe_fail_all_tickets(fs_info, space_info); - space_info->flush = 0; + space_info->flush = false; spin_unlock(&space_info->lock); } @@ -1825,7 +1825,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ maybe_clamp_preempt(fs_info, space_info); - space_info->flush = 1; + space_info->flush = true; trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 679f22efb407..a846f63585c9 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -142,11 +142,11 @@ struct btrfs_space_info { flushing. The value is >> clamp, so turns out to be a 2^clamp divisor. */ - unsigned int full:1; /* indicates that we cannot allocate any more + bool full; /* indicates that we cannot allocate any more chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + bool chunk_alloc; /* set if we are allocating a chunk */ - unsigned int flush:1; /* set if we are trying to make space */ + bool flush; /* set if we are trying to make space */ unsigned int force_alloc; /* set if we need to force a chunk alloc for this space */ From 892794c02532b78c18bab5647675a230362cab9a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 7 Oct 2025 11:14:37 +0100 Subject: [PATCH 011/147] btrfs: use end_pos variable where needed in btrfs_dirty_folio() We have a couple places doing the computation "pos + write_bytes" when we already have it in the local variable "end_pos". Change then to use the variable instead and make source code smaller. Also make the variable const since it's not supposed to change. This also has a very slight reduction in the module size. Before: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1915990 161647 15592 2093229 1ff0ad fs/btrfs/btrfs.ko After: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1915974 161647 15592 2093213 1ff09d fs/btrfs/btrfs.ko Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa82def46e39..30986a625bdb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -75,7 +75,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos u64 num_bytes; u64 start_pos; u64 end_of_last_block; - u64 end_pos = pos + write_bytes; + const u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; @@ -86,10 +86,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos extra_bits |= EXTENT_NORESERVE; start_pos = round_down(pos, fs_info->sectorsize); - num_bytes = round_up(write_bytes + pos - start_pos, - fs_info->sectorsize); + num_bytes = round_up(end_pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); - ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes); + ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= end_pos); end_of_last_block = start_pos + num_bytes - 1; From 9b2839451dfc1eccd48972c675bb0e8e9d050d56 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Oct 2025 10:22:03 +1030 Subject: [PATCH 012/147] btrfs: introduce a new shutdown state A new fs state EMERGENCY_SHUTDOWN is introduced, which is btrfs' equivalent of XFS_IOC_GOINGDOWN or EXT4_IOC_SHUTDOWN, after entering emergency shutdown state, all operations will return errors (-EIO), and can not be bring back to normal state until unmouont. The new state will reject the following file operations: - read_iter() - write_iter() - mmap() - open() - remap_file_range() - uring_cmd() - splice_read() This requires a small wrapper to do the extra shutdown check, then call the regular filemap_splice_read() function This should reject most of the file operations on a shutdown btrfs. And for the existing dirty folios, extra shutdown checks are introduced to the following functions: - run_delalloc_nocow() - run_delalloc_compressed() - cow_file_range() So that dirty ranges will still be properly cleaned without being submitted. Finally the shutdown state will also set the fs error, so that no new transaction will be committed, protecting the metadata from any possible further corruption. And when the fs entered shutdown mode for the first time, a critical level kernel message will show up to indicate the incident. That message will be important for end users as rejected delalloc ranges will output error messages, hopefully that shutdown message and the fact that all fs operations are returning error will prevent end users from getting too confused about the delalloc error messages. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Tested-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 25 ++++++++++++++++++++++++- fs/btrfs/fs.h | 28 ++++++++++++++++++++++++++++ fs/btrfs/inode.c | 16 ++++++++++++++-- fs/btrfs/ioctl.c | 3 +++ fs/btrfs/messages.c | 1 + fs/btrfs/reflink.c | 3 +++ 6 files changed, 73 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 30986a625bdb..1e0ff3d7210d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1440,6 +1440,8 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; + if (unlikely(btrfs_is_shutdown(inode->root->fs_info))) + return -EIO; /* * If the fs flips readonly due to some impossible error, although we * have opened a file as writable, we have to stop this write operation @@ -2042,6 +2044,8 @@ static int btrfs_file_mmap_prepare(struct vm_area_desc *desc) struct file *filp = desc->file; struct address_space *mapping = filp->f_mapping; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))) + return -EIO; if (!mapping->a_ops->read_folio) return -ENOEXEC; @@ -3111,6 +3115,9 @@ static long btrfs_fallocate(struct file *file, int mode, int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + return -EIO; + /* Do not allow fallocate in ZONED mode */ if (btrfs_is_zoned(inode_to_fs_info(inode))) return -EOPNOTSUPP; @@ -3802,6 +3809,9 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) + return -EIO; + filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); @@ -3814,6 +3824,9 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))) + return -EIO; + if (iocb->ki_flags & IOCB_DIRECT) { ret = btrfs_direct_read(iocb, to); if (ret < 0 || !iov_iter_count(to) || @@ -3824,10 +3837,20 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return filemap_read(iocb, to, ret); } +static ssize_t btrfs_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))) + return -EIO; + + return filemap_splice_read(in, ppos, pipe, len, flags); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = btrfs_file_read_iter, - .splice_read = filemap_splice_read, + .splice_read = btrfs_file_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, .mmap_prepare = btrfs_file_mmap_prepare, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 814bbc9417d2..c83fd192a7dc 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -29,6 +29,7 @@ #include "extent-io-tree.h" #include "async-thread.h" #include "block-rsv.h" +#include "messages.h" struct inode; struct super_block; @@ -124,6 +125,12 @@ enum { /* No more delayed iput can be queued. */ BTRFS_FS_STATE_NO_DELAYED_IPUT, + /* + * Emergency shutdown, a step further than transaction aborted by + * rejecting all operations. + */ + BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, + BTRFS_FS_STATE_COUNT }; @@ -1120,6 +1127,27 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) +static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state); +} + +static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info) +{ + /* + * Here we do not want to use handle_fs_error(), which will mark the fs + * read-only. + * Some call sites like shutdown ioctl will mark the fs shutdown when + * the fs is frozen. But thaw path will handle RO and RW fs + * differently. + * + * So here we only mark the fs error without flipping it RO. + */ + WRITE_ONCE(fs_info->fs_error, -EIO); + if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)) + btrfs_crit(fs_info, "emergency shutdown"); +} + /* * We use folio flag owner_2 to indicate there is an ordered extent with * unfinished IO. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6131589aba7c..15131873f73d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -864,7 +864,7 @@ static void compress_file_range(struct btrfs_work *work) u64 actual_end; u64 i_size; int ret = 0; - struct folio **folios; + struct folio **folios = NULL; unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; @@ -873,6 +873,9 @@ static void compress_file_range(struct btrfs_work *work) int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; + if (unlikely(btrfs_is_shutdown(fs_info))) + goto cleanup_and_bail_uncompressed; + inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* @@ -1288,6 +1291,11 @@ static noinline int cow_file_range(struct btrfs_inode *inode, unsigned long page_ops; int ret = 0; + if (unlikely(btrfs_is_shutdown(fs_info))) { + ret = -EIO; + goto out_unlock; + } + if (btrfs_is_free_space_inode(inode)) { ret = -EINVAL; goto out_unlock; @@ -2006,7 +2014,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; - struct btrfs_path *path; + struct btrfs_path *path = NULL; u64 cow_start = (u64)-1; /* * If not 0, represents the inclusive end of the last fallback_to_cow() @@ -2036,6 +2044,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, */ ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); + if (unlikely(btrfs_is_shutdown(fs_info))) { + ret = -EIO; + goto error; + } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8cb7d5a462ef..803556ec0e18 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5077,6 +5077,9 @@ out_acct: int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file))))) + return -EIO; + switch (cmd->cmd_op) { case BTRFS_IOC_ENCODED_READ: #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index a0cf8effe008..2f853de44473 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -24,6 +24,7 @@ static const char fs_state_chars[] = { [BTRFS_FS_STATE_NO_DATA_CSUMS] = 'C', [BTRFS_FS_STATE_SKIP_META_CSUMS] = 'S', [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', + [BTRFS_FS_STATE_EMERGENCY_SHUTDOWN] = 'E', }; static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5465a5eae9b2..1bbe3bb7e1bb 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -868,6 +868,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, bool same_inode = dst_inode == src_inode; int ret; + if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))) + return -EIO; + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; From 6b1ac78dd0f29fe66421c460c12ec15e45af38c3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Oct 2025 10:22:04 +1030 Subject: [PATCH 013/147] btrfs: implement shutdown ioctl The shutdown ioctl should follow the XFS one, which use magic number 'X', and ioctl number 125, with a uint32 as flags. For now btrfs don't distinguish DEFAULT and LOGFLUSH flags (just like f2fs), both will freeze the fs first (implies committing the current transaction), setting the SHUTDOWN flag and finally thaw the fs. For NOLOGFLUSH flag, the freeze/thaw part is skipped thus the current transaction is aborted. The new shutdown ioctl is hidden behind experimental features for more testing. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Tested-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 41 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/btrfs.h | 9 +++++++++ 2 files changed, 50 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 803556ec0e18..127b5d8303a8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5223,6 +5223,43 @@ static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *a return 0; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_ioctl_shutdown(struct btrfs_fs_info *fs_info, unsigned long arg) +{ + int ret = 0; + u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (u32 __user *)arg)) + return -EFAULT; + + if (flags >= BTRFS_SHUTDOWN_FLAGS_LAST) + return -EINVAL; + + if (btrfs_is_shutdown(fs_info)) + return 0; + + switch (flags) { + case BTRFS_SHUTDOWN_FLAGS_LOGFLUSH: + case BTRFS_SHUTDOWN_FLAGS_DEFAULT: + ret = freeze_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL); + if (ret) + return ret; + btrfs_force_shutdown(fs_info); + ret = thaw_super(fs_info->sb, FREEZE_HOLDER_KERNEL, NULL); + if (ret) + return ret; + break; + case BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH: + btrfs_force_shutdown(fs_info); + break; + } + return ret; +} +#endif + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -5378,6 +5415,10 @@ long btrfs_ioctl(struct file *file, unsigned int #endif case BTRFS_IOC_SUBVOL_SYNC_WAIT: return btrfs_ioctl_subvol_sync(fs_info, argp); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + case BTRFS_IOC_SHUTDOWN: + return btrfs_ioctl_shutdown(fs_info, arg); +#endif } return -ENOTTY; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 8e710bbb688e..e8fd92789423 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -1099,6 +1099,12 @@ enum btrfs_err_code { BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, }; +/* Flags for IOC_SHUTDOWN, must match XFS_FSOP_GOING_FLAGS_* flags. */ +#define BTRFS_SHUTDOWN_FLAGS_DEFAULT 0x0 +#define BTRFS_SHUTDOWN_FLAGS_LOGFLUSH 0x1 +#define BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH 0x2 +#define BTRFS_SHUTDOWN_FLAGS_LAST 0x3 + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -1220,6 +1226,9 @@ enum btrfs_err_code { #define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \ struct btrfs_ioctl_subvol_wait) +/* Shutdown ioctl should follow XFS's interfaces, thus not using btrfs magic. */ +#define BTRFS_IOC_SHUTDOWN _IOR('X', 125, __u32) + #ifdef __cplusplus } #endif From 803e115657dd145fcf2b6481c6d224ecc270ed0e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Oct 2025 10:22:05 +1030 Subject: [PATCH 014/147] btrfs: implement remove_bdev and shutdown super operation callbacks For the ->remove_bdev() callback, btrfs will: - Mark the target device as missing - Go degraded if the fs can afford it - Return error other wise Thus falls back to the shutdown callback For the ->shutdown callback, btrfs will: - Set the SHUTDOWN flag Which will reject all new incoming operations, and make all writeback to fail. The behavior is the same as the NOLOGFLUSH behavior. To support the lookup from bdev to a btrfs_device, btrfs_dev_lookup_args is enhanced to have a new @devt member. If set, we should be able to use that @devt member to uniquely locating a btrfs device. I know the shutdown can be a little overkilled, if one has a RAID1 metadata and RAID0 data, in that case one can still read data with 50% chance to got some good data. But a filesystem returning -EIO for half of the time is not really considered usable. Further it can also be as bad as the only device went missing for a single device btrfs. So here we go safe other than sorry when handling missing device. And the remove_bdev callback will be hidden behind experimental features for now, the reasons are: - There are not enough btrfs specific bdev removal test cases The existing test cases are all removing the only device, thus only exercises the ->shutdown() behavior. - Not yet determined what's the expected behavior Although the current auto-degrade behavior is no worse than the old behavior, it may not always be what the end users want. Before there is a concrete interface, better hide the new feature from end users. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Tested-by: Anand Jain Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/super.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 2 ++ fs/btrfs/volumes.h | 5 ++++ 3 files changed, 71 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 430e7419349c..e606e11d3f57 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2430,6 +2430,66 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont return 0; } +#ifdef CONFIG_BTRFS_EXPERIMENTAL +static int btrfs_remove_bdev(struct super_block *sb, struct block_device *bdev) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_device *device; + struct btrfs_dev_lookup_args lookup_args = { .devt = bdev->bd_dev }; + bool can_rw; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + device = btrfs_find_device(fs_info->fs_devices, &lookup_args); + if (!device) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* Device not found, should not affect the running fs, just give a warning. */ + btrfs_warn(fs_info, "unable to find btrfs device for block device '%pg'", bdev); + return 0; + } + /* + * The to-be-removed device is already missing? + * + * That's weird but no special handling needed and can exit right now. + */ + if (unlikely(test_and_set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_warn(fs_info, "btrfs device id %llu is already missing", device->devid); + return 0; + } + + device->fs_devices->missing_devices++; + if (test_and_clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + list_del_init(&device->dev_alloc_list); + WARN_ON(device->fs_devices->rw_devices < 1); + device->fs_devices->rw_devices--; + } + can_rw = btrfs_check_rw_degradable(fs_info, device); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + /* + * Now device is considered missing, btrfs_device_name() won't give a + * meaningful result anymore, so only output the devid. + */ + if (unlikely(!can_rw)) { + btrfs_crit(fs_info, + "btrfs device id %llu has gone missing, can not maintain read-write", + device->devid); + return -EIO; + } + btrfs_warn(fs_info, + "btrfs device id %llu has gone missing, continue as degraded", + device->devid); + btrfs_set_opt(fs_info->mount_opt, DEGRADED); + return 0; +} + +static void btrfs_shutdown(struct super_block *sb) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_force_shutdown(fs_info); +} +#endif + static const struct super_operations btrfs_super_ops = { .drop_inode = btrfs_drop_inode, .evict_inode = btrfs_evict_inode, @@ -2445,6 +2505,10 @@ static const struct super_operations btrfs_super_ops = { .unfreeze_fs = btrfs_unfreeze, .nr_cached_objects = btrfs_nr_cached_objects, .free_cached_objects = btrfs_free_cached_objects, +#ifdef CONFIG_BTRFS_EXPERIMENTAL + .remove_bdev = btrfs_remove_bdev, + .shutdown = btrfs_shutdown, +#endif }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 65b02a93db31..928fc6a061b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6802,6 +6802,8 @@ static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, const struct btrfs_device *device) { + if (args->devt) + return device->devt == args->devt; if (args->missing) { if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && !device->bdev) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2cbf8080eade..adbd9e6c09ff 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -662,6 +662,11 @@ struct btrfs_dev_lookup_args { u64 devid; u8 *uuid; u8 *fsid; + /* + * If devt is specified, all other members will be ignored as it is + * enough to uniquely locate a device. + */ + dev_t devt; bool missing; }; From 18de34daa7c62c830be533aace6b7c271e8e95cf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 10 Oct 2025 16:50:02 +0100 Subject: [PATCH 015/147] btrfs: truncate ordered extent when skipping writeback past i_size While running test case btrfs/192 from fstests with support for large folios (needs CONFIG_BTRFS_EXPERIMENTAL=y) I ended up getting very sporadic btrfs check failures reporting that csum items were missing. Looking into the issue it turned out that btrfs check searches for csum items of a file extent item with a range that spans beyond the i_size of a file and we don't have any, because the kernel's writeback code skips submitting bios for ranges beyond eof. It's not expected however to find a file extent item that crosses the rounded up (by the sector size) i_size value, but there is a short time window where we can end up with a transaction commit leaving this small inconsistency between the i_size and the last file extent item. Example btrfs check output when this happens: $ btrfs check /dev/sdc Opening filesystem to check... Checking filesystem on /dev/sdc UUID: 69642c61-5efb-4367-aa31-cdfd4067f713 [1/8] checking log skipped (none written) [2/8] checking root items [3/8] checking extents [4/8] checking free space tree [5/8] checking fs roots root 5 inode 332 errors 1000, some csum missing ERROR: errors found in fs roots (...) Looking at a tree dump of the fs tree (root 5) for inode 332 we have: $ btrfs inspect-internal dump-tree -t 5 /dev/sdc (...) item 28 key (332 INODE_ITEM 0) itemoff 2006 itemsize 160 generation 17 transid 19 size 610969 nbytes 86016 block group 0 mode 100666 links 1 uid 0 gid 0 rdev 0 sequence 11 flags 0x0(none) atime 1759851068.391327881 (2025-10-07 16:31:08) ctime 1759851068.410098267 (2025-10-07 16:31:08) mtime 1759851068.410098267 (2025-10-07 16:31:08) otime 1759851068.391327881 (2025-10-07 16:31:08) item 29 key (332 INODE_REF 340) itemoff 1993 itemsize 13 index 2 namelen 3 name: f1f item 30 key (332 EXTENT_DATA 589824) itemoff 1940 itemsize 53 generation 19 type 1 (regular) extent data disk byte 21745664 nr 65536 extent data offset 0 nr 65536 ram 65536 extent compression 0 (none) (...) We can see that the file extent item for file offset 589824 has a length of 64K and its number of bytes is 64K. Looking at the inode item we see that its i_size is 610969 bytes which falls within the range of that file extent item [589824, 655360[. Looking into the csum tree: $ btrfs inspect-internal dump-tree /dev/sdc (...) item 15 key (EXTENT_CSUM EXTENT_CSUM 21565440) itemoff 991 itemsize 200 range start 21565440 end 21770240 length 204800 item 16 key (EXTENT_CSUM EXTENT_CSUM 1104576512) itemoff 983 itemsize 8 range start 1104576512 end 1104584704 length 8192 (..) We see that the csum item number 15 covers the first 24K of the file extent item - it ends at offset 21770240 and the extent's disk_bytenr is 21745664, so we have: 21770240 - 21745664 = 24K We see that the next csum item (number 16) is completely outside the range, so the remaining 40K of the extent doesn't have csum items in the tree. If we round up the i_size to the sector size, we get: round_up(610969, 4096) = 614400 If we subtract from that the file offset for the extent item we get: 614400 - 589824 = 24K So the missing 40K corresponds to the end of the file extent item's range minus the rounded up i_size: 655360 - 614400 = 40K Normally we don't expect a file extent item to span over the rounded up i_size of an inode, since when truncating, doing hole punching and other operations that trim a file extent item, the number of bytes is adjusted. There is however a short time window where the kernel can end up, temporarily,persisting an inode with an i_size that falls in the middle of the last file extent item and the file extent item was not yet trimmed (its number of bytes reduced so that it doesn't cross i_size rounded up by the sector size). The steps (in the kernel) that lead to such scenario are the following: 1) We have inode I as an empty file, no allocated extents, i_size is 0; 2) A buffered write is done for file range [589824, 655360[ (length of 64K) and the i_size is updated to 655360. Note that we got a single large folio for the range (64K); 3) A truncate operation starts that reduces the inode's i_size down to 610969 bytes. The truncate sets the inode's new i_size at btrfs_setsize() by calling truncate_setsize() and before calling btrfs_truncate(); 4) At btrfs_truncate() we trigger writeback for the range starting at 610304 (which is the new i_size rounded down to the sector size) and ending at (u64)-1; 5) During the writeback, at extent_write_cache_pages(), we get from the call to filemap_get_folios_tag(), the 64K folio that starts at file offset 589824 since it contains the start offset of the writeback range (610304); 6) At writepage_delalloc() we find the whole range of the folio is dirty and therefore we run delalloc for that 64K range ([589824, 655360[), reserving a 64K extent, creating an ordered extent, etc; 7) At extent_writepage_io() we submit IO only for subrange [589824, 614400[ because the inode's i_size is 610969 bytes (rounded up by sector size is 614400). There, in the while loop we intentionally skip IO beyond i_size to avoid any unnecessay work and just call btrfs_mark_ordered_io_finished() for the range [614400, 655360[ (which has a 40K length); 8) Once the IO finishes we finish the ordered extent by ending up at btrfs_finish_one_ordered(), join transaction N, insert a file extent item in the inode's subvolume tree for file offset 589824 with a number of bytes of 64K, and update the inode's delayed inode item or directly the inode item with a call to btrfs_update_inode_fallback(), which results in storing the new i_size of 610969 bytes; 9) Transaction N is committed either by the transaction kthread or some other task committed it (in response to a sync or fsync for example). At this point we have inode I persisted with an i_size of 610969 bytes and file extent item that starts at file offset 589824 and has a number of bytes of 64K, ending at an offset of 655360 which is beyond the i_size rounded up to the sector size (614400). --> So after a crash or power failure here, the btrfs check program reports that error about missing checksum items for this inode, as it tries to lookup for checksums covering the whole range of the extent; 10) Only after transaction N is committed that at btrfs_truncate() the call to btrfs_start_transaction() starts a new transaction, N + 1, instead of joining transaction N. And it's with transaction N + 1 that it calls btrfs_truncate_inode_items() which updates the file extent item at file offset 589824 to reduce its number of bytes from 64K down to 24K, so that the file extent item's range ends at the i_size rounded up to the sector size (614400 bytes). Fix this by truncating the ordered extent at extent_writepage_io() when we skip writeback because the current offset in the folio is beyond i_size. This ensures we don't ever persist a file extent item with a number of bytes beyond the rounded up (by sector size) value of the i_size. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 21 +++++++++++++++++++-- fs/btrfs/ordered-data.c | 5 +++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3804029978ea..3a57aeb3c622 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1691,13 +1691,13 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, bool submitted_io = false; int found_error = 0; const u64 folio_start = folio_pos(folio); + const u64 folio_end = folio_start + folio_size(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; - ASSERT(start >= folio_start && - start + len <= folio_start + folio_size(folio)); + ASSERT(start >= folio_start && start + len <= folio_end); ret = btrfs_writepage_cow_fixup(folio); if (ret == -EAGAIN) { @@ -1724,6 +1724,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { + struct btrfs_ordered_extent *ordered; + unsigned long flags; + + ordered = btrfs_lookup_first_ordered_range(inode, cur, + folio_end - cur); + /* + * We have just run delalloc before getting here, so + * there must be an ordered extent. + */ + ASSERT(ordered != NULL); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); + ordered->truncated_len = min(ordered->truncated_len, + cur - ordered->file_offset); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + btrfs_put_ordered_extent(ordered); + btrfs_mark_ordered_io_finished(inode, folio, cur, start + len - cur, true); /* diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2829f20d7bb5..8a8aa6ed405b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1098,8 +1098,9 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct rb_node *prev; struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; + unsigned long flags; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last @@ -1154,7 +1155,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return entry; } From 46a23908598f4b8e61483f04ea9f471b2affc58a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 10 Oct 2025 17:04:03 +0100 Subject: [PATCH 016/147] btrfs: use variable for end offset in extent_writepage_io() Instead of repeating the expression "start + len" multiple times, store it in a variable and use it where needed. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3a57aeb3c622..a8f75a4f8985 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1690,6 +1690,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, unsigned long range_bitmap = 0; bool submitted_io = false; int found_error = 0; + const u64 end = start + len; const u64 folio_start = folio_pos(folio); const u64 folio_end = folio_start + folio_size(folio); const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); @@ -1697,7 +1698,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, int bit; int ret = 0; - ASSERT(start >= folio_start && start + len <= folio_end); + ASSERT(start >= folio_start && end <= folio_end); ret = btrfs_writepage_cow_fixup(folio); if (ret == -EAGAIN) { @@ -1713,7 +1714,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, return ret; } - for (cur = start; cur < start + len; cur += fs_info->sectorsize) + for (cur = start; cur < end; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, blocks_per_folio); @@ -1742,7 +1743,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, btrfs_put_ordered_extent(ordered); btrfs_mark_ordered_io_finished(inode, folio, cur, - start + len - cur, true); + end - cur, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1751,8 +1752,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_folio_clear_dirty(fs_info, folio, cur, - start + len - cur); + btrfs_folio_clear_dirty(fs_info, folio, cur, end - cur); break; } ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); From 74ca34f79e53657760c3b09abe1bd593b849ca8c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 10 Oct 2025 17:17:10 +0100 Subject: [PATCH 017/147] btrfs: split assertion into two in extent_writepage_io() If the assertion fails we don't get to know which of the two expressions failed and neither the values used in each expression. So split the assertion into two, each for a single expression, so that if any is triggered we see a line number reported in a stack trace that points to which expression failed. Also make the assertions use the verbose mode to print the values involved in the computations. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a8f75a4f8985..c741de164535 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1698,7 +1698,9 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, int bit; int ret = 0; - ASSERT(start >= folio_start && end <= folio_end); + ASSERT(start >= folio_start, "start=%llu folio_start=%llu", start, folio_start); + ASSERT(end <= folio_end, "start=%llu len=%u folio_start=%llu folio_size=%zu", + start, len, folio_start, folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); if (ret == -EAGAIN) { From 28fe58ce6a20aa674076645bcfc1be126a12ed4b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 12 Oct 2025 10:43:02 +0100 Subject: [PATCH 018/147] btrfs: add unlikely to unexpected error case in extent_writepages() We don't expect to hit errors and log the error message, so add the unlikely annotation to make it clear and to hint the compiler that it may reorganize code to be more efficient. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c741de164535..cb680cdeb77d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1874,7 +1874,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; - if (ret < 0) + if (unlikely(ret < 0)) btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", btrfs_root_id(inode->root), btrfs_ino(inode), From b917a94a4c085a307069790a0527f9492fc70700 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 12 Oct 2025 10:26:40 +0100 Subject: [PATCH 019/147] btrfs: consistently round up or down i_size in btrfs_truncate() We're using different ways to round down the i_size by sector size, one with a bitwise and with a negated mask and another with ALIGN_DOWN(), and using ALIGN() to round up. Replace these uses with the round_down() and round_up() macros which have have names that make it clear the direction of the rounding (unlike the ALIGN() macro) and getting rid of the bitwise and, negated mask and local variable for the mask. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 15131873f73d..865a288e0e29 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7662,12 +7662,12 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) struct btrfs_block_rsv rsv; int ret; struct btrfs_trans_handle *trans; - u64 mask = fs_info->sectorsize - 1; const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(inode, - inode->vfs_inode.i_size & (~mask), + round_down(inode->vfs_inode.i_size, + fs_info->sectorsize), (u64)-1); if (ret) return ret; @@ -7733,7 +7733,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) while (1) { struct extent_state *cached_state = NULL; const u64 new_size = inode->vfs_inode.i_size; - const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); + const u64 lock_start = round_down(new_size, fs_info->sectorsize); control.new_size = new_size; btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); @@ -7743,7 +7743,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) * block of the extent just the way it is. */ btrfs_drop_extent_map_range(inode, - ALIGN(new_size, fs_info->sectorsize), + round_up(new_size, fs_info->sectorsize), (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); From 3b7c0c20b72003238ea3e17e60e357513be8edaf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 12 Oct 2025 10:39:08 +0100 Subject: [PATCH 020/147] btrfs: avoid multiple i_size rounding in btrfs_truncate() We have the inode locked so no one can concurrently change its i_size and neither do we change it ourselves, so there's no point in keep rounding it in the while loop and setting it up in the control structure. That only causes confusion when reading the code. So move all the i_size setup and rounding out of the loop and assert the inode is locked. Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 865a288e0e29..81cb8a86324d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7656,6 +7656,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) .ino = btrfs_ino(inode), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, + .new_size = inode->vfs_inode.i_size, }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -7663,12 +7664,14 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) int ret; struct btrfs_trans_handle *trans; const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); + const u64 lock_start = round_down(inode->vfs_inode.i_size, fs_info->sectorsize); + const u64 i_size_up = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); + + /* Our inode is locked and the i_size can't be changed concurrently. */ + btrfs_assert_inode_locked(inode); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(inode, - round_down(inode->vfs_inode.i_size, - fs_info->sectorsize), - (u64)-1); + ret = btrfs_wait_ordered_range(inode, lock_start, (u64)-1); if (ret) return ret; } @@ -7732,19 +7735,14 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) while (1) { struct extent_state *cached_state = NULL; - const u64 new_size = inode->vfs_inode.i_size; - const u64 lock_start = round_down(new_size, fs_info->sectorsize); - control.new_size = new_size; btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ - btrfs_drop_extent_map_range(inode, - round_up(new_size, fs_info->sectorsize), - (u64)-1, false); + btrfs_drop_extent_map_range(inode, i_size_up, (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); From f1ae05b8eaf5b2049ef0f6bfff4376f793adeb83 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 12 Oct 2025 17:48:27 +0100 Subject: [PATCH 021/147] btrfs: avoid repeated computations in btrfs_mark_ordered_io_finished() We're computing a few values several times: 1) The current ordered extent's end offset inside the while loop, we have computed it and stored it in the 'entry_end' variable but then we compute it again later as the first argument to the min() macro; 2) The end file offset, open coded 3 times; 3) The current length (stored in variable 'len') computed 2 times, one inside an assertion and the other when assigning to the 'len' variable. So use existing variables and add new ones to prevent repeating these expressions and reduce the source code. We were also subtracting one from the result of min() macro call and then adding 1 back in the next line, making both operations pointless. So just remove the decrement and increment by 1. This also reduces very slightly the object code. Before: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1916576 161679 15592 2093847 1ff317 fs/btrfs/btrfs.ko After: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1916556 161679 15592 2093827 1ff303 fs/btrfs/btrfs.ko Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 8a8aa6ed405b..dfda952dcf7b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -483,16 +483,15 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct btrfs_ordered_extent *entry = NULL; unsigned long flags; u64 cur = file_offset; + const u64 end = file_offset + num_bytes; - trace_btrfs_writepage_end_io_hook(inode, file_offset, - file_offset + num_bytes - 1, - uptodate); + trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate); spin_lock_irqsave(&inode->ordered_tree_lock, flags); - while (cur < file_offset + num_bytes) { + while (cur < end) { u64 entry_end; - u64 end; - u32 len; + u64 this_end; + u64 len; node = ordered_tree_search(inode, cur); /* No ordered extents at all */ @@ -535,10 +534,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, * | * cur */ - end = min(entry->file_offset + entry->num_bytes, - file_offset + num_bytes) - 1; - ASSERT(end + 1 - cur < U32_MAX); - len = end + 1 - cur; + this_end = min(entry_end, end); + len = this_end - cur; + ASSERT(len < U32_MAX); if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); From e3df6408b13a75cf73e543e53453f28261874c6f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 13:57:09 +0100 Subject: [PATCH 022/147] btrfs: remove fs_info argument from btrfs_try_granting_tickets() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 4 ++-- fs/btrfs/block-rsv.c | 2 +- fs/btrfs/space-info.c | 14 +++++++------- fs/btrfs/space-info.h | 5 ++--- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 8bf501fbcc0b..035b04e7658d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3836,7 +3836,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, * that happens. */ if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); + btrfs_try_granting_tickets(space_info); out: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -3874,7 +3874,7 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); - btrfs_try_granting_tickets(cache->fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 5ad6de738aee..75cd35570a28 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -387,7 +387,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = block_rsv->reserved - block_rsv->size; btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes); block_rsv->reserved = block_rsv->size; - btrfs_try_granting_tickets(fs_info, sinfo); + btrfs_try_granting_tickets(sinfo); } block_rsv->full = (block_rsv->reserved == block_rsv->size); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 85c466c85910..869641e42f2f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -373,7 +373,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable); if (block_group->length > 0) space_info->full = false; - btrfs_try_granting_tickets(info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); block_group->space_info = space_info; @@ -523,9 +523,9 @@ static void remove_ticket(struct btrfs_space_info *space_info, * This is for space we already have accounted in space_info->bytes_may_use, so * basically when we're returning space from block_rsv's. */ -void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +void btrfs_try_granting_tickets(struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct list_head *head; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; @@ -1124,7 +1124,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, * the list. */ if (!aborted) - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_try_granting_tickets(space_info); } return (tickets_id != space_info->tickets_id); } @@ -1544,7 +1544,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, * ticket in front of a smaller ticket that can now be satisfied with * the available space. */ - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } @@ -1572,7 +1572,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, ticket->error = -ENOSPC; remove_ticket(space_info, ticket); - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } @@ -2195,5 +2195,5 @@ void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) grant: /* Add to any tickets we may have. */ if (len) - btrfs_try_granting_tickets(fs_info, space_info); + btrfs_try_granting_tickets(space_info); } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index a846f63585c9..596a1e923ddf 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -283,8 +283,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); -void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info); +void btrfs_try_granting_tickets(struct btrfs_space_info *space_info); int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); @@ -295,7 +294,7 @@ static inline void btrfs_space_info_free_bytes_may_use( { spin_lock(&space_info->lock); btrfs_space_info_update_bytes_may_use(space_info, -num_bytes); - btrfs_try_granting_tickets(space_info->fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, From f63b36686b721a21f83e170c247d3910d5a8b800 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 13:58:55 +0100 Subject: [PATCH 023/147] btrfs: remove fs_info argument from priority_reclaim_data_space() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 869641e42f2f..0925083cd38a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1548,10 +1548,11 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void priority_reclaim_data_space(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { + struct btrfs_fs_info *fs_info = space_info->fs_info; + spin_lock(&space_info->lock); /* We could have been granted before we got here. */ @@ -1647,7 +1648,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, ARRAY_SIZE(evict_flush_states)); break; case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: - priority_reclaim_data_space(fs_info, space_info, ticket); + priority_reclaim_data_space(space_info, ticket); break; default: ASSERT(0); From cf3ae29caf1657a8921396163f69fa36d1c8edac Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 14:01:18 +0100 Subject: [PATCH 024/147] btrfs: remove fs_info argument from priority_reclaim_metadata_space() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 0925083cd38a..bd902e38929e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1489,12 +1489,12 @@ static const enum btrfs_flush_state evict_flush_states[] = { RESET_ZONES, }; -static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket, - const enum btrfs_flush_state *states, - int states_nr) +static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + const enum btrfs_flush_state *states, + int states_nr) { + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 to_reclaim; int flush_state = 0; @@ -1638,12 +1638,12 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, wait_reserve_ticket(space_info, ticket); break; case BTRFS_RESERVE_FLUSH_LIMIT: - priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_reclaim_metadata_space(space_info, ticket, priority_flush_states, ARRAY_SIZE(priority_flush_states)); break; case BTRFS_RESERVE_FLUSH_EVICT: - priority_reclaim_metadata_space(fs_info, space_info, ticket, + priority_reclaim_metadata_space(space_info, ticket, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; From 1b809e305574ea59c0fa414cb129d6625d280944 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 14:02:41 +0100 Subject: [PATCH 025/147] btrfs: remove fs_info argument from maybe_fail_all_tickets() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bd902e38929e..8d3459b9f8dd 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1071,7 +1071,6 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, /* * We've exhausted our flushing, start failing tickets. * - * @fs_info - fs_info for this fs * @space_info - the space info we were flushing * * We call this when we've exhausted our flushing ability and haven't made @@ -1084,9 +1083,9 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, * other tickets, or if it stumbles across a ticket that was smaller than the * first ticket. */ -static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; const bool aborted = BTRFS_FS_ERROR(fs_info); @@ -1197,7 +1196,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) if (flush_state > final_state) { commit_cycles++; if (commit_cycles > 2) { - if (maybe_fail_all_tickets(fs_info, space_info)) { + if (maybe_fail_all_tickets(space_info)) { flush_state = FLUSH_DELAYED_ITEMS_NR; commit_cycles--; } else { @@ -1425,7 +1424,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) if (flush_state >= ARRAY_SIZE(data_flush_states)) { if (space_info->full) { - if (maybe_fail_all_tickets(fs_info, space_info)) + if (maybe_fail_all_tickets(space_info)) flush_state = 0; else space_info->flush = false; @@ -1443,7 +1442,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) return; aborted_fs: - maybe_fail_all_tickets(fs_info, space_info); + maybe_fail_all_tickets(space_info); space_info->flush = false; spin_unlock(&space_info->lock); } From 302b4b69c4eeb7be755ef8a712350c8aa5d6f072 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 17:39:21 +0100 Subject: [PATCH 026/147] btrfs: remove fs_info argument from calc_available_free_space() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 8d3459b9f8dd..97e5007c78ed 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -421,10 +421,10 @@ static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) return min_t(u64, data_chunk_size, SZ_1G); } -static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info, - enum btrfs_reserve_flush_enum flush) +static u64 calc_available_free_space(const struct btrfs_space_info *space_info, + enum btrfs_reserve_flush_enum flush) { + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 profile; u64 avail; u64 data_chunk_size; @@ -502,7 +502,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(fs_info, space_info, flush); + avail = calc_available_free_space(space_info, flush); if (used + bytes < space_info->total_bytes + avail) return 1; @@ -909,8 +909,7 @@ static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, lockdep_assert_held(&space_info->lock); - avail = calc_available_free_space(fs_info, space_info, - BTRFS_RESERVE_FLUSH_ALL); + avail = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL); used = btrfs_space_info_used(space_info, true); /* @@ -992,8 +991,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * much delalloc we need for the background flusher to kick in. */ - thresh = calc_available_free_space(fs_info, space_info, - BTRFS_RESERVE_FLUSH_ALL); + thresh = calc_available_free_space(space_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; if (used < space_info->total_bytes) From 78a77f4da4ba2162ab7f82246ff0eef0236cfe36 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 17:44:34 +0100 Subject: [PATCH 027/147] btrfs: remove fs_info argument from btrfs_can_overcommit() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 +-- fs/btrfs/space-info.c | 9 +++------ fs/btrfs/space-info.h | 3 +-- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 035b04e7658d..4eceba0ff4e0 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1403,8 +1403,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of * leeway to allow us to mark this block group as read only. */ - if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, - BTRFS_RESERVE_NO_FLUSH)) + if (btrfs_can_overcommit(sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH)) ret = 0; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 97e5007c78ed..599d6cce9e77 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -490,8 +490,7 @@ static u64 calc_available_free_space(const struct btrfs_space_info *space_info, return avail; } -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info, u64 bytes, +int btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { u64 avail; @@ -525,7 +524,6 @@ static void remove_ticket(struct btrfs_space_info *space_info, */ void btrfs_try_granting_tickets(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info = space_info->fs_info; struct list_head *head; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; @@ -541,8 +539,7 @@ again: /* Check and see if our ticket can be satisfied now. */ if ((used + ticket->bytes <= space_info->total_bytes) || - btrfs_can_overcommit(fs_info, space_info, ticket->bytes, - flush)) { + btrfs_can_overcommit(space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; @@ -1775,7 +1772,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || - btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { + btrfs_can_overcommit(space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 596a1e923ddf..737e874a8c34 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -284,8 +284,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_space_info *space_info); -int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info, u64 bytes, +int btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( From e96059c9d7feb36daa4d2062b5a137a0f5c7de9c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 17:53:20 +0100 Subject: [PATCH 028/147] btrfs: remove fs_info argument from btrfs_dump_space_info() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 8 ++++---- fs/btrfs/extent-tree.c | 3 +-- fs/btrfs/space-info.c | 18 +++++++++--------- fs/btrfs/space-info.h | 3 +-- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 4eceba0ff4e0..d7451400cc22 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1424,7 +1424,7 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false); + btrfs_dump_space_info(cache->space_info, 0, false); } return ret; } @@ -4314,7 +4314,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, bytes, type); - btrfs_dump_space_info(fs_info, info, 0, false); + btrfs_dump_space_info(info, 0, false); } if (left < bytes) { @@ -4459,7 +4459,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) * indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, false); + btrfs_dump_space_info(space_info, 0, false); /* * If there was a failure to cleanup a log tree, very likely due to an @@ -4470,7 +4470,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, false); + btrfs_dump_space_info(space_info, 0, false); } WARN_ON(space_info->reclaim_size > 0); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dc4ca98c3780..d1e75da97f58 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4735,8 +4735,7 @@ again: "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) - btrfs_dump_space_info(fs_info, sinfo, - num_bytes, 1); + btrfs_dump_space_info(sinfo, num_bytes, 1); } } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 599d6cce9e77..2e9417996970 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -591,9 +591,9 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); } -static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *info) +static void __btrfs_dump_space_info(const struct btrfs_space_info *info) { + const struct btrfs_fs_info *fs_info = info->fs_info; const char *flag_str = space_info_flag_to_str(info); lockdep_assert_held(&info->lock); @@ -610,16 +610,16 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, info->bytes_readonly, info->bytes_zone_unusable); } -void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, +void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, bool dump_block_groups) { + struct btrfs_fs_info *fs_info = info->fs_info; struct btrfs_block_group *cache; u64 total_avail = 0; int index = 0; spin_lock(&info->lock); - __btrfs_dump_space_info(fs_info, info); + __btrfs_dump_space_info(info); dump_global_block_rsv(fs_info); spin_unlock(&info->lock); @@ -1089,7 +1089,7 @@ static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); - __btrfs_dump_space_info(fs_info, space_info); + __btrfs_dump_space_info(space_info); } while (!list_empty(&space_info->tickets) && @@ -1882,7 +1882,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, orig_bytes, false); + btrfs_dump_space_info(space_info, orig_bytes, false); } return ret; } @@ -1913,7 +1913,7 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, space_info, bytes, false); + btrfs_dump_space_info(space_info, bytes, false); } return ret; } @@ -1926,7 +1926,7 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) btrfs_info(fs_info, "dumping space info:"); list_for_each_entry(space_info, &fs_info->space_info, list) { spin_lock(&space_info->lock); - __btrfs_dump_space_info(fs_info, space_info); + __btrfs_dump_space_info(space_info); spin_unlock(&space_info->lock); } dump_global_block_rsv(fs_info); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 737e874a8c34..a88cf71b3d3a 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -276,8 +276,7 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, +void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, bool dump_block_groups); int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, From 3ee124653641785acdbaf5b14fa14e8c8810c621 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 17:58:07 +0100 Subject: [PATCH 029/147] btrfs: remove fs_info argument from shrink_delalloc() and flush_space() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2e9417996970..66f99645f9ef 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -667,11 +667,11 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static void shrink_delalloc(struct btrfs_space_info *space_info, u64 to_reclaim, bool wait_ordered, bool for_preempt) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 ordered_bytes; @@ -798,10 +798,10 @@ skip_async: * and may fail for various reasons. The caller is supposed to examine the * state of @space_info to detect the outcome. */ -static void flush_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 num_bytes, - enum btrfs_flush_state state, bool for_preempt) +static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes, + enum btrfs_flush_state state, bool for_preempt) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; int nr; @@ -830,7 +830,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, case FLUSH_DELALLOC_FULL: if (state == FLUSH_DELALLOC_FULL) num_bytes = U64_MAX; - shrink_delalloc(fs_info, space_info, num_bytes, + shrink_delalloc(space_info, num_bytes, state != FLUSH_DELALLOC, for_preempt); break; case FLUSH_DELAYED_REFS_NR: @@ -1149,7 +1149,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) flush_state = FLUSH_DELAYED_ITEMS_NR; do { - flush_space(fs_info, space_info, to_reclaim, flush_state, false); + flush_space(space_info, to_reclaim, flush_state, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = false; @@ -1312,7 +1312,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) to_reclaim >>= 2; if (!to_reclaim) to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1); - flush_space(fs_info, space_info, to_reclaim, flush, true); + flush_space(space_info, to_reclaim, flush, true); cond_resched(); spin_lock(&space_info->lock); } @@ -1385,7 +1385,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) spin_unlock(&space_info->lock); while (!space_info->full) { - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); + flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = false; @@ -1401,7 +1401,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) } while (flush_state < ARRAY_SIZE(data_flush_states)) { - flush_space(fs_info, space_info, U64_MAX, + flush_space(space_info, U64_MAX, data_flush_states[flush_state], false); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { @@ -1507,8 +1507,7 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, while (flush_state < states_nr) { spin_unlock(&space_info->lock); - flush_space(fs_info, space_info, to_reclaim, states[flush_state], - false); + flush_space(space_info, to_reclaim, states[flush_state], false); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -1545,8 +1544,6 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, static void priority_reclaim_data_space(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { - struct btrfs_fs_info *fs_info = space_info->fs_info; - spin_lock(&space_info->lock); /* We could have been granted before we got here. */ @@ -1557,7 +1554,7 @@ static void priority_reclaim_data_space(struct btrfs_space_info *space_info, while (!space_info->full) { spin_unlock(&space_info->lock); - flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); + flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); From 4199eb2761344dac7a600ce893967d9314842252 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 18:01:55 +0100 Subject: [PATCH 030/147] btrfs: remove fs_info argument from btrfs_calc_reclaim_metadata_size() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 66f99645f9ef..68bf62551f20 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -897,8 +897,7 @@ static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes, return; } -static u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info) +static u64 btrfs_calc_reclaim_metadata_size(const struct btrfs_space_info *space_info) { u64 used; u64 avail; @@ -1138,7 +1137,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) final_state = COMMIT_TRANS; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); if (!to_reclaim) { space_info->flush = false; spin_unlock(&space_info->lock); @@ -1156,8 +1155,7 @@ static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) spin_unlock(&space_info->lock); return; } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { @@ -1493,7 +1491,7 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, int flush_state = 0; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still * because we may have only satisfied the priority tickets and still From d77b22de56776103511a50f92443ba83a70b2b32 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 18:09:16 +0100 Subject: [PATCH 031/147] btrfs: remove fs_info argument from need_preemptive_reclaim() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 68bf62551f20..2446fe1310f6 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -920,9 +920,9 @@ static u64 btrfs_calc_reclaim_metadata_size(const struct btrfs_space_info *space return to_reclaim; } -static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *space_info) +static bool need_preemptive_reclaim(const struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv); u64 ordered, delalloc; u64 thresh; @@ -1249,7 +1249,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) trans_rsv = &fs_info->trans_block_rsv; spin_lock(&space_info->lock); - while (need_preemptive_reclaim(fs_info, space_info)) { + while (need_preemptive_reclaim(space_info)) { enum btrfs_flush_state flush; u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; @@ -1834,7 +1834,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && !work_busy(&fs_info->preempt_reclaim_work) && - need_preemptive_reclaim(fs_info, space_info)) { + need_preemptive_reclaim(space_info)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); queue_work(system_dfl_wq, From ddeac2a12b114a5def0a4c23961d2c0938556472 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 18:10:02 +0100 Subject: [PATCH 032/147] btrfs: remove fs_info argument from steal_from_global_rsv() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2446fe1310f6..4dc2ae5d79a8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1031,10 +1031,10 @@ static bool need_preemptive_reclaim(const struct btrfs_space_info *space_info) !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } -static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static bool steal_from_global_rsv(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 min_bytes; @@ -1096,7 +1096,7 @@ static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket)) + if (!aborted && steal_from_global_rsv(space_info, ticket)) return true; if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) @@ -1525,7 +1525,7 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, if (BTRFS_FS_ERROR(fs_info)) { ticket->error = BTRFS_FS_ERROR(fs_info); remove_ticket(space_info, ticket); - } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + } else if (!steal_from_global_rsv(space_info, ticket)) { ticket->error = -ENOSPC; remove_ticket(space_info, ticket); } From e182eca6ed2db481f058fc82f9b9977fac466d62 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 18:13:49 +0100 Subject: [PATCH 033/147] btrfs: remove fs_info argument from handle_reserve_ticket() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 4dc2ae5d79a8..ad7106bd669c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1602,7 +1602,6 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, /* * Do the appropriate flushing and waiting for a ticket. * - * @fs_info: the filesystem * @space_info: space info for the reservation * @ticket: ticket for the reservation * @start_ns: timestamp when the reservation started @@ -1612,8 +1611,7 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, * This does the work of figuring out how to flush for the ticket, waiting for * the reservation, and returning the appropriate error if there is one. */ -static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static int handle_reserve_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket, u64 start_ns, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) @@ -1653,8 +1651,8 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, * space wasn't reserved at all). */ ASSERT(!(ticket->bytes == 0 && ticket->error)); - trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes, - start_ns, flush, ticket->error); + trace_btrfs_reserve_ticket(space_info->fs_info, space_info->flags, + orig_bytes, start_ns, flush, ticket->error); return ret; } @@ -1845,8 +1843,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, if (!ret || !can_ticket(flush)) return ret; - return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, - orig_bytes, flush); + return handle_reserve_ticket(space_info, &ticket, start_ns, orig_bytes, flush); } /* From 5495cbe920abb53ff126345b919529a32fa3979a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 18:14:39 +0100 Subject: [PATCH 034/147] btrfs: remove fs_info argument from maybe_clamp_preempt() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ad7106bd669c..bf542f154603 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1666,9 +1666,9 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); } -static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +static inline void maybe_clamp_preempt(struct btrfs_space_info *space_info) { + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); @@ -1811,7 +1811,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * preemptive flushing in order to keep up with * the workload. */ - maybe_clamp_preempt(fs_info, space_info); + maybe_clamp_preempt(space_info); space_info->flush = true; trace_btrfs_trigger_flush(fs_info, From 09d0f285310ab46457a31ab942e9a1157dd70c38 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 18:19:46 +0100 Subject: [PATCH 035/147] btrfs: fix parameter documentation for btrfs_reserve_data_bytes() We don't have a fs_info argument anymore since commit 5d39fda880be ("btrfs: pass btrfs_space_info to btrfs_reserve_data_bytes()"), it was replaced by a space_info argument. So update the documentation. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bf542f154603..f01d083a444c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1882,7 +1882,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, /* * Try to reserve data bytes for an allocation. * - * @fs_info: the filesystem + * @space_info: the space_info we're allocating for * @bytes: number of bytes we need * @flush: how we are allowed to flush * From 30b87a23196c18df70851ea5b021dd8b8fe1ab8e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 18:23:27 +0100 Subject: [PATCH 036/147] btrfs: remove fs_info argument from __reserve_bytes() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f01d083a444c..30f1c2d23fc6 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1703,7 +1703,6 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) /* * Try to reserve bytes from the block_rsv's space. * - * @fs_info: the filesystem * @space_info: space info we want to allocate from * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1715,10 +1714,10 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 orig_bytes, +static int __reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct work_struct *async_work; struct reserve_ticket ticket; u64 start_ns = 0; @@ -1868,7 +1867,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, { int ret; - ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); + ret = __reserve_bytes(space_info, orig_bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, orig_bytes, 1); @@ -1900,7 +1899,7 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - ret = __reserve_bytes(fs_info, space_info, bytes, flush); + ret = __reserve_bytes(space_info, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, bytes, 1); From a1359d06d7878db4ac28d9c5134bc9771e56833d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 13 Oct 2025 18:27:16 +0100 Subject: [PATCH 037/147] btrfs: remove fs_info argument from btrfs_reserve_metadata_bytes() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 12 +++++------- fs/btrfs/delalloc-space.c | 4 ++-- fs/btrfs/delayed-ref.c | 2 +- fs/btrfs/space-info.c | 6 +++--- fs/btrfs/space-info.h | 3 +-- fs/btrfs/transaction.c | 4 ++-- 6 files changed, 14 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 75cd35570a28..96cf7a162987 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -218,8 +218,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -259,8 +258,7 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -530,8 +528,8 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - blocksize, BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize, + BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; /* @@ -552,7 +550,7 @@ try_reserve: * one last time to force a reservation if there's enough actual space * on disk to make the reservation. */ - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize, + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, blocksize, BTRFS_RESERVE_FLUSH_EMERGENCY); if (!ret) return block_rsv; diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 288e1776c02d..0970799d0aa4 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, noflush); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, - meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(block_rsv->space_info, meta_reserve, + flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index f8fc26272f76..e8bc37453336 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -228,7 +228,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(space_info, num_bytes, flush); if (ret) return ret; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 30f1c2d23fc6..9ced89678953 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1848,7 +1848,6 @@ static int __reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, /* * Try to reserve metadata bytes from the block_rsv's space. * - * @fs_info: the filesystem * @space_info: the space_info we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation @@ -1860,8 +1859,7 @@ static int __reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, * regain reservations will be made and this will fail if there is not enough * space already. */ -int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { @@ -1869,6 +1867,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, ret = __reserve_bytes(space_info, orig_bytes, flush); if (ret == -ENOSPC) { + struct btrfs_fs_info *fs_info = space_info->fs_info; + trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, orig_bytes, 1); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index a88cf71b3d3a..2fad2e4c2252 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -278,8 +278,7 @@ u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, bool dump_block_groups); -int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_space_info *space_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 89ae0c7a610a..6607e354eae5 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -575,7 +575,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, * We want to reserve all the bytes we may need all at once, so we only * do 1 enospc flushing cycle per transaction start. */ - ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + ret = btrfs_reserve_metadata_bytes(si, bytes, flush); /* * If we are an emergency flush, which can steal from the global block @@ -585,7 +585,7 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { bytes -= *delayed_refs_bytes; *delayed_refs_bytes = 0; - ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + ret = btrfs_reserve_metadata_bytes(si, bytes, flush); } return ret; From 7fc35cc559cb64221a7fb1d2cf48cda8fd31fc9e Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Sat, 4 Oct 2025 22:31:09 +0800 Subject: [PATCH 038/147] btrfs: more trivial BTRFS_PATH_AUTO_FREE conversions Convert more of the trivial pattern for the auto freeing of btrfs_path with goto -> return conversions where applicable. Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/uuid-tree.c | 120 ++++++++++++--------------------- fs/btrfs/verity.c | 29 +++----- fs/btrfs/volumes.c | 153 ++++++++++++++++--------------------------- fs/btrfs/xattr.c | 36 ++++------ 4 files changed, 119 insertions(+), 219 deletions(-) diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 17b5e81123a1..e3a1310fa7d5 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -27,32 +27,26 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, u8 type, u64 subid) { int ret; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *eb; int slot; u32 item_size; unsigned long offset; struct btrfs_key key; - if (WARN_ON_ONCE(!uuid_root)) { - ret = -ENOENT; - goto out; - } + if (WARN_ON_ONCE(!uuid_root)) + return -ENOENT; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; btrfs_uuid_to_key(uuid, type, &key); ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return -ENOENT; eb = path->nodes[0]; slot = path->slots[0]; @@ -64,7 +58,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); - goto out; + return ret; } while (item_size) { __le64 data; @@ -78,8 +72,6 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, item_size -= sizeof(data); } -out: - btrfs_free_path(path); return ret; } @@ -89,7 +81,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; int slot; @@ -100,18 +92,14 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ if (ret != -ENOENT) return ret; - if (WARN_ON_ONCE(!uuid_root)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(!uuid_root)) + return -EINVAL; btrfs_uuid_to_key(uuid, type, &key); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; ret = btrfs_insert_empty_item(trans, uuid_root, path, &key, sizeof(subid_le)); @@ -134,15 +122,12 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", ret, key.objectid, key.offset, type); - goto out; + return ret; } - ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); -out: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 type, @@ -151,7 +136,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; int slot; @@ -161,29 +146,23 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 unsigned long move_src; unsigned long move_len; - if (WARN_ON_ONCE(!uuid_root)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON_ONCE(!uuid_root)) + return -EINVAL; btrfs_uuid_to_key(uuid, type, &key); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for uuid item!", ret); - goto out; - } - if (ret > 0) { - ret = -ENOENT; - goto out; + return ret; } + if (ret > 0) + return -ENOENT; eb = path->nodes[0]; slot = path->slots[0]; @@ -192,8 +171,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 if (!IS_ALIGNED(item_size, sizeof(u64))) { btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); - ret = -ENOENT; - goto out; + return -ENOENT; } while (item_size) { __le64 read_subid; @@ -205,16 +183,12 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 item_size -= sizeof(read_subid); } - if (!item_size) { - ret = -ENOENT; - goto out; - } + if (!item_size) + return -ENOENT; item_size = btrfs_item_size(eb, slot); - if (item_size == sizeof(subid)) { - ret = btrfs_del_item(trans, uuid_root, path); - goto out; - } + if (item_size == sizeof(subid)) + return btrfs_del_item(trans, uuid_root, path); move_dst = offset; move_src = offset + sizeof(subid); @@ -222,9 +196,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, const u8 *uuid, u8 memmove_extent_buffer(eb, move_dst, move_src, move_len); btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1); -out: - btrfs_free_path(path); - return ret; + return 0; } static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, @@ -293,7 +265,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->uuid_root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret = 0; struct extent_buffer *leaf; int slot; @@ -301,10 +273,8 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) unsigned long offset; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = 0; key.type = 0; @@ -312,17 +282,15 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info) again_search_slot: ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); - if (ret) { - if (ret > 0) - ret = 0; - goto out; - } + if (ret < 0) + return ret; + if (ret > 0) + return 0; while (1) { - if (btrfs_fs_closing(fs_info)) { - ret = -EINTR; - goto out; - } + if (btrfs_fs_closing(fs_info)) + return -EINTR; + cond_resched(); leaf = path->nodes[0]; slot = path->slots[0]; @@ -353,7 +321,7 @@ again_search_slot: ret = btrfs_check_uuid_tree_entry(fs_info, uuid, key.type, subid_cpu); if (ret < 0) - goto out; + return ret; if (ret > 0) { btrfs_release_path(path); ret = btrfs_uuid_iter_rem(root, uuid, key.type, @@ -369,7 +337,7 @@ again_search_slot: goto again_search_slot; } if (ret < 0 && ret != -ENOENT) - goto out; + return ret; key.offset++; goto again_search_slot; } @@ -386,8 +354,6 @@ skip: break; } -out: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 46bd8ca58670..16f5580cba55 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -109,7 +109,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) { struct btrfs_trans_handle *trans; struct btrfs_root *root = inode->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int count = 0; int ret; @@ -121,10 +121,8 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) while (1) { /* 1 for the item being dropped */ trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); /* * Walk backwards through all the items until we find one that @@ -143,7 +141,7 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) path->slots[0]--; } else if (ret < 0) { btrfs_end_transaction(trans); - goto out; + return ret; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -161,17 +159,14 @@ static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) ret = btrfs_del_items(trans, root, path, path->slots[0], 1); if (ret) { btrfs_end_transaction(trans); - goto out; + return ret; } count++; btrfs_release_path(path); btrfs_end_transaction(trans); } - ret = count; btrfs_end_transaction(trans); -out: - btrfs_free_path(path); - return ret; + return count; } /* @@ -217,7 +212,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, const char *src, u64 len) { struct btrfs_trans_handle *trans; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_key key; @@ -233,10 +228,8 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, while (len > 0) { /* 1 for the new item being inserted */ trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); key.objectid = btrfs_ino(inode); key.type = key_type; @@ -267,7 +260,6 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, btrfs_end_transaction(trans); } - btrfs_free_path(path); return ret; } @@ -296,7 +288,7 @@ static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, char *dest, u64 len, struct folio *dest_folio) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_key key; @@ -404,7 +396,6 @@ static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, } } out: - btrfs_free_path(path); if (!ret) ret = copied; return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 928fc6a061b6..45d89b12025b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1681,7 +1681,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct btrfs_dev_extent *dev_extent; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u64 search_start; u64 hole_size; u64 max_hole_start; @@ -1812,7 +1812,6 @@ next: "max_hole_start=%llu max_hole_size=%llu search_end=%llu", max_hole_start, max_hole_size, search_end); out: - btrfs_free_path(path); *start = max_hole_start; if (len) *len = max_hole_size; @@ -1826,7 +1825,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf = NULL; @@ -1845,7 +1844,7 @@ again: ret = btrfs_previous_item(root, path, key.objectid, BTRFS_DEV_EXTENT_KEY); if (ret) - goto out; + return ret; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); extent = btrfs_item_ptr(leaf, path->slots[0], @@ -1860,7 +1859,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - goto out; + return ret; } *dev_extent_len = btrfs_dev_extent_length(leaf, extent); @@ -1868,8 +1867,6 @@ again: ret = btrfs_del_item(trans, root, path); if (ret == 0) set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); -out: - btrfs_free_path(path); return ret; } @@ -1897,7 +1894,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, int ret; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) @@ -1909,13 +1906,12 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (unlikely(ret == 0)) { /* Corruption */ btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } ret = btrfs_previous_item(fs_info->chunk_root, path, @@ -1928,10 +1924,7 @@ static noinline int find_next_devid(struct btrfs_fs_info *fs_info, path->slots[0]); *devid_ret = found_key.offset + 1; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -1942,7 +1935,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; struct btrfs_key key; @@ -1961,7 +1954,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, &key, sizeof(*dev_item)); btrfs_trans_release_chunk_metadata(trans); if (ret) - goto out; + return ret; leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); @@ -1987,10 +1980,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -2017,7 +2007,7 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, { struct btrfs_root *root = device->fs_info->chunk_root; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -2031,16 +2021,12 @@ static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); btrfs_trans_release_chunk_metadata(trans); - if (ret) { - if (ret > 0) - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; + if (ret < 0) + return ret; - ret = btrfs_del_item(trans, root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, root, path); } /* @@ -2626,7 +2612,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_dev_item *dev_item; struct btrfs_device *device; @@ -2648,7 +2634,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) ret = btrfs_search_slot(trans, root, &key, path, 0, 1); btrfs_trans_release_chunk_metadata(trans); if (ret < 0) - goto error; + return ret; leaf = path->nodes[0]; next_slot: @@ -2657,7 +2643,7 @@ next_slot: if (ret > 0) break; if (ret < 0) - goto error; + return ret; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); btrfs_release_path(path); @@ -2688,10 +2674,7 @@ next_slot: path->slots[0]++; goto next_slot; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) @@ -2946,7 +2929,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = device->fs_info->chunk_root; struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; @@ -2962,12 +2945,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); @@ -2981,8 +2962,6 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); -out: - btrfs_free_path(path); return ret; } @@ -3035,7 +3014,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -3048,23 +3027,21 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; - else if (unlikely(ret > 0)) { /* Logic error or corruption */ + return ret; + if (unlikely(ret > 0)) { + /* Logic error or corruption */ btrfs_err(fs_info, "failed to lookup chunk %llu when freeing", chunk_offset); btrfs_abort_transaction(trans, -ENOENT); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = btrfs_del_item(trans, root, path); if (unlikely(ret < 0)) { btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } -out: - btrfs_free_path(path); return ret; } @@ -3501,7 +3478,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) { struct btrfs_root *chunk_root = fs_info->chunk_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_chunk *chunk; struct btrfs_key key; @@ -3525,7 +3502,7 @@ again: ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { mutex_unlock(&fs_info->reclaim_bgs_lock); - goto error; + return ret; } if (unlikely(ret == 0)) { /* @@ -3535,9 +3512,8 @@ again: * offset (one less than the previous one, wrong * alignment and size). */ - ret = -EUCLEAN; mutex_unlock(&fs_info->reclaim_bgs_lock); - goto error; + return -EUCLEAN; } ret = btrfs_previous_item(chunk_root, path, key.objectid, @@ -3545,7 +3521,7 @@ again: if (ret) mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) - goto error; + return ret; if (ret > 0) break; @@ -3579,8 +3555,6 @@ again: } else if (WARN_ON(failed && retried)) { ret = -ENOSPC; } -error: - btrfs_free_path(path); return ret; } @@ -4709,7 +4683,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) struct btrfs_balance_control *bctl; struct btrfs_balance_item *item; struct btrfs_disk_balance_args disk_bargs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; int ret; @@ -4724,17 +4698,14 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret > 0) { /* ret = -ENOENT; */ - ret = 0; - goto out; + return 0; } bctl = kzalloc(sizeof(*bctl), GFP_NOFS); - if (!bctl) { - ret = -ENOMEM; - goto out; - } + if (!bctl) + return -ENOMEM; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); @@ -4771,8 +4742,6 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) fs_info->balance_ctl = bctl; spin_unlock(&fs_info->balance_lock); mutex_unlock(&fs_info->balance_mutex); -out: - btrfs_free_path(path); return ret; } @@ -7452,7 +7421,7 @@ static void readahead_tree_node_children(struct extent_buffer *node) int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->chunk_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; struct btrfs_key found_key; @@ -7569,8 +7538,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) ret = 0; error: mutex_unlock(&uuid_mutex); - - btrfs_free_path(path); return ret; } @@ -7670,7 +7637,7 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); int ret = 0; path = btrfs_alloc_path(); @@ -7692,8 +7659,6 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) } out: mutex_unlock(&fs_devices->device_list_mutex); - - btrfs_free_path(path); return ret; } @@ -7702,7 +7667,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_stats_item *ptr; @@ -7721,7 +7686,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, btrfs_warn(fs_info, "error %d while searching for dev_stats item for device %s", ret, btrfs_dev_name(device)); - goto out; + return ret; } if (ret == 0 && @@ -7732,7 +7697,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, btrfs_warn(fs_info, "delete too small dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); - goto out; + return ret; } ret = 1; } @@ -7746,7 +7711,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, btrfs_warn(fs_info, "insert dev_stats item for device %s failed %d", btrfs_dev_name(device), ret); - goto out; + return ret; } } @@ -7755,8 +7720,6 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); -out: - btrfs_free_path(path); return ret; } @@ -8046,7 +8009,7 @@ out: */ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; u64 prev_devid = 0; @@ -8077,17 +8040,15 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) path->reada = READA_FORWARD; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ret = btrfs_next_leaf(root, path); if (ret < 0) - goto out; + return ret; /* No dev extents at all? Not good */ - if (unlikely(ret > 0)) { - ret = -EUCLEAN; - goto out; - } + if (unlikely(ret > 0)) + return -EUCLEAN; } while (1) { struct extent_buffer *leaf = path->nodes[0]; @@ -8113,20 +8074,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", devid, physical_offset, prev_dev_ext_end); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = verify_one_dev_extent(fs_info, chunk_offset, devid, physical_offset, physical_len); if (ret < 0) - goto out; + return ret; prev_devid = devid; prev_dev_ext_end = physical_offset + physical_len; ret = btrfs_next_item(root, path); if (ret < 0) - goto out; + return ret; if (ret > 0) { ret = 0; break; @@ -8134,10 +8094,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) } /* Ensure all chunks have corresponding dev extents */ - ret = verify_chunk_dev_extent_mapping(fs_info); -out: - btrfs_free_path(path); - return ret; + return verify_chunk_dev_extent_mapping(fs_info); } /* diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 79fb1614bd0c..3d27eb1e2f74 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -29,9 +29,8 @@ int btrfs_getxattr(const struct inode *inode, const char *name, { struct btrfs_dir_item *di; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; - int ret = 0; unsigned long data_ptr; path = btrfs_alloc_path(); @@ -41,26 +40,19 @@ int btrfs_getxattr(const struct inode *inode, const char *name, /* lookup the xattr by name */ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(BTRFS_I(inode)), name, strlen(name), 0); - if (!di) { - ret = -ENODATA; - goto out; - } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } + if (!di) + return -ENODATA; + if (IS_ERR(di)) + return PTR_ERR(di); leaf = path->nodes[0]; /* if size is 0, that means we want the size of the attr */ - if (!size) { - ret = btrfs_dir_data_len(leaf, di); - goto out; - } + if (!size) + return btrfs_dir_data_len(leaf, di); /* now get the data out of our dir_item */ - if (btrfs_dir_data_len(leaf, di) > size) { - ret = -ERANGE; - goto out; - } + if (btrfs_dir_data_len(leaf, di) > size) + return -ERANGE; /* * The way things are packed into the leaf is like this @@ -73,11 +65,7 @@ int btrfs_getxattr(const struct inode *inode, const char *name, btrfs_dir_name_len(leaf, di)); read_extent_buffer(leaf, buffer, data_ptr, btrfs_dir_data_len(leaf, di)); - ret = btrfs_dir_data_len(leaf, di); - -out: - btrfs_free_path(path); - return ret; + return btrfs_dir_data_len(leaf, di); } int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, @@ -278,7 +266,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int iter_ret = 0; int ret = 0; size_t total_size = 0, size_left = size; @@ -354,8 +342,6 @@ next: else ret = total_size; - btrfs_free_path(path); - return ret; } From 771af6ff72e0ed0eb8bf97e5ae4fa5094e0c5d1d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 14 Oct 2025 19:18:17 +0100 Subject: [PATCH 039/147] btrfs: remove fs_info argument from btrfs_sysfs_add_space_info_type() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 4 ++-- fs/btrfs/sysfs.c | 5 ++--- fs/btrfs/sysfs.h | 3 +-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9ced89678953..69237f5d6078 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -274,7 +274,7 @@ static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flag sub_group->parent = parent; sub_group->subgroup_id = id; - ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group); + ret = btrfs_sysfs_add_space_info_type(sub_group); if (ret) { kfree(sub_group); parent->sub_group[index] = NULL; @@ -308,7 +308,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) return ret; } - ret = btrfs_sysfs_add_space_info_type(info, space_info); + ret = btrfs_sysfs_add_space_info_type(space_info); if (ret) return ret; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 81f52c1f55ce..d66681ce2b3d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1981,13 +1981,12 @@ static const char *alloc_name(struct btrfs_space_info *space_info) * Create a sysfs entry for a space info type at path * /sys/fs/btrfs/UUID/allocation/TYPE */ -int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) +int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info) { int ret; ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, - fs_info->space_info_kobj, "%s", + space_info->fs_info->space_info_kobj, "%s", alloc_name(space_info)); if (ret) { kobject_put(&space_info->kobj); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 0f94ae923210..05498e5346c3 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -37,8 +37,7 @@ void __cold btrfs_exit_sysfs(void); int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache); -int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info); +int btrfs_sysfs_add_space_info_type(struct btrfs_space_info *space_info); void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); void btrfs_sysfs_update_devid(struct btrfs_device *device); From a232ff90d14657c8637c6e94b606bb5d700a2ecb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 14 Oct 2025 19:20:31 +0100 Subject: [PATCH 040/147] btrfs: remove fs_info argument from btrfs_zoned_activate_one_bg() We don't need it since we can grab fs_info from the given space_info. So remove the fs_info argument. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 4 ++-- fs/btrfs/zoned.c | 5 ++--- fs/btrfs/zoned.h | 6 ++---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index d7451400cc22..ec1e4fc0cd51 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3071,7 +3071,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, * We have allocated a new chunk. We also need to activate that chunk to * grant metadata tickets for zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); + ret = btrfs_zoned_activate_one_bg(space_info, true); if (ret < 0) goto out; @@ -4339,7 +4339,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, * We have a new chunk. We also need to activate it for * zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + ret = btrfs_zoned_activate_one_bg(info, true); if (ret < 0) return; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d1db7fa1fe58..9b8c9894a1de 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2750,10 +2750,9 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) return ret < 0 ? ret : 1; } -int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool do_finish) +int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish) { + struct btrfs_fs_info *fs_info = space_info->fs_info; struct btrfs_block_group *bg; int index; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 17c5656580dd..d64f7c9255fa 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -94,8 +94,7 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); -int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, bool do_finish); +int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish); void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); #else /* CONFIG_BLK_DEV_ZONED */ @@ -262,8 +261,7 @@ static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) return 1; } -static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, +static inline int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish) { /* Consider all the block groups are active */ From 225e747ea57781198b44cb65373d076865c51a7a Mon Sep 17 00:00:00 2001 From: Xuanqiang Luo Date: Wed, 15 Oct 2025 15:05:21 +0800 Subject: [PATCH 041/147] btrfs: remove redundant refcount check in btrfs_put_transaction() Eric Dumazet removed the redundant refcount check for sk_refcnt, I noticed a similar issue in btrfs_put_transaction(). refcount_dec_and_test() already checks for a zero refcount and complains, making the preceding WARN_ON redundant. This is a leftover from the atomic_t times. Signed-off-by: Xuanqiang Luo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6607e354eae5..907f2d047b44 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -138,7 +138,6 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { void btrfs_put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs)); From 95de4b097e25225d4deb5a33a4bfc27bb441f2d8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 15 Oct 2025 12:40:06 +0100 Subject: [PATCH 042/147] btrfs: add macros to facilitate printing of keys There's a lot of places where we need to print a key, and it's tiresome to type the format specifier, typically "(%llu %u %llu)", as well as passing 3 arguments to a prink family function (key->objectid, key->type, key->offset). So add a couple macros for this just like we have for csum values in btrfs_inode.h (CSUM_FMT and CSUM_FMT_VALUE). This also ensures that we consistently print a key in the same format, always as "(%llu %llu %llu)", which is the most common format we use, but we have a few variations such as "[%llu %llu %llu]" for no good reason. This patch introduces the macros while the next one makes use of it. This is to ease backports of future patches, since then we can backport this patch which is simple and short and then backport those future patches, as the next patch in the series that makes use of these new macros is quite large and may have some dependencies. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index c83fd192a7dc..522152904b8f 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -74,6 +74,9 @@ struct btrfs_space_info; #define BTRFS_SUPER_INFO_SIZE 4096 static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); +#define BTRFS_KEY_FMT "(%llu %u %llu)" +#define BTRFS_KEY_FMT_VALUE(key) (key)->objectid, (key)->type, (key)->offset + /* * Number of metadata items necessary for an unlink operation: * From af1e800c0244a04f5eb0993745c23d974f262628 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 15 Oct 2025 13:16:26 +0100 Subject: [PATCH 043/147] btrfs: use the key format macros when printing keys Change all locations that print a key to use the new macros to print them in order to ensure a consistent style and avoid repetitive code. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 11 +++++------ fs/btrfs/ctree.c | 17 +++++++---------- fs/btrfs/extent-tree.c | 14 +++++++------- fs/btrfs/inode.c | 4 ++-- fs/btrfs/print-tree.c | 14 ++++++-------- fs/btrfs/qgroup.c | 6 ++---- fs/btrfs/relocation.c | 4 ++-- fs/btrfs/root-tree.c | 4 ++-- fs/btrfs/send.c | 10 ++++------ fs/btrfs/tree-checker.c | 21 +++++++++------------ fs/btrfs/tree-log.c | 38 ++++++++++++++++++-------------------- 11 files changed, 64 insertions(+), 79 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2ab550a1e715..e050d0938dc4 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -666,10 +666,9 @@ static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx, ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq); btrfs_debug(ctx->fs_info, - "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", - ref->root_id, level, ref->count, ret, - ref->key_for_search.objectid, ref->key_for_search.type, - ref->key_for_search.offset); +"search slot in root %llu (level %d, ref count %d) returned %d for key " BTRFS_KEY_FMT, + ref->root_id, level, ref->count, ret, + BTRFS_KEY_FMT_VALUE(&ref->key_for_search)); if (ret < 0) goto out; @@ -3323,9 +3322,9 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, eb = path->nodes[level]; if (btrfs_node_blockptr(eb, path->slots[level]) != cur->bytenr) { btrfs_err(fs_info, -"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)", +"couldn't find block (%llu) (level %d) in tree (%llu) with key " BTRFS_KEY_FMT, cur->bytenr, level - 1, btrfs_root_id(root), - tree_key->objectid, tree_key->type, tree_key->offset); + BTRFS_KEY_FMT_VALUE(tree_key)); btrfs_put_root(root); ret = -ENOENT; goto out; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 561658aca018..3be1b66aea35 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2599,12 +2599,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, - "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, slot, btrfs_disk_key_objectid(&disk_key), btrfs_disk_key_type(&disk_key), btrfs_disk_key_offset(&disk_key), - new_key->objectid, new_key->type, - new_key->offset); + BTRFS_KEY_FMT_VALUE(new_key)); BUG(); } } @@ -2613,12 +2612,11 @@ void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, - "slot %u key (%llu %u %llu) new key (%llu %u %llu)", + "slot %u key " BTRFS_KEY_FMT " new key " BTRFS_KEY_FMT, slot, btrfs_disk_key_objectid(&disk_key), btrfs_disk_key_type(&disk_key), btrfs_disk_key_offset(&disk_key), - new_key->objectid, new_key->type, - new_key->offset); + BTRFS_KEY_FMT_VALUE(new_key)); BUG(); } } @@ -2677,10 +2675,9 @@ static bool check_sibling_keys(const struct extent_buffer *left, btrfs_crit(left->fs_info, "right extent buffer:"); btrfs_print_tree(right, false); btrfs_crit(left->fs_info, -"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", - left_last.objectid, left_last.type, - left_last.offset, right_first.objectid, - right_first.type, right_first.offset); +"bad key order, sibling blocks, left last " BTRFS_KEY_FMT " right first " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&left_last), + BTRFS_KEY_FMT_VALUE(&right_first)); return true; } return false; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d1e75da97f58..ae2c3dc9957e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -164,8 +164,8 @@ search_again: if (unlikely(num_refs == 0)) { ret = -EUCLEAN; btrfs_err(fs_info, - "unexpected zero reference count for extent item (%llu %u %llu)", - key.objectid, key.type, key.offset); + "unexpected zero reference count for extent item " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key)); btrfs_abort_transaction(trans, ret); return ret; } @@ -597,8 +597,8 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, num_refs = btrfs_shared_data_ref_count(leaf, ref2); } else { btrfs_err(trans->fs_info, - "unrecognized backref key (%llu %u %llu)", - key.objectid, key.type, key.offset); + "unrecognized backref key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key)); btrfs_abort_transaction(trans, -EUCLEAN); return -EUCLEAN; } @@ -3326,9 +3326,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (iref) { if (unlikely(path->slots[0] != extent_slot)) { abort_and_dump(trans, path, -"invalid iref, extent item key (%llu %u %llu) slot %u doesn't have wanted iref", - key.objectid, key.type, - key.offset, path->slots[0]); +"invalid iref, extent item key " BTRFS_KEY_FMT " slot %u doesn't have wanted iref", + BTRFS_KEY_FMT_VALUE(&key), + path->slots[0]); ret = -EUCLEAN; goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81cb8a86324d..41b1d7819b86 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5644,9 +5644,9 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, location->type != BTRFS_ROOT_ITEM_KEY)) { ret = -EUCLEAN; btrfs_warn(root->fs_info, -"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", +"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location " BTRFS_KEY_FMT ")", __func__, fname.disk_name.name, btrfs_ino(dir), - location->objectid, location->type, location->offset); + BTRFS_KEY_FMT_VALUE(location)); } if (!ret) *type = btrfs_dir_ftype(path->nodes[0], di); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index d16f2960d55d..f189bf09ce6a 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -131,7 +131,7 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type struct btrfs_tree_block_info *info; info = (struct btrfs_tree_block_info *)(ei + 1); btrfs_tree_block_key(eb, info, &key); - pr_info("\t\ttree block key (%llu %u %llu) level %d\n", + pr_info("\t\ttree block key " BTRFS_KEY_FMT " level %d\n", btrfs_disk_key_objectid(&key), key.type, btrfs_disk_key_offset(&key), btrfs_tree_block_level(eb, info)); @@ -277,9 +277,8 @@ static void print_dir_item(const struct extent_buffer *eb, int i) struct btrfs_key location; btrfs_dir_item_key_to_cpu(eb, di, &location); - pr_info("\t\tlocation key (%llu %u %llu) type %d\n", - location.objectid, location.type, location.offset, - btrfs_dir_ftype(eb, di)); + pr_info("\t\tlocation key " BTRFS_KEY_FMT " type %d\n", + BTRFS_KEY_FMT_VALUE(&location), btrfs_dir_ftype(eb, di)); pr_info("\t\ttransid %llu data_len %u name_len %u\n", btrfs_dir_transid(eb, di), data_len, name_len); di = (struct btrfs_dir_item *)((char *)di + len); @@ -598,10 +597,9 @@ void btrfs_print_tree(const struct extent_buffer *c, bool follow) print_eb_refs_lock(c); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); - pr_info("\tkey %d (%llu %u %llu) block %llu gen %llu\n", - i, key.objectid, key.type, key.offset, - btrfs_node_blockptr(c, i), - btrfs_node_ptr_generation(c, i)); + pr_info("\tkey %d " BTRFS_KEY_FMT " block %llu gen %llu\n", + i, BTRFS_KEY_FMT_VALUE(&key), btrfs_node_blockptr(c, i), + btrfs_node_ptr_generation(c, i)); } if (!follow) return; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 31ad8580322a..6e3871ba1845 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3712,10 +3712,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, path, 1, 0); btrfs_debug(fs_info, - "current progress key (%llu %u %llu), search_slot ret %d", - fs_info->qgroup_rescan_progress.objectid, - fs_info->qgroup_rescan_progress.type, - fs_info->qgroup_rescan_progress.offset, ret); + "current progress key " BTRFS_KEY_FMT ", search_slot ret %d", + BTRFS_KEY_FMT_VALUE(&fs_info->qgroup_rescan_progress), ret); if (ret) { /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 748290758459..96539e8b7b4b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -615,8 +615,8 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); btrfs_err(fs_info, - "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)", - objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset); + "cannot relocate partially dropped subvolume %llu, drop progress key " BTRFS_KEY_FMT, + objectid, BTRFS_KEY_FMT_VALUE(&cpu_key)); ret = -EUCLEAN; goto fail; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index d07eab70f759..6a7e297ab0a7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -147,8 +147,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root if (unlikely(ret > 0)) { btrfs_crit(fs_info, - "unable to find root key (%llu %u %llu) in tree %llu", - key->objectid, key->type, key->offset, btrfs_root_id(root)); + "unable to find root key " BTRFS_KEY_FMT " in tree %llu", + BTRFS_KEY_FMT_VALUE(key), btrfs_root_id(root)); ret = -EUCLEAN; btrfs_abort_transaction(trans, ret); return ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 96a030d28e09..caeaa50f2f44 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1053,10 +1053,8 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path, } if (unlikely(start < p->buf)) { btrfs_err(root->fs_info, - "send: path ref buffer underflow for key (%llu %u %llu)", - found_key->objectid, - found_key->type, - found_key->offset); + "send: path ref buffer underflow for key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(found_key)); ret = -EINVAL; goto out; } @@ -7274,8 +7272,8 @@ static int search_key_again(const struct send_ctx *sctx, if (unlikely(ret > 0)) { btrfs_print_tree(path->nodes[path->lowest_level], false); btrfs_err(root->fs_info, -"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", - key->objectid, key->type, key->offset, +"send: key " BTRFS_KEY_FMT" not found in %s root %llu, lowest_level %d, slot %d", + BTRFS_KEY_FMT_VALUE(key), (root == sctx->parent_root ? "parent" : "send"), btrfs_root_id(root), path->lowest_level, path->slots[path->lowest_level]); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index c10b4c242acf..5684750ca7a6 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1618,10 +1618,9 @@ static int check_extent_item(struct extent_buffer *leaf, if (unlikely(prev_end > key->objectid)) { extent_err(leaf, slot, - "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]", - prev_key->objectid, prev_key->type, - prev_key->offset, key->objectid, key->type, - key->offset); + "previous extent " BTRFS_KEY_FMT " overlaps current extent " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(prev_key), + BTRFS_KEY_FMT_VALUE(key)); return -EUCLEAN; } } @@ -2060,10 +2059,9 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) /* Make sure the keys are in the right order */ if (unlikely(btrfs_comp_cpu_keys(&prev_key, &key) >= 0)) { generic_err(leaf, slot, - "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", - prev_key.objectid, prev_key.type, - prev_key.offset, key.objectid, key.type, - key.offset); + "bad key order, prev " BTRFS_KEY_FMT " current " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&prev_key), + BTRFS_KEY_FMT_VALUE(&key)); return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } @@ -2181,10 +2179,9 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) if (unlikely(btrfs_comp_cpu_keys(&key, &next_key) >= 0)) { generic_err(node, slot, - "bad key order, current (%llu %u %llu) next (%llu %u %llu)", - key.objectid, key.type, key.offset, - next_key.objectid, next_key.type, - next_key.offset); + "bad key order, current " BTRFS_KEY_FMT " next " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&key), + BTRFS_KEY_FMT_VALUE(&next_key)); return BTRFS_TREE_BLOCK_BAD_KEY_ORDER; } } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f7e5fe3adff2..030d0fef97bd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -198,9 +198,9 @@ static void do_abort_log_replay(struct walk_control *wc, const char *function, if (wc->log_leaf) { btrfs_crit(fs_info, - "log tree (for root %llu) leaf currently being processed (slot %d key %llu %u %llu):", +"log tree (for root %llu) leaf currently being processed (slot %d key " BTRFS_KEY_FMT "):", btrfs_root_id(wc->root), wc->log_slot, - wc->log_key.objectid, wc->log_key.type, wc->log_key.offset); + BTRFS_KEY_FMT_VALUE(&wc->log_key)); btrfs_print_leaf(wc->log_leaf); } @@ -510,9 +510,9 @@ static int overwrite_item(struct walk_control *wc) ret = btrfs_search_slot(NULL, root, &wc->log_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search subvolume tree for key (%llu %u %llu) root %llu", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset, btrfs_root_id(root)); + "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&wc->log_key), + btrfs_root_id(root)); return ret; } @@ -618,9 +618,8 @@ insert: btrfs_extend_item(trans, wc->subvol_path, item_size - found_size); } else if (ret) { btrfs_abort_log_replay(wc, ret, - "failed to insert item for key (%llu %u %llu)", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset); + "failed to insert item for key " BTRFS_KEY_FMT, + BTRFS_KEY_FMT_VALUE(&wc->log_key)); return ret; } dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); @@ -829,9 +828,9 @@ static noinline int replay_one_extent(struct walk_control *wc) &wc->log_key, sizeof(*item)); if (ret) { btrfs_abort_log_replay(wc, ret, - "failed to insert item with key (%llu %u %llu) root %llu", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset, btrfs_root_id(root)); + "failed to insert item with key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&wc->log_key), + btrfs_root_id(root)); goto out; } dest_offset = btrfs_item_ptr_offset(wc->subvol_path->nodes[0], @@ -1348,9 +1347,9 @@ again: ret = btrfs_search_slot(NULL, root, &search_key, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search subvolume tree for key (%llu %u %llu) root %llu", - search_key.objectid, search_key.type, - search_key.offset, btrfs_root_id(root)); + "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&search_key), + btrfs_root_id(root)); return ret; } else if (ret == 0) { /* @@ -1483,9 +1482,9 @@ again: } if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search subvolume tree for key (%llu %u %llu) root %llu", - wc->log_key.objectid, wc->log_key.type, - wc->log_key.offset, btrfs_root_id(root)); + "failed to search subvolume tree for key " BTRFS_KEY_FMT " root %llu", + BTRFS_KEY_FMT_VALUE(&wc->log_key), + btrfs_root_id(root)); goto out; } @@ -2700,10 +2699,9 @@ static noinline int replay_dir_deletes(struct walk_control *wc, wc->subvol_path, 0, 0); if (ret < 0) { btrfs_abort_log_replay(wc, ret, - "failed to search root %llu for key (%llu %u %llu)", + "failed to search root %llu for key " BTRFS_KEY_FMT, btrfs_root_id(root), - dir_key.objectid, dir_key.type, - dir_key.offset); + BTRFS_KEY_FMT_VALUE(&dir_key)); goto out; } From ca428e9b49c77b0bfc6ebbc8536ed854463b26e2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 16 Oct 2025 11:22:57 +0100 Subject: [PATCH 044/147] btrfs: remove pointless data_end assignment in btrfs_extent_item() There's no point in setting 'data_end' to 'old_data' as we don't use it afterwards. So remove the redundant assignment which was never needed and added when the function was first added in commit 6567e837df07 ("Btrfs: early work to file_write in big extents"). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3be1b66aea35..f6a9b6bbf78b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4136,7 +4136,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, memmove_leaf_data(leaf, data_end - data_size, data_end, old_data - data_end); - data_end = old_data; old_size = btrfs_item_size(leaf, slot); btrfs_set_item_size(leaf, slot, old_size + data_size); btrfs_mark_buffer_dirty(trans, leaf); From 988f693a46d83dc832005a1403ae0471eb1f8964 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 21 Oct 2025 14:21:48 +1030 Subject: [PATCH 045/147] btrfs: subpage: simplify the PAGECACHE_TAG_TOWRITE handling In function btrfs_subpage_set_writeback() we need to keep the PAGECACHE_TAG_TOWRITE tag if the folio is still dirty. This is a needed quirk for support async extents, as a subpage range can almost suddenly go writeback, without touching other subpage ranges in the same folio. However we can simplify the handling by replace the open-coded tag clearing by passing the @keep_write flag depending on if the folio is dirty. Since we're holding the subpage lock already, no one is able to change the dirty/writeback flag, thus it's safe to check the folio dirty before calling __folio_start_writeback(). Reviewed-by: Filipe Manana Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 0a4a1ee81e63..80cd27d3267f 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -440,6 +440,7 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; + bool keep_write; spin_lock_irqsave(&bfs->lock, flags); bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); @@ -450,18 +451,9 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, * assume writeback is complete, and exit too early — violating sync * ordering guarantees. */ + keep_write = folio_test_dirty(folio); if (!folio_test_writeback(folio)) - __folio_start_writeback(folio, true); - if (!folio_test_dirty(folio)) { - struct address_space *mapping = folio_mapping(folio); - XA_STATE(xas, &mapping->i_pages, folio->index); - unsigned long xa_flags; - - xas_lock_irqsave(&xas, xa_flags); - xas_load(&xas); - xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); - xas_unlock_irqrestore(&xas, xa_flags); - } + __folio_start_writeback(folio, keep_write); spin_unlock_irqrestore(&bfs->lock, flags); } From 063171a4f0fa25fe47331b4fee3f705484f1c690 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 16:41:05 +0100 Subject: [PATCH 046/147] btrfs: return real error when failing tickets in maybe_fail_all_tickets() In case we had a transaction abort we set a ticket's error to -EIO, but we have the real error that caused the transaction to be aborted returned by the macro BTRFS_FS_ERROR(). So use that real error instead of -EIO. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 69237f5d6078..8b1cf7f6c223 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1082,7 +1082,7 @@ static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) struct btrfs_fs_info *fs_info = space_info->fs_info; struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; - const bool aborted = BTRFS_FS_ERROR(fs_info); + const int abort_error = BTRFS_FS_ERROR(fs_info); trace_btrfs_fail_all_tickets(fs_info, space_info); @@ -1096,16 +1096,16 @@ static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (!aborted && steal_from_global_rsv(space_info, ticket)) + if (!abort_error && steal_from_global_rsv(space_info, ticket)) return true; - if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + if (!abort_error && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); remove_ticket(space_info, ticket); - if (aborted) - ticket->error = -EIO; + if (abort_error) + ticket->error = abort_error; else ticket->error = -ENOSPC; wake_up(&ticket->wait); @@ -1116,7 +1116,7 @@ static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) * here to see if we can make progress with the next ticket in * the list. */ - if (!aborted) + if (!abort_error) btrfs_try_granting_tickets(space_info); } return (tickets_id != space_info->tickets_id); From 60532c2136ea205c5db0a622e1a51420c8530d0f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 14:57:27 +0100 Subject: [PATCH 047/147] btrfs: avoid recomputing used space in btrfs_try_granting_tickets() In every iteration of the loop we call btrfs_space_info_used() which sums a bunch of fields from a space_info object. This implies doing a function call besides the sum, and we are holding the space_info's spinlock while we do this, so we want to keep the critical section as short as possible since that spinlock is used in all the code for space reservation and flushing (therefore it's heavily used). So call btrfs_try_granting_tickets() only once, before entering the loop, and then update it as we remove tickets. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 8b1cf7f6c223..c0bad6914bb7 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -526,6 +526,7 @@ void btrfs_try_granting_tickets(struct btrfs_space_info *space_info) { struct list_head *head; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + u64 used = btrfs_space_info_used(space_info, true); lockdep_assert_held(&space_info->lock); @@ -533,18 +534,20 @@ void btrfs_try_granting_tickets(struct btrfs_space_info *space_info) again: while (!list_empty(head)) { struct reserve_ticket *ticket; - u64 used = btrfs_space_info_used(space_info, true); + u64 used_after; ticket = list_first_entry(head, struct reserve_ticket, list); + used_after = used + ticket->bytes; /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= space_info->total_bytes) || + if (used_after <= space_info->total_bytes || btrfs_can_overcommit(space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; space_info->tickets_id++; wake_up(&ticket->wait); + used = used_after; } else { break; } From 563ef2befb55a75ba13b66d9714d50b848de8aae Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 15:12:55 +0100 Subject: [PATCH 048/147] btrfs: make btrfs_can_overcommit() return bool instead of int It's a boolean function, so switch its return type to bool. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 10 ++++------ fs/btrfs/space-info.h | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c0bad6914bb7..0fdf60f05228 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -490,22 +490,20 @@ static u64 calc_available_free_space(const struct btrfs_space_info *space_info, return avail; } -int btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush) +bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) { u64 avail; u64 used; /* Don't overcommit when in mixed mode */ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; + return false; used = btrfs_space_info_used(space_info, true); avail = calc_available_free_space(space_info, flush); - if (used + bytes < space_info->total_bytes + avail) - return 1; - return 0; + return (used + bytes < space_info->total_bytes + avail); } static void remove_ticket(struct btrfs_space_info *space_info, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 2fad2e4c2252..d97b0799649f 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -282,8 +282,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_space_info *space_info); -int btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush); +bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( struct btrfs_space_info *space_info, From a5f8f64aa3377b470945252f926e2cbb5a931c11 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 15:47:59 +0100 Subject: [PATCH 049/147] btrfs: avoid used space computation when trying to grant tickets In btrfs_try_granting_tickets(), we call btrfs_can_overcommit() and that calls btrfs_space_info_used(). But we already keep track, in the 'used' local variable, of the used space in the space_info, so we are just repeating the same computation and doing an extra function call while we are holding the space_info's spinlock, which is heavily used by the space reservation and flushing code. So add a local variant of btrfs_can_overcommit() that takes in the used space as an argument and therefore does not call btrfs_space_info_used(), and use it in btrfs_try_granting_tickets(). Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 0fdf60f05228..f5ff51680f41 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -490,10 +490,29 @@ static u64 calc_available_free_space(const struct btrfs_space_info *space_info, return avail; } +static inline bool check_can_overcommit(const struct btrfs_space_info *space_info, + u64 space_info_used_bytes, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + const u64 avail = calc_available_free_space(space_info, flush); + + return (space_info_used_bytes + bytes < space_info->total_bytes + avail); +} + +static inline bool can_overcommit(const struct btrfs_space_info *space_info, + u64 space_info_used_bytes, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return false; + + return check_can_overcommit(space_info, space_info_used_bytes, bytes, flush); +} + bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - u64 avail; u64 used; /* Don't overcommit when in mixed mode */ @@ -501,9 +520,8 @@ bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, return false; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(space_info, flush); - return (used + bytes < space_info->total_bytes + avail); + return check_can_overcommit(space_info, used, bytes, flush); } static void remove_ticket(struct btrfs_space_info *space_info, @@ -539,7 +557,7 @@ again: /* Check and see if our ticket can be satisfied now. */ if (used_after <= space_info->total_bytes || - btrfs_can_overcommit(space_info, ticket->bytes, flush)) { + can_overcommit(space_info, used, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); remove_ticket(space_info, ticket); ticket->bytes = 0; From 0ce6300feca082a866a58fa1f4f9af47a450c41c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 16:10:04 +0100 Subject: [PATCH 050/147] btrfs: avoid used space computation when reserving space In __reserve_bytes() we have 3 repeated calls to btrfs_space_info_used(), one early on as soon as take the space_info's spinlock, another one when we call btrfs_can_overcommit(), which calls btrfs_space_info_used() again, and a final one when we are reserving for a flush emergency. During all these calls we are holding the space_info's spinlock, which is heavily used by the space reservation and flushing code, so it's desirable to make the critical sections as short as possible. So make this more efficient by: 1) Instead of calling btrfs_can_overcommit() call the new variant can_overcommit() which takes the space_info's used space as an argument and pass the value we already computed and have in the 'used' variable; 2) Instead of calling btrfs_space_info_used() with its second argument as false when we are doing a flush emergency, decrement the space_info's bytes_may_use counter from the 'used' variable, as the difference between passing true or false as the second argument to btrfs_space_info_used() is whether or not to include the space_info's bytes_may_use counter in the computation. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f5ff51680f41..6c2769044b55 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1783,7 +1783,7 @@ static int __reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || - btrfs_can_overcommit(space_info, orig_bytes, flush))) { + can_overcommit(space_info, used, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; } @@ -1794,7 +1794,7 @@ static int __reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, * left to allocate for the block. */ if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { - used = btrfs_space_info_used(space_info, false); + used -= space_info->bytes_may_use; if (used + orig_bytes <= space_info->total_bytes) { btrfs_space_info_update_bytes_may_use(space_info, orig_bytes); ret = 0; From 6f4779faa0c19c3a6ed0d52cb0f068ae891d7bb9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 23 Oct 2025 13:01:34 +0100 Subject: [PATCH 051/147] btrfs: inline btrfs_space_info_used() The function is simple enough to be inlined and in fact doing it even reduces the object code. In x86_64 with gcc 14.2.0-19 from Debian the results were the following: Before this change $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1919410 161703 15592 2096705 1ffe41 fs/btrfs/btrfs.ko After this change $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1918991 161675 15592 2096258 1ffc82 fs/btrfs/btrfs.ko Also remove the ASSERT() that checks the space_info argument is not NULL, as it's odd to be there since it can never be NULL and in case that ever happens during development, a stack trace from a NULL pointer dereference will be obvious. It was originally added when btrfs_space_info_used() was introduced in commit 4136135b080f ("Btrfs: use helper to get used bytes of space_info"). Also add a lockdep assertion to check the space_info's lock is being held by the calling task. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 10 ---------- fs/btrfs/space-info.h | 13 +++++++++++-- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 6c2769044b55..53677ecb8c15 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -172,16 +172,6 @@ * thing with or without extra unallocated space. */ -u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, - bool may_use_included) -{ - ASSERT(s_info); - return s_info->bytes_used + s_info->bytes_reserved + - s_info->bytes_pinned + s_info->bytes_readonly + - s_info->bytes_zone_unusable + - (may_use_included ? s_info->bytes_may_use : 0); -} - /* * after adding space to the filesystem, we need to clear the full flags * on all the space infos. diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index d97b0799649f..7e16d4c116c8 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -266,6 +266,17 @@ DECLARE_SPACE_INFO_UPDATE(bytes_may_use, "space_info"); DECLARE_SPACE_INFO_UPDATE(bytes_pinned, "pinned"); DECLARE_SPACE_INFO_UPDATE(bytes_zone_unusable, "zone_unusable"); +static inline u64 btrfs_space_info_used(const struct btrfs_space_info *s_info, + bool may_use_included) +{ + lockdep_assert_held(&s_info->lock); + + return s_info->bytes_used + s_info->bytes_reserved + + s_info->bytes_pinned + s_info->bytes_readonly + + s_info->bytes_zone_unusable + + (may_use_included ? s_info->bytes_may_use : 0); +} + int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group); @@ -273,8 +284,6 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); -u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info, - bool may_use_included); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, bool dump_block_groups); From 49f204be223b8bae5dd3d99f86c1ea649ce58aab Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 16:32:47 +0100 Subject: [PATCH 052/147] btrfs: bail out earlier from need_preemptive_reclaim() if we have tickets Instead of doing some calculations and then return false if it turns out we have queued tickets, check first if we have tickets and return false immediately if we have tickets, without wasting time on doing those computations. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 53677ecb8c15..bd206fc300e7 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -937,10 +937,17 @@ static bool need_preemptive_reclaim(const struct btrfs_space_info *space_info) u64 thresh; u64 used; - thresh = mult_perc(space_info->total_bytes, 90); - lockdep_assert_held(&space_info->lock); + /* + * We have tickets queued, bail so we don't compete with the async + * flushers. + */ + if (space_info->reclaim_size) + return false; + + thresh = mult_perc(space_info->total_bytes, 90); + /* If we're just plain full then async reclaim just slows us down. */ if ((space_info->bytes_used + space_info->bytes_reserved + global_rsv_size) >= thresh) @@ -960,13 +967,6 @@ static bool need_preemptive_reclaim(const struct btrfs_space_info *space_info) if (used - global_rsv_size <= SZ_128M) return false; - /* - * We have tickets queued, bail so we don't compete with the async - * flushers. - */ - if (space_info->reclaim_size) - return false; - /* * If we have over half of the free space occupied by reservations or * pinned then we want to start flushing. From 8ab2b8bdbecaaf1b01adc5cfc13534a04917515d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 16:54:12 +0100 Subject: [PATCH 053/147] btrfs: increment loop count outside critical section during metadata reclaim In btrfs_preempt_reclaim_metadata_space() there's no need to increment the local variable that tracks the number of iterations of the while loop while inside the critical section delimited by the space_info's spinlock. That spinlock is heavily used by space reservation and flushing code, so it's desirable to have its critical sections as short as possible. So move the loop count incremented outside the critical section. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bd206fc300e7..2dd9d4e5c2c2 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1264,8 +1264,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) u64 to_reclaim, block_rsv_size; const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv); - loops++; - /* * We don't have a precise counter for the metadata being * reserved for delalloc, so we'll approximate it by subtracting @@ -1311,6 +1309,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) spin_unlock(&space_info->lock); + loops++; + /* * We don't want to reclaim everything, just a portion, so scale * down the to_reclaim by 1/4. If it takes us down to 0, From 4ddb077378aa84d0872fdfce85e7a82fd805ee86 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 17:07:22 +0100 Subject: [PATCH 054/147] btrfs: shorten critical section in btrfs_preempt_reclaim_metadata_space() We are doing a lot of small calculations and assignments while holding the space_info's spinlock, which is a heavily used lock for space reservation and flushing. There's no point in holding the lock for so long when all we want is to call need_preemptive_reclaim() and get a consistent value for a couple of counters from the space_info. Instead, grab the counters into local variables, release the lock and then use the local variables. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2dd9d4e5c2c2..9a072009eec8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1263,7 +1263,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) u64 delalloc_size = 0; u64 to_reclaim, block_rsv_size; const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv); + const u64 bytes_may_use = space_info->bytes_may_use; + const u64 bytes_pinned = space_info->bytes_pinned; + spin_unlock(&space_info->lock); /* * We don't have a precise counter for the metadata being * reserved for delalloc, so we'll approximate it by subtracting @@ -1275,8 +1278,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) btrfs_block_rsv_reserved(delayed_block_rsv) + btrfs_block_rsv_reserved(delayed_refs_rsv) + btrfs_block_rsv_reserved(trans_rsv); - if (block_rsv_size < space_info->bytes_may_use) - delalloc_size = space_info->bytes_may_use - block_rsv_size; + if (block_rsv_size < bytes_may_use) + delalloc_size = bytes_may_use - block_rsv_size; /* * We don't want to include the global_rsv in our calculation, @@ -1293,10 +1296,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) if (delalloc_size > block_rsv_size) { to_reclaim = delalloc_size; flush = FLUSH_DELALLOC; - } else if (space_info->bytes_pinned > + } else if (bytes_pinned > (btrfs_block_rsv_reserved(delayed_block_rsv) + btrfs_block_rsv_reserved(delayed_refs_rsv))) { - to_reclaim = space_info->bytes_pinned; + to_reclaim = bytes_pinned; flush = COMMIT_TRANS; } else if (btrfs_block_rsv_reserved(delayed_block_rsv) > btrfs_block_rsv_reserved(delayed_refs_rsv)) { @@ -1307,8 +1310,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) flush = FLUSH_DELAYED_REFS_NR; } - spin_unlock(&space_info->lock); - loops++; /* From afbc047ab0db1470c1d5ff82788a8a94431dc7e9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 17:14:11 +0100 Subject: [PATCH 055/147] btrfs: avoid unnecessary reclaim calculation in priority_reclaim_metadata_space() If the given ticket was already served (its ->bytes is 0), then we wasted time calculating the metadata reclaim size. So calculate it only after we checked the ticket was not yet served. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9a072009eec8..b03c015d5d51 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1501,7 +1501,6 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, int flush_state = 0; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still * because we may have only satisfied the priority tickets and still @@ -1513,6 +1512,8 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, return; } + to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); + while (flush_state < states_nr) { spin_unlock(&space_info->lock); flush_space(space_info, to_reclaim, states[flush_state], false); From f18a203a1b316f4cb2a2bd38ed79fc9182a1ddab Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 17:26:58 +0100 Subject: [PATCH 056/147] btrfs: assert space_info is locked in steal_from_global_rsv() The caller is supposed to have locked the space_info, so assert that. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index b03c015d5d51..a2af55178c69 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1047,6 +1047,8 @@ static bool steal_from_global_rsv(struct btrfs_space_info *space_info, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 min_bytes; + lockdep_assert_held(&space_info->lock); + if (!ticket->steal) return false; From 5ca7725ddfc5b7a1e5b87ba3cb489b3cd052faab Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 17:30:38 +0100 Subject: [PATCH 057/147] btrfs: assign booleans to global reserve's full field We have a couple places that are assigning 0 and 1 to the full field of the global reserve. This is harmless since 0 is converted to false and 1 converted to true, but for better readability, replace these with true and false since the field is of type bool. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index a2af55178c69..62e1ba7f09c0 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1067,7 +1067,7 @@ static bool steal_from_global_rsv(struct btrfs_space_info *space_info, wake_up(&ticket->wait); space_info->tickets_id++; if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; + global_rsv->full = false; spin_unlock(&global_rsv->lock); return true; @@ -2186,7 +2186,7 @@ void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len) global_rsv->reserved += to_add; btrfs_space_info_update_bytes_may_use(space_info, to_add); if (global_rsv->reserved >= global_rsv->size) - global_rsv->full = 1; + global_rsv->full = true; len -= to_add; } spin_unlock(&global_rsv->lock); From 189db2510569c0f1cc7eefb583c48a36d373cae3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 17:34:36 +0100 Subject: [PATCH 058/147] btrfs: process ticket outside global reserve critical section In steal_from_global_rsv() there's no need to process the ticket inside the critical section of the global reserve. Move the ticket processing to happen after the critical section. This helps reduce contention on the global reserve's spinlock. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 62e1ba7f09c0..957477e5f01f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1062,13 +1062,14 @@ static bool steal_from_global_rsv(struct btrfs_space_info *space_info, return false; } global_rsv->reserved -= ticket->bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = false; + spin_unlock(&global_rsv->lock); + remove_ticket(space_info, ticket); ticket->bytes = 0; wake_up(&ticket->wait); space_info->tickets_id++; - if (global_rsv->reserved < global_rsv->size) - global_rsv->full = false; - spin_unlock(&global_rsv->lock); return true; } From b70c32f10a049a6e7c7c718d6ce69554af1e9b3c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 17 Oct 2025 17:58:23 +0100 Subject: [PATCH 059/147] btrfs: remove double underscore prefix from __reserve_bytes() The use of a double underscore prefix is discouraged and we have no justification at all for it all since there's no reserved_bytes() counter part. So remove the prefix. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 957477e5f01f..edeb46f1aa33 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -67,7 +67,7 @@ * Assume we are unable to simply make the reservation because we do not have * enough space * - * -> __reserve_bytes + * -> reserve_bytes * create a reserve_ticket with ->bytes set to our reservation, add it to * the tail of space_info->tickets, kick async flush thread * @@ -1728,8 +1728,8 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { struct btrfs_fs_info *fs_info = space_info->fs_info; struct work_struct *async_work; @@ -1879,7 +1879,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_space_info *space_info, { int ret; - ret = __reserve_bytes(space_info, orig_bytes, flush); + ret = reserve_bytes(space_info, orig_bytes, flush); if (ret == -ENOSPC) { struct btrfs_fs_info *fs_info = space_info->fs_info; @@ -1913,7 +1913,7 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - ret = __reserve_bytes(space_info, bytes, flush); + ret = reserve_bytes(space_info, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", space_info->flags, bytes, 1); From f7a32dd2a616c333cff2d6fb7e3d854ec8d3ae41 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 12:39:52 +0100 Subject: [PATCH 060/147] btrfs: reduce space_info critical section in btrfs_chunk_alloc() There's no need to update local variables while holding the space_info's spinlock, since the update isn't using anything from the space_info. So move these updates outside the critical section to shorten it. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ec1e4fc0cd51..ebd4c514c2c8 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4191,11 +4191,11 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, should_alloc = should_alloc_chunk(fs_info, space_info, force); if (space_info->full) { /* No more free physical space */ + spin_unlock(&space_info->lock); if (should_alloc) ret = -ENOSPC; else ret = 0; - spin_unlock(&space_info->lock); return ret; } else if (!should_alloc) { spin_unlock(&space_info->lock); @@ -4207,16 +4207,16 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, * recheck if we should continue with our allocation * attempt. */ + spin_unlock(&space_info->lock); wait_for_alloc = true; force = CHUNK_ALLOC_NO_FORCE; - spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { /* Proceed with allocation */ space_info->chunk_alloc = true; - wait_for_alloc = false; spin_unlock(&space_info->lock); + wait_for_alloc = false; } cond_resched(); From 8b6fa164ab59f9e3f24e627fe09a0234783e7a8b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 12:47:26 +0100 Subject: [PATCH 061/147] btrfs: reduce block group critical section in btrfs_free_reserved_bytes() There's no need to update the space_info fields (bytes_reserved, max_extent_size, bytes_readonly, bytes_zone_unusable) while holding the block group's spinlock. So move those updates to happen after we unlock the block group (and while holding the space_info locked of course), so that all we do under the block group's critical section is to update the block group itself. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ebd4c514c2c8..856bda9c99d9 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3858,21 +3858,24 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, bool is_delalloc) { struct btrfs_space_info *space_info = cache->space_info; + bool bg_ro; spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (cache->ro) - space_info->bytes_readonly += num_bytes; - else if (btrfs_is_zoned(cache->fs_info)) - space_info->bytes_zone_unusable += num_bytes; + bg_ro = cache->ro; cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->max_extent_size = 0; - if (is_delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); + if (bg_ro) + space_info->bytes_readonly += num_bytes; + else if (btrfs_is_zoned(cache->fs_info)) + space_info->bytes_zone_unusable += num_bytes; + + space_info->bytes_reserved -= num_bytes; + space_info->max_extent_size = 0; + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } From a270cb420c06ae7b52f385e139577209c705e5e0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 12:57:34 +0100 Subject: [PATCH 062/147] btrfs: reduce block group critical section in btrfs_add_reserved_bytes() We are doing some things inside the block group's critical section that are relevant only to the space_info: updating the space_info counters bytes_reserved and bytes_may_use as well as trying to grant tickets (calling btrfs_try_granting_tickets()), and this later can take some time. So move all those updates to outside the block group's critical section and still inside the space_info's critical section. Like this we keep the block group's critical section only for block group updates and can help reduce contention on a block group's lock. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 856bda9c99d9..b964eacc1610 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3813,30 +3813,38 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - goto out; + goto out_error; } if (btrfs_block_group_should_use_size_class(cache)) { size_class = btrfs_calc_block_group_size_class(num_bytes); ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); if (ret) - goto out; + goto out_error; } + cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - trace_btrfs_space_reservation(cache->fs_info, "space_info", - space_info->flags, num_bytes, 1); - btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", + space_info->flags, num_bytes, 1); + spin_unlock(&cache->lock); + + space_info->bytes_reserved += num_bytes; + btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); + /* * Compression can use less space than we reserved, so wake tickets if * that happens. */ if (num_bytes < ram_bytes) btrfs_try_granting_tickets(space_info); -out: + spin_unlock(&space_info->lock); + + return 0; + +out_error: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; From c0d0b13d27f8cf9d5bf5adae52df8ec781fbb983 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 13:17:23 +0100 Subject: [PATCH 063/147] btrfs: reduce block group critical section in do_trimming() There's no need to update the bytes_reserved and bytes_readonly fields of the space_info while holding the block group's spinlock. We are only making the critical section longer than necessary. So move the space_info updates outside of the block group's critical section. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ab873bd67192..6ccb492eae8e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3656,7 +3656,7 @@ static int do_trimming(struct btrfs_block_group *block_group, struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; - int update = 0; + bool bg_ro; const u64 end = start + bytes; const u64 reserved_end = reserved_start + reserved_bytes; enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; @@ -3664,12 +3664,14 @@ static int do_trimming(struct btrfs_block_group *block_group, spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (!block_group->ro) { + bg_ro = block_group->ro; + if (!bg_ro) { block_group->reserved += reserved_bytes; + spin_unlock(&block_group->lock); space_info->bytes_reserved += reserved_bytes; - update = 1; + } else { + spin_unlock(&block_group->lock); } - spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); @@ -3690,14 +3692,16 @@ static int do_trimming(struct btrfs_block_group *block_group, list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); - if (update) { + if (!bg_ro) { spin_lock(&space_info->lock); spin_lock(&block_group->lock); - if (block_group->ro) - space_info->bytes_readonly += reserved_bytes; + bg_ro = block_group->ro; block_group->reserved -= reserved_bytes; - space_info->bytes_reserved -= reserved_bytes; spin_unlock(&block_group->lock); + + space_info->bytes_reserved -= reserved_bytes; + if (bg_ro) + space_info->bytes_readonly += reserved_bytes; spin_unlock(&space_info->lock); } From 585416766d2711d0bcc328f54bff392f5e865ffa Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 13:37:32 +0100 Subject: [PATCH 064/147] btrfs: reduce block group critical section in pin_down_extent() There's no need to update the bytes_reserved and bytes_may_use fields of the space_info while holding the block group's spinlock. We are only making the critical section longer than necessary. So move the space_info updates outside of the block group's critical section. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ae2c3dc9957e..70b77fe21b9f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2594,15 +2594,15 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *cache, u64 bytenr, u64 num_bytes, int reserved) { + const u64 reserved_bytes = (reserved ? num_bytes : 0); + spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); - if (reserved) { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; - } + cache->reserved -= reserved_bytes; spin_unlock(&cache->lock); + cache->space_info->bytes_reserved -= reserved_bytes; + btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); spin_unlock(&cache->space_info->lock); btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, From ec8022cd2656935bdf0be13110c1a27dfe154aaf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 13:40:56 +0100 Subject: [PATCH 065/147] btrfs: use local variable for space_info in pin_down_extent() Instead of dereferencing the block group multiple times to access its space_info, use a local variable to shorten the code horizontal wise and make it easier to read. Also, while at it, also rename the block group argument from 'cache' to 'bg', as the cache name is confusing and it's from the old days where the block group structure was named as 'btrfs_block_group_cache'. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 70b77fe21b9f..4be20949f0ba 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2591,19 +2591,20 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info) } static int pin_down_extent(struct btrfs_trans_handle *trans, - struct btrfs_block_group *cache, + struct btrfs_block_group *bg, u64 bytenr, u64 num_bytes, int reserved) { + struct btrfs_space_info *space_info = bg->space_info; const u64 reserved_bytes = (reserved ? num_bytes : 0); - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += num_bytes; - cache->reserved -= reserved_bytes; - spin_unlock(&cache->lock); - cache->space_info->bytes_reserved -= reserved_bytes; - btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes); - spin_unlock(&cache->space_info->lock); + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + bg->pinned += num_bytes; + bg->reserved -= reserved_bytes; + spin_unlock(&bg->lock); + space_info->bytes_reserved -= reserved_bytes; + btrfs_space_info_update_bytes_pinned(space_info, num_bytes); + spin_unlock(&space_info->lock); btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); From 8dcb8e4b110d86aaae2c485622423b6f63a65408 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 13:48:33 +0100 Subject: [PATCH 066/147] btrfs: remove 'reserved' argument from btrfs_pin_extent() All callers pass a value of 1 (true) to it, so remove it. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 15 +++++++-------- fs/btrfs/extent-tree.h | 3 +-- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4be20949f0ba..21420dc26a50 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1764,7 +1764,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, if (TRANS_ABORTED(trans)) { if (insert_reserved) { - btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); free_head_ref_squota_rsv(trans->fs_info, href); } return 0; @@ -1783,7 +1783,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, else BUG(); if (ret && insert_reserved) - btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); if (ret < 0) btrfs_err(trans->fs_info, "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", @@ -1890,7 +1890,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); if (head->must_insert_reserved) { - btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1); + btrfs_pin_extent(trans, head->bytenr, head->num_bytes); if (head->is_data) { struct btrfs_root *csum_root; @@ -2611,15 +2611,14 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, return 0; } -int btrfs_pin_extent(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int reserved) +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(trans, cache, bytenr, num_bytes, reserved); + pin_down_extent(trans, cache, bytenr, num_bytes, 1); btrfs_put_block_group(cache); return 0; @@ -3538,7 +3537,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if (ref->ref_root == BTRFS_TREE_LOG_OBJECTID) { - btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes, 1); + btrfs_pin_extent(trans, ref->bytenr, ref->num_bytes); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { ret = btrfs_add_delayed_tree_ref(trans, ref, NULL); @@ -5022,7 +5021,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, offset, ins, 1, root_objectid); if (ret) - btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); + btrfs_pin_extent(trans, ins->objectid, ins->offset); ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index e970ac42a871..e573509c5a71 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -110,8 +110,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags, u64 *owner_root); -int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, - int reserved); +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); From 4cb0abc1cf4f46f9b910ce19e79f326c1f16cecb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 13:52:11 +0100 Subject: [PATCH 067/147] btrfs: change 'reserved' argument from pin_down_extent() to bool It's used as a boolean, so convert it from int type to bool type. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 21420dc26a50..36963b4a6303 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2592,7 +2592,7 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info) static int pin_down_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg, - u64 bytenr, u64 num_bytes, int reserved) + u64 bytenr, u64 num_bytes, bool reserved) { struct btrfs_space_info *space_info = bg->space_info; const u64 reserved_bytes = (reserved ? num_bytes : 0); @@ -2618,7 +2618,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes cache = btrfs_lookup_block_group(trans->fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(trans, cache, bytenr, num_bytes, 1); + pin_down_extent(trans, cache, bytenr, num_bytes, true); btrfs_put_block_group(cache); return 0; @@ -2642,7 +2642,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, if (ret) goto out; - pin_down_extent(trans, cache, eb->start, eb->len, 0); + pin_down_extent(trans, cache, eb->start, eb->len, false); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, eb->start, eb->len); @@ -3483,7 +3483,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, bg = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(trans, bg, buf->start, buf->len, 1); + pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); goto out; } @@ -3507,7 +3507,7 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) || btrfs_is_zoned(fs_info)) { - pin_down_extent(trans, bg, buf->start, buf->len, 1); + pin_down_extent(trans, bg, buf->start, buf->len, true); btrfs_put_block_group(bg); goto out; } @@ -4775,7 +4775,7 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, return -ENOSPC; } - ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); + ret = pin_down_extent(trans, cache, eb->start, eb->len, true); btrfs_put_block_group(cache); return ret; } From 36574363b75c6adf4642dc5f33b2a33870c8da3c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 15:53:01 +0100 Subject: [PATCH 068/147] btrfs: reduce block group critical section in unpin_extent_range() There's no need to update the bytes_pinned, bytes_readonly and max_extent_size fields of the space_info while inside the critical section delimited by the block group's lock. So move that out of the block group's critical section, but sill inside the space_info's critical section. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 36963b4a6303..d839d8d32412 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2747,13 +2747,12 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, struct btrfs_free_cluster *cluster = NULL; u64 total_unpinned = 0; u64 empty_cluster = 0; - bool readonly; int ret = 0; while (start <= end) { u64 len; + bool readonly; - readonly = false; if (!cache || start >= cache->start + cache->length) { if (cache) @@ -2797,20 +2796,21 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); + readonly = cache->ro; cache->pinned -= len; + spin_unlock(&cache->lock); + btrfs_space_info_update_bytes_pinned(space_info, -len); space_info->max_extent_size = 0; - if (cache->ro) { + + if (readonly) { space_info->bytes_readonly += len; - readonly = true; } else if (btrfs_is_zoned(fs_info)) { /* Need reset before reusing in a zoned block group */ btrfs_space_info_update_bytes_zone_unusable(space_info, len); - readonly = true; - } - spin_unlock(&cache->lock); - if (!readonly && return_free_space) + } else if (return_free_space) { btrfs_return_free_space(space_info, len); + } spin_unlock(&space_info->lock); } From 8b6e1f5dcef97c8336a011c52384c0eb39691a43 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 16:08:50 +0100 Subject: [PATCH 069/147] btrfs: remove pointless label and goto from unpin_extent_range() There's no need to have an 'out' label and jump there in case we can not find a block group. We can simply return directly since there are no resources to release, removing the need for the label and the 'ret' variable. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d839d8d32412..f981ff72fb98 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2747,7 +2747,6 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, struct btrfs_free_cluster *cluster = NULL; u64 total_unpinned = 0; u64 empty_cluster = 0; - int ret = 0; while (start <= end) { u64 len; @@ -2761,8 +2760,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, cache = btrfs_lookup_block_group(fs_info, start); if (unlikely(cache == NULL)) { /* Logic error, something removed the block group. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } cluster = fetch_cluster_info(fs_info, @@ -2816,8 +2814,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, if (cache) btrfs_put_block_group(cache); -out: - return ret; + + return 0; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) From cdf8a566eeef0c28a082dcdfb5d91e964029d6c3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 23 Oct 2025 13:24:22 +0100 Subject: [PATCH 070/147] btrfs: add data_race() in btrfs_account_ro_block_groups_free_space() Surround the intentional empty list check with the data_race() annotation so that tools like KCSAN don't report a data race. The race is intentional as it's harmless and we want to avoid lock contention of the space_info since its lock is heavily used (space reservation, space flushing, extent allocation and deallocation, etc). Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index edeb46f1aa33..be58f702cc61 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1948,7 +1948,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) int factor; /* It's df, we don't care if it's racy */ - if (list_empty(&sinfo->ro_bgs)) + if (data_race(list_empty(&sinfo->ro_bgs))) return 0; spin_lock(&sinfo->lock); From 50a51b53782af2f9eabe77b1f0d5a3b339ee4531 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 20 Oct 2025 22:59:03 +0100 Subject: [PATCH 071/147] btrfs: move ticket wakeup and finalization to remove_ticket() Instead of repeating the wakeup and setup of the ->bytes or ->error field, move those steps to remove_ticket() to avoid duplication. This is also needed for the next patch in the series, so that we avoid duplicating more logic. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 40 ++++++++++++++++++---------------------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index be58f702cc61..86cd87c5884a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -515,13 +515,20 @@ bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, } static void remove_ticket(struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) + struct reserve_ticket *ticket, int error) { if (!list_empty(&ticket->list)) { list_del_init(&ticket->list); ASSERT(space_info->reclaim_size >= ticket->bytes); space_info->reclaim_size -= ticket->bytes; } + + if (error) + ticket->error = error; + else + ticket->bytes = 0; + + wake_up(&ticket->wait); } /* @@ -549,10 +556,8 @@ again: if (used_after <= space_info->total_bytes || can_overcommit(space_info, used, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes); - remove_ticket(space_info, ticket); - ticket->bytes = 0; + remove_ticket(space_info, ticket, 0); space_info->tickets_id++; - wake_up(&ticket->wait); used = used_after; } else { break; @@ -1066,9 +1071,7 @@ static bool steal_from_global_rsv(struct btrfs_space_info *space_info, global_rsv->full = false; spin_unlock(&global_rsv->lock); - remove_ticket(space_info, ticket); - ticket->bytes = 0; - wake_up(&ticket->wait); + remove_ticket(space_info, ticket, 0); space_info->tickets_id++; return true; @@ -1115,12 +1118,10 @@ static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); - remove_ticket(space_info, ticket); if (abort_error) - ticket->error = abort_error; + remove_ticket(space_info, ticket, abort_error); else - ticket->error = -ENOSPC; - wake_up(&ticket->wait); + remove_ticket(space_info, ticket, -ENOSPC); /* * We're just throwing tickets away, so more flushing may not @@ -1536,13 +1537,10 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, * just to have caller fail immediately instead of later when trying to * modify the fs, making it easier to debug -ENOSPC problems. */ - if (BTRFS_FS_ERROR(fs_info)) { - ticket->error = BTRFS_FS_ERROR(fs_info); - remove_ticket(space_info, ticket); - } else if (!steal_from_global_rsv(space_info, ticket)) { - ticket->error = -ENOSPC; - remove_ticket(space_info, ticket); - } + if (BTRFS_FS_ERROR(fs_info)) + remove_ticket(space_info, ticket, BTRFS_FS_ERROR(fs_info)); + else if (!steal_from_global_rsv(space_info, ticket)) + remove_ticket(space_info, ticket, -ENOSPC); /* * We must run try_granting_tickets here because we could be a large @@ -1574,8 +1572,7 @@ static void priority_reclaim_data_space(struct btrfs_space_info *space_info, } } - ticket->error = -ENOSPC; - remove_ticket(space_info, ticket); + remove_ticket(space_info, ticket, -ENOSPC); btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } @@ -1599,8 +1596,7 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, * despite getting an error, resulting in a space leak * (bytes_may_use counter of our space_info). */ - remove_ticket(space_info, ticket); - ticket->error = -EINTR; + remove_ticket(space_info, ticket, -EINTR); break; } spin_unlock(&space_info->lock); From f912f0af13aebfd5634ba68c1a077e9a59fca47a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 21 Oct 2025 16:35:19 +0100 Subject: [PATCH 072/147] btrfs: avoid space_info locking when checking if tickets are served When checking if a ticket was served, we take the space_info's spinlock. If the ticket was served (its ->bytes is 0) or had an error (its ->error it not 0) then we just unlock the space_info and return. This however causes contention on the space_info's spinlock, which is heavily used (space reservation, space flushing, allocating and deallocating an extent from a block group (btrfs_update_block_group()), etc). Instead of using the space_info's spinlock to check if a ticket was served, use a per ticket spinlock which isn't used by anyone other than the task that created the ticket (stack allocated) and the task that serves the ticket (a reclaim task or any task deallocating space that ends up at btrfs_try_granting_tickets()). After applying this patch and all previous patches from the same patchset (many attempt to reduce space_info critical sections), lockstat showed some improvements for a fs_mark test regarding the space_info's spinlock 'lock'. The lockstat results: Before patchset: con-bounces: 13733858 contentions: 15902322 waittime-total: 264902529.72 acq-bounces: 28161791 acquisitions: 38679282 After patchset: con-bounces: 12032220 contentions: 13598034 waittime-total: 221806127.28 acq-bounces: 24717947 acquisitions: 34103281 Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 67 ++++++++++++++++++++++++++----------------- fs/btrfs/space-info.h | 1 + 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 86cd87c5884a..50704e38d133 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -517,18 +517,27 @@ bool btrfs_can_overcommit(const struct btrfs_space_info *space_info, u64 bytes, static void remove_ticket(struct btrfs_space_info *space_info, struct reserve_ticket *ticket, int error) { + lockdep_assert_held(&space_info->lock); + if (!list_empty(&ticket->list)) { list_del_init(&ticket->list); ASSERT(space_info->reclaim_size >= ticket->bytes); space_info->reclaim_size -= ticket->bytes; } - if (error) + spin_lock(&ticket->lock); + /* + * If we are called from a task waiting on the ticket, it may happen + * that before it sets an error on the ticket, a reclaim task was able + * to satisfy the ticket. In that case ignore the error. + */ + if (error && ticket->bytes > 0) ticket->error = error; else ticket->bytes = 0; wake_up(&ticket->wait); + spin_unlock(&ticket->lock); } /* @@ -1495,6 +1504,17 @@ static const enum btrfs_flush_state evict_flush_states[] = { RESET_ZONES, }; +static bool is_ticket_served(struct reserve_ticket *ticket) +{ + bool ret; + + spin_lock(&ticket->lock); + ret = (ticket->bytes == 0); + spin_unlock(&ticket->lock); + + return ret; +} + static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, struct reserve_ticket *ticket, const enum btrfs_flush_state *states, @@ -1504,31 +1524,27 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, u64 to_reclaim; int flush_state = 0; - spin_lock(&space_info->lock); /* * This is the priority reclaim path, so to_reclaim could be >0 still * because we may have only satisfied the priority tickets and still * left non priority tickets on the list. We would then have * to_reclaim but ->bytes == 0. */ - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + if (is_ticket_served(ticket)) return; - } + spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(space_info); + spin_unlock(&space_info->lock); while (flush_state < states_nr) { - spin_unlock(&space_info->lock); flush_space(space_info, to_reclaim, states[flush_state], false); - flush_state++; - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + if (is_ticket_served(ticket)) return; - } + flush_state++; } + spin_lock(&space_info->lock); /* * Attempt to steal from the global rsv if we can, except if the fs was * turned into error mode due to a transaction abort when flushing space @@ -1554,22 +1570,17 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, static void priority_reclaim_data_space(struct btrfs_space_info *space_info, struct reserve_ticket *ticket) { - spin_lock(&space_info->lock); - /* We could have been granted before we got here. */ - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + if (is_ticket_served(ticket)) return; - } + spin_lock(&space_info->lock); while (!space_info->full) { spin_unlock(&space_info->lock); flush_space(space_info, U64_MAX, ALLOC_CHUNK_FORCE, false); - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); + if (is_ticket_served(ticket)) return; - } + spin_lock(&space_info->lock); } remove_ticket(space_info, ticket, -ENOSPC); @@ -1582,11 +1593,13 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, { DEFINE_WAIT(wait); - int ret = 0; - spin_lock(&space_info->lock); + spin_lock(&ticket->lock); while (ticket->bytes > 0 && ticket->error == 0) { + int ret; + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + spin_unlock(&ticket->lock); if (ret) { /* * Delete us from the list. After we unlock the space @@ -1596,17 +1609,18 @@ static void wait_reserve_ticket(struct btrfs_space_info *space_info, * despite getting an error, resulting in a space leak * (bytes_may_use counter of our space_info). */ + spin_lock(&space_info->lock); remove_ticket(space_info, ticket, -EINTR); - break; + spin_unlock(&space_info->lock); + return; } - spin_unlock(&space_info->lock); schedule(); finish_wait(&ticket->wait, &wait); - spin_lock(&space_info->lock); + spin_lock(&ticket->lock); } - spin_unlock(&space_info->lock); + spin_unlock(&ticket->lock); } /* @@ -1804,6 +1818,7 @@ static int reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, ticket.error = 0; space_info->reclaim_size += ticket.bytes; init_waitqueue_head(&ticket.wait); + spin_lock_init(&ticket.lock); ticket.steal = can_steal(flush); if (trace_btrfs_reserve_ticket_enabled()) start_ns = ktime_get_ns(); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 7e16d4c116c8..a4c2a3c8b388 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -230,6 +230,7 @@ struct reserve_ticket { bool steal; struct list_head list; wait_queue_head_t wait; + spinlock_t lock; }; static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) From 38e03b820e00196018a7ad2523a3c45653b4927d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 22 Oct 2025 19:15:00 +0100 Subject: [PATCH 073/147] btrfs: annotate as unlikely fs aborted checks in space flushing code It's not expected to have the fs in an aborted state, so surround the abortion checks with unlikely to make it clear it's unexpected and to hint the compiler to generate better code. Also at maybe_fail_all_tickets() untangle all repeated checks for the abortion into a single if-then-else. This makes things more readable and makes the compiler generate less code. On x86_64 with gcc 14.2.0-19 from Debian I got the following object size differences. Before this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 2021606 179704 25088 2226398 21f8de fs/btrfs/btrfs.ko After this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 2021458 179704 25088 2226250 21f84a fs/btrfs/btrfs.ko Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 50704e38d133..c3e1831a48a3 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1119,27 +1119,26 @@ static bool maybe_fail_all_tickets(struct btrfs_space_info *space_info) tickets_id == space_info->tickets_id) { ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - - if (!abort_error && steal_from_global_rsv(space_info, ticket)) - return true; - - if (!abort_error && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_info(fs_info, "failing ticket with %llu bytes", - ticket->bytes); - - if (abort_error) + if (unlikely(abort_error)) { remove_ticket(space_info, ticket, abort_error); - else + } else { + if (steal_from_global_rsv(space_info, ticket)) + return true; + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_info(fs_info, "failing ticket with %llu bytes", + ticket->bytes); + remove_ticket(space_info, ticket, -ENOSPC); - /* - * We're just throwing tickets away, so more flushing may not - * trip over btrfs_try_granting_tickets, so we need to call it - * here to see if we can make progress with the next ticket in - * the list. - */ - if (!abort_error) + /* + * We're just throwing tickets away, so more flushing may + * not trip over btrfs_try_granting_tickets, so we need + * to call it here to see if we can make progress with + * the next ticket in the list. + */ btrfs_try_granting_tickets(space_info); + } } return (tickets_id != space_info->tickets_id); } @@ -1415,7 +1414,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) } /* Something happened, fail everything and bail. */ - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) goto aborted_fs; last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); @@ -1449,7 +1448,7 @@ static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) } /* Something happened, fail everything and bail. */ - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) goto aborted_fs; } @@ -1553,7 +1552,7 @@ static void priority_reclaim_metadata_space(struct btrfs_space_info *space_info, * just to have caller fail immediately instead of later when trying to * modify the fs, making it easier to debug -ENOSPC problems. */ - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) remove_ticket(space_info, ticket, BTRFS_FS_ERROR(fs_info)); else if (!steal_from_global_rsv(space_info, ticket)) remove_ticket(space_info, ticket, -ENOSPC); From 02a7e90797be89ff4f6bdf1d1fbab26964b0c13a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 19 Oct 2025 11:15:26 +1030 Subject: [PATCH 074/147] btrfs: scrub: add cancel/pause/removed bg checks for raid56 parity stripes For raid56, data and parity stripes are handled differently. For data stripes they are handled just like regular RAID1/RAID10 stripes, going through the regular scrub_simple_mirror(). But for parity stripes we have to read out all involved data stripes and do any needed verification and repair, then scrub the parity stripe. This process will take a much longer time than a regular stripe, but unlike scrub_simple_mirror(), we do not check if we should cancel/pause or the block group is already removed. Aligned the behavior of scrub_raid56_parity_stripe() to scrub_simple_mirror(), by adding: - Cancel check - Pause check - Removed block group check Since those checks are the same from the scrub_simple_mirror(), also update the comments of scrub_simple_mirror() by: - Remove too obvious comments We do not need extra comments on what we're checking, it's really too obvious. - Remove a stale comment about pausing Now the scrub is always queuing all involved stripes, and submit them in one go, there is no more submission part during pausing. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e760e76df3f0..00e42a7f52af 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2091,6 +2091,20 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) + return -ECANCELED; + + if (atomic_read(&fs_info->scrub_pause_req)) + scrub_blocked_if_needed(fs_info); + + spin_lock(&bg->lock); + if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { + spin_unlock(&bg->lock); + return 0; + } + spin_unlock(&bg->lock); + /* * For data stripe search, we cannot reuse the same extent/csum paths, * as the data stripe bytenr may be smaller than previous extent. Thus @@ -2263,18 +2277,15 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; - /* Canceled? */ if (atomic_read(&fs_info->scrub_cancel_req) || atomic_read(&sctx->cancel_req)) { ret = -ECANCELED; break; } - /* Paused? */ - if (atomic_read(&fs_info->scrub_pause_req)) { - /* Push queued extents */ + + if (atomic_read(&fs_info->scrub_pause_req)) scrub_blocked_if_needed(fs_info); - } - /* Block group removed? */ + spin_lock(&bg->lock); if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &bg->runtime_flags)) { spin_unlock(&bg->lock); From c7b478504b2e5a8e428eac4c16925d52c8deb6bd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 19 Oct 2025 11:15:27 +1030 Subject: [PATCH 075/147] btrfs: scrub: cancel the run if the process or fs is being frozen It's a known bug that btrfs scrub/dev-replace can prevent the system from suspending. There are at least two factors involved: - Holding super_block::s_writers for the whole scrub/dev-replace duration We hold that percpu rw semaphore through mnt_want_write_file() for the whole scrub/dev-replace duration. That will prevent the fs being frozen, which can be initiated by either the user (e.g. fsfreeze) or power management suspend/hibernate. - Stuck in the kernel space for a long time During suspend all user processes (and some kernel threads) will be frozen. But if a user space progress has fallen into kernel (scrub ioctl) and do not return for a long time, it will make process freezing time out. Unfortunately scrub/dev-replace is a long running ioctl, and it will prevent the btrfs process from returning to the user space, thus make PM suspend/hibernate time out. Address them in one go: - Introduce a new helper should_cancel_scrub() Which includes the existing cancel request and new fs/process freezing checks. Here we have to check both fs and process freezing for PM suspend/hibernate. PM can be configured to freeze filesystems before processes. (The current default is not to freeze filesystems, but planned to freeze the filesystems as the new default.) Checking only fs freezing will fail PM without fs freezing, as the process freezing will time out. Checking only process freezing will fail PM with fs freezing since the fs freezing happens before process freezing. And the return value will indicate the reason, -ECANCLED for the explicitly canceled runs, and -EINTR for fs freeze or PM reasons. - Cancel the run if should_cancel_scrub() is true Unfortunately canceling is the only feasible solution here, pausing is not possible as we will still stay in the kernel space thus will still prevent the process from being frozen. This will cause a user impacting behavior change: Dev-replace can be interrupted by PM, and there is no way to resume but start from the beginning again. This means dev-replace may fail on newer kernels, and end users will need extra steps like using systemd-inhibit to prevent suspend/hibernate, to get back the old uninterrupted behavior. This behavior change will need extra documentation updates and communication with projects involving scrub/dev-replace including btrfs-progs. Reviewed-by: Filipe Manana Link: https://lore.kernel.org/linux-btrfs/d93b2a2d-6ad9-4c49-809f-11d769a6f30a@app.fastmail.com/ Reported-by: Chris Murphy Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 50 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 00e42a7f52af..9738caa355c4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2069,6 +2069,44 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * return 0; } +/* + * Return 0 if we should not cancel the scrub. + * Return <0 if we need to cancel the scrub, returned value will + * indicate the reason: + * - -ECANCELED - Being explicitly canceled through ioctl. + * - -EINTR - Being interrupted by fs/process freezing. + */ +static int should_cancel_scrub(const struct scrub_ctx *sctx) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) + return -ECANCELED; + + /* + * The user (e.g. fsfreeze command) or power management (PM) + * suspend/hibernate can freeze the fs. And PM suspend/hibernate will + * also freeze all user processes. + * + * A user process can only be frozen when it is in user space, thus we + * have to cancel the run so that the process can return to the user + * space. + * + * Furthermore we have to check both filesystem and process freezing, + * as PM can be configured to freeze the filesystems before processes. + * + * If we only check fs freezing, then suspend without fs freezing + * will timeout, as the process is still in kernel space. + * + * If we only check process freezing, then suspend with fs freezing + * will timeout, as the running scrub will prevent the fs from being frozen. + */ + if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || freezing(current)) + return -EINTR; + return 0; +} + static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, struct btrfs_block_group *bg, @@ -2091,9 +2129,9 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); - if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sctx->cancel_req)) - return -ECANCELED; + ret = should_cancel_scrub(sctx); + if (ret < 0) + return ret; if (atomic_read(&fs_info->scrub_pause_req)) scrub_blocked_if_needed(fs_info); @@ -2277,11 +2315,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; - if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sctx->cancel_req)) { - ret = -ECANCELED; + ret = should_cancel_scrub(sctx); + if (ret < 0) break; - } if (atomic_read(&fs_info->scrub_pause_req)) scrub_blocked_if_needed(fs_info); From 937f99c736135e530895eff028503cb057eb75f6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 19 Oct 2025 11:15:28 +1030 Subject: [PATCH 076/147] btrfs: scrub: cancel the run if there is a pending signal Unlike relocation, scrub never checks pending signals, and even for relocation is only explicitly checking for fatal signal (SIGKILL), not for regular ones. Thankfully relocation can still be interrupted by regular signals by the usage of wait_on_bit(), which is called with TASK_INTERRUPTIBLE. Do the same for scrub/dev-replace, so that regular signals can also cancel the scrub/replace run, and more importantly handle v2 cgroup freezing which is based on signal handling code inside the kernel, and freezing() function will not return true for v2 cgroup freezing. This will address the problem that systemd slice freezing will timeout on long running scrub/dev-replace. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9738caa355c4..5959511288c4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2074,7 +2074,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * * Return <0 if we need to cancel the scrub, returned value will * indicate the reason: * - -ECANCELED - Being explicitly canceled through ioctl. - * - -EINTR - Being interrupted by fs/process freezing. + * - -EINTR - Being interrupted by signal or fs/process freezing. */ static int should_cancel_scrub(const struct scrub_ctx *sctx) { @@ -2102,7 +2102,8 @@ static int should_cancel_scrub(const struct scrub_ctx *sctx) * If we only check process freezing, then suspend with fs freezing * will timeout, as the running scrub will prevent the fs from being frozen. */ - if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || freezing(current)) + if (fs_info->sb->s_writers.frozen > SB_UNFROZEN || + freezing(current) || signal_pending(current)) return -EINTR; return 0; } From 285c3ab28eed282af70aba02d7708dea245bfc4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Fri, 24 Oct 2025 12:21:40 +0200 Subject: [PATCH 077/147] btrfs: declare free_ipath() via DEFINE_FREE() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The free_ipath() function was being used as a cleanup function everywhere. Declare it via DEFINE_FREE() so we can use this function with the __free() helper. The name has also been adjusted so it's closer to the type's name. Signed-off-by: Miquel Sabaté Solà Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 10 +--------- fs/btrfs/backref.h | 7 ++++++- fs/btrfs/inode.c | 4 +--- fs/btrfs/ioctl.c | 3 +-- fs/btrfs/scrub.c | 4 +--- 5 files changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e050d0938dc4..eff2d388a706 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2785,7 +2785,7 @@ struct btrfs_data_container *init_data_container(u32 total_bytes) * allocates space to return multiple file system paths for an inode. * total_bytes to allocate are passed, note that space usable for actual path * information will be total_bytes - sizeof(struct inode_fs_paths). - * the returned pointer must be freed with free_ipath() in the end. + * the returned pointer must be freed with __free_inode_fs_paths() in the end. */ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path) @@ -2810,14 +2810,6 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, return ifp; } -void free_ipath(struct inode_fs_paths *ipath) -{ - if (!ipath) - return; - kvfree(ipath->fspath); - kfree(ipath); -} - struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info) { struct btrfs_backref_iter *ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 25d51c246070..1d009b0f4c69 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -241,7 +241,12 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, struct btrfs_data_container *init_data_container(u32 total_bytes); struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, struct btrfs_path *path); -void free_ipath(struct inode_fs_paths *ipath); + +DEFINE_FREE(inode_fs_paths, struct inode_fs_paths *, + if (_T) { + kvfree(_T->fspath); + kfree(_T); + }) int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 41b1d7819b86..7958e6c4f6b3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -130,7 +130,7 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, struct btrfs_fs_info *fs_info = warn->fs_info; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; - struct inode_fs_paths *ipath = NULL; + struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; struct btrfs_root *local_root; struct btrfs_key key; unsigned int nofs_flag; @@ -195,7 +195,6 @@ static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, } btrfs_put_root(local_root); - free_ipath(ipath); return 0; err: @@ -203,7 +202,6 @@ err: "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", warn->logical, warn->mirror_num, root, inum, offset, ret); - free_ipath(ipath); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 127b5d8303a8..875d286bec86 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3298,7 +3298,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) u64 rel_ptr; int size; struct btrfs_ioctl_ino_path_args *ipa = NULL; - struct inode_fs_paths *ipath = NULL; + struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; struct btrfs_path *path; if (!capable(CAP_DAC_READ_SEARCH)) @@ -3346,7 +3346,6 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) out: btrfs_free_path(path); - free_ipath(ipath); kfree(ipa); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5959511288c4..33c9cb91f0a9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -505,7 +505,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; struct btrfs_fs_info *fs_info = swarn->dev->fs_info; - struct inode_fs_paths *ipath = NULL; + struct inode_fs_paths *ipath __free(inode_fs_paths) = NULL; struct btrfs_root *local_root; struct btrfs_key key; @@ -569,7 +569,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, (char *)(unsigned long)ipath->fspath->val[i]); btrfs_put_root(local_root); - free_ipath(ipath); return 0; err: @@ -580,7 +579,6 @@ err: swarn->physical, root, inum, offset, ret); - free_ipath(ipath); return 0; } From d00cbce0a7d5de5fc31bf60abd59b44d36806b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Fri, 24 Oct 2025 12:21:41 +0200 Subject: [PATCH 078/147] btrfs: define the AUTO_KFREE/AUTO_KVFREE helper macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These are two simple macros which ensure that a pointer is initialized to NULL and with the proper cleanup attribute for it. Signed-off-by: Miquel Sabaté Solà Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/misc.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 60f9b000d644..a82032c66ccd 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -13,6 +13,13 @@ #include #include +/* + * Convenience macros to define a pointer with the __free(kfree) and + * __free(kvfree) cleanup attributes and initialized to NULL. + */ +#define AUTO_KFREE(name) *name __free(kfree) = NULL +#define AUTO_KVFREE(name) *name __free(kvfree) = NULL + /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. */ From 7ab5d01d58a766807e137cbe8c90cb2e591e2f7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Fri, 24 Oct 2025 12:21:42 +0200 Subject: [PATCH 079/147] btrfs: apply the AUTO_K(V)FREE macros throughout the code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the AUTO_KFREE and AUTO_KVFREE macros wherever it makes sense. Since this macro is expected to improve code readability, it has been avoided in places where the lifetime of objects wasn't easy to follow and a cleanup attribute would've made things worse; or when the cleanup section of a function involved many other things and thus there was no readability impact anyways. This change has also not been applied in extremely short functions where readability was clearly not an issue. Signed-off-by: Miquel Sabaté Solà Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/acl.c | 25 +++++++--------- fs/btrfs/delayed-inode.c | 15 ++++------ fs/btrfs/extent-tree.c | 17 +++++------ fs/btrfs/ioctl.c | 41 ++++++++++--------------- fs/btrfs/qgroup.c | 3 +- fs/btrfs/raid-stripe-tree.c | 14 +++------ fs/btrfs/reflink.c | 7 ++--- fs/btrfs/relocation.c | 34 ++++++++------------- fs/btrfs/send.c | 50 ++++++++++++------------------- fs/btrfs/super.c | 3 +- fs/btrfs/tests/extent-io-tests.c | 3 +- fs/btrfs/tests/extent-map-tests.c | 6 ++-- fs/btrfs/tree-log.c | 43 ++++++++++---------------- fs/btrfs/volumes.c | 28 +++++------------ fs/btrfs/zoned.c | 3 +- 15 files changed, 106 insertions(+), 186 deletions(-) diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index e0ba00d64ea0..c336e2ab7f8a 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -14,12 +14,13 @@ #include "ctree.h" #include "xattr.h" #include "acl.h" +#include "misc.h" struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) { int size; const char *name; - char *value = NULL; + char AUTO_KFREE(value); struct posix_acl *acl; if (rcu) @@ -49,7 +50,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) acl = NULL; else acl = ERR_PTR(size); - kfree(value); return acl; } @@ -59,7 +59,7 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, { int ret, size = 0; const char *name; - char *value = NULL; + char AUTO_KFREE(value); switch (type) { case ACL_TYPE_ACCESS: @@ -85,28 +85,23 @@ int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, nofs_flag = memalloc_nofs_save(); value = kmalloc(size, GFP_KERNEL); memalloc_nofs_restore(nofs_flag); - if (!value) { - ret = -ENOMEM; - goto out; - } + if (!value) + return -ENOMEM; ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (ret < 0) - goto out; + return ret; } if (trans) ret = btrfs_setxattr(trans, inode, name, value, size, 0); else ret = btrfs_setxattr_trans(inode, name, value, size, 0); + if (ret < 0) + return ret; -out: - kfree(value); - - if (!ret) - set_cached_acl(inode, type, acl); - - return ret; + set_cached_acl(inode, type, acl); + return 0; } int btrfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 3df7b9d7fbe8..e77a597580c5 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -668,7 +668,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_key first_key; const u32 first_data_size = first_item->data_len; int total_size; - char *ins_data = NULL; + char AUTO_KFREE(ins_data); int ret; bool continuous_keys_only = false; @@ -740,10 +740,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, ins_data = kmalloc_array(batch.nr, sizeof(u32) + sizeof(struct btrfs_key), GFP_NOFS); - if (!ins_data) { - ret = -ENOMEM; - goto out; - } + if (!ins_data) + return -ENOMEM; ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); batch.keys = ins_keys; @@ -759,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret) - goto out; + return ret; list_for_each_entry(curr, &item_list, tree_list) { char *data_ptr; @@ -814,9 +812,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } -out: - kfree(ins_data); - return ret; + + return 0; } static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f981ff72fb98..d7e5be81ea81 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6058,7 +6058,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc struct btrfs_trans_handle *trans; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root_item *root_item = &root->root_item; - struct walk_control *wc; + struct walk_control AUTO_KFREE(wc); struct btrfs_key key; const u64 rootid = btrfs_root_id(root); int ret = 0; @@ -6076,9 +6076,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, bool update_ref, bool for_reloc wc = kzalloc(sizeof(*wc), GFP_NOFS); if (!wc) { - btrfs_free_path(path); ret = -ENOMEM; - goto out; + goto out_free; } /* @@ -6288,7 +6287,6 @@ out_end_trans: btrfs_end_transaction_throttle(trans); out_free: - kfree(wc); btrfs_free_path(path); out: if (!ret && root_dropped) { @@ -6331,7 +6329,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; BTRFS_PATH_AUTO_FREE(path); - struct walk_control *wc; + struct walk_control AUTO_KFREE(wc); int level; int parent_level; int ret = 0; @@ -6370,18 +6368,17 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, while (1) { ret = walk_down_tree(trans, root, path, wc); if (ret < 0) - break; + return ret; ret = walk_up_tree(trans, root, path, wc, parent_level); if (ret) { - if (ret > 0) - ret = 0; + if (ret < 0) + return ret; break; } } - kfree(wc); - return ret; + return 0; } /* diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 875d286bec86..83a168613ee9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -503,7 +503,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap, struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_root_item *root_item; + struct btrfs_root_item AUTO_KFREE(root_item); struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -527,20 +527,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) - goto out_root_item; + return ret; /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assumes subvolume qgroup's level to be 0. */ - if (btrfs_qgroup_level(objectid)) { - ret = -ENOSPC; - goto out_root_item; - } + if (btrfs_qgroup_level(objectid)) + return -ENOSPC; ret = get_anon_bdev(&anon_dev); if (ret < 0) - goto out_root_item; + return ret; new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir); if (!new_inode_args.inode) { @@ -692,8 +690,7 @@ out_inode: out_anon_dev: if (anon_dev) free_anon_bdev(anon_dev); -out_root_item: - kfree(root_item); + return ret; } @@ -2956,7 +2953,7 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; - struct btrfs_ioctl_space_info *dest_orig; + struct btrfs_ioctl_space_info AUTO_KFREE(dest_orig); struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; static const u64 types[] = { @@ -3077,9 +3074,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, (arg + sizeof(struct btrfs_ioctl_space_args)); if (copy_to_user(user_dest, dest_orig, alloc_size)) - ret = -EFAULT; + return -EFAULT; - kfree(dest_orig); out: if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args))) ret = -EFAULT; @@ -3610,7 +3606,7 @@ static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd) static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_balance_args *bargs; + struct btrfs_ioctl_balance_args AUTO_KFREE(bargs); int ret = 0; if (!capable(CAP_SYS_ADMIN)) @@ -3632,8 +3628,6 @@ static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; - - kfree(bargs); out: mutex_unlock(&fs_info->balance_mutex); return ret; @@ -4227,7 +4221,7 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, u64 safe_set, u64 safe_clear) { const char *type = btrfs_feature_set_name(set); - char *names; + const char AUTO_KFREE(names); u64 disallowed, unsupported; u64 set_mask = flags & change_mask; u64 clear_mask = ~flags & change_mask; @@ -4235,12 +4229,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, unsupported = set_mask & ~supported_flags; if (unsupported) { names = btrfs_printable_features(set, unsupported); - if (names) { + if (names) btrfs_warn(fs_info, "this kernel does not support the %s feature bit%s", names, strchr(names, ',') ? "s" : ""); - kfree(names); - } else + else btrfs_warn(fs_info, "this kernel does not support %s bits 0x%llx", type, unsupported); @@ -4250,12 +4243,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, disallowed = set_mask & ~safe_set; if (disallowed) { names = btrfs_printable_features(set, disallowed); - if (names) { + if (names) btrfs_warn(fs_info, "can't set the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); - kfree(names); - } else + else btrfs_warn(fs_info, "can't set %s bits 0x%llx while mounted", type, disallowed); @@ -4265,12 +4257,11 @@ static int check_feature_bits(const struct btrfs_fs_info *fs_info, disallowed = clear_mask & ~safe_clear; if (disallowed) { names = btrfs_printable_features(set, disallowed); - if (names) { + if (names) btrfs_warn(fs_info, "can't clear the %s feature bit%s while mounted", names, strchr(names, ',') ? "s" : ""); - kfree(names); - } else + else btrfs_warn(fs_info, "can't clear %s bits 0x%llx while mounted", type, disallowed); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 6e3871ba1845..877a65e1794f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4794,7 +4794,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; - struct btrfs_qgroup_swapped_block *block; + struct btrfs_qgroup_swapped_block AUTO_KFREE(block); struct extent_buffer *reloc_eb = NULL; struct rb_node *node; bool swapped = false; @@ -4851,7 +4851,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, block->last_snapshot, block->trace_leaf); free_out: - kfree(block); free_extent_buffer(reloc_eb); out: if (ret < 0) { diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index cc6f6095cc9f..f5c616115254 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -19,7 +19,7 @@ static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, u64 newlen, u64 frontpad) { struct btrfs_root *stripe_root = trans->fs_info->stripe_root; - struct btrfs_stripe_extent *extent, *newitem; + struct btrfs_stripe_extent *extent, AUTO_KFREE(newitem); struct extent_buffer *leaf; int slot; size_t item_size; @@ -53,14 +53,10 @@ static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, stripe_root, path); if (ret) - goto out; + return ret; btrfs_release_path(path); - ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); - -out: - kfree(newitem); - return ret; + return btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size); } int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) @@ -299,7 +295,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, struct btrfs_key stripe_key; struct btrfs_root *stripe_root = fs_info->stripe_root; const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type); - struct btrfs_stripe_extent *stripe_extent; + struct btrfs_stripe_extent AUTO_KFREE(stripe_extent); const size_t item_size = struct_size(stripe_extent, strides, num_stripes); int ret; @@ -336,8 +332,6 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); } - kfree(stripe_extent); - return ret; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 1bbe3bb7e1bb..775a32a7953a 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -343,7 +343,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_trans_handle *trans; - char *buf = NULL; + char AUTO_KVFREE(buf); struct btrfs_key key; u32 nritems; int slot; @@ -358,10 +358,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, return ret; path = btrfs_alloc_path(); - if (!path) { - kvfree(buf); + if (!path) return ret; - } path->reada = READA_FORWARD; /* Clone data */ @@ -611,7 +609,6 @@ process_slot: } out: - kvfree(buf); clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); return ret; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 96539e8b7b4b..739fca944296 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -511,7 +511,7 @@ static void __del_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; - struct mapping_node *node = NULL; + struct mapping_node AUTO_KFREE(node); struct reloc_control *rc = fs_info->reloc_ctl; bool put_ref = false; @@ -544,7 +544,6 @@ static void __del_reloc_root(struct btrfs_root *root) spin_unlock(&fs_info->trans_lock); if (put_ref) btrfs_put_root(root); - kfree(node); } /* @@ -586,10 +585,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct extent_buffer *eb; - struct btrfs_root_item *root_item; + struct btrfs_root_item AUTO_KFREE(root_item); struct btrfs_key root_key; int ret = 0; - bool must_abort = false; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); if (!root_item) @@ -617,15 +615,14 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, btrfs_err(fs_info, "cannot relocate partially dropped subvolume %llu, drop progress key " BTRFS_KEY_FMT, objectid, BTRFS_KEY_FMT_VALUE(&cpu_key)); - ret = -EUCLEAN; - goto fail; + return ERR_PTR(-EUCLEAN); } /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) - goto fail; + return ERR_PTR(ret); /* * Set the last_snapshot field to the generation of the commit @@ -648,14 +645,13 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_copy_root(trans, root, root->node, &eb, BTRFS_TREE_RELOC_OBJECTID); if (ret) - goto fail; + return ERR_PTR(ret); } /* * We have changed references at this point, we must abort the - * transaction if anything fails. + * transaction if anything fails (i.e. 'goto abort'). */ - must_abort = true; memcpy(root_item, &root->root_item, sizeof(*root_item)); btrfs_set_root_bytenr(root_item, eb->start); @@ -675,9 +671,7 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); if (ret) - goto fail; - - kfree(root_item); + goto abort; reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); if (IS_ERR(reloc_root)) { @@ -687,11 +681,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); btrfs_set_root_last_trans(reloc_root, trans->transid); return reloc_root; -fail: - kfree(root_item); + abort: - if (must_abort) - btrfs_abort_transaction(trans, ret); + btrfs_abort_transaction(trans, ret); return ERR_PTR(ret); } @@ -2947,7 +2939,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) const struct file_extent_cluster *cluster = &rc->cluster; u64 offset = BTRFS_I(inode)->reloc_block_group_start; u64 cur_file_offset = cluster->start - offset; - struct file_ra_state *ra; + struct file_ra_state AUTO_KFREE(ra); int cluster_nr = 0; int ret = 0; @@ -2960,13 +2952,13 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) ret = prealloc_file_extent_cluster(rc); if (ret) - goto out; + return ret; file_ra_state_init(ra, inode->i_mapping); ret = setup_relocation_extent_mapping(rc); if (ret) - goto out; + return ret; while (cur_file_offset < cluster->end - offset) { ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset); @@ -2975,8 +2967,6 @@ static int relocate_file_extent_cluster(struct reloc_control *rc) } if (ret == 0) WARN_ON(cluster_nr != cluster->nr); -out: - kfree(ra); return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index caeaa50f2f44..9312d74400a3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2458,7 +2458,7 @@ static int send_subvol_begin(struct send_ctx *sctx) struct btrfs_key key; struct btrfs_root_ref *ref; struct extent_buffer *leaf; - char *name = NULL; + char AUTO_KFREE(name); int namelen; path = btrfs_alloc_path(); @@ -2476,18 +2476,15 @@ static int send_subvol_begin(struct send_ctx *sctx) ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root, &key, path, 1, 0); if (ret < 0) - goto out; - if (ret) { - ret = -ENOENT; - goto out; - } + return ret; + if (ret) + return -ENOENT; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.type != BTRFS_ROOT_BACKREF_KEY || key.objectid != btrfs_root_id(send_root)) { - ret = -ENOENT; - goto out; + return -ENOENT; } ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); namelen = btrfs_root_ref_name_len(leaf, ref); @@ -2497,11 +2494,11 @@ static int send_subvol_begin(struct send_ctx *sctx) if (parent_root) { ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT); if (ret < 0) - goto out; + return ret; } else { ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL); if (ret < 0) - goto out; + return ret; } TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); @@ -2529,8 +2526,6 @@ static int send_subvol_begin(struct send_ctx *sctx) ret = send_cmd(sctx); tlv_put_failure: -out: - kfree(name); return ret; } @@ -4077,7 +4072,7 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) */ static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) { - char *name; + char AUTO_KFREE(name); int ret; name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); @@ -4087,17 +4082,16 @@ static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) fs_path_reset(ref->full_path); ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); if (ret < 0) - goto out; + return ret; ret = fs_path_add(ref->full_path, name, ref->name_len); if (ret < 0) - goto out; + return ret; /* Update the reference's base name pointer. */ set_ref_path(ref, ref->full_path); -out: - kfree(name); - return ret; + + return 0; } static int rbtree_check_dir_ref_comp(const void *k, const struct rb_node *node) @@ -5006,8 +5000,8 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, { int ret; struct send_ctx *sctx = ctx; - char *found_data = NULL; - int found_data_len = 0; + char AUTO_KFREE(found_data); + int found_data_len = 0; ret = find_xattr(sctx->parent_root, sctx->right_path, sctx->cmp_key, name, name_len, &found_data, @@ -5025,7 +5019,6 @@ static int __process_changed_new_xattr(int num, struct btrfs_key *di_key, } } - kfree(found_data); return ret; } @@ -5762,7 +5755,7 @@ static int send_capabilities(struct send_ctx *sctx) struct btrfs_dir_item *di; struct extent_buffer *leaf; unsigned long data_ptr; - char *buf = NULL; + char AUTO_KFREE(buf); int buf_len; int ret = 0; @@ -5774,28 +5767,23 @@ static int send_capabilities(struct send_ctx *sctx) XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); if (!di) { /* There is no xattr for this inode */ - goto out; + return 0; } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; + return PTR_ERR(di); } leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); buf = kmalloc(buf_len, GFP_KERNEL); - if (!buf) { - ret = -ENOMEM; - goto out; - } + if (!buf) + return -ENOMEM; data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); -out: - kfree(buf); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e606e11d3f57..7e4cfae63bcc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1614,7 +1614,7 @@ static inline void btrfs_descending_sort_devices( static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 *free_bytes) { - struct btrfs_device_info *devices_info; + struct btrfs_device_info AUTO_KFREE(devices_info); struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 type; @@ -1712,7 +1712,6 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, nr_devices--; } - kfree(devices_info); *free_bytes = avail_space; return 0; } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index b19328d077d3..a0187d6163df 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -505,7 +505,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb) static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { struct btrfs_fs_info *fs_info; - unsigned long *bitmap = NULL; + unsigned long AUTO_KFREE(bitmap); struct extent_buffer *eb = NULL; int ret; @@ -551,7 +551,6 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) ret = __test_eb_bitmaps(bitmap, eb); out: free_extent_buffer(eb); - kfree(bitmap); btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 42af6c737c6e..0b9f25dd1a68 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -1013,7 +1013,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, struct rmap_test_vector *test) { struct btrfs_chunk_map *map; - u64 *logical = NULL; + u64 AUTO_KFREE(logical); int i, out_ndaddrs, out_stripe_len; int ret; @@ -1046,7 +1046,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, if (ret) { test_err("error adding chunk map to mapping tree"); btrfs_free_chunk_map(map); - goto out_free; + return ret; } ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1), @@ -1079,8 +1079,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = 0; out: btrfs_remove_chunk_map(fs_info, map); -out_free: - kfree(logical); return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 030d0fef97bd..bec93a0a6756 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4015,7 +4015,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, int count) { struct btrfs_root *log = inode->root->log_root; - char *ins_data = NULL; + char AUTO_KFREE(ins_data); struct btrfs_item_batch batch; struct extent_buffer *dst; unsigned long src_offset; @@ -4060,7 +4060,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); if (ret) - goto out; + return ret; dst = dst_path->nodes[0]; /* @@ -4092,8 +4092,6 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, if (btrfs_get_first_dir_index_to_log(inode) == 0) btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); -out: - kfree(ins_data); return ret; } @@ -4760,7 +4758,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_key *ins_keys; u32 *ins_sizes; struct btrfs_item_batch batch; - char *ins_data; + char AUTO_KFREE(ins_data); int dst_index; const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM); const u64 i_size = i_size_read(&inode->vfs_inode); @@ -4888,7 +4886,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, disk_bytenr + extent_num_bytes - 1, &ordered_sums, false); if (ret < 0) - goto out; + return ret; ret = 0; list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) { @@ -4898,7 +4896,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, kfree(sums); } if (ret) - goto out; + return ret; add_to_batch: ins_sizes[dst_index] = btrfs_item_size(src, src_slot); @@ -4912,11 +4910,11 @@ add_to_batch: * so we don't need to do anything. */ if (batch.nr == 0) - goto out; + return 0; ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); if (ret) - goto out; + return ret; dst_index = 0; for (int i = 0; i < nr; i++) { @@ -4969,8 +4967,6 @@ copy_item: } btrfs_release_path(dst_path); -out: - kfree(ins_data); return ret; } @@ -5689,9 +5685,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, struct btrfs_inode *inode, u64 *other_ino, u64 *other_parent) { - int ret; BTRFS_PATH_AUTO_FREE(search_path); - char *name = NULL; + char AUTO_KFREE(name); u32 name_len = 0; u32 item_size = btrfs_item_size(eb, slot); u32 cur_offset = 0; @@ -5734,10 +5729,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, char *new_name; new_name = krealloc(name, this_name_len, GFP_NOFS); - if (!new_name) { - ret = -ENOMEM; - goto out; - } + if (!new_name) + return -ENOMEM; name_len = this_name_len; name = new_name; } @@ -5755,28 +5748,24 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, di, &di_key); if (di_key.type == BTRFS_INODE_ITEM_KEY) { if (di_key.objectid != key->objectid) { - ret = 1; *other_ino = di_key.objectid; *other_parent = parent; + return 1; } else { - ret = 0; + return 0; } } else { - ret = -EAGAIN; + return -EAGAIN; } - goto out; } else if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; + return PTR_ERR(di); } btrfs_release_path(search_path); cur_offset += this_len; } - ret = 0; -out: - kfree(name); - return ret; + + return 0; } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 45d89b12025b..75a34ed95c74 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -739,7 +739,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; struct path new = { .mnt = NULL, .dentry = NULL }; - char *old_path = NULL; + char AUTO_KFREE(old_path); bool is_same = false; int ret; @@ -765,7 +765,6 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path) if (path_equal(&old, &new)) is_same = true; out: - kfree(old_path); path_put(&old); path_put(&new); return is_same; @@ -4384,7 +4383,7 @@ static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) { u32 size_buf = 1024; char tmp_buf[192] = {'\0'}; - char *buf; + char AUTO_KFREE(buf); char *bp; u32 size_bp = size_buf; int ret; @@ -4432,8 +4431,6 @@ out_overflow: btrfs_info(fs_info, "balance: %s %s", (bctl->flags & BTRFS_BALANCE_RESUME) ? "resume" : "start", buf); - - kfree(buf); } /* @@ -5562,9 +5559,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct btrfs_device_info *devices_info = NULL; + struct btrfs_device_info AUTO_KFREE(devices_info); struct alloc_chunk_ctl ctl; - struct btrfs_block_group *block_group; int ret; lockdep_assert_held(&info->chunk_mutex); @@ -5597,22 +5593,14 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOMEM); ret = gather_device_info(fs_devices, &ctl, devices_info); - if (ret < 0) { - block_group = ERR_PTR(ret); - goto out; - } + if (ret < 0) + return ERR_PTR(ret); ret = decide_stripe_size(fs_devices, &ctl, devices_info); - if (ret < 0) { - block_group = ERR_PTR(ret); - goto out; - } + if (ret < 0) + return ERR_PTR(ret); - block_group = create_chunk(trans, &ctl, devices_info); - -out: - kfree(devices_info); - return block_group; + return create_chunk(trans, &ctl, devices_info); } /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 9b8c9894a1de..a8abe8e1576e 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1628,7 +1628,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) struct btrfs_chunk_map *map; u64 logical = cache->start; u64 length = cache->length; - struct zone_info *zone_info = NULL; + struct zone_info AUTO_KFREE(zone_info); int ret; int i; unsigned long *active = NULL; @@ -1782,7 +1782,6 @@ out: cache->physical_map = NULL; } bitmap_free(active); - kfree(zone_info); return ret; } From 252877a8701530fde861a4f27710c1e718e97caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Fri, 24 Oct 2025 12:21:43 +0200 Subject: [PATCH 080/147] btrfs: add ASSERTs on prealloc in qgroup functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prealloc variable in these functions is always initialized to NULL. Whenever we allocate memory for it, if it fails then NULL is preserved, otherwise we delegate the ownership of the pointer to add_qgroup_rb() and set it right after to NULL. Since in any case the pointer ends up being NULL at the end of its usage, we can safely remove calls to kfree() for it, while adding an ASSERT as an extra check. Signed-off-by: Miquel Sabaté Solà Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 877a65e1794f..1956e4bf2302 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1263,7 +1263,14 @@ out: btrfs_end_transaction(trans); else if (trans) ret = btrfs_end_transaction(trans); - kfree(prealloc); + + /* + * At this point we either failed at allocating prealloc, or we + * succeeded and passed the ownership to it to add_qgroup_rb(). In any + * case, this needs to be NULL or there is something wrong. + */ + ASSERT(prealloc == NULL); + return ret; } @@ -1695,7 +1702,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); - kfree(prealloc); + /* + * At this point we either failed at allocating prealloc, or we + * succeeded and passed the ownership to it to add_qgroup_rb(). In any + * case, this needs to be NULL or there is something wrong. + */ + ASSERT(prealloc == NULL); return ret; } @@ -3303,7 +3315,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; - struct btrfs_qgroup *prealloc; + struct btrfs_qgroup *prealloc = NULL; struct btrfs_qgroup_list **qlist_prealloc = NULL; bool free_inherit = false; bool need_rescan = false; @@ -3544,7 +3556,14 @@ out: } if (free_inherit) kfree(inherit); - kfree(prealloc); + + /* + * At this point we either failed at allocating prealloc, or we + * succeeded and passed the ownership to it to add_qgroup_rb(). In any + * case, this needs to be NULL or there is something wrong. + */ + ASSERT(prealloc == NULL); + return ret; } From 51070655e7d5749f9515e7a6ca1d5f49d1a76d81 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 11:19:59 +0200 Subject: [PATCH 081/147] btrfs: zoned: show statistics for zoned filesystems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide statistics for zoned filesystems. These statistics include, the number of active block-groups, how many of them are reclaimable or unused, if the filesystem needs to be reclaimed, the currently assigned relocation and treelog block-groups if they're present and a list of active zones. Example: active block-groups: 4   reclaimable: 0   unused: 2   need reclaim: false data relocation block-group: 4294967296 active zones:   start: 1610612736, wp: 344064 used: 16384, reserved: 0, unusable: 327680   start: 1879048192, wp: 34963456 used: 131072, reserved: 0, unusable: 34832384   start: 4026531840, wp: 0 used: 0, reserved: 0, unusable: 0   start: 4294967296, wp: 0 used: 0, reserved: 0, unusable: 0 Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index d66681ce2b3d..1f64c132b387 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "messages.h" #include "ctree.h" @@ -25,6 +26,7 @@ #include "misc.h" #include "fs.h" #include "accessors.h" +#include "zoned.h" /* * Structure name Path @@ -1187,6 +1189,56 @@ static ssize_t btrfs_commit_stats_store(struct kobject *kobj, } BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); +static ssize_t btrfs_zoned_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + struct btrfs_block_group *bg; + size_t ret = 0; + + + if (!btrfs_is_zoned(fs_info)) + return ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + ret += sysfs_emit_at(buf, ret, "active block-groups: %zu\n", + list_count_nodes(&fs_info->zone_active_bgs)); + spin_unlock(&fs_info->zone_active_bgs_lock); + + mutex_lock(&fs_info->reclaim_bgs_lock); + spin_lock(&fs_info->unused_bgs_lock); + ret += sysfs_emit_at(buf, ret, "\treclaimable: %zu\n", + list_count_nodes(&fs_info->reclaim_bgs)); + ret += sysfs_emit_at(buf, ret, "\tunused: %zu\n", + list_count_nodes(&fs_info->unused_bgs)); + spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); + + ret += sysfs_emit_at(buf, ret, "\tneed reclaim: %s\n", + str_true_false(btrfs_zoned_should_reclaim(fs_info))); + + if (fs_info->data_reloc_bg) + ret += sysfs_emit_at(buf, ret, + "data relocation block-group: %llu\n", + fs_info->data_reloc_bg); + if (fs_info->treelog_bg) + ret += sysfs_emit_at(buf, ret, + "tree-log block-group: %llu\n", + fs_info->treelog_bg); + + spin_lock(&fs_info->zone_active_bgs_lock); + ret += sysfs_emit_at(buf, ret, "active zones:\n"); + list_for_each_entry(bg, &fs_info->zone_active_bgs, active_bg_list) { + ret += sysfs_emit_at(buf, ret, + "\tstart: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu\n", + bg->start, bg->alloc_offset, bg->used, + bg->reserved, bg->zone_unusable); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + return ret; +} +BTRFS_ATTR(, zoned_stats, btrfs_zoned_stats_show); + static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1599,6 +1651,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), BTRFS_ATTR_PTR(, temp_fsid), + BTRFS_ATTR_PTR(, zoned_stats), #ifdef CONFIG_BTRFS_EXPERIMENTAL BTRFS_ATTR_PTR(, offload_csum), #endif From c913649c1b0260a8a992773aa6a49189059f65a5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 31 Oct 2025 08:55:09 +0100 Subject: [PATCH 082/147] btrfs: replace const_ilog2() with ilog2() const_ilog2() was a workaround of some sparse issue, which has never appeared in the C functions. Replace it with ilog2(). Signed-off-by: Andy Shevchenko Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 5 ++--- fs/btrfs/zoned.c | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index adbd9e6c09ff..34b854c1a303 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -45,7 +45,7 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN_SHIFT (16) #define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1) -static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); +static_assert(ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); /* Used by sanity check for btrfs_raid_types. */ #define const_ffs(n) (__builtin_ctzll(n) + 1) @@ -58,8 +58,7 @@ static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); */ static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); -static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) > - ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); +static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); /* ilog2() can handle both constants and variables */ #define BTRFS_BG_FLAG_TO_INDEX(profile) \ diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index a8abe8e1576e..9b2af6210867 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -37,8 +37,8 @@ #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) -#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) -#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) +#define BTRFS_SB_LOG_FIRST_SHIFT ilog2(BTRFS_SB_LOG_FIRST_OFFSET) +#define BTRFS_SB_LOG_SECOND_SHIFT ilog2(BTRFS_SB_LOG_SECOND_OFFSET) /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 From afc04c8b1bb5552e6f7e05b4fe02ebc451fe66ff Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 27 Oct 2025 18:58:47 +1030 Subject: [PATCH 083/147] btrfs: replace BTRFS_MAX_BIO_SECTORS with BIO_MAX_VECS It's impossible to have a btrfs bio with more than BIO_MAX_VECS vectors anyway. And there is only one location utilizing that macro, just replace it with BIO_MAX_VECS. Both have the same value. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.h | 7 ------- fs/btrfs/direct-io.c | 2 +- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 00883aea55d7..3cc0fe23898f 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -18,13 +18,6 @@ struct btrfs_inode; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 -/* - * Maximum number of sectors for a single bio to limit the size of the - * checksum array. This matches the number of bio_vecs per bio and thus the - * I/O size for buffered I/O. - */ -#define BTRFS_MAX_BIO_SECTORS (256) - typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); /* diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 802d4dbe5b38..db0191567b8d 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -385,7 +385,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to allocate a contiguous array for the checksums. */ if (!write) - len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); + len = min_t(u64, len, fs_info->sectorsize * BIO_MAX_VECS); lockstart = start; lockend = start + len - 1; From c5667f9c8eb90293dfa4e52c65eb89fe39f5652d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 28 Oct 2025 10:06:36 +1030 Subject: [PATCH 084/147] btrfs: headers cleanup to remove unnecessary local includes [BUG] When I tried to remove btrfs_bio::fs_info and use btrfs_bio::inode to grab the fs_info, the header "btrfs_inode.h" is needed to access the full btrfs_inode structure. Then btrfs will fail to compile. [CAUSE] There is a recursive including chain: "bio.h" -> "btrfs_inode.h" -> "extent_map.h" -> "compression.h" -> "bio.h" That recursive including is causing problems for btrfs. [ENHANCEMENT] To reduce the risk of recursive including: - Remove unnecessary local includes from btrfs headers Either the included header is pulled in by other headers, or is completely unnecessary. - Remove btrfs local includes if the header only requires a pointer In that case let the implementing C file to pull the required header. This is especially important for headers like "btrfs_inode.h" which pulls in a lot of other btrfs headers, thus it's a mine field of recursive including. - Remove unnecessary temporary structure definition Either if we have included the header defining the structure, or completely unused. Now including "btrfs_inode.h" inside "bio.h" is completely fine, although "btrfs_inode.h" still includes "extent_map.h", but that header only includes "fs.h", no more chain back to "bio.h". Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 1 + fs/btrfs/btrfs_inode.h | 8 ++++---- fs/btrfs/compression.h | 3 --- fs/btrfs/ctree.h | 2 -- fs/btrfs/defrag.c | 1 + fs/btrfs/dir-item.c | 1 + fs/btrfs/direct-io.c | 2 ++ fs/btrfs/disk-io.c | 1 + fs/btrfs/disk-io.h | 3 ++- fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent_io.h | 1 - fs/btrfs/extent_map.h | 3 +-- fs/btrfs/file-item.h | 2 +- fs/btrfs/inode.c | 1 + fs/btrfs/space-info.c | 1 + fs/btrfs/subpage.h | 1 - fs/btrfs/transaction.c | 2 ++ fs/btrfs/transaction.h | 4 ---- fs/btrfs/tree-log.c | 1 + fs/btrfs/tree-log.h | 3 +-- fs/btrfs/zoned.h | 1 - 21 files changed, 21 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 99b3ced12805..78721412951c 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -12,6 +12,7 @@ #include #include #include +#include "fs.h" #include "extent_io.h" struct extent_buffer; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index af373d50a901..a66ca5531b5c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -18,20 +18,20 @@ #include #include #include +#include "ctree.h" #include "block-rsv.h" #include "extent_map.h" -#include "extent_io.h" #include "extent-io-tree.h" -#include "ordered-data.h" -#include "delayed-inode.h" -struct extent_state; struct posix_acl; struct iov_iter; struct writeback_control; struct btrfs_root; struct btrfs_fs_info; struct btrfs_trans_handle; +struct btrfs_bio; +struct btrfs_file_extent; +struct btrfs_delayed_node; /* * Since we search a directory based on f_pos (struct dir_context::pos) we have diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index eba188a9e3bb..c6812d5fcab7 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -14,14 +14,11 @@ #include #include "bio.h" #include "fs.h" -#include "messages.h" struct address_space; -struct page; struct inode; struct btrfs_inode; struct btrfs_ordered_extent; -struct btrfs_bio; /* * We want to make sure that amount of RAM required to uncompress an extent is diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe70b593c7cd..16dd11c48531 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -17,9 +17,7 @@ #include #include #include "locking.h" -#include "fs.h" #include "accessors.h" -#include "extent-io-tree.h" struct extent_buffer; struct btrfs_block_rsv; diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 7b277934f66f..a4cc1bc63562 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -15,6 +15,7 @@ #include "defrag.h" #include "file-item.h" #include "super.h" +#include "compression.h" static struct kmem_cache *btrfs_inode_defrag_cachep; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 69863e398e22..77e1bcb2a74b 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -9,6 +9,7 @@ #include "transaction.h" #include "accessors.h" #include "dir-item.h" +#include "delayed-inode.h" /* * insert a name into a directory, doing overflow properly if there is a hash diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index db0191567b8d..f225cc3fd3a1 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -10,6 +10,8 @@ #include "fs.h" #include "transaction.h" #include "volumes.h" +#include "bio.h" +#include "ordered-data.h" struct btrfs_dio_data { ssize_t submitted; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0aa7e5d1b05f..46b715f3447b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,6 +50,7 @@ #include "relocation.h" #include "scrub.h" #include "super.h" +#include "delayed-inode.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 57920f2c6fe4..5320da83d0cf 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -9,7 +9,8 @@ #include #include #include "ctree.h" -#include "fs.h" +#include "bio.h" +#include "ordered-data.h" struct block_device; struct super_block; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d7e5be81ea81..86004b8daa96 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -40,6 +40,7 @@ #include "orphan.h" #include "tree-checker.h" #include "raid-stripe-tree.h" +#include "delayed-inode.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5fcbfe44218c..02ebb2f238af 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -12,7 +12,6 @@ #include #include #include -#include "compression.h" #include "messages.h" #include "ulist.h" #include "misc.h" diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index d4b81ee4d97b..6f685f3c9327 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -8,8 +8,7 @@ #include #include #include -#include "misc.h" -#include "compression.h" +#include "fs.h" struct btrfs_inode; struct btrfs_fs_info; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 63216c43676d..0d59e830018a 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -7,7 +7,7 @@ #include #include #include "ctree.h" -#include "accessors.h" +#include "ordered-data.h" struct extent_map; struct btrfs_file_extent_item; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7958e6c4f6b3..3c8bcdcf525e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ #include "backref.h" #include "raid-stripe-tree.h" #include "fiemap.h" +#include "delayed-inode.h" #define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) #define COW_FILE_RANGE_NO_INLINE (1UL << 1) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c3e1831a48a3..4ae6928fdca4 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -15,6 +15,7 @@ #include "accessors.h" #include "extent-tree.h" #include "zoned.h" +#include "delayed-inode.h" /* * HOW DOES SPACE RESERVATION WORK diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index ad0552db7c7d..d81a0ade559f 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -7,7 +7,6 @@ #include #include #include "btrfs_inode.h" -#include "fs.h" struct address_space; struct folio; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 907f2d047b44..03c62fd1a091 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -32,6 +32,8 @@ #include "ioctl.h" #include "relocation.h" #include "scrub.h" +#include "ordered-data.h" +#include "delayed-inode.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 9f7c777af635..18ef069197e5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -14,10 +14,6 @@ #include #include "btrfs_inode.h" #include "delayed-ref.h" -#include "extent-io-tree.h" -#include "block-rsv.h" -#include "messages.h" -#include "misc.h" struct dentry; struct inode; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bec93a0a6756..c9eb35fa5b20 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -29,6 +29,7 @@ #include "orphan.h" #include "print-tree.h" #include "tree-checker.h" +#include "delayed-inode.h" #define MAX_CONFLICT_INODES 10 diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index dc313e6bb2fa..4f149d7d4fde 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -8,8 +8,7 @@ #include #include -#include "messages.h" -#include "ctree.h" +#include #include "transaction.h" struct inode; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index d64f7c9255fa..5cefdeb08b7b 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -15,7 +15,6 @@ #include "disk-io.h" #include "block-group.h" #include "btrfs_inode.h" -#include "fs.h" struct block_device; struct extent_buffer; From 81cea6cd7041ebd42281e0517f856d88527d3326 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 29 Oct 2025 08:35:33 +1030 Subject: [PATCH 085/147] btrfs: remove btrfs_bio::fs_info by extracting it from btrfs_bio::inode Currently there is only one caller which doesn't populate btrfs_bio::inode, and that's scrub. The idea is scrub doesn't want any automatic csum verification nor read-repair, as everything will be handled by scrub itself. However that behavior is really no different than metadata inode, thus we can reuse btree_inode as btrfs_bio::inode for scrub. The only exception is in btrfs_submit_chunk() where if a bbio is from scrub or data reloc inode, we set rst_search_commit_root to true. This means we still need a way to distinguish scrub from metadata, but that can be done by a new flag inside btrfs_bio. Now btrfs_bio::inode is a mandatory parameter, we can extract fs_info from that inode thus can remove btrfs_bio::fs_info to save 8 bytes from btrfs_bio structure. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 53 ++++++++++++++++++++++-------------------- fs/btrfs/bio.h | 18 +++++++++----- fs/btrfs/compression.c | 6 ++--- fs/btrfs/compression.h | 3 ++- fs/btrfs/direct-io.c | 4 +--- fs/btrfs/extent_io.c | 22 +++++++----------- fs/btrfs/inode.c | 7 ++---- fs/btrfs/scrub.c | 51 ++++++++++++++++++++++------------------ fs/btrfs/zoned.c | 4 ++-- 9 files changed, 87 insertions(+), 81 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 21df48e6c4fa..b85b6b21b545 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -41,13 +41,17 @@ static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio) * Initialize a btrfs_bio structure. This skips the embedded bio itself as it * is already initialized by the block layer. */ -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private) { + /* @inode parameter is mandatory. */ + ASSERT(inode); + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); - bbio->fs_info = fs_info; + bbio->inode = inode; bbio->end_io = end_io; bbio->private = private; + bbio->file_offset = file_offset; atomic_set(&bbio->pending_ios, 1); WRITE_ONCE(bbio->status, BLK_STS_OK); } @@ -60,7 +64,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, * a mempool. */ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private) { struct btrfs_bio *bbio; @@ -68,7 +72,7 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, fs_info, end_io, private); + btrfs_bio_init(bbio, inode, file_offset, end_io, private); return bbio; } @@ -85,9 +89,7 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return ERR_CAST(bio); bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); - bbio->inode = orig_bbio->inode; - bbio->file_offset = orig_bbio->file_offset; + btrfs_bio_init(bbio, orig_bbio->inode, orig_bbio->file_offset, NULL, orig_bbio); orig_bbio->file_offset += map_length; if (bbio_has_ordered_extent(bbio)) { refcount_inc(&orig_bbio->ordered->refs); @@ -244,9 +246,8 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, bio_add_folio_nofail(repair_bio, folio, sectorsize, foff); repair_bbio = btrfs_bio(repair_bio); - btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); - repair_bbio->inode = failed_bbio->inode; - repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset, + NULL, fbio); mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); @@ -332,7 +333,7 @@ static void btrfs_simple_end_io(struct bio *bio) { struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_device *dev = bio->bi_private; - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; btrfs_bio_counter_dec(fs_info); @@ -581,10 +582,11 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) static bool should_async_write(struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; bool auto_csum_mode = true; #ifdef CONFIG_BTRFS_EXPERIMENTAL - struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) @@ -594,7 +596,7 @@ static bool should_async_write(struct btrfs_bio *bbio) #endif /* Submit synchronously if the checksum implementation is fast. */ - if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags)) + if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) return false; /* @@ -605,7 +607,7 @@ static bool should_async_write(struct btrfs_bio *bbio) return false; /* Zoned devices require I/O to be submitted in order. */ - if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info)) + if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(fs_info)) return false; return true; @@ -620,7 +622,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, struct btrfs_io_stripe *smap, int mirror_num) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); @@ -639,11 +641,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; unsigned int nr_segs; int sector_offset; - map_length = min(map_length, bbio->fs_info->max_zone_append_size); - sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + map_length = min(map_length, fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &fs_info->limits, &nr_segs, map_length); if (sector_offset) { /* @@ -651,7 +654,7 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) * sectorsize and thus cause unaligned I/Os. Fix that by * always rounding down to the nearest boundary. */ - return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, fs_info->sectorsize); } return map_length; } @@ -659,7 +662,7 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) { struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bio->bi_iter.bi_size; @@ -670,7 +673,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t status; int ret; - if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) + if (bbio->is_scrub || btrfs_is_data_reloc_root(inode->root)) smap.rst_search_commit_root = true; else smap.rst_search_commit_root = false; @@ -734,7 +737,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. */ - if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && + if (!(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(inode->root)) { if (should_async_write(bbio) && @@ -782,7 +785,7 @@ end_bbio: static void assert_bbio_alignment(struct btrfs_bio *bbio) { #ifdef CONFIG_BTRFS_ASSERT - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct bio_vec bvec; struct bvec_iter iter; const u32 blocksize = fs_info->sectorsize; @@ -885,16 +888,16 @@ out_counter_dec: */ void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 length = bbio->bio.bi_iter.bi_size; struct btrfs_io_stripe smap = { 0 }; int ret; - ASSERT(fs_info); ASSERT(mirror_num > 0); ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); - ASSERT(!bbio->inode); + ASSERT(!is_data_inode(bbio->inode)); + ASSERT(bbio->is_scrub); btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 3cc0fe23898f..5d20f959e12d 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -27,7 +27,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); struct btrfs_bio { /* * Inode and offset into it that this I/O operates on. - * Only set for data I/O. + * + * If the inode is a data one, csum verification and read-repair + * will be done automatically. + * If the inode is a metadata one, everything is handled by the caller. */ struct btrfs_inode *inode; u64 file_offset; @@ -69,14 +72,17 @@ struct btrfs_bio { atomic_t pending_ios; struct work_struct end_io_work; - /* File system that this I/O operates on. */ - struct btrfs_fs_info *fs_info; - /* Save the first error status of split bio. */ blk_status_t status; /* Use the commit root to look up csums (data read bio only). */ bool csum_search_commit_root; + + /* + * Since scrub will reuse btree inode, we need this flag to distinguish + * scrub bios. + */ + bool is_scrub; /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -92,10 +98,10 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) int __init btrfs_bioset_init(void); void __cold btrfs_bioset_exit(void); -void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, +void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private); struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode, u64 file_offset, btrfs_bio_end_io_t end_io, void *private); void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bacad18357b3..8c3899832a1a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -67,9 +67,7 @@ static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode, bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op, GFP_NOFS, &btrfs_compressed_bioset)); - btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL); - bbio->inode = inode; - bbio->file_offset = start; + btrfs_bio_init(bbio, inode, start, end_io, NULL); return to_compressed_bio(bbio); } @@ -354,7 +352,7 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { - struct btrfs_fs_info *fs_info = cb->bbio.fs_info; + struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; struct bio *bio = &cb->bbio.bio; u32 offset = 0; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index c6812d5fcab7..062ebd9c2d32 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -14,6 +14,7 @@ #include #include "bio.h" #include "fs.h" +#include "btrfs_inode.h" struct address_space; struct inode; @@ -74,7 +75,7 @@ struct compressed_bio { static inline struct btrfs_fs_info *cb_to_fs_info(const struct compressed_bio *cb) { - return cb->bbio.fs_info; + return cb->bbio.inode->root->fs_info; } /* @range_end must be exclusive. */ diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index f225cc3fd3a1..962fccceffd6 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -715,10 +715,8 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; - btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, + btrfs_bio_init(bbio, BTRFS_I(iter->inode), file_offset, btrfs_dio_end_io, bio->bi_private); - bbio->inode = BTRFS_I(iter->inode); - bbio->file_offset = file_offset; dip->file_offset = file_offset; dip->bytes = bio->bi_iter.bi_size; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cb680cdeb77d..b25a2b45047e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -517,7 +517,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le */ static void end_bbio_data_write(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; @@ -573,7 +573,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) */ static void end_bbio_data_read(struct btrfs_bio *bbio) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi; @@ -738,12 +738,10 @@ static void alloc_new_bio(struct btrfs_inode *inode, struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_bio *bbio; - bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, - bio_ctrl->end_io_func, NULL); + bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, inode, + file_offset, bio_ctrl->end_io_func, NULL); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->bio.bi_write_hint = inode->vfs_inode.i_write_hint; - bbio->inode = inode; - bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; bio_ctrl->next_file_offset = file_offset; @@ -2224,12 +2222,11 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, end_bbio_meta_write, eb); + BTRFS_I(fs_info->btree_inode), eb->start, + end_bbio_meta_write, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, &bbio->bio); - bbio->inode = BTRFS_I(eb->fs_info->btree_inode); - bbio->file_offset = eb->start; for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 range_start = max_t(u64, eb->start, folio_pos(folio)); @@ -3844,6 +3841,7 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *check) { + struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_bio *bbio; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) @@ -3877,11 +3875,9 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, refcount_inc(&eb->refs); bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, - REQ_OP_READ | REQ_META, eb->fs_info, - end_bbio_meta_read, eb); + REQ_OP_READ | REQ_META, BTRFS_I(fs_info->btree_inode), + eb->start, end_bbio_meta_read, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; - bbio->inode = BTRFS_I(eb->fs_info->btree_inode); - bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3c8bcdcf525e..a2e8d52a2a87 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9404,7 +9404,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 disk_bytenr, u64 disk_io_size, struct page **pages, void *uring_ctx) { - struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private *priv, sync_priv; struct completion sync_reads; unsigned long i = 0; @@ -9429,10 +9428,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, priv->status = 0; priv->uring_ctx = uring_ctx; - bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0, btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - bbio->inode = inode; do { size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); @@ -9441,10 +9439,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); - bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, + bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, inode, 0, btrfs_encoded_read_endio, priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; - bbio->inode = inode; continue; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 33c9cb91f0a9..3dbb02dbfffb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -927,10 +927,11 @@ static int calc_next_mirror(int mirror, int num_copies) static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, int sector_nr) { + struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); int ret; - ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), bbio->fs_info->sectorsize, + ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), fs_info->sectorsize, offset_in_page(kaddr)); /* * Caller should ensure the bbio has enough size. @@ -940,7 +941,21 @@ static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *st * to create the minimal amount of bio vectors, for fs block size < page * size cases. */ - ASSERT(ret == bbio->fs_info->sectorsize); + ASSERT(ret == fs_info->sectorsize); +} + +static struct btrfs_bio *alloc_scrub_bbio(struct btrfs_fs_info *fs_info, + unsigned int nr_vecs, blk_opf_t opf, + u64 logical, + btrfs_bio_end_io_t end_io, void *private) +{ + struct btrfs_bio *bbio; + + bbio = btrfs_bio_alloc(nr_vecs, opf, BTRFS_I(fs_info->btree_inode), + logical, end_io, private); + bbio->is_scrub = true; + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + return bbio; } static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, @@ -966,12 +981,10 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, bbio = NULL; } - if (!bbio) { - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, - fs_info, scrub_repair_read_endio, stripe); - bbio->bio.bi_iter.bi_sector = (stripe->logical + - (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; - } + if (!bbio) + bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, + stripe->logical + (i << fs_info->sectorsize_bits), + scrub_repair_read_endio, stripe); scrub_bio_add_sector(bbio, stripe, i); } @@ -1350,13 +1363,10 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); bbio = NULL; } - if (!bbio) { - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE, - fs_info, scrub_write_endio, stripe); - bbio->bio.bi_iter.bi_sector = (stripe->logical + - (sector_nr << fs_info->sectorsize_bits)) >> - SECTOR_SHIFT; - } + if (!bbio) + bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_WRITE, + stripe->logical + (sector_nr << fs_info->sectorsize_bits), + scrub_write_endio, stripe); scrub_bio_add_sector(bbio, stripe, sector_nr); } if (bbio) @@ -1847,9 +1857,8 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) continue; } - bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, - fs_info, scrub_read_endio, stripe); - bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + bbio = alloc_scrub_bbio(fs_info, stripe->nr_sectors, REQ_OP_READ, + logical, scrub_read_endio, stripe); } scrub_bio_add_sector(bbio, stripe, i); @@ -1886,10 +1895,8 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, return; } - bbio = btrfs_bio_alloc(BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, fs_info, - scrub_read_endio, stripe); - - bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; + bbio = alloc_scrub_bbio(fs_info, BTRFS_STRIPE_LEN >> min_folio_shift, REQ_OP_READ, + stripe->logical, scrub_read_endio, stripe); /* Read the whole range inside the chunk boundary. */ for (unsigned int cur = 0; cur < nr_sectors; cur++) scrub_bio_add_sector(bbio, stripe, cur); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 9b2af6210867..41a4a7d50bd3 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1808,14 +1808,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); struct btrfs_inode *inode = bbio->inode; - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; bool ret = false; if (!btrfs_is_zoned(fs_info)) return false; - if (!inode || !is_data_inode(inode)) + if (!is_data_inode(inode)) return false; if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) From 4591c3ef751d861d7dd95ff4d2aadb1b5e95854e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 23 Oct 2025 15:19:16 +1030 Subject: [PATCH 086/147] btrfs: make sure all btrfs_bio::end_io are called in task context [BACKGROUND] Btrfs has a lot of different bi_end_io functions, to handle different raid profiles. But they introduced a lot of different contexts for btrfs_bio::end_io() calls: - Simple read bios Run in task context, backed by either endio_meta_workers or endio_workers. - Simple write bios Run in IRQ context. - RAID56 write or rebuild bios Run in task context, backed by rmw_workers. - Mirrored write bios Run in irq context. This is inconsistent, and contributes to the number of workqueues used in btrfs. [ENHANCEMENT] Make all the above bios call their btrfs_bio::end_io() in task context, backed by either endio_meta_workers for metadata, or endio_workers for data. For simple write bios, merge the handling into simple_end_io_work(). For mirrored write bios, it will be a little more complex, since both the original or the cloned bios can run the final btrfs_bio::end_io(). Here we make sure the cloned bios are using btrfs_bioset, to reuse the end_io_work, and run both original and cloned work inside the workqueue. Add extra ASSERT()s to make sure btrfs_bio_end_io() is running in task context. This not only unifies the context for btrfs_bio::end_io() functions, but also opens a new door for further btrfs_bio::end_io() related cleanups. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 64 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index b85b6b21b545..52b8893f26f1 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -102,6 +102,9 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { + /* Make sure we're already in task context. */ + ASSERT(in_task()); + bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; @@ -318,15 +321,20 @@ static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_i return fs_info->endio_workers; } -static void btrfs_end_bio_work(struct work_struct *work) +static void simple_end_io_work(struct work_struct *work) { struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + struct bio *bio = &bbio->bio; - /* Metadata reads are checked and repaired by the submitter. */ - if (is_data_bbio(bbio)) - btrfs_check_read_bio(bbio, bbio->bio.bi_private); - else - btrfs_bio_end_io(bbio, bbio->bio.bi_status); + if (bio_op(bio) == REQ_OP_READ) { + /* Metadata reads are checked and repaired by the submitter. */ + if (is_data_bbio(bbio)) + return btrfs_check_read_bio(bbio, bbio->bio.bi_private); + return btrfs_bio_end_io(bbio, bbio->bio.bi_status); + } + if (bio_is_zone_append(bio) && !bio->bi_status) + btrfs_record_physical_zoned(bbio); + btrfs_bio_end_io(bbio, bbio->bio.bi_status); } static void btrfs_simple_end_io(struct bio *bio) @@ -340,14 +348,8 @@ static void btrfs_simple_end_io(struct bio *bio) if (bio->bi_status) btrfs_log_dev_io_error(bio, dev); - if (bio_op(bio) == REQ_OP_READ) { - INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); - queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); - } else { - if (bio_is_zone_append(bio) && !bio->bi_status) - btrfs_record_physical_zoned(bbio); - btrfs_bio_end_io(bbio, bbio->bio.bi_status); - } + INIT_WORK(&bbio->end_io_work, simple_end_io_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } static void btrfs_raid56_end_io(struct bio *bio) @@ -355,6 +357,9 @@ static void btrfs_raid56_end_io(struct bio *bio) struct btrfs_io_context *bioc = bio->bi_private; struct btrfs_bio *bbio = btrfs_bio(bio); + /* RAID56 endio is always handled in workqueue. */ + ASSERT(in_task()); + btrfs_bio_counter_dec(bioc->fs_info); bbio->mirror_num = bioc->mirror_num; if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) @@ -365,11 +370,12 @@ static void btrfs_raid56_end_io(struct bio *bio) btrfs_put_bioc(bioc); } -static void btrfs_orig_write_end_io(struct bio *bio) +static void orig_write_end_io_work(struct work_struct *work) { + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + struct bio *bio = &bbio->bio; struct btrfs_io_stripe *stripe = bio->bi_private; struct btrfs_io_context *bioc = stripe->bioc; - struct btrfs_bio *bbio = btrfs_bio(bio); btrfs_bio_counter_dec(bioc->fs_info); @@ -394,8 +400,18 @@ static void btrfs_orig_write_end_io(struct bio *bio) btrfs_put_bioc(bioc); } -static void btrfs_clone_write_end_io(struct bio *bio) +static void btrfs_orig_write_end_io(struct bio *bio) { + struct btrfs_bio *bbio = btrfs_bio(bio); + + INIT_WORK(&bbio->end_io_work, orig_write_end_io_work); + queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); +} + +static void clone_write_end_io_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + struct bio *bio = &bbio->bio; struct btrfs_io_stripe *stripe = bio->bi_private; if (bio->bi_status) { @@ -410,6 +426,14 @@ static void btrfs_clone_write_end_io(struct bio *bio) bio_put(bio); } +static void btrfs_clone_write_end_io(struct bio *bio) +{ + struct btrfs_bio *bbio = btrfs_bio(bio); + + INIT_WORK(&bbio->end_io_work, clone_write_end_io_work); + queue_work(btrfs_end_io_wq(bbio->inode->root->fs_info, bio), &bbio->end_io_work); +} + static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) { if (!dev || !dev->bdev || @@ -456,6 +480,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) { struct bio *orig_bio = bioc->orig_bio, *bio; + struct btrfs_bio *orig_bbio = btrfs_bio(orig_bio); ASSERT(bio_op(orig_bio) != REQ_OP_READ); @@ -464,8 +489,11 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) bio = orig_bio; bio->bi_end_io = btrfs_orig_write_end_io; } else { - bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + /* We need to use endio_work to run end_io in task context. */ + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &btrfs_bioset); bio_inc_remaining(orig_bio); + btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, + orig_bbio->file_offset, NULL, NULL); bio->bi_end_io = btrfs_clone_write_end_io; } From 4bbdce84175db7ff0dfaa82e960c7488c6cb0bcf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 23 Oct 2025 18:32:34 +1030 Subject: [PATCH 087/147] btrfs: remove btrfs_fs_info::compressed_write_workers The reason why end_bbio_compressed_write() queues a work into compressed_write_workers wq is for end_compressed_writeback() call, as it will grab all the involved folios and clear the writeback flags, which may sleep. However now we always run btrfs_bio::end_io() in task context, there is no need to queue the work anymore. Just remove btrfs_fs_info::compressed_write_workers and compressed_bio::write_end_work. There is a comment about the works queued into compressed_write_workers, now change to flush endio wq instead, which is responsible to handle all data endio functions. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 27 ++++++++------------------- fs/btrfs/compression.h | 7 ++----- fs/btrfs/disk-io.c | 9 ++------- fs/btrfs/fs.h | 1 - 4 files changed, 12 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8c3899832a1a..cdb5c891b0ac 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -319,22 +319,6 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) /* the inode may be gone now */ } -static void btrfs_finish_compressed_write_work(struct work_struct *work) -{ - struct compressed_bio *cb = - container_of(work, struct compressed_bio, write_end_work); - - btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, - cb->bbio.bio.bi_status == BLK_STS_OK); - - if (cb->writeback) - end_compressed_writeback(cb); - /* Note, our inode could be gone now */ - - btrfs_free_compressed_folios(cb); - bio_put(&cb->bbio.bio); -} - /* * Do the cleanup once all the compressed pages hit the disk. This will clear * writeback on the file pages and free the compressed pages. @@ -345,9 +329,15 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) static void end_bbio_compressed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); - struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; - queue_work(fs_info->compressed_write_workers, &cb->write_end_work); + btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, + cb->bbio.bio.bi_status == BLK_STS_OK); + + if (cb->writeback) + end_compressed_writeback(cb); + /* Note, our inode could be gone now. */ + btrfs_free_compressed_folios(cb); + bio_put(&cb->bbio.bio); } static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) @@ -400,7 +390,6 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb->compressed_folios = compressed_folios; cb->compressed_len = ordered->disk_num_bytes; cb->writeback = writeback; - INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_folios = nr_folios; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 062ebd9c2d32..40aa49fed18c 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -63,11 +63,8 @@ struct compressed_bio { /* Whether this is a write for writeback. */ bool writeback; - union { - /* For reads, this is the bio we are copying the data into */ - struct btrfs_bio *orig_bbio; - struct work_struct write_end_work; - }; + /* For reads, this is the bio we are copying the data into. */ + struct btrfs_bio *orig_bbio; /* Must be last. */ struct btrfs_bio bbio; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 46b715f3447b..6a1fa3b08b3f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1774,8 +1774,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) destroy_workqueue(fs_info->endio_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); - if (fs_info->compressed_write_workers) - destroy_workqueue(fs_info->compressed_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -1987,8 +1985,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); - fs_info->compressed_write_workers = - alloc_workqueue("btrfs-compressed-write", flags, max_active); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); @@ -2004,7 +2000,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->compressed_write_workers && fs_info->endio_write_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && @@ -4291,7 +4286,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* * When finishing a compressed write bio we schedule a work queue item - * to finish an ordered extent - btrfs_finish_compressed_write_work() + * to finish an ordered extent - end_bbio_compressed_write() * calls btrfs_finish_ordered_extent() which in turns does a call to * btrfs_queue_ordered_fn(), and that queues the ordered extent * completion either in the endio_write_workers work queue or in the @@ -4299,7 +4294,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * below, so before we flush them we must flush this queue for the * workers of compressed writes. */ - flush_workqueue(fs_info->compressed_write_workers); + flush_workqueue(fs_info->endio_workers); /* * After we parked the cleaner kthread, ordered extents may have diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 522152904b8f..e7cd4490736f 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -654,7 +654,6 @@ struct btrfs_fs_info { struct workqueue_struct *endio_workers; struct workqueue_struct *endio_meta_workers; struct workqueue_struct *rmw_workers; - struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; From 39bc80216a3656d54d65cdda994f406aeb27c3da Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 24 Oct 2025 08:32:41 +1030 Subject: [PATCH 088/147] btrfs: relax btrfs_inode::ordered_tree_lock IRQ locking context We used IRQ version of spinlock for ordered_tree_lock, as btrfs_finish_ordered_extent() can be called in end_bbio_data_write() which was in IRQ context. However since we're moving all the btrfs_bio::end_io() calls into task context, there is no more need to support IRQ context thus we can relax to regular spin_lock()/spin_unlock() for btrfs_inode::ordered_tree_lock. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 5 ++-- fs/btrfs/inode.c | 4 +-- fs/btrfs/ordered-data.c | 57 ++++++++++++++++++----------------------- fs/btrfs/tree-log.c | 4 +-- 4 files changed, 31 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b25a2b45047e..2d32dfc34ae3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1726,7 +1726,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, if (cur >= i_size) { struct btrfs_ordered_extent *ordered; - unsigned long flags; ordered = btrfs_lookup_first_ordered_range(inode, cur, folio_end - cur); @@ -1735,11 +1734,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, * there must be an ordered extent. */ ASSERT(ordered != NULL); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); btrfs_put_ordered_extent(ordered); btrfs_mark_ordered_io_finished(inode, folio, cur, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2e8d52a2a87..8737914e8552 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7589,11 +7589,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); /* * If the ordered extent has finished, we're safe to delete all diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dfda952dcf7b..a421f7db9eec 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -237,14 +237,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) /* One ref for the tree. */ refcount_inc(&entry->refs); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = tree_insert(&inode->ordered_tree, entry->file_offset, &entry->rb_node); if (unlikely(node)) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -328,9 +328,9 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, { struct btrfs_inode *inode = entry->inode; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } void btrfs_mark_ordered_extent_error(struct btrfs_ordered_extent *ordered) @@ -417,15 +417,14 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, bool uptodate) { struct btrfs_inode *inode = ordered->inode; - unsigned long flags; bool ret; trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); ret = can_finish_ordered_extent(ordered, folio, file_offset, len, uptodate); - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); /* * If this is a COW write it means we created new extent maps for the @@ -481,13 +480,12 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; u64 cur = file_offset; const u64 end = file_offset + num_bytes; trace_btrfs_writepage_end_io_hook(inode, file_offset, end - 1, uptodate); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); while (cur < end) { u64 entry_end; u64 this_end; @@ -539,13 +537,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, ASSERT(len < U32_MAX); if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); btrfs_queue_ordered_fn(entry); - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); } cur += len; } - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); } /* @@ -571,10 +569,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; bool finished = false; - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); if (cached && *cached) { entry = *cached; goto have_entry; @@ -611,7 +608,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); return finished; } @@ -676,7 +673,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); - spin_lock_irq(&btrfs_inode->ordered_tree_lock); + spin_lock(&btrfs_inode->ordered_tree_lock); node = &entry->rb_node; rb_erase(node, &btrfs_inode->ordered_tree); RB_CLEAR_NODE(node); @@ -684,7 +681,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, btrfs_inode->ordered_tree_last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); - spin_unlock_irq(&btrfs_inode->ordered_tree_lock); + spin_unlock(&btrfs_inode->ordered_tree_lock); /* * The current running transaction is waiting on us, we need to let it @@ -969,9 +966,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -984,7 +980,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino trace_btrfs_ordered_extent_lookup(inode, entry); } out: - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -997,7 +993,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) { node = ordered_tree_search(inode, file_offset + len); @@ -1024,7 +1020,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_range(inode, entry); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1039,7 +1035,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, btrfs_assert_inode_locked(inode); - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { struct btrfs_ordered_extent *ordered; @@ -1053,7 +1049,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, refcount_inc(&ordered->refs); trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } /* @@ -1066,7 +1062,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -1075,7 +1071,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_first(inode, entry); out: - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1096,9 +1092,8 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct rb_node *prev; struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; - unsigned long flags; - spin_lock_irqsave(&inode->ordered_tree_lock, flags); + spin_lock(&inode->ordered_tree_lock); node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last @@ -1153,7 +1148,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); + spin_unlock(&inode->ordered_tree_lock); return entry; } @@ -1285,9 +1280,7 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( /* * Take the root's ordered_extent_lock to avoid a race with * btrfs_wait_ordered_extents() when updating the disk_bytenr and - * disk_num_bytes fields of the ordered extent below. And we disable - * IRQs because the inode's ordered_tree_lock is used in IRQ context - * elsewhere. + * disk_num_bytes fields of the ordered extent below. * * There's no concern about a previous caller of * btrfs_wait_ordered_extents() getting the trimmed ordered extent diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c9eb35fa5b20..8e41fb906c6e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5406,12 +5406,12 @@ process: set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { - spin_lock_irq(&inode->ordered_tree_lock); + spin_lock(&inode->ordered_tree_lock); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); atomic_inc(&trans->transaction->pending_ordered); } - spin_unlock_irq(&inode->ordered_tree_lock); + spin_unlock(&inode->ordered_tree_lock); } btrfs_put_ordered_extent(ordered); } From dd57c78aec398717a2fa6488d87b1a6cd43c7d0d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 24 Oct 2025 15:08:34 +1030 Subject: [PATCH 089/147] btrfs: introduce btrfs_bio::async_csum [ENHANCEMENT] Btrfs currently calculates data checksums then submits the bio. But after commit 968f19c5b1b7 ("btrfs: always fallback to buffered write if the inode requires checksum"), any writes with data checksum will fallback to buffered IO, meaning the content will not change during writeback. This means we're safe to calculate the data checksum and submit the bio in parallel, and only need the following new behavior: - Wait the csum generation to finish before calling btrfs_bio::end_io() Or this can lead to use-after-free for the csum generation worker. - Save the current bi_iter for csum_one_bio() As the submission part can advance btrfs_bio::bio.bi_iter, if not saved csum_one_bio() may got an empty bi_iter and do not generate any checksum. Unfortunately this means we have to increase the size of btrfs_bio for 16 bytes, but this is still acceptable. As usual, such new feature is hidden behind the experimental flag. [THEORETIC ANALYZE] Consider the following theoretic hardware performance, which should be more or less close to modern mainstream hardware: Memory bandwidth: 50GiB/s CRC32C bandwidth: 45GiB/s SSD bandwidth: 8GiB/s Then write bandwidth with data checksum before the patch is: 1 / ( 1 / 50 + 1 / 45 + 1 / 8) = 5.98 GiB/s After the patch, the bandwidth is: 1 / ( 1 / 50 + max( 1 / 45 + 1 / 8)) = 6.90 GiB/s The difference is 15.32% improvement. [REAL WORLD BENCHMARK] I'm using a Zen5 (HX 370) as the host, the VM has 4GiB memory, 10 vCPUs, the storage is backed by a PCIe gen3 x4 NVMe. The test is a direct IO write, with 1MiB block size, write 7GiB data into a btrfs mount with data checksum. Thus the direct write will fallback to buffered one: Vanilla Datasum: 1619.97 GiB/s Patched Datasum: 1792.26 GiB/s Diff +10.6 % In my case, the bottleneck is the storage, thus the improvement is not reaching the theoretic one, but still some observable improvement. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 21 +++++++++++---- fs/btrfs/bio.h | 7 +++++ fs/btrfs/file-item.c | 64 +++++++++++++++++++++++++++++++------------- fs/btrfs/file-item.h | 2 +- 4 files changed, 69 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 52b8893f26f1..1286c1ac1940 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -105,6 +105,9 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) /* Make sure we're already in task context. */ ASSERT(in_task()); + if (bbio->async_csum) + wait_for_completion(&bbio->csum_done); + bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; @@ -538,7 +541,11 @@ static int btrfs_bio_csum(struct btrfs_bio *bbio) { if (bbio->bio.bi_opf & REQ_META) return btree_csum_one_bio(bbio); - return btrfs_csum_one_bio(bbio); +#ifdef CONFIG_BTRFS_EXPERIMENTAL + return btrfs_csum_one_bio(bbio, true); +#else + return btrfs_csum_one_bio(bbio, false); +#endif } /* @@ -617,10 +624,14 @@ static bool should_async_write(struct btrfs_bio *bbio) struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); - if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF) - return false; - - auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO); + if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON) + return true; + /* + * Write bios will calculate checksum and submit bio at the same time. + * Unless explicitly required don't offload serial csum calculate and bio + * submit into a workqueue. + */ + return false; #endif /* Submit synchronously if the checksum implementation is fast. */ diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 5d20f959e12d..deaeea3becf4 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -56,6 +56,9 @@ struct btrfs_bio { struct { struct btrfs_ordered_extent *ordered; struct btrfs_ordered_sum *sums; + struct work_struct csum_work; + struct completion csum_done; + struct bvec_iter csum_saved_iter; u64 orig_physical; }; @@ -83,6 +86,10 @@ struct btrfs_bio { * scrub bios. */ bool is_scrub; + + /* Whether the csum generation for data write is async. */ + bool async_csum; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a42e6d54e7cd..4b7c40f05e8f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -18,6 +18,7 @@ #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "volumes.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ @@ -764,21 +765,46 @@ fail: return ret; } -/* - * Calculate checksums of the data contained inside a bio. - */ -int btrfs_csum_one_bio(struct btrfs_bio *bbio) +static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src) { - struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct bio *bio = &bbio->bio; - struct btrfs_ordered_sum *sums; - struct bvec_iter iter = bio->bi_iter; + struct btrfs_ordered_sum *sums = bbio->sums; + struct bvec_iter iter = *src; phys_addr_t paddr; const u32 blocksize = fs_info->sectorsize; - int index; + int index = 0; + + shash->tfm = fs_info->csum_shash; + + btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) { + btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index); + index += fs_info->csum_size; + } +} + +static void csum_one_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, csum_work); + + ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); + ASSERT(bbio->async_csum == true); + csum_one_bio(bbio, &bbio->csum_saved_iter); + complete(&bbio->csum_done); +} + +/* + * Calculate checksums of the data contained inside a bio. + */ +int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async) +{ + struct btrfs_ordered_extent *ordered = bbio->ordered; + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio *bio = &bbio->bio; + struct btrfs_ordered_sum *sums; unsigned nofs_flag; nofs_flag = memalloc_nofs_save(); @@ -789,21 +815,21 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio) if (!sums) return -ENOMEM; + sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); - - sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - index = 0; - - shash->tfm = fs_info->csum_shash; - - btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) { - btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index); - index += fs_info->csum_size; - } - bbio->sums = sums; btrfs_add_ordered_sum(ordered, sums); + + if (!async) { + csum_one_bio(bbio, &bbio->bio.bi_iter); + return 0; + } + init_completion(&bbio->csum_done); + bbio->async_csum = true; + bbio->csum_saved_iter = bbio->bio.bi_iter; + INIT_WORK(&bbio->csum_work, csum_one_bio_work); + schedule_work(&bbio->csum_work); return 0; } diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 0d59e830018a..5645c5e3abdb 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -64,7 +64,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async); int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, From 1dac8db80cee66b7ba51d323025e47989278ee03 Mon Sep 17 00:00:00 2001 From: Gladyshev Ilya Date: Sun, 2 Nov 2025 10:38:52 +0300 Subject: [PATCH 090/147] btrfs: don't generate any code from ASSERT() in release builds The current definition of ASSERT(cond) as (void)(cond) is redundant, since these checks have no side effects and don't affect code logic. However, some checks contain READ_ONCE() or other compiler-unfriendly constructs. For example, ASSERT(list_empty) in btrfs_add_dealloc_inode() was compiled to a redundant mov instruction due to this issue. Define ASSERT as BUILD_BUG_ON_INVALID for !CONFIG_BTRFS_ASSERT builds which uses sizeof(cond) trick. Also mark full_page_sectors_uptodate() as __maybe_unused to suppress "unneeded declaration" warning (it's needed in compile time) Signed-off-by: Gladyshev Ilya Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/messages.h | 3 ++- fs/btrfs/raid56.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 4416c165644f..d8c0bd17dcda 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -168,7 +168,8 @@ do { \ #endif #else -#define ASSERT(cond, args...) (void)(cond) +/* Compile check the @cond expression but don't generate any code. */ +#define ASSERT(cond, args...) BUILD_BUG_ON_INVALID(cond) #endif #ifdef CONFIG_BTRFS_DEBUG diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0135dceb7baa..302f20d8c335 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -299,8 +299,8 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } -static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, - unsigned int page_nr) +static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; const u32 sectors_per_page = PAGE_SIZE / sectorsize; From cfc7fe2b0f18c54b571b4137156f944ff76057c8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 6 Nov 2025 08:15:03 +1030 Subject: [PATCH 091/147] btrfs: use kvcalloc for btrfs_bio::csum allocation [BUG] There is a report that memory allocation failed for btrfs_bio::csum during a large read: b2sum: page allocation failure: order:4, mode:0x40c40(GFP_NOFS|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 UID: 0 PID: 416120 Comm: b2sum Tainted: G W 6.17.0 #1 NONE Tainted: [W]=WARN Hardware name: Raspberry Pi 4 Model B Rev 1.5 (DT) Call trace: show_stack+0x18/0x30 (C) dump_stack_lvl+0x5c/0x7c dump_stack+0x18/0x24 warn_alloc+0xec/0x184 __alloc_pages_slowpath.constprop.0+0x21c/0x730 __alloc_frozen_pages_noprof+0x230/0x260 ___kmalloc_large_node+0xd4/0xf0 __kmalloc_noprof+0x1c8/0x260 btrfs_lookup_bio_sums+0x214/0x278 btrfs_submit_chunk+0xf0/0x3c0 btrfs_submit_bbio+0x2c/0x4c submit_one_bio+0x50/0xac submit_extent_folio+0x13c/0x340 btrfs_do_readpage+0x4b0/0x7a0 btrfs_readahead+0x184/0x254 read_pages+0x58/0x260 page_cache_ra_unbounded+0x170/0x24c page_cache_ra_order+0x360/0x3bc page_cache_async_ra+0x1a4/0x1d4 filemap_readahead.isra.0+0x44/0x74 filemap_get_pages+0x2b4/0x3b4 filemap_read+0xc4/0x3bc btrfs_file_read_iter+0x70/0x7c vfs_read+0x1ec/0x2c0 ksys_read+0x4c/0xe0 __arm64_sys_read+0x18/0x24 el0_svc_common.constprop.0+0x5c/0x130 do_el0_svc+0x1c/0x30 el0_svc+0x30/0xa0 el0t_64_sync_handler+0xa0/0xe4 el0t_64_sync+0x198/0x19c [CAUSE] Btrfs needs to allocate memory for btrfs_bio::csum for large reads, so that we can later verify the contents of the read. However nowadays a read bio can easily go beyond BIO_MAX_VECS * PAGE_SIZE (which is 1M for 4K page sizes), due to the multi-page bvec that one bvec can have more than one pages, as long as the pages are physically adjacent. This will become more common when the large folio support is moved out of experimental features. In the above case, a read larger than 4MiB with SHA256 checksum (32 bytes for each 4K block) will be able to trigger a order 4 allocation. The order 4 is larger than PAGE_ALLOC_COSTLY_ORDER (3), thus without extra flags such allocation will not retry. And if the system has very small amount of memory (e.g. RPI4 with low memory spec) or VMs with small vRAM, or the memory is heavily fragmented, such allocation will fail and cause the above warning. [FIX] Although btrfs is handling the memory allocation failure correctly, we do not really need the physically contiguous memory just to restore our checksum. In fact btrfs_csum_one_bio() is already using kvzalloc() to reduce the memory pressure. So follow the step to use kvcalloc() for btrfs_bio::csum. Reported-by: Calvin Owens Link: https://lore.kernel.org/linux-btrfs/20251105180054.511528-1-calvin@wbinvd.org/ Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 2 +- fs/btrfs/file-item.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1286c1ac1940..a73652b8724a 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -293,7 +293,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de offset += sectorsize; } if (bbio->csum != bbio->csum_inline) - kfree(bbio->csum); + kvfree(bbio->csum); if (fbio) btrfs_repair_done(fbio); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 4b7c40f05e8f..72be3ede0edf 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -373,7 +373,7 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) return -ENOMEM; if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + bbio->csum = kvcalloc(nblocks, csum_size, GFP_NOFS); if (!bbio->csum) return -ENOMEM; } else { @@ -439,7 +439,7 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (count < 0) { ret = count; if (bbio->csum != bbio->csum_inline) - kfree(bbio->csum); + kvfree(bbio->csum); bbio->csum = NULL; break; } From d435c513652e6a90a13c881986a2cc6420c99cab Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 5 Nov 2025 20:28:12 +1030 Subject: [PATCH 092/147] btrfs: make sure extent and csum paths are always released in scrub_raid56_parity_stripe() Unlike queue_scrub_stripe() which uses the global sctx->extent_path and sctx->csum_path which are always released at the end of scrub_stripe(), scrub_raid56_parity_stripe() uses local extent_path and csum_path, as that function is going to handle the full stripe, whose bytenr may be smaller than the bytenr in the global sctx paths. However the cleanup of local extent/csum paths is only happening after we have successfully submitted an rbio. There are several error routes that we didn't release those two paths: - scrub_find_fill_first_stripe() errored out at csum tree search In that case extent_path is still valid, and that function itself will not release the extent_path passed in. And the function returns directly without releasing both paths. - The full stripe is empty - Some blocks failed to be recovered - btrfs_map_block() failed - raid56_parity_alloc_scrub_rbio() failed The function returns directly without releasing both paths. Fix it by covering btrfs_release_path() calls inside the out: tag. This is just a hot fix, in the long run we will go scoped based auto freeing for both local paths. Fixes: 1dc4888e725d ("btrfs: scrub: avoid unnecessary extent tree search preparing stripes") Fixes: 3c771c194402 ("btrfs: scrub: avoid unnecessary csum tree search preparing stripes") Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3dbb02dbfffb..ad7084c323d0 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2288,9 +2288,9 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio_put(bio); btrfs_bio_counter_dec(fs_info); +out: btrfs_release_path(&extent_path); btrfs_release_path(&csum_path); -out: return ret; } From 07166122b58a7fb3c056247aa262e832f3f38d0f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 6 Nov 2025 20:02:15 +1030 Subject: [PATCH 093/147] btrfs: scrub: factor out parity scrub code into a helper The function scrub_raid56_parity_stripe() is handling the parity stripe by the following steps: - Scrub each data stripes And make sure everything is fine in each data stripe - Cache the data stripe into the raid bio - Use the cached raid bio to scrub the target parity stripe Extract the last two steps into a new helper, scrub_raid56_cached_parity(), as a cleanup and make the error handling more straightforward. With the following minor cleanups: - Use on-stack bio structure The bio is always empty thus we do not need any bio vector nor the block device. Thus there is no need to allocate a bio, the on-stack one is more than enough to cut it. - Remove the unnecessary btrfs_put_bioc() call if btrfs_map_block() failed If btrfs_map_block() is failed, @bioc_ret will not be touched thus there is no need to call btrfs_put_bioc() in this case. - Use a proper out: tag to do the cleanup Now the error cleanup is much shorter and simpler, just btrfs_bio_counter_dec() and bio_uninit(). Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 93 +++++++++++++++++++++++++++--------------------- 1 file changed, 52 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ad7084c323d0..3029ef683fb9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2113,24 +2113,69 @@ static int should_cancel_scrub(const struct scrub_ctx *sctx) return 0; } +static int scrub_raid56_cached_parity(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, + struct btrfs_chunk_map *map, + u64 full_stripe_start, + unsigned long *extent_bitmap) +{ + DECLARE_COMPLETION_ONSTACK(io_done); + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_io_context *bioc = NULL; + struct btrfs_raid_bio *rbio; + struct bio bio; + const int data_stripes = nr_data_stripes(map); + u64 length = btrfs_stripe_nr_to_offset(data_stripes); + int ret; + + bio_init(&bio, NULL, NULL, 0, REQ_OP_READ); + bio.bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; + bio.bi_private = &io_done; + bio.bi_end_io = raid56_scrub_wait_endio; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, + &length, &bioc, NULL, NULL); + if (ret < 0) + goto out; + /* For RAID56 write there must be an @bioc allocated. */ + ASSERT(bioc); + rbio = raid56_parity_alloc_scrub_rbio(&bio, bioc, scrub_dev, extent_bitmap, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); + btrfs_put_bioc(bioc); + if (!rbio) { + ret = -ENOMEM; + goto out; + } + /* Use the recovered stripes as cache to avoid read them from disk again. */ + for (int i = 0; i < data_stripes; i++) { + struct scrub_stripe *stripe = &sctx->raid56_data_stripes[i]; + + raid56_parity_cache_data_folios(rbio, stripe->folios, + full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); + } + raid56_parity_submit_scrub_rbio(rbio); + wait_for_completion_io(&io_done); + ret = blk_status_to_errno(bio.bi_status); +out: + btrfs_bio_counter_dec(fs_info); + bio_uninit(&bio); + return ret; +} + static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, struct btrfs_block_group *bg, struct btrfs_chunk_map *map, u64 full_stripe_start) { - DECLARE_COMPLETION_ONSTACK(io_done); struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_raid_bio *rbio; - struct btrfs_io_context *bioc = NULL; struct btrfs_path extent_path = { 0 }; struct btrfs_path csum_path = { 0 }; - struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; const int data_stripes = nr_data_stripes(map); unsigned long extent_bitmap = 0; - u64 length = btrfs_stripe_nr_to_offset(data_stripes); int ret; ASSERT(sctx->raid56_data_stripes); @@ -2252,42 +2297,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, } /* Now we can check and regenerate the P/Q stripe. */ - bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; - bio->bi_private = &io_done; - bio->bi_end_io = raid56_scrub_wait_endio; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc, NULL, NULL); - if (ret < 0) { - bio_put(bio); - btrfs_put_bioc(bioc); - btrfs_bio_counter_dec(fs_info); - goto out; - } - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap, - BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); - btrfs_put_bioc(bioc); - if (!rbio) { - ret = -ENOMEM; - bio_put(bio); - btrfs_bio_counter_dec(fs_info); - goto out; - } - /* Use the recovered stripes as cache to avoid read them from disk again. */ - for (int i = 0; i < data_stripes; i++) { - stripe = &sctx->raid56_data_stripes[i]; - - raid56_parity_cache_data_folios(rbio, stripe->folios, - full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT)); - } - raid56_parity_submit_scrub_rbio(rbio); - wait_for_completion_io(&io_done); - ret = blk_status_to_errno(bio->bi_status); - bio_put(bio); - btrfs_bio_counter_dec(fs_info); - + ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, + &extent_bitmap); out: btrfs_release_path(&extent_path); btrfs_release_path(&csum_path); From 9b3743a6760bedc783809b94aa87b9b8ef64f52b Mon Sep 17 00:00:00 2001 From: Baolin Liu Date: Tue, 11 Nov 2025 20:05:58 +0800 Subject: [PATCH 094/147] btrfs: simplify list initialization in btrfs_compr_pool_scan() In btrfs_compr_pool_scan(), use LIST_HEAD() to declare and initialize the 'remove' list_head in one step instead of using INIT_LIST_HEAD() separately. Signed-off-by: Baolin Liu Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index cdb5c891b0ac..241a117ad7cc 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -192,15 +192,13 @@ static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_c static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc) { - struct list_head remove; + LIST_HEAD(remove); struct list_head *tmp, *next; int freed; if (compr_pool.count == 0) return SHRINK_STOP; - INIT_LIST_HEAD(&remove); - /* For now, just simply drain the whole list. */ spin_lock(&compr_pool.lock); list_splice_init(&compr_pool.list, &remove); From 17d552ab9b2be6c2c28169fcf913114f63a71a22 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 9 Oct 2025 15:09:59 +1030 Subject: [PATCH 095/147] btrfs: raid56: remove sector_ptr::has_paddr member We can use paddr -1 as an indicator for unset/uninitialized paddr. We can not use 0 paddr, unlike virtual address 0 which is never mapped thus will always trigger a page fault, physical address 0 may be a valid page. So here we follow swiotlb to use (paddr)-1 as a special indicator for invalid/unset physical address. Even if the PFN may still be valid, our usage of the physical address should always be aligned to fs block size (or page size for bs > ps cases), thus such -1 paddr should never be a valid one. With this special -1 paddr, we can get rid of has_paddr member and save 1 byte for sector_ptr structure. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 302f20d8c335..c057491c842b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -133,6 +133,12 @@ struct btrfs_stripe_hash_table { struct btrfs_stripe_hash table[]; }; +/* + * The PFN may still be valid, but our paddrs should always be block size + * aligned, thus such -1 paddr is definitely not a valid one. + */ +#define INVALID_PADDR (~(phys_addr_t)0) + /* * A structure to present a sector inside a page, the length is fixed to * sectorsize; @@ -141,9 +147,10 @@ struct sector_ptr { /* * Blocks from the bio list can still be highmem. * So here we use physical address to present a page and the offset inside it. + * + * If it's INVALID_PADDR then it's not set. */ phys_addr_t paddr; - bool has_paddr; bool uptodate; }; @@ -263,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].has_paddr) { + if (rbio->bio_sectors[i].paddr == INVALID_PADDR) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is @@ -335,7 +342,6 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) if (!rbio->stripe_pages[page_index]) continue; - rbio->stripe_sectors[i].has_paddr = true; rbio->stripe_sectors[i].paddr = page_to_phys(rbio->stripe_pages[page_index]) + offset_in_page(offset); @@ -972,9 +978,9 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, spin_lock(&rbio->bio_list_lock); sector = &rbio->bio_sectors[index]; - if (sector->has_paddr || bio_list_only) { + if (sector->paddr != INVALID_PADDR || bio_list_only) { /* Don't return sector without a valid page pointer */ - if (!sector->has_paddr) + if (sector->paddr == INVALID_PADDR) sector = NULL; spin_unlock(&rbio->bio_list_lock); return sector; @@ -1032,6 +1038,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, kfree(rbio); return ERR_PTR(-ENOMEM); } + for (int i = 0; i < num_sectors; i++) { + rbio->stripe_sectors[i].paddr = INVALID_PADDR; + rbio->bio_sectors[i].paddr = INVALID_PADDR; + } bio_list_init(&rbio->bio_list); init_waitqueue_head(&rbio->io_wait); @@ -1152,7 +1162,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - ASSERT(sector->has_paddr); + ASSERT(sector->paddr != INVALID_PADDR); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1216,7 +1226,6 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) unsigned int index = (offset >> sectorsize_bits); struct sector_ptr *sector = &rbio->bio_sectors[index]; - sector->has_paddr = true; sector->paddr = paddr; offset += sectorsize; } @@ -1299,7 +1308,7 @@ static void assert_rbio(struct btrfs_raid_bio *rbio) static inline void *kmap_local_sector(const struct sector_ptr *sector) { /* The sector pointer must have a page mapped to it. */ - ASSERT(sector->has_paddr); + ASSERT(sector->paddr != INVALID_PADDR); return kmap_local_page(phys_to_page(sector->paddr)) + offset_in_page(sector->paddr); @@ -1498,7 +1507,7 @@ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; - if (sector->has_paddr && sector->paddr == paddr) + if (sector->paddr == paddr) return sector; } return NULL; @@ -1532,8 +1541,7 @@ static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) for (i = 0; i < rbio->nr_sectors; i++) { if (rbio->stripe_sectors[i].paddr == bvec_paddr) break; - if (rbio->bio_sectors[i].has_paddr && - rbio->bio_sectors[i].paddr == bvec_paddr) + if (rbio->bio_sectors[i].paddr == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -2317,7 +2325,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (!sector->has_paddr || !sector->uptodate) + if (sector->paddr == INVALID_PADDR || !sector->uptodate) return true; } return false; @@ -2508,8 +2516,8 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) int sectornr; bool has_qstripe; struct page *page; - struct sector_ptr p_sector = { 0 }; - struct sector_ptr q_sector = { 0 }; + struct sector_ptr p_sector = { .paddr = INVALID_PADDR }; + struct sector_ptr q_sector = { .paddr = INVALID_PADDR }; struct bio_list bio_list; int is_replace = 0; int ret; @@ -2542,7 +2550,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; - p_sector.has_paddr = true; p_sector.paddr = page_to_phys(page); p_sector.uptodate = 1; page = NULL; @@ -2552,10 +2559,9 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) page = alloc_page(GFP_NOFS); if (!page) { __free_page(phys_to_page(p_sector.paddr)); - p_sector.has_paddr = false; + p_sector.paddr = INVALID_PADDR; return -ENOMEM; } - q_sector.has_paddr = true; q_sector.paddr = page_to_phys(page); q_sector.uptodate = 1; page = NULL; @@ -2604,10 +2610,10 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) kunmap_local(pointers[nr_data]); __free_page(phys_to_page(p_sector.paddr)); - p_sector.has_paddr = false; - if (q_sector.has_paddr) { + p_sector.paddr = INVALID_PADDR; + if (q_sector.paddr != INVALID_PADDR) { __free_page(phys_to_page(q_sector.paddr)); - q_sector.has_paddr = false; + q_sector.paddr = INVALID_PADDR; } /* From 1810350b04ef38b375c64304e142de96d90404e1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 9 Oct 2025 15:10:00 +1030 Subject: [PATCH 096/147] btrfs: raid56: move sector_ptr::uptodate into a dedicated bitmap The uptodate boolean member can be extracted into a bitmap, which will save us some space (1 bit in a byte vs 8 bits in a byte). Furthermore we do not need to record the uptodate bitmap for bio sectors, as if bio_sectors[].paddr is valid it means there is a bio and will be uptodate. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 68 +++++++++++++++++++++++------------------------ fs/btrfs/raid56.h | 3 +++ 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c057491c842b..3375919f8122 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -151,7 +151,6 @@ struct sector_ptr { * If it's INVALID_PADDR then it's not set. */ phys_addr_t paddr; - bool uptodate; }; static void rmw_rbio_work(struct work_struct *work); @@ -277,13 +276,13 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) * read from disk. */ if (i < rbio->nr_data * rbio->stripe_nsectors) - ASSERT(rbio->stripe_sectors[i].uptodate); + ASSERT(test_bit(i, rbio->stripe_uptodate_bitmap)); continue; } memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i], rbio->bioc->fs_info->sectorsize); - rbio->stripe_sectors[i].uptodate = 1; + set_bit(i, rbio->stripe_uptodate_bitmap); } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } @@ -318,7 +317,7 @@ static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbi for (i = sectors_per_page * page_nr; i < sectors_per_page * page_nr + sectors_per_page; i++) { - if (!rbio->stripe_sectors[i].uptodate) + if (!test_bit(i, rbio->stripe_uptodate_bitmap)) return false; } return true; @@ -353,17 +352,14 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, { const u32 sectorsize = src->bioc->fs_info->sectorsize; const u32 sectors_per_page = PAGE_SIZE / sectorsize; - int i; if (dest->stripe_pages[page_nr]) __free_page(dest->stripe_pages[page_nr]); dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; src->stripe_pages[page_nr] = NULL; - /* Also update the sector->uptodate bits. */ - for (i = sectors_per_page * page_nr; - i < sectors_per_page * page_nr + sectors_per_page; i++) - dest->stripe_sectors[i].uptodate = true; + /* Also update the stripe_uptodate_bitmap bits. */ + bitmap_set(dest->stripe_uptodate_bitmap, sectors_per_page * page_nr, sectors_per_page); } static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) @@ -1031,9 +1027,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || - !rbio->finish_pointers || !rbio->error_bitmap) { + !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) { free_raid_bio_pointers(rbio); kfree(rbio); return ERR_PTR(-ENOMEM); @@ -1331,7 +1328,8 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) /* Then add the parity stripe */ sector = rbio_pstripe_sector(rbio, sectornr); - sector->uptodate = 1; + set_bit(rbio_stripe_sector_index(rbio, rbio->nr_data, sectornr), + rbio->stripe_uptodate_bitmap); pointers[stripe++] = kmap_local_sector(sector); if (has_qstripe) { @@ -1340,7 +1338,8 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) * to fill in our p/q */ sector = rbio_qstripe_sector(rbio, sectornr); - sector->uptodate = 1; + set_bit(rbio_stripe_sector_index(rbio, rbio->nr_data + 1, sectornr), + rbio->stripe_uptodate_bitmap); pointers[stripe++] = kmap_local_sector(sector); assert_rbio(rbio); @@ -1496,21 +1495,19 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } /* - * For subpage case, we can no longer set page Up-to-date directly for - * stripe_pages[], thus we need to locate the sector. + * Return the index inside the rbio->stripe_sectors[] array. + * + * Return -1 if not found. */ -static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, - phys_addr_t paddr) +static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr) { - int i; - - for (i = 0; i < rbio->nr_sectors; i++) { + for (int i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; if (sector->paddr == paddr) - return sector; + return i; } - return NULL; + return -1; } /* @@ -1525,11 +1522,11 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) ASSERT(!bio_flagged(bio, BIO_CLONED)); btrfs_bio_for_each_block_all(paddr, bio, blocksize) { - struct sector_ptr *sector = find_stripe_sector(rbio, paddr); + int sector_nr = find_stripe_sector_nr(rbio, paddr); - ASSERT(sector); - if (sector) - sector->uptodate = 1; + ASSERT(sector_nr >= 0); + if (sector_nr >= 0) + set_bit(sector_nr, rbio->stripe_uptodate_bitmap); } } @@ -1963,7 +1960,8 @@ pstripe: goto cleanup; sector = rbio_stripe_sector(rbio, faila, sector_nr); - sector->uptodate = 1; + set_bit(rbio_stripe_sector_index(rbio, faila, sector_nr), + rbio->stripe_uptodate_bitmap); } if (failb >= 0) { ret = verify_one_sector(rbio, failb, sector_nr); @@ -1971,7 +1969,8 @@ pstripe: goto cleanup; sector = rbio_stripe_sector(rbio, failb, sector_nr); - sector->uptodate = 1; + set_bit(rbio_stripe_sector_index(rbio, failb, sector_nr), + rbio->stripe_uptodate_bitmap); } cleanup: @@ -2325,7 +2324,8 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (sector->paddr == INVALID_PADDR || !sector->uptodate) + if (sector->paddr == INVALID_PADDR || + !test_bit(i, rbio->stripe_uptodate_bitmap)) return true; } return false; @@ -2551,7 +2551,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; p_sector.paddr = page_to_phys(page); - p_sector.uptodate = 1; page = NULL; if (has_qstripe) { @@ -2563,7 +2562,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) return -ENOMEM; } q_sector.paddr = page_to_phys(page); - q_sector.uptodate = 1; page = NULL; pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector); } @@ -2781,7 +2779,8 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) * The bio cache may have handed us an uptodate sector. If so, * use it. */ - if (sector->uptodate) + if (test_bit(rbio_stripe_sector_index(rbio, stripe, sectornr), + rbio->stripe_uptodate_bitmap)) continue; ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, @@ -2899,8 +2898,7 @@ void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, foffset = 0; } } - for (unsigned int sector_nr = offset_in_full_stripe >> fs_info->sectorsize_bits; - sector_nr < (offset_in_full_stripe + BTRFS_STRIPE_LEN) >> fs_info->sectorsize_bits; - sector_nr++) - rbio->stripe_sectors[sector_nr].uptodate = true; + bitmap_set(rbio->stripe_uptodate_bitmap, + offset_in_full_stripe >> fs_info->sectorsize_bits, + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 84c4d1d29c7a..b636de4af7ac 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -124,6 +124,9 @@ struct btrfs_raid_bio { */ struct sector_ptr *stripe_sectors; + /* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */ + unsigned long *stripe_uptodate_bitmap; + /* Allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; From 5387bd958180bfd7ffe454c8d2e7ae2782ebd4cc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 9 Oct 2025 15:10:01 +1030 Subject: [PATCH 097/147] btrfs: raid56: remove sector_ptr structure Since sector_ptr structure is now only containing a single paddr, there is no need to use that structure. Instead use phys_addr_t array for bio and stripe pointers. This means several helpers are also needed to accept a paddr instead of a sector_ptr pointer. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 283 ++++++++++++++++++++-------------------------- fs/btrfs/raid56.h | 14 +-- 2 files changed, 128 insertions(+), 169 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3375919f8122..95cc243d9c8b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -139,20 +139,6 @@ struct btrfs_stripe_hash_table { */ #define INVALID_PADDR (~(phys_addr_t)0) -/* - * A structure to present a sector inside a page, the length is fixed to - * sectorsize; - */ -struct sector_ptr { - /* - * Blocks from the bio list can still be highmem. - * So here we use physical address to present a page and the offset inside it. - * - * If it's INVALID_PADDR then it's not set. - */ - phys_addr_t paddr; -}; - static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); @@ -165,8 +151,8 @@ static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { bitmap_free(rbio->error_bitmap); kfree(rbio->stripe_pages); - kfree(rbio->bio_sectors); - kfree(rbio->stripe_sectors); + kfree(rbio->bio_paddrs); + kfree(rbio->stripe_paddrs); kfree(rbio->finish_pointers); } @@ -241,12 +227,17 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) return 0; } -static void memcpy_sectors(const struct sector_ptr *dst, - const struct sector_ptr *src, u32 blocksize) +static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr) { - memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr), - phys_to_page(src->paddr), offset_in_page(src->paddr), - blocksize); + phys_addr_t dst = rbio->stripe_paddrs[sector_nr]; + phys_addr_t src = rbio->bio_paddrs[sector_nr]; + + ASSERT(dst != INVALID_PADDR); + ASSERT(src != INVALID_PADDR); + + memcpy_page(phys_to_page(dst), offset_in_page(dst), + phys_to_page(src), offset_in_page(src), + rbio->bioc->fs_info->sectorsize); } /* @@ -269,7 +260,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (rbio->bio_sectors[i].paddr == INVALID_PADDR) { + if (rbio->bio_paddrs[i] == INVALID_PADDR) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is @@ -280,8 +271,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) continue; } - memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i], - rbio->bioc->fs_info->sectorsize); + memcpy_from_bio_to_stripe(rbio, i); set_bit(i, rbio->stripe_uptodate_bitmap); } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -341,9 +331,8 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) if (!rbio->stripe_pages[page_index]) continue; - rbio->stripe_sectors[i].paddr = - page_to_phys(rbio->stripe_pages[page_index]) + - offset_in_page(offset); + rbio->stripe_paddrs[i] = page_to_phys(rbio->stripe_pages[page_index]) + + offset_in_page(offset); } } @@ -689,29 +678,27 @@ static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, return stripe_nr * rbio->stripe_nsectors + sector_nr; } -/* Return a sector from rbio->stripe_sectors, not from the bio list */ -static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int stripe_nr, - unsigned int sector_nr) +/* Return a paddr from rbio->stripe_sectors, not from the bio list */ +static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr) { - return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, - sector_nr)]; + return rbio->stripe_paddrs[rbio_stripe_sector_index(rbio, stripe_nr, sector_nr)]; } -/* Grab a sector inside P stripe */ -static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +/* Grab a paddr inside P stripe */ +static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { - return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); + return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr); } -/* Grab a sector inside Q stripe, return NULL if not RAID6 */ -static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +/* Grab a paddr inside Q stripe, return INVALID_PADDR if not RAID6 */ +static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) - return NULL; - return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); + return INVALID_PADDR; + return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr); } /* @@ -946,7 +933,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) } /* - * Get a sector pointer specified by its @stripe_nr and @sector_nr. + * Get the paddr specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) @@ -957,11 +944,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) * The read/modify/write code wants to reuse the original bio page as much * as possible, and only use stripe_sectors as fallback. */ -static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, - int stripe_nr, int sector_nr, - bool bio_list_only) +static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) { - struct sector_ptr *sector; + phys_addr_t ret = INVALID_PADDR; int index; ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, @@ -973,17 +960,16 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, ASSERT(index >= 0 && index < rbio->nr_sectors); spin_lock(&rbio->bio_list_lock); - sector = &rbio->bio_sectors[index]; - if (sector->paddr != INVALID_PADDR || bio_list_only) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { /* Don't return sector without a valid page pointer */ - if (sector->paddr == INVALID_PADDR) - sector = NULL; + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = rbio->bio_paddrs[index]; spin_unlock(&rbio->bio_list_lock); - return sector; + return ret; } spin_unlock(&rbio->bio_list_lock); - return &rbio->stripe_sectors[index]; + return rbio->stripe_paddrs[index]; } /* @@ -1021,23 +1007,21 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); - rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), - GFP_NOFS); - rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), - GFP_NOFS); + rbio->bio_paddrs = kcalloc(num_sectors, sizeof(phys_addr_t), GFP_NOFS); + rbio->stripe_paddrs = kcalloc(num_sectors, sizeof(phys_addr_t), GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); - if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || + if (!rbio->stripe_pages || !rbio->bio_paddrs || !rbio->stripe_paddrs || !rbio->finish_pointers || !rbio->error_bitmap || !rbio->stripe_uptodate_bitmap) { free_raid_bio_pointers(rbio); kfree(rbio); return ERR_PTR(-ENOMEM); } for (int i = 0; i < num_sectors; i++) { - rbio->stripe_sectors[i].paddr = INVALID_PADDR; - rbio->bio_sectors[i].paddr = INVALID_PADDR; + rbio->stripe_paddrs[i] = INVALID_PADDR; + rbio->bio_paddrs[i] = INVALID_PADDR; } bio_list_init(&rbio->bio_list); @@ -1136,12 +1120,9 @@ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, * Return 0 if everything went well. * Return <0 for error. */ -static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct sector_ptr *sector, - unsigned int stripe_nr, - unsigned int sector_nr, - enum req_op op) +static int rbio_add_io_paddr(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, + phys_addr_t paddr, unsigned int stripe_nr, + unsigned int sector_nr, enum req_op op) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; @@ -1159,7 +1140,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - ASSERT(sector->paddr != INVALID_PADDR); + ASSERT(paddr != INVALID_PADDR); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1190,8 +1171,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, phys_to_page(sector->paddr), - sectorsize, offset_in_page(sector->paddr)); + ret = bio_add_page(last, phys_to_page(paddr), sectorsize, + offset_in_page(paddr)); if (ret == sectorsize) return 0; } @@ -1204,8 +1185,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; - __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize, - offset_in_page(sector->paddr)); + __bio_add_page(bio, phys_to_page(paddr), sectorsize, offset_in_page(paddr)); bio_list_add(bio_list, bio); return 0; } @@ -1221,9 +1201,8 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) { unsigned int index = (offset >> sectorsize_bits); - struct sector_ptr *sector = &rbio->bio_sectors[index]; - sector->paddr = paddr; + rbio->bio_paddrs[index] = paddr; offset += sectorsize; } } @@ -1302,13 +1281,12 @@ static void assert_rbio(struct btrfs_raid_bio *rbio) ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); } -static inline void *kmap_local_sector(const struct sector_ptr *sector) +static inline void *kmap_local_paddr(phys_addr_t paddr) { /* The sector pointer must have a page mapped to it. */ - ASSERT(sector->paddr != INVALID_PADDR); + ASSERT(paddr != INVALID_PADDR); - return kmap_local_page(phys_to_page(sector->paddr)) + - offset_in_page(sector->paddr); + return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); } /* Generate PQ for one vertical stripe. */ @@ -1316,31 +1294,27 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { void **pointers = rbio->finish_pointers; const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct sector_ptr *sector; int stripe; const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; /* First collect one sector from each data stripe */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_sector(sector); - } + for (stripe = 0; stripe < rbio->nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sectornr, 0)); /* Then add the parity stripe */ - sector = rbio_pstripe_sector(rbio, sectornr); set_bit(rbio_stripe_sector_index(rbio, rbio->nr_data, sectornr), rbio->stripe_uptodate_bitmap); - pointers[stripe++] = kmap_local_sector(sector); + pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sectornr)); if (has_qstripe) { /* * RAID6, add the qstripe and call the library function * to fill in our p/q */ - sector = rbio_qstripe_sector(rbio, sectornr); set_bit(rbio_stripe_sector_index(rbio, rbio->nr_data + 1, sectornr), rbio->stripe_uptodate_bitmap); - pointers[stripe++] = kmap_local_sector(sector); + pointers[stripe++] = kmap_local_paddr(rbio_qstripe_paddr(rbio, sectornr)); assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, @@ -1380,7 +1354,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; + phys_addr_t paddr; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; @@ -1390,15 +1364,15 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) + paddr = sector_paddr_in_rbio(rbio, stripe, sectornr, 1); + if (paddr == INVALID_PADDR) continue; } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddr = rbio_stripe_paddr(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, - sectornr, REQ_OP_WRITE); + ret = rbio_add_io_paddr(rbio, bio_list, paddr, stripe, + sectornr, REQ_OP_WRITE); if (ret) goto error; } @@ -1415,7 +1389,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; + phys_addr_t paddr; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; @@ -1440,14 +1414,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) + paddr = sector_paddr_in_rbio(rbio, stripe, sectornr, 1); + if (paddr == INVALID_PADDR) continue; } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddr = rbio_stripe_paddr(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, bio_list, sector, + ret = rbio_add_io_paddr(rbio, bio_list, paddr, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) @@ -1502,9 +1476,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr) { for (int i = 0; i < rbio->nr_sectors; i++) { - struct sector_ptr *sector = &rbio->stripe_sectors[i]; - - if (sector->paddr == paddr) + if (rbio->stripe_paddrs[i] == paddr) return i; } return -1; @@ -1536,9 +1508,9 @@ static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) int i; for (i = 0; i < rbio->nr_sectors; i++) { - if (rbio->stripe_sectors[i].paddr == bvec_paddr) + if (rbio->stripe_paddrs[i] == bvec_paddr) break; - if (rbio->bio_sectors[i].paddr == bvec_paddr) + if (rbio->bio_paddrs[i] == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -1790,7 +1762,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - struct sector_ptr *sector; + phys_addr_t paddr; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; int ret; @@ -1806,15 +1778,15 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { - sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr); } csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - ret = btrfs_check_block_csum(fs_info, sector->paddr, csum_buf, csum_expected); + ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, csum_expected); return ret; } @@ -1827,7 +1799,6 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, void **pointers, void **unmap_array) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - struct sector_ptr *sector; const u32 sectorsize = fs_info->sectorsize; int found_errors; int faila; @@ -1862,16 +1833,18 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * pointer order. */ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + phys_addr_t paddr; + /* * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { - sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr); } - pointers[stripe_nr] = kmap_local_sector(sector); + pointers[stripe_nr] = kmap_local_paddr(paddr); unmap_array[stripe_nr] = pointers[stripe_nr]; } @@ -1959,7 +1932,6 @@ pstripe: if (ret < 0) goto cleanup; - sector = rbio_stripe_sector(rbio, faila, sector_nr); set_bit(rbio_stripe_sector_index(rbio, faila, sector_nr), rbio->stripe_uptodate_bitmap); } @@ -1968,7 +1940,6 @@ pstripe: if (ret < 0) goto cleanup; - sector = rbio_stripe_sector(rbio, failb, sector_nr); set_bit(rbio_stripe_sector_index(rbio, failb, sector_nr), rbio->stripe_uptodate_bitmap); } @@ -2050,7 +2021,7 @@ static void recover_rbio(struct btrfs_raid_bio *rbio) total_sector_nr++) { int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; - struct sector_ptr *sector; + phys_addr_t paddr; /* * Skip the range which has error. It can be a range which is @@ -2067,9 +2038,9 @@ static void recover_rbio(struct btrfs_raid_bio *rbio) continue; } - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, REQ_OP_READ); + paddr = rbio_stripe_paddr(rbio, stripe, sectornr); + ret = rbio_add_io_paddr(rbio, &bio_list, paddr, stripe, + sectornr, REQ_OP_READ); if (ret < 0) { bio_list_put(&bio_list); goto out; @@ -2257,13 +2228,13 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct sector_ptr *sector; int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; + phys_addr_t paddr; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, REQ_OP_READ); + paddr = rbio_stripe_paddr(rbio, stripe, sectornr); + ret = rbio_add_io_paddr(rbio, &bio_list, paddr, stripe, + sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; @@ -2317,14 +2288,14 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) int i; for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { - struct sector_ptr *sector = &rbio->stripe_sectors[i]; + phys_addr_t paddr = rbio->stripe_paddrs[i]; /* * We have a sector which doesn't have page nor uptodate, * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (sector->paddr == INVALID_PADDR || + if (paddr == INVALID_PADDR || !test_bit(i, rbio->stripe_uptodate_bitmap)) return true; } @@ -2516,8 +2487,8 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) int sectornr; bool has_qstripe; struct page *page; - struct sector_ptr p_sector = { .paddr = INVALID_PADDR }; - struct sector_ptr q_sector = { .paddr = INVALID_PADDR }; + phys_addr_t p_paddr = INVALID_PADDR; + phys_addr_t q_paddr = INVALID_PADDR; struct bio_list bio_list; int is_replace = 0; int ret; @@ -2550,36 +2521,34 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; - p_sector.paddr = page_to_phys(page); + p_paddr = page_to_phys(page); page = NULL; + pointers[nr_data] = kmap_local_paddr(p_paddr); if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ page = alloc_page(GFP_NOFS); if (!page) { - __free_page(phys_to_page(p_sector.paddr)); - p_sector.paddr = INVALID_PADDR; + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; return -ENOMEM; } - q_sector.paddr = page_to_phys(page); + q_paddr = page_to_phys(page); page = NULL; - pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector); + pointers[rbio->real_stripes - 1] = kmap_local_paddr(q_paddr); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_sector(&p_sector); for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; void *parity; /* first collect one page from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_sector(sector); - } + for (stripe = 0; stripe < nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_paddr_in_rbio(rbio, stripe, sectornr, 0)); if (has_qstripe) { assert_rbio(rbio); @@ -2593,8 +2562,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } /* Check scrubbing parity and repair it */ - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - parity = kmap_local_sector(sector); + parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sectornr)); if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) memcpy(parity, pointers[rbio->scrubp], sectorsize); else @@ -2607,11 +2575,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } kunmap_local(pointers[nr_data]); - __free_page(phys_to_page(p_sector.paddr)); - p_sector.paddr = INVALID_PADDR; - if (q_sector.paddr != INVALID_PADDR) { - __free_page(phys_to_page(q_sector.paddr)); - q_sector.paddr = INVALID_PADDR; + __free_page(phys_to_page(p_paddr)); + p_paddr = INVALID_PADDR; + if (q_paddr != INVALID_PADDR) { + __free_page(phys_to_page(q_paddr)); + q_paddr = INVALID_PADDR; } /* @@ -2620,11 +2588,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) * everything else. */ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; + phys_addr_t paddr; - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, - sectornr, REQ_OP_WRITE); + paddr = rbio_stripe_paddr(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddr(rbio, &bio_list, paddr, rbio->scrubp, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2638,12 +2606,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) */ ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; + phys_addr_t paddr; - sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, - rbio->real_stripes, - sectornr, REQ_OP_WRITE); + paddr = rbio_stripe_paddr(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddr(rbio, &bio_list, paddr, rbio->real_stripes, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2759,7 +2726,7 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) total_sector_nr++) { int sectornr = total_sector_nr % rbio->stripe_nsectors; int stripe = total_sector_nr / rbio->stripe_nsectors; - struct sector_ptr *sector; + phys_addr_t paddr; /* No data in the vertical stripe, no need to read. */ if (!test_bit(sectornr, &rbio->dbitmap)) @@ -2767,14 +2734,14 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) /* * We want to find all the sectors missing from the rbio and - * read them from the disk. If sector_in_rbio() finds a sector + * read them from the disk. If sector_paddr_in_rbio() finds a sector * in the bio list we don't need to read it off the stripe. */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) + paddr = sector_paddr_in_rbio(rbio, stripe, sectornr, 1); + if (paddr == INVALID_PADDR) continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); + paddr = rbio_stripe_paddr(rbio, stripe, sectornr); /* * The bio cache may have handed us an uptodate sector. If so, * use it. @@ -2783,8 +2750,8 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) rbio->stripe_uptodate_bitmap)) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, REQ_OP_READ); + ret = rbio_add_io_paddr(rbio, &bio_list, paddr, stripe, + sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index b636de4af7ac..42a45716fb03 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -16,7 +16,6 @@ #include "volumes.h" struct page; -struct sector_ptr; struct btrfs_fs_info; enum btrfs_rbio_ops { @@ -116,13 +115,10 @@ struct btrfs_raid_bio { struct page **stripe_pages; /* Pointers to the sectors in the bio_list, for faster lookup */ - struct sector_ptr *bio_sectors; + phys_addr_t *bio_paddrs; - /* - * For subpage support, we need to map each sector to above - * stripe_pages. - */ - struct sector_ptr *stripe_sectors; + /* Pointers to the sectors in the stripe_pages[]. */ + phys_addr_t *stripe_paddrs; /* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */ unsigned long *stripe_uptodate_bitmap; @@ -134,10 +130,6 @@ struct btrfs_raid_bio { * The bitmap recording where IO errors happened. * Each bit is corresponding to one sector in either bio_sectors[] or * stripe_sectors[] array. - * - * The reason we don't use another bit in sector_ptr is, we have two - * arrays of sectors, and a lot of IO can use sectors in both arrays. - * Thus making it much harder to iterate. */ unsigned long *error_bitmap; From a320476ca8a3d2e63017fe8ec06ef8b6a09c65cd Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Tue, 7 Oct 2025 11:35:12 +0800 Subject: [PATCH 098/147] btrfs: tests: do trivial BTRFS_PATH_AUTO_FREE conversions Trivial pattern for the auto freeing where there are no operations between btrfs_free_path() and the function returns. Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tests/qgroup-tests.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 3fc8dc3fd980..05cfda8af422 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -20,7 +20,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, struct btrfs_extent_item *item; struct btrfs_extent_inline_ref *iref; struct btrfs_tree_block_info *block_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); @@ -41,7 +41,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, ret = btrfs_insert_empty_item(&trans, root, path, &ins, size); if (ret) { test_err("couldn't insert ref %d", ret); - btrfs_free_path(path); return ret; } @@ -61,7 +60,6 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY); btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_free_path(path); return 0; } @@ -70,7 +68,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, { struct btrfs_trans_handle trans; struct btrfs_extent_item *item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 refs; int ret; @@ -90,7 +88,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); - btrfs_free_path(path); return ret; } @@ -112,7 +109,6 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, ret = btrfs_insert_empty_item(&trans, root, path, &key, 0); if (ret) test_err("failed to insert backref"); - btrfs_free_path(path); return ret; } @@ -121,7 +117,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, { struct btrfs_trans_handle trans; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; btrfs_init_dummy_trans(&trans, NULL); @@ -139,11 +135,9 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, ret = btrfs_search_slot(&trans, root, &key, path, -1, 1); if (ret) { test_err("didn't find our key %d", ret); - btrfs_free_path(path); return ret; } btrfs_del_item(&trans, root, path); - btrfs_free_path(path); return 0; } @@ -152,7 +146,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, { struct btrfs_trans_handle trans; struct btrfs_extent_item *item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 refs; int ret; @@ -172,7 +166,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, ret = btrfs_search_slot(&trans, root, &key, path, 0, 1); if (ret) { test_err("couldn't find extent ref"); - btrfs_free_path(path); return ret; } @@ -198,7 +191,6 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return ret; } btrfs_del_item(&trans, root, path); - btrfs_free_path(path); return ret; } From 4decf577fb7a5a252f6f67383d06111b3525505f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 15 Oct 2025 18:48:37 +0200 Subject: [PATCH 099/147] btrfs: move and rename CSUM_FMT definition Move the CSUM_FMT* definitions to fs.h where is be the BTRFS_KEY_FMT and add the prefix for consistency. Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 4 ---- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/fs.h | 4 ++++ fs/btrfs/inode.c | 24 ++++++++++++------------ fs/btrfs/scrub.c | 6 +++--- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a66ca5531b5c..af38d47fc131 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -543,10 +543,6 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) #endif } -/* Array of bytes with variable length, hexadecimal format 0x1234 */ -#define CSUM_FMT "0x%*phN" -#define CSUM_FMT_VALUE(size, bytes) size, bytes - void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *dest); int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6a1fa3b08b3f..4764108b0338 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -399,10 +399,10 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, -"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d%s", +"checksum verify failed on logical %llu mirror %u wanted " BTRFS_CSUM_FMT " found " BTRFS_CSUM_FMT " level %d%s", eb->start, eb->read_mirror, - CSUM_FMT_VALUE(csum_size, header_csum), - CSUM_FMT_VALUE(csum_size, result), + BTRFS_CSUM_FMT_VALUE(csum_size, header_csum), + BTRFS_CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb), ignore_csum ? ", ignored" : ""); if (unlikely(!ignore_csum)) { diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index e7cd4490736f..0f7e1ef27891 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -74,6 +74,10 @@ struct btrfs_space_info; #define BTRFS_SUPER_INFO_SIZE 4096 static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); +/* Array of bytes with variable length, hexadecimal format 0x1234 */ +#define BTRFS_CSUM_FMT "0x%*phN" +#define BTRFS_CSUM_FMT_VALUE(size, bytes) size, bytes + #define BTRFS_KEY_FMT "(%llu %u %llu)" #define BTRFS_KEY_FMT_VALUE(key) (key)->objectid, (key)->type, (key)->offset diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8737914e8552..5929cc5fc01f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -234,21 +234,21 @@ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off if (logical == U64_MAX) { btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, -"csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %lld ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); return; } logical += file_off; btrfs_warn_rl(fs_info, -"csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %lld ino %llu off %llu logical %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); @@ -319,19 +319,19 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, /* Output without objectid, which is more meaningful */ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, -"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %lld ino %lld off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } else { btrfs_warn_rl(root->fs_info, -"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", +"csum failed root %llu ino %llu off %llu csum " BTRFS_CSUM_FMT " expected csum " BTRFS_CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), + BTRFS_CSUM_FMT_VALUE(csum_size, csum), + BTRFS_CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3029ef683fb9..ab259815a899 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -775,10 +775,10 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, -"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +"scrub: tree block %llu mirror %u has bad csum, has " BTRFS_CSUM_FMT " want " BTRFS_CSUM_FMT, logical, stripe->mirror_num, - CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), - CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); + BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + BTRFS_CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); return; } if (stripe->sectors[sector_nr].generation != From fe1e50031feae74688e33fe4e0bdc7d9585c07ce Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Nov 2025 15:40:47 +0000 Subject: [PATCH 100/147] btrfs: move struct reserve_ticket definition to space-info.c It's not used anywhere outside space-info.c so move it from space-info.h into space-info.c. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 9 +++++++++ fs/btrfs/space-info.h | 9 --------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 4ae6928fdca4..61fd76c3da0d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -173,6 +173,15 @@ * thing with or without extra unallocated space. */ +struct reserve_ticket { + u64 bytes; + int error; + bool steal; + struct list_head list; + wait_queue_head_t wait; + spinlock_t lock; +}; + /* * after adding space to the filesystem, we need to clear the full flags * on all the space infos. diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index a4c2a3c8b388..446c0614ad4a 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -224,15 +224,6 @@ struct btrfs_space_info { s64 reclaimable_bytes; }; -struct reserve_ticket { - u64 bytes; - int error; - bool steal; - struct list_head list; - wait_queue_head_t wait; - spinlock_t lock; -}; - static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_info) { return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && From 62bcbdca0ea9b1add9c22f400b51c56184902053 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 11 Nov 2025 09:11:58 +1030 Subject: [PATCH 101/147] btrfs: make btrfs_csum_one_bio() handle bs > ps without large folios For bs > ps cases, all folios passed into btrfs_csum_one_bio() are ensured to be backed by large folios. But that requirement excludes features like direct IO and encoded writes. To support bs > ps without large folios, enhance btrfs_csum_one_bio() by: - Split btrfs_calculate_block_csum() into two versions * btrfs_calculate_block_csum_folio() For call sites where a fs block is always backed by a large folio. This will do extra checks on the folio size, build a paddrs[] array, and pass it into the newer btrfs_calculate_block_csum_pages() helper. For now btrfs_check_block_csum() is still using this version. * btrfs_calculate_block_csum_pages() For call sites that may hit a fs block backed by noncontiguous pages. The pages are represented by paddrs[] array, which includes the offset inside the page. This function will do the proper sub-block handling. - Make btrfs_csum_one_bio() to use btrfs_calculate_block_csum_pages() This means we will need to build a local paddrs[] array, and after filling a fs block, do the checksum calculation. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 6 ++-- fs/btrfs/file-item.c | 15 +++++++-- fs/btrfs/inode.c | 73 ++++++++++++++++++++++++++++++------------ 3 files changed, 68 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index af38d47fc131..00671b724079 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -543,8 +543,10 @@ static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode) #endif } -void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, - u8 *dest); +void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info, + const phys_addr_t paddr, u8 *dest); +void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, + const phys_addr_t paddrs[], u8 *dest); int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 72be3ede0edf..e7c219e83ff0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -775,13 +775,22 @@ static void csum_one_bio(struct btrfs_bio *bbio, struct bvec_iter *src) struct bvec_iter iter = *src; phys_addr_t paddr; const u32 blocksize = fs_info->sectorsize; + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; + u32 offset = 0; int index = 0; shash->tfm = fs_info->csum_shash; - btrfs_bio_for_each_block(paddr, bio, &iter, blocksize) { - btrfs_calculate_block_csum(fs_info, paddr, sums->sums + index); - index += fs_info->csum_size; + btrfs_bio_for_each_block(paddr, bio, &iter, step) { + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; + + if (IS_ALIGNED(offset, blocksize)) { + btrfs_calculate_block_csum_pages(fs_info, paddrs, sums->sums + index); + index += fs_info->csum_size; + } } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5929cc5fc01f..c083a67d0091 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3343,36 +3343,67 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } -void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, - u8 *dest) +/* + * Calculate the checksum of an fs block at physical memory address @paddr, + * and save the result to @dest. + * + * The folio containing @paddr must be large enough to contain a full fs block. + */ +void btrfs_calculate_block_csum_folio(struct btrfs_fs_info *fs_info, + const phys_addr_t paddr, u8 *dest) { struct folio *folio = page_folio(phys_to_page(paddr)); const u32 blocksize = fs_info->sectorsize; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; - shash->tfm = fs_info->csum_shash; /* The full block must be inside the folio. */ ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); - if (folio_test_partial_kmap(folio)) { - size_t cur = paddr; + for (int i = 0; i < nr_steps; i++) { + u32 pindex = offset_in_folio(folio, paddr + i * step) >> PAGE_SHIFT; - crypto_shash_init(shash); - while (cur < paddr + blocksize) { - void *kaddr; - size_t len = min(paddr + blocksize - cur, - PAGE_SIZE - offset_in_page(cur)); - - kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur)); - crypto_shash_update(shash, kaddr, len); - kunmap_local(kaddr); - cur += len; - } - crypto_shash_final(shash, dest); - } else { - crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest); + /* + * For bs <= ps cases, we will only run the loop once, so the offset + * inside the page will only added to paddrs[0]. + * + * For bs > ps cases, the block must be page aligned, thus offset + * inside the page will always be 0. + */ + paddrs[i] = page_to_phys(folio_page(folio, pindex)) + offset_in_page(paddr); } + return btrfs_calculate_block_csum_pages(fs_info, paddrs, dest); } + +/* + * Calculate the checksum of a fs block backed by multiple noncontiguous pages + * at @paddrs[] and save the result to @dest. + * + * The folio containing @paddr must be large enough to contain a full fs block. + */ +void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, + const phys_addr_t paddrs[], u8 *dest) +{ + const u32 blocksize = fs_info->sectorsize; + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + for (int i = 0; i < nr_steps; i++) { + const phys_addr_t paddr = paddrs[i]; + void *kaddr; + + ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE); + kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); + crypto_shash_update(shash, kaddr, step); + kunmap_local(kaddr); + } + crypto_shash_final(shash, dest); +} + /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. @@ -3382,7 +3413,7 @@ void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected) { - btrfs_calculate_block_csum(fs_info, paddr, csum); + btrfs_calculate_block_csum_folio(fs_info, paddr, csum); if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) return -EIO; return 0; From 2574e9011018a1d6d3da8d03d0bfc4e2675dee2a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 11 Nov 2025 09:11:59 +1030 Subject: [PATCH 102/147] btrfs: make btrfs_repair_io_failure() handle bs > ps cases without large folios Currently btrfs_repair_io_failure() only accept a single @paddr parameter, and for bs > ps cases it's required that @paddr is backed by a large folio. That assumption has quite some limitations, preventing us from utilizing true zero-copy direct-io and encoded read/writes. To address the problem, enhance btrfs_repair_io_failure() by: - Accept an array of paddrs, up to 64K / PAGE_SIZE entries This kind of acts like a bio_vec, but with very limited entries, as the function is only utilized to repair one fs data block, or a tree block. Both have an upper size limit (BTRFS_MAX_BLOCK_SIZE, i.e. 64K), so we don't need the full bio_vec thing to handle it. - Allocate a bio with multiple slots Previously even for bs > ps cases, we only passed in a contiguous physical address range, thus a single slot will be enough. But not anymore, so we have to allocate a bio structure, other than using the on-stack one. - Use on-stack memory to allocate @paddrs array It's at most 16 pages (4K page size, 64K block size), will take up at most 128 bytes. I think the on-stack cost is still acceptable. - Add one extra check to make sure the repair bio is exactly one block - Utilize btrfs_repair_io_failure() to submit a single bio for metadata This should improve the read-repair performance for metadata, as now we submit a node sized bio then wait, other than submit each block of the metadata and wait for each submitted block. - Add one extra parameter indicating the step This is due to the fact that metadata step can be as large as nodesize, instead of sectorsize. So we need a way to distinguish metadata and data repair. - Reduce the width of @length parameter of btrfs_repair_io_failure() Since we only call btrfs_repair_io_failure() on a single data or metadata block, u64 is overkilled. Use u32 instead and add one extra ASSERT()s to make sure the length never exceed BTRFS_MAX_BLOCK_SIZE. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 68 ++++++++++++++++++++++++++++++++++++---------- fs/btrfs/bio.h | 5 ++-- fs/btrfs/disk-io.c | 29 ++++++++++++-------- 3 files changed, 75 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index a73652b8724a..383ea6731b35 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -172,7 +172,21 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct btrfs_inode *inode = repair_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); + /* + * We can not move forward the saved_iter, as it will be later + * utilized by repair_bbio again. + */ + struct bvec_iter saved_iter = repair_bbio->saved_iter; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT; + const u32 nr_steps = repair_bbio->saved_iter.bi_size / step; int mirror = repair_bbio->mirror_num; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; + phys_addr_t paddr; + unsigned int slot = 0; + + /* Repair bbio should be eaxctly one block sized. */ + ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize); if (repair_bbio->bio.bi_status || !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { @@ -190,12 +204,17 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, return; } + btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { + ASSERT(slot < nr_steps); + paddrs[slot] = paddr; + slot++; + } + do { mirror = prev_repair_mirror(fbio, mirror); btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, - repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - bvec_phys(bv), mirror); + logical, paddrs, step, mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -866,18 +885,36 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) * * The I/O is issued synchronously to block the repair read completion from * freeing the bio. + * + * @ino: Offending inode number + * @fileoff: File offset inside the inode + * @length: Length of the repair write + * @logical: Logical address of the range + * @paddrs: Physical address array of the content + * @step: Length of for each paddrs + * @mirror_num: Mirror number to write to. Must not be zero */ -int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, phys_addr_t paddr, int mirror_num) +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, + u32 length, u64 logical, const phys_addr_t paddrs[], + unsigned int step, int mirror_num) { + const u32 nr_steps = DIV_ROUND_UP_POW2(length, step); struct btrfs_io_stripe smap = { 0 }; - struct bio_vec bvec; - struct bio bio; + struct bio *bio = NULL; int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); + /* Basic alignment checks. */ + ASSERT(IS_ALIGNED(logical, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(length, fs_info->sectorsize)); + ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize)); + /* Either it's a single data or metadata block. */ + ASSERT(length <= BTRFS_MAX_BLOCKSIZE); + ASSERT(step <= length); + ASSERT(is_power_of_2(step)); + if (btrfs_repair_one_zone(fs_info, logical)) return 0; @@ -897,24 +934,27 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, goto out_counter_dec; } - bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr)); - ret = submit_bio_wait(&bio); + bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + for (int i = 0; i < nr_steps; i++) { + ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i])); + /* We should have allocated enough slots to contain all the different pages. */ + ASSERT(ret == step); + } + ret = submit_bio_wait(bio); + bio_put(bio); if (ret) { /* try to remap that extent elsewhere? */ btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); - goto out_bio_uninit; + goto out_counter_dec; } btrfs_info_rl(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, btrfs_dev_name(smap.dev), + ino, fileoff, btrfs_dev_name(smap.dev), smap.physical >> SECTOR_SHIFT); ret = 0; -out_bio_uninit: - bio_uninit(&bio); out_counter_dec: btrfs_bio_counter_dec(fs_info); return ret; diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index deaeea3becf4..035145909b00 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -117,7 +117,8 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); -int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, phys_addr_t paddr, int mirror_num); +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff, + u32 length, u64 logical, const phys_addr_t paddrs[], + unsigned int step, int mirror_num); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4764108b0338..0df81a09a3d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -183,26 +183,33 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; + const u32 step = min(fs_info->nodesize, PAGE_SIZE); + const u32 nr_steps = eb->len / step; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_extent_folios(eb); i++) { + for (int i = 0; i < num_extent_pages(eb); i++) { struct folio *folio = eb->folios[i]; - u64 start = max_t(u64, eb->start, folio_pos(folio)); - u64 end = min_t(u64, eb->start + eb->len, - folio_pos(folio) + eb->folio_size); - u32 len = end - start; - phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) + - offset_in_folio(folio, start); - ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, - paddr, mirror_num); - if (ret) - break; + /* No large folio support yet. */ + ASSERT(folio_order(folio) == 0); + ASSERT(i < nr_steps); + + /* + * For nodesize < page size, there is just one paddr, with some + * offset inside the page. + * + * For nodesize >= page size, it's one or more paddrs, and eb->start + * must be aligned to page boundary. + */ + paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start); } + ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start, + paddrs, step, mirror_num); return ret; } From 052fd7a5cace152489cfc8abc212e0213154980f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 11 Nov 2025 09:12:00 +1030 Subject: [PATCH 103/147] btrfs: make read verification handle bs > ps cases without large folios The current read verification is also relying on large folios to support bs > ps cases, but that introduced quite some limits. To enhance read-repair to support bs > ps without large folios: - Make btrfs_data_csum_ok() to accept an array of paddrs Which can pass the paddrs[] direct into btrfs_calculate_block_csum_pages(). - Make repair_one_sector() to accept an array of paddrs So that it can submit a repair bio backed by regular pages, not only large folios. This requires us to allocate more slots at bio allocation time though. Also since the caller may have only partially advanced the saved_iter for bs > ps cases, we can not directly trust the logical bytenr from saved_iter (can be unaligned), thus a manual round down is necessary for the logical bytenr. - Make btrfs_check_read_bio() to build an array of paddrs The tricky part is that we can only call btrfs_data_csum_ok() after all involved pages are assembled. This means at the call time of btrfs_check_read_bio(), our offset inside the bio is already at the end of the fs block. Thus we must re-calculate @bio_offset for btrfs_data_csum_ok() and repair_one_sector(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 62 ++++++++++++++++++++++++++++-------------- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/inode.c | 18 ++++++------ 3 files changed, 52 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 383ea6731b35..fcd28eb68247 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -171,7 +171,6 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct btrfs_failed_bio *fbio = repair_bbio->private; struct btrfs_inode *inode = repair_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); /* * We can not move forward the saved_iter, as it will be later * utilized by repair_bbio again. @@ -188,8 +187,14 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, /* Repair bbio should be eaxctly one block sized. */ ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize); + btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { + ASSERT(slot < nr_steps); + paddrs[slot] = paddr; + slot++; + } + if (repair_bbio->bio.bi_status || - !btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) { + !btrfs_data_csum_ok(repair_bbio, dev, 0, paddrs)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); repair_bbio->bio.bi_iter = repair_bbio->saved_iter; @@ -204,12 +209,6 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, return; } - btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) { - ASSERT(slot < nr_steps); - paddrs[slot] = paddr; - slot++; - } - do { mirror = prev_repair_mirror(fbio, mirror); btrfs_repair_io_failure(fs_info, btrfs_ino(inode), @@ -231,21 +230,25 @@ done: */ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, u32 bio_offset, - phys_addr_t paddr, + phys_addr_t paddrs[], struct btrfs_failed_bio *fbio) { struct btrfs_inode *inode = failed_bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct folio *folio = page_folio(phys_to_page(paddr)); const u32 sectorsize = fs_info->sectorsize; - const u32 foff = offset_in_folio(folio, paddr); - const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT); + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 nr_steps = sectorsize / step; + /* + * For bs > ps cases, the saved_iter can be partially moved forward. + * In that case we should round it down to the block boundary. + */ + const u64 logical = round_down(failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT, + sectorsize); struct btrfs_bio *repair_bbio; struct bio *repair_bio; int num_copies; int mirror; - ASSERT(foff + sectorsize <= folio_size(folio)); btrfs_debug(fs_info, "repair read error: read error at %llu", failed_bbio->file_offset + bio_offset); @@ -265,10 +268,18 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, atomic_inc(&fbio->repair_count); - repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + repair_bio = bio_alloc_bioset(NULL, nr_steps, REQ_OP_READ, GFP_NOFS, &btrfs_repair_bioset); - repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; - bio_add_folio_nofail(repair_bio, folio, sectorsize, foff); + repair_bio->bi_iter.bi_sector = logical >> SECTOR_SHIFT; + for (int i = 0; i < nr_steps; i++) { + int ret; + + ASSERT(offset_in_page(paddrs[i]) + step <= PAGE_SIZE); + + ret = bio_add_page(repair_bio, phys_to_page(paddrs[i]), step, + offset_in_page(paddrs[i])); + ASSERT(ret == step); + } repair_bbio = btrfs_bio(repair_bio); btrfs_bio_init(repair_bbio, failed_bbio->inode, failed_bbio->file_offset + bio_offset, @@ -284,10 +295,13 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 sectorsize = fs_info->sectorsize; + const u32 sectorsize = fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); + const u32 nr_steps = sectorsize / step; struct bvec_iter *iter = &bbio->saved_iter; blk_status_t status = bbio->bio.bi_status; struct btrfs_failed_bio *fbio = NULL; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; phys_addr_t paddr; u32 offset = 0; @@ -306,10 +320,16 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de /* Clear the I/O error. A failed repair will reset it. */ bbio->bio.bi_status = BLK_STS_OK; - btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) { - if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr)) - fbio = repair_one_sector(bbio, offset, paddr, fbio); - offset += sectorsize; + btrfs_bio_for_each_block(paddr, &bbio->bio, iter, step) { + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; + + if (IS_ALIGNED(offset, sectorsize)) { + if (status || + !btrfs_data_csum_ok(bbio, dev, offset - sectorsize, paddrs)) + fbio = repair_one_sector(bbio, offset - sectorsize, + paddrs, fbio); + } } if (bbio->csum != bbio->csum_inline) kvfree(bbio->csum); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 00671b724079..73602ee8de3f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -550,7 +550,7 @@ void btrfs_calculate_block_csum_pages(struct btrfs_fs_info *fs_info, int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, phys_addr_t paddr); + u32 bio_offset, const phys_addr_t paddrs[]); noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c083a67d0091..1a0c380ef464 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3420,12 +3420,13 @@ int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 } /* - * Verify the checksum of a single data sector. + * Verify the checksum of a single data sector, which can be scattered at + * different noncontiguous pages. * * @bbio: btrfs_io_bio which contains the csum * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) - * @bv: bio_vec to check + * @paddrs: physical addresses which back the fs block * * Check if the checksum on a data block is valid. When a checksum mismatch is * detected, report the error and fill the corrupted range with zero. @@ -3433,12 +3434,13 @@ int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, - u32 bio_offset, phys_addr_t paddr) + u32 bio_offset, const phys_addr_t paddrs[]) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 blocksize = fs_info->sectorsize; - struct folio *folio; + const u32 step = min(blocksize, PAGE_SIZE); + const u32 nr_steps = blocksize / step; u64 file_offset = bbio->file_offset + bio_offset; u64 end = file_offset + blocksize - 1; u8 *csum_expected; @@ -3458,7 +3460,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum); + if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) goto zeroit; return true; @@ -3467,9 +3470,8 @@ zeroit: bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); - folio = page_folio(phys_to_page(paddr)); - ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); - folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); + for (int i = 0; i < nr_steps; i++) + memzero_page(phys_to_page(paddrs[i]), offset_in_page(paddrs[i]), step); return false; } From ec20799064c881e373939ea3cea55b1c406c6b76 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 11 Nov 2025 09:12:01 +1030 Subject: [PATCH 104/147] btrfs: enable encoded read/write/send for bs > ps cases Since the read verification and read repair are all supporting bs > ps without large folios now, we can enable encoded read/write/send. Now we can relax the alignment in assert_bbio_alignment() to min(blocksize, PAGE_SIZE). But also add the extra blocksize based alignment check for the logical and length of the bbio. There is a pitfall in btrfs_add_compress_bio_folios(), which relies on the folios passed in to meet the minimal folio order. But now we can pass regular page sized folios in, update it to check each folio's size instead of using the minimal folio size. This allows btrfs_add_compress_bio_folios() to even handle folios array with different sizes, thankfully we don't yet need to handle such crazy situation. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/bio.c | 22 ++++++++++++---------- fs/btrfs/compression.c | 9 ++++----- fs/btrfs/ioctl.c | 21 --------------------- fs/btrfs/send.c | 9 +-------- 4 files changed, 17 insertions(+), 44 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index fcd28eb68247..1b38e3ee0a33 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -867,21 +867,23 @@ static void assert_bbio_alignment(struct btrfs_bio *bbio) struct bio_vec bvec; struct bvec_iter iter; const u32 blocksize = fs_info->sectorsize; + const u32 alignment = min(blocksize, PAGE_SIZE); + const u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + const u32 length = bbio->bio.bi_iter.bi_size; - /* Metadata has no extra bs > ps alignment requirement. */ - if (!is_data_bbio(bbio)) - return; + /* The logical and length should still be aligned to blocksize. */ + ASSERT(IS_ALIGNED(logical, blocksize) && IS_ALIGNED(length, blocksize) && + length != 0, "root=%llu inode=%llu logical=%llu length=%u", + btrfs_root_id(bbio->inode->root), + btrfs_ino(bbio->inode), logical, length); bio_for_each_bvec(bvec, &bbio->bio, iter) - ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) && - IS_ALIGNED(bvec.bv_len, blocksize), + ASSERT(IS_ALIGNED(bvec.bv_offset, alignment) && + IS_ALIGNED(bvec.bv_len, alignment), "root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u", btrfs_root_id(bbio->inode->root), - btrfs_ino(bbio->inode), - bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT, - bbio->bio.bi_iter.bi_size, iter.bi_idx, - bvec.bv_offset, - bvec.bv_len); + btrfs_ino(bbio->inode), logical, length, iter.bi_idx, + bvec.bv_offset, bvec.bv_len); #endif } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 241a117ad7cc..1d4c3d2e2d6c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -340,21 +340,20 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) { - struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; struct bio *bio = &cb->bbio.bio; u32 offset = 0; + unsigned int findex = 0; while (offset < cb->compressed_len) { - struct folio *folio; + struct folio *folio = cb->compressed_folios[findex]; + u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio)); int ret; - u32 len = min_t(u32, cb->compressed_len - offset, - btrfs_min_folio_size(fs_info)); - folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)]; /* Maximum compressed extent is smaller than bio size limit. */ ret = bio_add_folio(bio, folio, len, 0); ASSERT(ret); offset += len; + findex++; } } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 83a168613ee9..59cef7e376a0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4408,10 +4408,6 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; @@ -4503,7 +4499,6 @@ out_acct: static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -4517,11 +4512,6 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } - if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; @@ -4803,11 +4793,6 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = -EPERM; goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } - sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (issue_flags & IO_URING_F_COMPAT) { @@ -4935,7 +4920,6 @@ out_acct: static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct file *file = cmd->file; - struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); loff_t pos; struct kiocb kiocb; ssize_t ret; @@ -4950,11 +4934,6 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu ret = -EPERM; goto out_acct; } - if (fs_info->sectorsize > PAGE_SIZE) { - ret = -ENOTTY; - goto out_acct; - } - sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (!(file->f_mode & FMODE_WRITE)) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9312d74400a3..fa94105e139a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5634,14 +5634,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - /* - * Do not go through encoded read for bs > ps cases. - * - * Encoded send is using vmallocated pages as buffer, which we can - * not ensure every folio is large enough to contain a block. - */ - if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE && - (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { bool is_inline = (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE); From 1c094e6ccead7a12ed41cfba9119974657ad8971 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 11 Nov 2025 15:31:52 +0100 Subject: [PATCH 105/147] btrfs: make a few more ASSERTs verbose We have support for optional string to be printed in ASSERT() (added in 19468a623a9109 ("btrfs: enhance ASSERT() to take optional format string")), it's not yet everywhere it could be so add a few more files. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 17 ++++++++------- fs/btrfs/space-info.c | 30 ++++++++++++++++----------- fs/btrfs/subpage.c | 10 ++++++--- fs/btrfs/transaction.c | 41 +++++++++++++++++++++++++----------- fs/btrfs/tree-checker.c | 2 +- fs/btrfs/tree-log.c | 46 +++++++++++++++++++++++++++-------------- fs/btrfs/zoned.c | 37 ++++++++++++++++++++------------- 7 files changed, 119 insertions(+), 64 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ab259815a899..7e521d21ad40 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -966,8 +966,9 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); int i; - ASSERT(stripe->mirror_num >= 1); - ASSERT(atomic_read(&stripe->pending_io) == 0); + ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); + ASSERT(atomic_read(&stripe->pending_io) == 0, + "atomic_read(&stripe->pending_io)=%d", atomic_read(&stripe->pending_io)); for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { /* The current sector cannot be merged, submit the bio. */ @@ -1030,7 +1031,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, int ret; /* For scrub, our mirror_num should always start at 1. */ - ASSERT(stripe->mirror_num >= 1); + ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, stripe->logical, &mapped_len, &bioc, NULL, NULL); @@ -1170,7 +1171,7 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) int mirror; int i; - ASSERT(stripe->mirror_num > 0); + ASSERT(stripe->mirror_num >= 1, "stripe->mirror_num=%d", stripe->mirror_num); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); @@ -1486,7 +1487,7 @@ static int compare_extent_item_range(struct btrfs_path *path, btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || - key.type == BTRFS_METADATA_ITEM_KEY); + key.type == BTRFS_METADATA_ITEM_KEY, "key.type=%u", key.type); if (key.type == BTRFS_METADATA_ITEM_KEY) len = fs_info->nodesize; else @@ -1591,7 +1592,7 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || - key.type == BTRFS_EXTENT_ITEM_KEY); + key.type == BTRFS_EXTENT_ITEM_KEY, "key.type=%u", key.type); *extent_start_ret = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) *size_ret = path->nodes[0]->fs_info->nodesize; @@ -1689,7 +1690,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, scrub_stripe_reset_bitmaps(stripe); /* The range must be inside the bg. */ - ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length, + "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", + bg->start, logical_start, logical_end, bg->start + bg->length); ret = find_first_extent_item(extent_root, extent_path, logical_start, logical_len); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 61fd76c3da0d..6babbe333741 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -211,7 +211,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) if (btrfs_is_zoned(fs_info)) return fs_info->zone_size; - ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK, "flags=%llu", flags); if (flags & BTRFS_BLOCK_GROUP_DATA) return BTRFS_MAX_DATA_CHUNK_SIZE; @@ -262,8 +262,9 @@ static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flag struct btrfs_space_info *sub_group; int ret; - ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); - ASSERT(id != BTRFS_SUB_GROUP_PRIMARY); + ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY, + "parent->subgroup_id=%d", parent->subgroup_id); + ASSERT(id != BTRFS_SUB_GROUP_PRIMARY, "id=%d", id); sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS); if (!sub_group) @@ -531,7 +532,9 @@ static void remove_ticket(struct btrfs_space_info *space_info, if (!list_empty(&ticket->list)) { list_del_init(&ticket->list); - ASSERT(space_info->reclaim_size >= ticket->bytes); + ASSERT(space_info->reclaim_size >= ticket->bytes, + "space_info->reclaim_size=%llu ticket->bytes=%llu", + space_info->reclaim_size, ticket->bytes); space_info->reclaim_size -= ticket->bytes; } @@ -1671,7 +1674,7 @@ static int handle_reserve_ticket(struct btrfs_space_info *space_info, priority_reclaim_data_space(space_info, ticket); break; default: - ASSERT(0); + ASSERT(0, "flush=%d", flush); break; } @@ -1683,7 +1686,8 @@ static int handle_reserve_ticket(struct btrfs_space_info *space_info, * releasing reserved space (if an error happens the expectation is that * space wasn't reserved at all). */ - ASSERT(!(ticket->bytes == 0 && ticket->error)); + ASSERT(!(ticket->bytes == 0 && ticket->error), + "ticket->bytes=%llu ticket->error=%d", ticket->bytes, ticket->error); trace_btrfs_reserve_ticket(space_info->fs_info, space_info->flags, orig_bytes, start_ns, flush, ticket->error); return ret; @@ -1758,7 +1762,7 @@ static int reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, int ret = -ENOSPC; bool pending_tickets; - ASSERT(orig_bytes); + ASSERT(orig_bytes, "orig_bytes=%llu", orig_bytes); /* * If have a transaction handle (current->journal_info != NULL), then * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor @@ -1767,9 +1771,9 @@ static int reserve_bytes(struct btrfs_space_info *space_info, u64 orig_bytes, */ if (current->journal_info) { /* One assert per line for easier debugging. */ - ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); - ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); - ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL, "flush=%d", flush); + ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL, "flush=%d", flush); + ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT, "flush=%d", flush); } if (flush == BTRFS_RESERVE_FLUSH_DATA) @@ -1930,8 +1934,10 @@ int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE || - flush == BTRFS_RESERVE_NO_FLUSH); - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + flush == BTRFS_RESERVE_NO_FLUSH, "flush=%d", flush); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA, + "current->journal_info=0x%lx flush=%d", + (unsigned long)current->journal_info, flush); ret = reserve_bytes(space_info, bytes, flush); if (ret == -ENOSPC) { diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 80cd27d3267f..60f23de779f9 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -180,7 +180,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, /* Basic checks */ ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && - IS_ALIGNED(len, fs_info->sectorsize)); + IS_ALIGNED(len, fs_info->sectorsize), "start=%llu len=%u", start, len); /* * The range check only works for mapped page, we can still have * unmapped page like dummy extent buffer pages. @@ -249,7 +249,9 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info, clear_bit(bit, bfs->bitmaps); cleared++; } - ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared, + "atomic_read(&bfs->nr_locked)=%d cleared=%d", + atomic_read(&bfs->nr_locked), cleared); last = atomic_sub_and_test(cleared, &bfs->nr_locked); spin_unlock_irqrestore(&bfs->lock, flags); return last; @@ -328,7 +330,9 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, if (test_and_clear_bit(bit + start_bit, bfs->bitmaps)) cleared++; } - ASSERT(atomic_read(&bfs->nr_locked) >= cleared); + ASSERT(atomic_read(&bfs->nr_locked) >= cleared, + "atomic_read(&bfs->nr_locked)=%d cleared=%d", + atomic_read(&bfs->nr_locked), cleared); last = atomic_sub_and_test(cleared, &bfs->nr_locked); spin_unlock_irqrestore(&bfs->lock, flags); if (last) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 03c62fd1a091..05ee4391c83a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -186,7 +186,8 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ - ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); + ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING, + "cur_trans->state=%d", cur_trans->state); down_write(&fs_info->commit_root_sem); @@ -1025,13 +1026,18 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->block_rsv) { - ASSERT(!trans->bytes_reserved); - ASSERT(!trans->delayed_refs_bytes_reserved); + ASSERT(trans->bytes_reserved == 0, + "trans->bytes_reserved=%llu", trans->bytes_reserved); + ASSERT(trans->delayed_refs_bytes_reserved == 0, + "trans->delayed_refs_bytes_reserved=%llu", + trans->delayed_refs_bytes_reserved); return; } if (!trans->bytes_reserved) { - ASSERT(!trans->delayed_refs_bytes_reserved); + ASSERT(trans->delayed_refs_bytes_reserved == 0, + "trans->delayed_refs_bytes_reserved=%llu", + trans->delayed_refs_bytes_reserved); return; } @@ -1230,7 +1236,8 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) bool errors = false; int ret; - ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID, + "root_id(log_root)=%llu", btrfs_root_id(log_root)); ret = __btrfs_wait_marked_extents(fs_info, dirty_pages); if ((mark & EXTENT_DIRTY_LOG1) && @@ -1335,7 +1342,8 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ - ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING, + "trans->transaction->state=%d", trans->transaction->state); eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, @@ -1469,7 +1477,8 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ - ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING, + "trans->transaction->state=%d", trans->transaction->state); spin_lock(&fs_info->fs_roots_radix_lock); while (1) { @@ -1487,9 +1496,15 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) * At this point we can neither have tasks logging inodes * from a root nor trying to commit a log tree. */ - ASSERT(atomic_read(&root->log_writers) == 0); - ASSERT(atomic_read(&root->log_commit[0]) == 0); - ASSERT(atomic_read(&root->log_commit[1]) == 0); + ASSERT(atomic_read(&root->log_writers) == 0, + "atomic_read(&root->log_writers)=%d", + atomic_read(&root->log_writers)); + ASSERT(atomic_read(&root->log_commit[0]) == 0, + "atomic_read(&root->log_commit[0])=%d", + atomic_read(&root->log_commit[0])); + ASSERT(atomic_read(&root->log_commit[1]) == 0, + "atomic_read(&root->log_commit[1])=%d", + atomic_read(&root->log_commit[1])); radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)btrfs_root_id(root), @@ -2158,7 +2173,8 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) return; lockdep_assert_held(&trans->fs_info->trans_lock); - ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP, + "cur_trans->state=%d", cur_trans->state); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } @@ -2185,7 +2201,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) struct btrfs_transaction *prev_trans = NULL; int ret; - ASSERT(refcount_read(&trans->use_count) == 1); + ASSERT(refcount_read(&trans->use_count) == 1, + "refcount_read(&trans->use_count)=%d", refcount_read(&trans->use_count)); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 5684750ca7a6..c21c21adf61e 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -186,7 +186,7 @@ static bool check_prev_ino(struct extent_buffer *leaf, key->type == BTRFS_INODE_EXTREF_KEY || key->type == BTRFS_DIR_INDEX_KEY || key->type == BTRFS_DIR_ITEM_KEY || - key->type == BTRFS_EXTENT_DATA_KEY); + key->type == BTRFS_EXTENT_DATA_KEY, "key->type=%u", key->type); /* * Only subvolume trees along with their reloc trees need this check. diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8e41fb906c6e..e40e1d746381 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -263,7 +263,7 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r struct btrfs_inode *inode; /* Only meant to be called for subvolume roots and not for log roots. */ - ASSERT(btrfs_is_fstree(btrfs_root_id(root))); + ASSERT(btrfs_is_fstree(btrfs_root_id(root)), "root_id=%llu", btrfs_root_id(root)); /* * We're holding a transaction handle whether we are logging or @@ -502,7 +502,7 @@ static int overwrite_item(struct walk_control *wc) * the leaf before writing into the log tree. See the comments at * copy_items() for more details. */ - ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); + ASSERT(btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID, "root_id=%llu", btrfs_root_id(root)); item_size = btrfs_item_size(wc->log_leaf, wc->log_slot); src_ptr = btrfs_item_ptr_offset(wc->log_leaf, wc->log_slot); @@ -2282,7 +2282,8 @@ static noinline int replay_one_dir_item(struct walk_control *wc) struct btrfs_dir_item *di; /* We only log dir index keys, which only contain a single dir item. */ - ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY); + ASSERT(wc->log_key.type == BTRFS_DIR_INDEX_KEY, + "wc->log_key.type=%u", wc->log_key.type); di = btrfs_item_ptr(wc->log_leaf, wc->log_slot, struct btrfs_dir_item); ret = replay_one_name(wc, di); @@ -2434,7 +2435,7 @@ static noinline int check_item_in_log(struct walk_control *wc, * we need to do is process the dir index keys, we (and our caller) can * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). */ - ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY); + ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY, "dir_key->type=%u", dir_key->type); eb = wc->subvol_path->nodes[0]; slot = wc->subvol_path->slots[0]; @@ -3339,7 +3340,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); return ctx->log_ret; } - ASSERT(log_transid == root->log_transid); + ASSERT(log_transid == root->log_transid, + "log_transid=%d root->log_transid=%d", log_transid, root->log_transid); atomic_set(&root->log_commit[index1], 1); /* wait for previous tree log sync to complete */ @@ -3479,7 +3481,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = root_log_ctx.log_ret; goto out; } - ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); + ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid, + "root_log_ctx.log_transid=%d log_root_tree->log_transid=%d", + root_log_ctx.log_transid, log_root_tree->log_transid); atomic_set(&log_root_tree->log_commit[index2], 1); if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) { @@ -3583,7 +3587,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * someone else already started it. We use <= and not < because the * first log transaction has an ID of 0. */ - ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid); + ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid, + "last_log_commit(root)=%d log_transid=%d", + btrfs_get_root_last_log_commit(root), log_transid); btrfs_set_root_last_log_commit(root, log_transid); out_wake_log_root: @@ -4027,7 +4033,7 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, int ret; int i; - ASSERT(count > 0); + ASSERT(count > 0, "count=%d", count); batch.nr = count; if (count == 1) { @@ -4080,7 +4086,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, btrfs_release_path(dst_path); last_index = batch.keys[count - 1].offset; - ASSERT(last_index > inode->last_dir_index_offset); + ASSERT(last_index > inode->last_dir_index_offset, + "last_index=%llu inode->last_dir_index_offset=%llu", + last_index, inode->last_dir_index_offset); /* * If for some unexpected reason the last item's index is not greater @@ -4404,7 +4412,9 @@ done: * change in the current transaction), then we don't need to log * a range, last_old_dentry_offset is == to last_offset. */ - ASSERT(last_old_dentry_offset <= last_offset); + ASSERT(last_old_dentry_offset <= last_offset, + "last_old_dentry_offset=%llu last_offset=%llu", + last_old_dentry_offset, last_offset); if (last_old_dentry_offset < last_offset) ret = insert_dir_log_key(trans, log, path, ino, last_old_dentry_offset + 1, @@ -6528,7 +6538,7 @@ static int log_delayed_insertion_items(struct btrfs_trans_handle *trans, curr = list_next_entry(curr, log_list); } - ASSERT(batch.nr >= 1); + ASSERT(batch.nr >= 1, "batch.nr=%d", batch.nr); ret = insert_delayed_items_batch(trans, log, path, &batch, first); curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item, @@ -6572,7 +6582,9 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans, } last_dir_index = curr->index; - ASSERT(last_dir_index >= first_dir_index); + ASSERT(last_dir_index >= first_dir_index, + "last_dir_index=%llu first_dir_index=%llu", + last_dir_index, first_dir_index); ret = insert_dir_log_key(trans, inode->root->log_root, path, ino, first_dir_index, last_dir_index); @@ -6666,7 +6678,9 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans, goto next_batch; last_dir_index = last->index; - ASSERT(last_dir_index >= first_dir_index); + ASSERT(last_dir_index >= first_dir_index, + "last_dir_index=%llu first_dir_index=%llu", + last_dir_index, first_dir_index); /* * If this range starts right after where the previous one ends, * then we want to reuse the previous range item and change its @@ -6733,7 +6747,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, */ lockdep_assert_not_held(&inode->log_mutex); - ASSERT(!ctx->logging_new_delayed_dentries); + ASSERT(!ctx->logging_new_delayed_dentries, + "ctx->logging_new_delayed_dentries=%d", ctx->logging_new_delayed_dentries); ctx->logging_new_delayed_dentries = true; list_for_each_entry(item, delayed_ins_list, log_list) { @@ -7950,7 +7965,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct fscrypt_name fname; - ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX, + "old_dir_index=%llu", old_dir_index); ret = fscrypt_setup_filename(&old_dir->vfs_inode, &old_dentry->d_name, 0, &fname); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 41a4a7d50bd3..0df78e825ca4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -93,7 +93,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, sector_t sector; for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { - ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); + ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL, + "zones[%d].type=%d", i, zones[i].type); empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); full[i] = sb_zone_is_full(&zones[i]); } @@ -166,14 +167,14 @@ static inline u32 sb_zone_number(int shift, int mirror) { u64 zone = U64_MAX; - ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX, "mirror=%d", mirror); switch (mirror) { case 0: zone = 0; break; case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } - ASSERT(zone <= U32_MAX); + ASSERT(zone <= U32_MAX, "zone=%llu", zone); return (u32)zone; } @@ -240,7 +241,8 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, unsigned int i; u32 zno; - ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); + ASSERT(IS_ALIGNED(pos, zinfo->zone_size), + "pos=%llu zinfo->zone_size=%llu", pos, zinfo->zone_size); zno = pos >> zinfo->zone_size_shift; /* * We cannot report zones beyond the zone end. So, it is OK to @@ -1055,8 +1057,10 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, bool have_sb; int i; - ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); - ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); + ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size), + "hole_start=%llu zinfo->zone_size=%llu", hole_start, zinfo->zone_size); + ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size), + "num_bytes=%llu zinfo->zone_size=%llu", num_bytes, zinfo->zone_size); while (pos < hole_end) { begin = pos >> shift; @@ -1172,8 +1176,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) u64 pos; int ret; - ASSERT(IS_ALIGNED(start, zinfo->zone_size)); - ASSERT(IS_ALIGNED(size, zinfo->zone_size)); + ASSERT(IS_ALIGNED(start, zinfo->zone_size), + "start=%llu, zinfo->zone_size=%llu", start, zinfo->zone_size); + ASSERT(IS_ALIGNED(size, zinfo->zone_size), + "size=%llu, zinfo->zone_size=%llu", size, zinfo->zone_size); if (begin + nbits > zinfo->nr_zones) return -ERANGE; @@ -1866,7 +1872,7 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, ordered->num_bytes); /* The em should be a new COW extent, thus it should not have an offset. */ - ASSERT(em->offset == 0); + ASSERT(em->offset == 0, "em->offset=%llu", em->offset); em->disk_bytenr = logical; btrfs_free_extent_map(em); write_unlock(&em_tree->lock); @@ -2577,7 +2583,8 @@ again: struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; int factor; - ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC, + "reloc_sinfo->subgroup_id=%d", reloc_sinfo->subgroup_id); factor = btrfs_bg_type_to_factor(bg->flags); down_write(&space_info->groups_sem); @@ -2591,9 +2598,9 @@ again: space_info->disk_total -= bg->length * factor; space_info->disk_total -= bg->zone_unusable; /* There is no allocation ever happened. */ - ASSERT(bg->used == 0); + ASSERT(bg->used == 0, "bg->used=%llu", bg->used); /* No super block in a block group on the zoned setup. */ - ASSERT(bg->bytes_super == 0); + ASSERT(bg->bytes_super == 0, "bg->bytes_super=%llu", bg->bytes_super); spin_unlock(&space_info->lock); bg->space_info = reloc_sinfo; @@ -2619,7 +2626,8 @@ again: /* Allocate new BG in the data relocation space_info. */ space_info = data_sinfo->sub_group[0]; - ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC, + "space_info->subgroup_id=%d", space_info->subgroup_id); ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); if (ret == 1) { @@ -2960,7 +2968,8 @@ int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num * This holds because we currently reset fully used then freed * block group. */ - ASSERT(reclaimed == bg->zone_capacity); + ASSERT(reclaimed == bg->zone_capacity, + "reclaimed=%llu bg->zone_capacity=%llu", reclaimed, bg->zone_capacity); bg->free_space_ctl->free_space += reclaimed; space_info->bytes_zone_unusable -= reclaimed; spin_unlock(&bg->lock); From 280dd7c106fd4c47756d19f6ae89862bb7bf7225 Mon Sep 17 00:00:00 2001 From: Zhen Ni Date: Fri, 14 Nov 2025 15:53:13 +0800 Subject: [PATCH 106/147] btrfs: fix incomplete parameter rename in btrfs_decompress() Commit 2c25716dcc25 ("btrfs: zlib: fix and simplify the inline extent decompression") renamed the 'start_byte' parameter to 'dest_pgoff' in the btrfs_decompress(). The remaining 'start_byte' references are inconsistent with the actual implementation and may cause confusion for developers. Ensure consistency between function declaration and implementation. Signed-off-by: Zhen Ni Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 3 ++- fs/btrfs/compression.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1d4c3d2e2d6c..7dda6cc68379 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1084,7 +1084,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) /* * a less complex decompression routine. Our compressed data fits in a * single page, and we want to read a single page out of it. - * start_byte tells us the offset into the compressed data we're interested in + * dest_pgoff tells us the offset into the destination folio where we write the + * decompressed data. */ int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, size_t destlen) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 40aa49fed18c..1947c6b1835d 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -95,7 +95,7 @@ int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inod u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, - unsigned long start_byte, size_t srclen, size_t destlen); + unsigned long dest_pgoff, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); From e7dd1182fcedee7c6097c9f49eba8de94a4364e3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2025 12:52:45 +0000 Subject: [PATCH 107/147] btrfs: fix leaf leak in an error path in btrfs_del_items() If the call to btrfs_del_leaf() fails we return without decrementing the extra ref we took on the leaf, therefore leaking it. Fix this by ensuring we drop the ref count before returning the error. Fixes: 751a27615dda ("btrfs: do not BUG_ON() on tree mod log failures at btrfs_del_ptr()") Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f6a9b6bbf78b..614aa4b56571 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4562,9 +4562,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; ret = btrfs_del_leaf(trans, root, path, leaf); + free_extent_buffer(leaf); if (ret < 0) return ret; - free_extent_buffer(leaf); ret = 0; } else { /* if we're still in the path, make sure From 86d3dc812f1e9aea58fabe8fcd42023f54abcad0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2025 12:59:19 +0000 Subject: [PATCH 108/147] btrfs: remove pointless return value update in btrfs_del_items() The call to btrfs_del_leaf() can only return an error (negative value) or zero (success). If we didn't get an error then 'ret' is zero, so it's pointless to set it to zero again. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 614aa4b56571..e683f961742a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4565,7 +4565,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, free_extent_buffer(leaf); if (ret < 0) return ret; - ret = 0; } else { /* if we're still in the path, make sure * we're dirty. Otherwise, one of the From fad159f69edabac046c725cdecf22275199b2dd0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2025 13:04:13 +0000 Subject: [PATCH 109/147] btrfs: add unlikely to critical error in btrfs_extend_item() It's not expected to get a data size less than the leaf's free space, which would lead to a leaf dump and BUG(), so tag the if statement's expression as unlikely, hinting the compiler to potentially generate better code. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e683f961742a..b5cf1b6f5adc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4106,7 +4106,7 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans, nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(leaf); - if (btrfs_leaf_free_space(leaf) < data_size) { + if (unlikely(btrfs_leaf_free_space(leaf) < data_size)) { btrfs_print_leaf(leaf); BUG(); } From 7447263d7da24097f17147ffe5d9c43c317deb44 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2025 11:32:44 +0000 Subject: [PATCH 110/147] btrfs: always use left leaf variable in __push_leaf_right() The 'left' variable points to path->nodes[0] and path->nodes[0] is never changed, but some places use 'left' while others refer to path->nodes[0]. Update all sites to use 'left' as not only it's shorter it's also easier to reason since it means the left leaf and avoids any confusion with the sibling right leaf. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b5cf1b6f5adc..dada50d86731 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3214,10 +3214,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) - btrfs_clear_buffer_dirty(trans, path->nodes[0]); - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); + if (btrfs_header_nritems(left) == 0) + btrfs_clear_buffer_dirty(trans, left); + btrfs_tree_unlock(left); + free_extent_buffer(left); path->nodes[0] = right; path->slots[1] += 1; } else { From 29bb40ed56ab9a1418cbe3e62c97a27b48f896e2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2025 11:46:34 +0000 Subject: [PATCH 111/147] btrfs: remove duplicated leaf dirty status clearing in __push_leaf_right() We have already called btrfs_clear_buffer_dirty() against the left leaf in the code above: btrfs_set_header_nritems(left, left_nritems); if (left_nritems) btrfs_mark_buffer_dirty(trans, left); else btrfs_clear_buffer_dirty(trans, left); So remove the second check for a 0 number of items in the left leaf and calling again btrfs_clear_buffer_dirty() against the left leaf. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dada50d86731..7265dd661cde 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3214,8 +3214,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; - if (btrfs_header_nritems(left) == 0) - btrfs_clear_buffer_dirty(trans, left); btrfs_tree_unlock(left); free_extent_buffer(left); path->nodes[0] = right; From 027358a0900913a395f99d911108375a7f1c50f4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2025 11:52:34 +0000 Subject: [PATCH 112/147] btrfs: always use right leaf variable in __push_leaf_left() The 'right' variable points to path->nodes[0] and path->nodes[0] is never changed, but some places use 'right' while others refer to path->nodes[0]. Update all sites to use 'right' as not only it's shorter it's also easier to reason since it means the right leaf and avoids any confusion with the sibling left leaf. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7265dd661cde..57b7d09d85cc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3428,8 +3428,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); + btrfs_tree_unlock(right); + free_extent_buffer(right); path->nodes[0] = left; path->slots[1] -= 1; } else { From 5d8222a50ad37c98455da08b33ce49fe6b726c72 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2025 16:44:41 +0000 Subject: [PATCH 113/147] btrfs: abort transaction on item count overflow in __push_leaf_left() If we try to push an item count from the right leaf that is greater than the number of items in the leaf, we just emit a warning. This should never happen but if it does we get an underflow in the new number of items in the right leaf and chaos follows from it. So replace the warning with proper error handling, by aborting the transaction and returning -EUCLEAN, and proper logging by using btrfs_crit() instead of WARN(), which gives us proper formatting and information about the filesystem. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 57b7d09d85cc..8b54daf3d0e7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3393,9 +3393,13 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, old_left_nritems + push_items); /* fixup right node */ - if (push_items > right_nritems) - WARN(1, KERN_CRIT "push items %d nr %u\n", push_items, - right_nritems); + if (unlikely(push_items > right_nritems)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + btrfs_crit(fs_info, "push items (%d) > right leaf items (%u)", + push_items, right_nritems); + goto out; + } if (push_items < right_nritems) { push_space = btrfs_item_offset(right, push_items - 1) - From c2b2504ece4089697bb7db115dc91e344dfed76f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Nov 2025 12:07:14 +0000 Subject: [PATCH 114/147] btrfs: update check_skip variable after unlocking current node There's no need to update the local variable 'check_skip' to false inside the critical section delimited by the lock of the current node, so do it after unlocking the node. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 8b54daf3d0e7..46262939e873 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1435,8 +1435,8 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } if (i >= lowest_unlock && i > skip_level) { - check_skip = false; btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); + check_skip = false; path->locks[i] = 0; if (write_lock_level && i > min_write_lock_level && From d7fe41044b3ac8f9b5965de499a13ac9ae947e79 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 14 Nov 2025 16:00:04 +0000 Subject: [PATCH 115/147] btrfs: use bool type for btrfs_path members used as booleans Many fields of struct btrfs_path are used as booleans but their type is an unsigned int (of one 1 bit width to save space). Change the type to bool keeping the :1 suffix so that they combine with the previous u8 fields in order to save space. This makes the code more clear by using explicit true/false and more in line with the preferred style, preserving the size of the structure. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 16 ++++++++-------- fs/btrfs/block-group.c | 8 ++++---- fs/btrfs/ctree.c | 28 ++++++++++++++-------------- fs/btrfs/ctree.h | 16 ++++++++-------- fs/btrfs/defrag.c | 4 ++-- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/extent-tree.c | 8 ++++---- fs/btrfs/file-item.c | 12 ++++++------ fs/btrfs/free-space-cache.c | 4 ++-- fs/btrfs/free-space-tree.c | 4 ++-- fs/btrfs/inode-item.c | 2 +- fs/btrfs/inode.c | 4 ++-- fs/btrfs/qgroup.c | 4 ++-- fs/btrfs/raid-stripe-tree.c | 4 ++-- fs/btrfs/relocation.c | 8 ++++---- fs/btrfs/scrub.c | 20 ++++++++++---------- fs/btrfs/send.c | 14 +++++++------- fs/btrfs/tree-log.c | 20 ++++++++++---------- fs/btrfs/volumes.c | 6 +++--- fs/btrfs/xattr.c | 2 +- 20 files changed, 94 insertions(+), 94 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index eff2d388a706..78da47a3d00e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1408,12 +1408,12 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, if (!path) return -ENOMEM; if (!ctx->trans) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; } if (ctx->time_seq == BTRFS_SEQ_LAST) - path->skip_locking = 1; + path->skip_locking = true; again: head = NULL; @@ -1560,7 +1560,7 @@ again: btrfs_release_path(path); - ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0); + ret = add_missing_keys(ctx->fs_info, &preftrees, !path->skip_locking); if (ret) goto out; @@ -2825,8 +2825,8 @@ struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_inf } /* Current backref iterator only supports iteration in commit root */ - ret->path->search_commit_root = 1; - ret->path->skip_locking = 1; + ret->path->search_commit_root = true; + ret->path->skip_locking = true; ret->fs_info = fs_info; return ret; @@ -3299,8 +3299,8 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, level = cur->level + 1; /* Search the tree to find parent blocks referring to the block */ - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; path->lowest_level = level; ret = btrfs_search_slot(NULL, root, tree_key, path, 0, 0); path->lowest_level = 0; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b964eacc1610..ebbf04501782 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -613,8 +613,8 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET)); - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; search_offset = index * div_u64(block_group->length, max_index); @@ -744,8 +744,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) * root to add free space. So we skip locking and search the commit * root, since its read-only */ - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; key.objectid = last; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 46262939e873..51dc8e0bc9c1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1709,9 +1709,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when - * p->search_commit_root = 1. + * p->search_commit_root is true. */ - ASSERT(p->skip_locking == 1); + ASSERT(p->skip_locking); goto out; } @@ -3860,10 +3860,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - path->keep_locks = 1; - path->search_for_split = 1; + path->keep_locks = true; + path->search_for_split = true; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - path->search_for_split = 0; + path->search_for_split = false; if (ret > 0) ret = -EAGAIN; if (ret < 0) @@ -3890,11 +3890,11 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, if (ret) goto err; - path->keep_locks = 0; + path->keep_locks = false; btrfs_unlock_up_safe(path, 1); return 0; err: - path->keep_locks = 0; + path->keep_locks = false; return ret; } @@ -4610,11 +4610,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u32 nritems; int level; int ret = 1; - int keep_locks = path->keep_locks; + const bool keep_locks = path->keep_locks; ASSERT(!path->nowait); ASSERT(path->lowest_level == 0); - path->keep_locks = 1; + path->keep_locks = true; again: cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); @@ -4704,7 +4704,7 @@ out: * 0 is returned if another key is found, < 0 if there are any errors * and 1 is returned if there are no higher keys in the tree * - * path->keep_locks should be set to 1 on the search made before + * path->keep_locks should be set to true on the search made before * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, @@ -4803,13 +4803,13 @@ again: next = NULL; btrfs_release_path(path); - path->keep_locks = 1; + path->keep_locks = true; if (time_seq) { ret = btrfs_search_old_slot(root, &key, path, time_seq); } else { if (path->need_commit_sem) { - path->need_commit_sem = 0; + path->need_commit_sem = false; need_commit_sem = true; if (path->nowait) { if (!down_read_trylock(&fs_info->commit_root_sem)) { @@ -4822,7 +4822,7 @@ again: } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); } - path->keep_locks = 0; + path->keep_locks = false; if (ret < 0) goto done; @@ -4961,7 +4961,7 @@ done: if (need_commit_sem) { int ret2; - path->need_commit_sem = 1; + path->need_commit_sem = true; ret2 = finish_need_commit_sem_search(path); up_read(&fs_info->commit_root_sem); if (ret2) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 16dd11c48531..692370fc07b2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -65,21 +65,21 @@ struct btrfs_path { * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ - unsigned int search_for_split:1; + bool search_for_split:1; /* Keep some upper locks as we walk down. */ - unsigned int keep_locks:1; - unsigned int skip_locking:1; - unsigned int search_commit_root:1; - unsigned int need_commit_sem:1; - unsigned int skip_release_on_error:1; + bool keep_locks:1; + bool skip_locking:1; + bool search_commit_root:1; + bool need_commit_sem:1; + bool skip_release_on_error:1; /* * Indicate that new item (btrfs_search_slot) is extending already * existing item and ins_len contains only the data size and not item * header (ie. sizeof(struct btrfs_item) is not included). */ - unsigned int search_for_extension:1; + bool search_for_extension:1; /* Stop search if any locks need to be taken (for read) */ - unsigned int nowait:1; + bool nowait:1; }; #define BTRFS_PATH_AUTO_FREE(path_name) \ diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index a4cc1bc63562..2e3c011d410a 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -472,7 +472,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, memcpy(&key, &root->defrag_progress, sizeof(key)); } - path->keep_locks = 1; + path->keep_locks = true; ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); if (ret < 0) @@ -515,7 +515,7 @@ static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* * Now that we reallocated the node we can find the next key. Note that * btrfs_find_next_key() can release our path and do another search - * without COWing, this is because even with path->keep_locks = 1, + * without COWing, this is because even with path->keep_locks == true, * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a * node when path->slots[node_level - 1] does not point to the last * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a4eaef60549e..b6c7da8e1bc8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -489,8 +489,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, } path->reada = READA_FORWARD; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; key.objectid = src_dev->devid; key.type = BTRFS_DEV_EXTENT_KEY; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 86004b8daa96..819e0a15e8e7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -789,7 +789,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, want = extent_ref_type(parent, owner); if (insert) { extra_size = btrfs_extent_inline_ref_size(want); - path->search_for_extension = 1; + path->search_for_extension = true; } else extra_size = -1; @@ -955,7 +955,7 @@ again: if (!path->keep_locks) { btrfs_release_path(path); - path->keep_locks = 1; + path->keep_locks = true; goto again; } @@ -976,11 +976,11 @@ out_no_entry: *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: if (path->keep_locks) { - path->keep_locks = 0; + path->keep_locks = false; btrfs_unlock_up_safe(path, 1); } if (insert) - path->search_for_extension = 0; + path->search_for_extension = false; return ret; } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index e7c219e83ff0..b17632ea085f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -394,8 +394,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) * between reading the free space cache and updating the csum tree. */ if (btrfs_is_free_space_inode(inode)) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; } /* @@ -423,8 +423,8 @@ int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) * from across transactions. */ if (bbio->csum_search_commit_root) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; down_read(&fs_info->commit_root_sem); } @@ -1177,10 +1177,10 @@ again: } btrfs_release_path(path); - path->search_for_extension = 1; + path->search_for_extension = true; ret = btrfs_search_slot(trans, root, &file_key, path, csum_size, 1); - path->search_for_extension = 0; + path->search_for_extension = false; if (ret < 0) goto out; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6ccb492eae8e..f0f72850fab2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -968,8 +968,8 @@ int load_free_space_cache(struct btrfs_block_group *block_group) path = btrfs_alloc_path(); if (!path) return 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; /* * We must pass a path with search_commit_root set to btrfs_iget in diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 26eae347739f..47745ae23c7d 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1694,8 +1694,8 @@ int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl) * Just like caching_thread() doesn't want to deadlock on the extent * tree, we don't want to deadlock on the free space tree. */ - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; info = btrfs_search_free_space_info(NULL, block_group, path, 0); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 1bd73b80f9fa..98dacfd03234 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -312,7 +312,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->skip_release_on_error = 1; + path->skip_release_on_error = true; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1a0c380ef464..fc0f0c46ab22 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7111,8 +7111,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, * point the commit_root has everything we need. */ if (btrfs_is_free_space_inode(inode)) { - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; } ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1956e4bf2302..58fb55644be5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3834,8 +3834,8 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) * Rescan should only search for commit root, and any later difference * should be recorded by qgroup */ - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; while (!ret && !(stopped = rescan_should_stop(fs_info))) { trans = btrfs_start_transaction(fs_info->fs_root, 0); diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index f5c616115254..2987cb7c686e 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -388,8 +388,8 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, return -ENOMEM; if (stripe->rst_search_commit_root) { - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; } ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 739fca944296..5bfefc3e9c06 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3165,8 +3165,8 @@ again: key.offset = blocksize; } - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) return ret; @@ -3358,8 +3358,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); if (ret < 0) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7e521d21ad40..f6c2196322c6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -463,10 +463,10 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - sctx->extent_path.search_commit_root = 1; - sctx->extent_path.skip_locking = 1; - sctx->csum_path.search_commit_root = 1; - sctx->csum_path.skip_locking = 1; + sctx->extent_path.search_commit_root = true; + sctx->extent_path.skip_locking = true; + sctx->csum_path.search_commit_root = true; + sctx->csum_path.skip_locking = true; for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { int ret; @@ -2202,10 +2202,10 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, * as the data stripe bytenr may be smaller than previous extent. Thus * we have to use our own extent/csum paths. */ - extent_path.search_commit_root = 1; - extent_path.skip_locking = 1; - csum_path.search_commit_root = 1; - csum_path.skip_locking = 1; + extent_path.search_commit_root = true; + extent_path.skip_locking = true; + csum_path.search_commit_root = true; + csum_path.skip_locking = true; for (int i = 0; i < data_stripes; i++) { int stripe_index; @@ -2688,8 +2688,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, return -ENOMEM; path->reada = READA_FORWARD; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; key.objectid = scrub_dev->devid; key.type = BTRFS_DEV_EXTENT_KEY; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fa94105e139a..3d437024e8bc 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -633,9 +633,9 @@ static struct btrfs_path *alloc_path_for_send(void) path = btrfs_alloc_path(); if (!path) return NULL; - path->search_commit_root = 1; - path->skip_locking = 1; - path->need_commit_sem = 1; + path->search_commit_root = true; + path->skip_locking = true; + path->need_commit_sem = true; return path; } @@ -7622,10 +7622,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - left_path->search_commit_root = 1; - left_path->skip_locking = 1; - right_path->search_commit_root = 1; - right_path->skip_locking = 1; + left_path->search_commit_root = true; + left_path->skip_locking = true; + right_path->search_commit_root = true; + right_path->skip_locking = true; /* * Strategy: Go to the first items of both trees. Then do diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e40e1d746381..cc27f87c4904 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -602,9 +602,9 @@ static int overwrite_item(struct walk_control *wc) insert: btrfs_release_path(wc->subvol_path); /* try to insert the key into the destination tree */ - wc->subvol_path->skip_release_on_error = 1; + wc->subvol_path->skip_release_on_error = true; ret = btrfs_insert_empty_item(trans, root, wc->subvol_path, &wc->log_key, item_size); - wc->subvol_path->skip_release_on_error = 0; + wc->subvol_path->skip_release_on_error = false; dst_eb = wc->subvol_path->nodes[0]; dst_slot = wc->subvol_path->slots[0]; @@ -5706,8 +5706,8 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, search_path = btrfs_alloc_path(); if (!search_path) return -ENOMEM; - search_path->search_commit_root = 1; - search_path->skip_locking = 1; + search_path->search_commit_root = true; + search_path->skip_locking = true; while (cur_offset < item_size) { u64 parent; @@ -6026,8 +6026,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (WARN_ON_ONCE(ret > 0)) { @@ -6047,8 +6047,8 @@ static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino, } btrfs_release_path(path); - path->search_commit_root = 0; - path->skip_locking = 0; + path->search_commit_root = false; + path->skip_locking = false; return ret; } @@ -7169,8 +7169,8 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; key.objectid = ino; key.type = BTRFS_INODE_REF_KEY; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 75a34ed95c74..e6a3f3ceb74b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1710,8 +1710,8 @@ again: } path->reada = READA_FORWARD; - path->search_commit_root = 1; - path->skip_locking = 1; + path->search_commit_root = true; + path->skip_locking = true; key.objectid = device->devid; key.type = BTRFS_DEV_EXTENT_KEY; @@ -7448,7 +7448,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * chunk tree, to keep it simple, just skip locking on the chunk tree. */ ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); - path->skip_locking = 1; + path->skip_locking = true; /* * Read all device items, and then all the chunk items. All diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3d27eb1e2f74..98d6aa3b7d6a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -85,7 +85,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->skip_release_on_error = 1; + path->skip_release_on_error = true; if (!value) { di = btrfs_lookup_xattr(trans, root, path, From e21756fc4aa78539b9cb9b45bfc8c4fd12322bc5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 17 Nov 2025 12:02:29 +0000 Subject: [PATCH 116/147] btrfs: use booleans for delalloc arguments and struct find_free_extent_ctl The struct find_free_extent_ctl uses an int for the 'delalloc' field but it's always used as a boolean, and its value is used to be passed to several functions to signal if we are dealing with delalloc. The same goes for the 'is_data' argument from btrfs_reserve_extent(). So change the type from int to bool and move the field definition in the find_free_extent_ctl structure so that it's close to other bool fields and reduces the size of the structure from 144 down to 136 bytes (at the moment it's only declared in the stack of btrfs_reserve_extent(), never allocated otherwise). Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/block-group.h | 2 +- fs/btrfs/direct-io.c | 2 +- fs/btrfs/extent-tree.c | 16 +++++++--------- fs/btrfs/extent-tree.h | 4 ++-- fs/btrfs/inode.c | 8 ++++---- 6 files changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ebbf04501782..8ae73123b610 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3802,7 +3802,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc, + u64 ram_bytes, u64 num_bytes, bool delalloc, bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9172104a5889..5f933455118c 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -345,7 +345,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc, + u64 ram_bytes, u64 num_bytes, bool delalloc, bool force_wrong_size_class); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, bool is_delalloc); diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 962fccceffd6..07e19e88ba4b 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -186,7 +186,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len); again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, - 0, alloc_hint, &ins, 1, 1); + 0, alloc_hint, &ins, true, true); if (ret == -EAGAIN) { ASSERT(btrfs_is_zoned(fs_info)); wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 819e0a15e8e7..a3646440c4fe 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3587,15 +3587,14 @@ enum btrfs_loop_type { }; static inline void -btrfs_lock_block_group(struct btrfs_block_group *cache, - int delalloc) +btrfs_lock_block_group(struct btrfs_block_group *cache, bool delalloc) { if (delalloc) down_read(&cache->data_rwsem); } static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, - int delalloc) + bool delalloc) { btrfs_get_block_group(cache); if (delalloc) @@ -3605,7 +3604,7 @@ static inline void btrfs_grab_block_group(struct btrfs_block_group *cache, static struct btrfs_block_group *btrfs_lock_cluster( struct btrfs_block_group *block_group, struct btrfs_free_cluster *cluster, - int delalloc) + bool delalloc) __acquires(&cluster->refill_lock) { struct btrfs_block_group *used_bg = NULL; @@ -3642,8 +3641,7 @@ static struct btrfs_block_group *btrfs_lock_cluster( } static inline void -btrfs_release_block_group(struct btrfs_block_group *cache, - int delalloc) +btrfs_release_block_group(struct btrfs_block_group *cache, bool delalloc) { if (delalloc) up_read(&cache->data_rwsem); @@ -4033,7 +4031,7 @@ static int do_allocation(struct btrfs_block_group *block_group, static void release_block_group(struct btrfs_block_group *block_group, struct find_free_extent_ctl *ffe_ctl, - int delalloc) + bool delalloc) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: @@ -4689,7 +4687,7 @@ loop: int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc) + struct btrfs_key *ins, bool is_data, bool delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; struct find_free_extent_ctl ffe_ctl = {}; @@ -5166,7 +5164,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, return ERR_CAST(block_rsv); ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, - empty_size, hint, &ins, 0, 0); + empty_size, hint, &ins, false, false); if (ret) goto out_unuse; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index e573509c5a71..f96a300a2db4 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -30,7 +30,6 @@ struct find_free_extent_ctl { u64 min_alloc_size; u64 empty_size; u64 flags; - int delalloc; /* Where to start the search inside the bg */ u64 search_start; @@ -40,6 +39,7 @@ struct find_free_extent_ctl { struct btrfs_free_cluster *last_ptr; bool use_cluster; + bool delalloc; bool have_caching_bg; bool orig_have_caching_bg; @@ -137,7 +137,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc); + struct btrfs_key *ins, bool is_data, bool delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, bool full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fc0f0c46ab22..f71a5f7f55b9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1136,7 +1136,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, - 0, *alloc_hint, &ins, 1, 1); + 0, *alloc_hint, &ins, true, true); if (ret) { /* * We can't reserve contiguous space for the compressed size. @@ -1359,7 +1359,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, - &ins, 1, 1); + &ins, true, true); if (ret == -EAGAIN) { /* * btrfs_reserve_extent only returns -EAGAIN for zoned @@ -9106,7 +9106,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, */ cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, - min_size, 0, *alloc_hint, &ins, 1, 0); + min_size, 0, *alloc_hint, &ins, true, false); if (ret) break; @@ -9914,7 +9914,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, } ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, - disk_num_bytes, 0, 0, &ins, 1, 1); + disk_num_bytes, 0, 0, &ins, true, true); if (ret) goto out_delalloc_release; extent_reserved = true; From 7a832b870b8a7a6a16a51dcdd8adde47b27f0169 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 17 Nov 2025 12:15:09 +0000 Subject: [PATCH 117/147] btrfs: place all boolean fields together in struct find_free_extent_ctl Move the 'retry_uncached' and 'hint' fields close to the other boolean fields so that we remove a hole from the structure and reduce its size from 136 bytes down to 128 bytes. Currently this structure is only allocated in the stack of btrfs_reserve_extent(). Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index f96a300a2db4..71bb8109c969 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -49,6 +49,16 @@ struct find_free_extent_ctl { /* Allocation is called for data relocation */ bool for_data_reloc; + /* + * Set to true if we're retrying the allocation on this block group + * after waiting for caching progress, this is so that we retry only + * once before moving on to another block group. + */ + bool retry_uncached; + + /* Whether or not the allocator is currently following a hint. */ + bool hinted; + /* RAID index, converted from flags */ int index; @@ -57,13 +67,6 @@ struct find_free_extent_ctl { */ int loop; - /* - * Set to true if we're retrying the allocation on this block group - * after waiting for caching progress, this is so that we retry only - * once before moving on to another block group. - */ - bool retry_uncached; - /* If current block group is cached */ int cached; @@ -82,9 +85,6 @@ struct find_free_extent_ctl { /* Allocation policy */ enum btrfs_extent_allocation_policy policy; - /* Whether or not the allocator is currently following a hint */ - bool hinted; - /* Size class of block groups to prefer in early loops */ enum btrfs_block_group_size_class size_class; }; From 54df8b80cc63aa0f22c4590cad11542731ed43ff Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 3 Nov 2025 12:51:09 +1030 Subject: [PATCH 118/147] btrfs: scrub: always update btrfs_scrub_progress::last_physical [BUG] When a scrub failed immediately without any byte scrubbed, the returned btrfs_scrub_progress::last_physical will always be 0, even if there is a non-zero @start passed into btrfs_scrub_dev() for resume cases. This will reset the progress and make later scrub resume start from the beginning. [CAUSE] The function btrfs_scrub_dev() accepts a @progress parameter to copy its updated progress to the caller, there are cases where we either don't touch progress::last_physical at all or copy 0 into last_physical: - last_physical not updated at all If some error happened before scrubbing any super block or chunk, we will not copy the progress, leaving the @last_physical untouched. E.g. failed to allocate @sctx, scrubbing a missing device or even there is already a running scrub and so on. All those cases won't touch @progress at all, resulting the last_physical untouched and will be left as 0 for most cases. - Error out before scrubbing any bytes In those case we allocated @sctx, and sctx->stat.last_physical is all zero (initialized by kvzalloc()). Unfortunately some critical errors happened during scrub_enumerate_chunks() or scrub_supers() before any stripe is really scrubbed. In that case although we will copy sctx->stat back to @progress, since no byte is really scrubbed, last_physical will be overwritten to 0. [FIX] Make sure the parameter @progress always has its @last_physical member updated to @start parameter inside btrfs_scrub_dev(). At the very beginning of the function, set @progress->last_physical to @start, so that even if we error out without doing progress copying, last_physical is still at @start. Then after we got @sctx allocated, set sctx->stat.last_physical to @start, this will make sure even if we didn't get any byte scrubbed, at the progress copying stage the @last_physical is not left as zero. This should resolve the resume progress reset problem. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f6c2196322c6..a40ee41f42c6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3104,6 +3104,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, unsigned int nofs_flag; bool need_commit = false; + /* Set the basic fallback @last_physical before we got a sctx. */ + if (progress) + progress->last_physical = start; + if (btrfs_fs_closing(fs_info)) return -EAGAIN; @@ -3122,6 +3126,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, sctx = scrub_setup_ctx(fs_info, is_dev_replace); if (IS_ERR(sctx)) return PTR_ERR(sctx); + sctx->stat.last_physical = start; ret = scrub_workers_get(fs_info); if (ret) From 9042dc00023f6d8e8e52cf3df78ef3ba3e212ece Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Nov 2025 18:54:25 +1030 Subject: [PATCH 119/147] btrfs: raid56: add an overview for the btrfs_raid_bio structure The structure needs to track both the pages from higher layer bio and internal pages, thus it can be a little complex to grasp. Add an overview of the structure, especially how we track different pages from higher layer bios and internal ones, to save some time for future developers. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 42a45716fb03..3ed23303d7fa 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -24,6 +24,76 @@ enum btrfs_rbio_ops { BTRFS_RBIO_PARITY_SCRUB, }; +/* + * Overview of btrfs_raid_bio. + * + * One btrfs_raid_bio represents a full stripe of RAID56, including both data + * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K). + * + * One btrfs_raid_bio can have one or more bios from higher layer, covering + * part or all of the data stripes. + * + * [PAGES FROM HIGHER LAYER BIOS] + * Higher layer bios are in the btrfs_raid_bio::bio_list. + * + * Pages from the bio_list are represented like the following: + * + * bio_list: |<- Bio 1 ->| |<- Bio 2 ->| ... + * bio_paddrs: [0] [1] [2] [3] [4] [5] ... + * + * If there is a bio covering a sector (one btrfs fs block), the corresponding + * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address + * (with the offset inside the page) of the corresponding bio. + * + * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will + * be INVALID_PADDR. + * + * The length of each entry in bio_paddrs[] is sectorsize. + * + * [PAGES FOR INTERNAL USAGES] + * Pages not covered by any bio or belonging to P/Q stripes are stored in + * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following: + * + * stripe_pages: |<- Page 0 ->|<- Page 1 ->| ... + * stripe_paddrs: [0] [1] [2] [3] [4] ... + * + * stripe_pages[] array stores all the pages covering the full stripe, including + * data and P/Q pages. + * stripe_pages[0] is the first page of the first data stripe. + * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second + * data stripe. + * + * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write + * (the bio covers all data stripes) there is no need to allocate pages for + * data stripes (can grab from bio_paddrs[]). + * + * If the corresponding page of stripe_paddrs[i] is not allocated, the value of + * stripe_paddrs[i] will be INVALID_PADDR. + * + * The length of each entry in stripe_paddrs[] is sectorsize. + * + * [LOCATING A SECTOR] + * To locate a sector for IO, we need the following info: + * + * - stripe_nr + * Starts from 0 (representing the first data stripe), ends at + * @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe). + * + * - sector_nr + * Starts from 0 (representing the first sector of the stripe), ends + * at BTRFS_STRIPE_LEN / sectorsize - 1. + * + * All existing bitmaps are based on sector numbers. + * + * - from which array + * Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the + * bio_paddrs[] (aka, from the higher layer bios). + * + * For IO, a physical address is returned, so that we can extract the page and + * the offset inside the page for IO. + * A special value INVALID_PADDR represents when the physical address is invalid, + * normally meaning there is no page allocated for the specified sector. + */ struct btrfs_raid_bio { struct btrfs_io_context *bioc; From 91cd1b586578017e20103771615db75dd8df5727 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Nov 2025 19:11:36 +1030 Subject: [PATCH 120/147] btrfs: raid56: introduce a new parameter to locate a sector Since we cannot ensure that all bios from the higher layer are backed by large folios (e.g. direct IO, encoded read/write/send), we need the ability to locate sub-block (aka, a page) inside a full stripe. So the existing @stripe_nr + @sector_nr combination is not enough to locate such page for bs > ps cases. Introduce a new parameter, @step_nr, to locate the page of a larger fs block. The naming is following the conventions used inside btrfs elsewhere, where one step is min(sectorsize, PAGE_SIZE). It's still a preparation, only touching the following aspects: - btrfs_dump_rbio() To show the new @sector_nsteps member. - btrfs_raid_bio::sector_nsteps Recording how many steps there are inside a fs block. - Enlarge btrfs_raid_bio::*_paddrs[] size To take @sector_nsteps into consideration. - index_one_bio() - index_stripe_sectors() - memcpy_from_bio_to_stripe() - cache_rbio_pages() - need_read_stripe_sectors() Those functions are iterating *_paddrs[], which needs to take sector_nsteps into consideration. - Rename rbio_stripe_sector_index() to rbio_sector_index() The "stripe" part is not that helpful. And an extra ASSERT() before returning the result. - Add a new rbio_paddr_index() helper This will take the extra @step_nr into consideration. - The comments of btrfs_raid_bio Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 92 +++++++++++++++++++++++++++++++---------------- fs/btrfs/raid56.h | 22 ++++++++++-- 2 files changed, 80 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 95cc243d9c8b..7f01178be7d8 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -66,10 +66,10 @@ static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, dump_bioc(fs_info, rbio->bioc); btrfs_crit(fs_info, -"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx", +"rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u sector_nsteps=%u scrubp=%u dbitmap=0x%lx", rbio->flags, rbio->nr_sectors, rbio->nr_data, rbio->real_stripes, rbio->stripe_nsectors, - rbio->scrubp, rbio->dbitmap); + rbio->sector_nsteps, rbio->scrubp, rbio->dbitmap); } #define ASSERT_RBIO(expr, rbio) \ @@ -229,15 +229,20 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) static void memcpy_from_bio_to_stripe(struct btrfs_raid_bio *rbio, unsigned int sector_nr) { - phys_addr_t dst = rbio->stripe_paddrs[sector_nr]; - phys_addr_t src = rbio->bio_paddrs[sector_nr]; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); - ASSERT(dst != INVALID_PADDR); - ASSERT(src != INVALID_PADDR); + ASSERT(sector_nr < rbio->nr_sectors); + for (int i = 0; i < rbio->sector_nsteps; i++) { + unsigned int index = sector_nr * rbio->sector_nsteps + i; + phys_addr_t dst = rbio->stripe_paddrs[index]; + phys_addr_t src = rbio->bio_paddrs[index]; - memcpy_page(phys_to_page(dst), offset_in_page(dst), - phys_to_page(src), offset_in_page(src), - rbio->bioc->fs_info->sectorsize); + ASSERT(dst != INVALID_PADDR); + ASSERT(src != INVALID_PADDR); + + memcpy_page(phys_to_page(dst), offset_in_page(dst), + phys_to_page(src), offset_in_page(src), step); + } } /* @@ -260,7 +265,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (rbio->bio_paddrs[i] == INVALID_PADDR) { + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == INVALID_PADDR) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is @@ -320,11 +325,12 @@ static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbi */ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); u32 offset; int i; - for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { + for (i = 0, offset = 0; i < rbio->nr_sectors * rbio->sector_nsteps; + i++, offset += step) { int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); @@ -668,21 +674,41 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } -static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, - unsigned int stripe_nr, - unsigned int sector_nr) +/* Return the sector index for @stripe_nr and @sector_nr. */ +static unsigned int rbio_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { + unsigned int ret; + ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - return stripe_nr * rbio->stripe_nsectors + sector_nr; + ret = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(ret < rbio->nr_sectors); + return ret; +} + +/* Return the paddr array index for @stripe_nr, @sector_nr and @step_nr. */ +static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr, + unsigned int step_nr) +{ + unsigned int ret; + + ASSERT_RBIO_SECTOR(step_nr < rbio->sector_nsteps, rbio, step_nr); + + ret = rbio_sector_index(rbio, stripe_nr, sector_nr) * rbio->sector_nsteps + step_nr; + ASSERT(ret < rbio->nr_sectors * rbio->sector_nsteps); + return ret; } /* Return a paddr from rbio->stripe_sectors, not from the bio list */ static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, unsigned int stripe_nr, unsigned int sector_nr) { - return rbio->stripe_paddrs[rbio_stripe_sector_index(rbio, stripe_nr, sector_nr)]; + return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)]; } /* Grab a paddr inside P stripe */ @@ -985,6 +1011,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, const unsigned int stripe_nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; + const unsigned int step = min(fs_info->sectorsize, PAGE_SIZE); + const unsigned int sector_nsteps = fs_info->sectorsize / step; struct btrfs_raid_bio *rbio; /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ @@ -1007,8 +1035,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); - rbio->bio_paddrs = kcalloc(num_sectors, sizeof(phys_addr_t), GFP_NOFS); - rbio->stripe_paddrs = kcalloc(num_sectors, sizeof(phys_addr_t), GFP_NOFS); + rbio->bio_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); + rbio->stripe_paddrs = kcalloc(num_sectors * sector_nsteps, sizeof(phys_addr_t), GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); rbio->stripe_uptodate_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); @@ -1019,7 +1047,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, kfree(rbio); return ERR_PTR(-ENOMEM); } - for (int i = 0; i < num_sectors; i++) { + for (int i = 0; i < num_sectors * sector_nsteps; i++) { rbio->stripe_paddrs[i] = INVALID_PADDR; rbio->bio_paddrs[i] = INVALID_PADDR; } @@ -1037,6 +1065,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; + rbio->sector_nsteps = sector_nsteps; refcount_set(&rbio->refs, 1); atomic_set(&rbio->stripes_pending, 0); @@ -1192,18 +1221,19 @@ static int rbio_add_io_paddr(struct btrfs_raid_bio *rbio, struct bio_list *bio_l static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 step_bits = min(fs_info->sectorsize_bits, PAGE_SHIFT); struct bvec_iter iter = bio->bi_iter; phys_addr_t paddr; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - btrfs_bio_for_each_block(paddr, bio, &iter, sectorsize) { - unsigned int index = (offset >> sectorsize_bits); + btrfs_bio_for_each_block(paddr, bio, &iter, step) { + unsigned int index = (offset >> step_bits); rbio->bio_paddrs[index] = paddr; - offset += sectorsize; + offset += step; } } @@ -1303,7 +1333,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) sector_paddr_in_rbio(rbio, stripe, sectornr, 0)); /* Then add the parity stripe */ - set_bit(rbio_stripe_sector_index(rbio, rbio->nr_data, sectornr), + set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr), rbio->stripe_uptodate_bitmap); pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sectornr)); @@ -1312,7 +1342,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) * RAID6, add the qstripe and call the library function * to fill in our p/q */ - set_bit(rbio_stripe_sector_index(rbio, rbio->nr_data + 1, sectornr), + set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr), rbio->stripe_uptodate_bitmap); pointers[stripe++] = kmap_local_paddr(rbio_qstripe_paddr(rbio, sectornr)); @@ -1932,7 +1962,7 @@ pstripe: if (ret < 0) goto cleanup; - set_bit(rbio_stripe_sector_index(rbio, faila, sector_nr), + set_bit(rbio_sector_index(rbio, faila, sector_nr), rbio->stripe_uptodate_bitmap); } if (failb >= 0) { @@ -1940,7 +1970,7 @@ pstripe: if (ret < 0) goto cleanup; - set_bit(rbio_stripe_sector_index(rbio, failb, sector_nr), + set_bit(rbio_sector_index(rbio, failb, sector_nr), rbio->stripe_uptodate_bitmap); } @@ -2288,7 +2318,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) int i; for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { - phys_addr_t paddr = rbio->stripe_paddrs[i]; + phys_addr_t paddr = rbio->stripe_paddrs[i * rbio->sector_nsteps]; /* * We have a sector which doesn't have page nor uptodate, @@ -2746,7 +2776,7 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) * The bio cache may have handed us an uptodate sector. If so, * use it. */ - if (test_bit(rbio_stripe_sector_index(rbio, stripe, sectornr), + if (test_bit(rbio_sector_index(rbio, stripe, sectornr), rbio->stripe_uptodate_bitmap)) continue; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 3ed23303d7fa..1f463ecf7e41 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -48,7 +48,7 @@ enum btrfs_rbio_ops { * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will * be INVALID_PADDR. * - * The length of each entry in bio_paddrs[] is sectorsize. + * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)). * * [PAGES FOR INTERNAL USAGES] * Pages not covered by any bio or belonging to P/Q stripes are stored in @@ -70,7 +70,7 @@ enum btrfs_rbio_ops { * If the corresponding page of stripe_paddrs[i] is not allocated, the value of * stripe_paddrs[i] will be INVALID_PADDR. * - * The length of each entry in stripe_paddrs[] is sectorsize. + * The length of each entry in stripe_paddrs[] is a step. * * [LOCATING A SECTOR] * To locate a sector for IO, we need the following info: @@ -83,7 +83,15 @@ enum btrfs_rbio_ops { * Starts from 0 (representing the first sector of the stripe), ends * at BTRFS_STRIPE_LEN / sectorsize - 1. * - * All existing bitmaps are based on sector numbers. + * - step_nr + * A step is min(sector_size, PAGE_SIZE). + * + * Starts from 0 (representing the first step of the sector), ends + * at @sector_nsteps - 1. + * + * For most call sites they do not need to bother this parameter. + * It is for bs > ps support and only for vertical stripe related works. + * (e.g. RMW/recover) * * - from which array * Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the @@ -151,6 +159,14 @@ struct btrfs_raid_bio { /* How many sectors there are for each stripe */ u8 stripe_nsectors; + /* + * How many steps there are for one sector. + * + * For bs > ps cases, it's sectorsize / PAGE_SIZE. + * For bs <= ps cases, it's always 1. + */ + u8 sector_nsteps; + /* Stripe number that we're scrubbing */ u8 scrubp; From 826325b6d091fdf93cc04fb5e8e462409635a469 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Nov 2025 20:10:38 +1030 Subject: [PATCH 121/147] btrfs: raid56: prepare generate_pq_vertical() for bs > ps cases Unlike btrfs_calculate_block_csum_pages(), we cannot handle multiple pages at the same time for P/Q generation. So here we introduce a new @step_nr, and various helpers to grab the sub-block page from the rbio, and generate the P/Q stripe page by page. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 92 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 70 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 7f01178be7d8..209f013e4da9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -711,20 +711,25 @@ static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)]; } -/* Grab a paddr inside P stripe */ -static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +static phys_addr_t rbio_stripe_step_paddr(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr, + unsigned int step_nr) { - return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr); + return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)]; } -/* Grab a paddr inside Q stripe, return INVALID_PADDR if not RAID6 */ -static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio, - unsigned int sector_nr) +static phys_addr_t rbio_pstripe_step_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) +{ + return rbio_stripe_step_paddr(rbio, rbio->nr_data, sector_nr, step_nr); +} + +static phys_addr_t rbio_qstripe_step_paddr(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr, unsigned int step_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) return INVALID_PADDR; - return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr); + return rbio_stripe_step_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr); } /* @@ -998,6 +1003,38 @@ static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, return rbio->stripe_paddrs[index]; } +/* + * Similar to sector_paddr_in_rbio(), but with extra consideration for + * bs > ps cases, where we can have multiple steps for a fs block. + */ +static phys_addr_t step_paddr_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, int step_nr, + bool bio_list_only) +{ + phys_addr_t ret = INVALID_PADDR; + int index; + + ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, + rbio, stripe_nr); + ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, + rbio, sector_nr); + ASSERT_RBIO_SECTOR(step_nr >= 0 && step_nr < rbio->sector_nsteps, + rbio, sector_nr); + + index = (stripe_nr * rbio->stripe_nsectors + sector_nr) * rbio->sector_nsteps + step_nr; + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); + + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = rbio->bio_paddrs[index]; + return ret; + } + } + return rbio->stripe_paddrs[index]; +} + /* * allocation and initial setup for the btrfs_raid_bio. Not * this does not allocate any pages for rbio->pages. @@ -1319,45 +1356,56 @@ static inline void *kmap_local_paddr(phys_addr_t paddr) return kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); } -/* Generate PQ for one vertical stripe. */ -static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int sector_nr, + unsigned int step_nr) { void **pointers = rbio->finish_pointers; - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); int stripe; const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; /* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) pointers[stripe] = kmap_local_paddr( - sector_paddr_in_rbio(rbio, stripe, sectornr, 0)); + step_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); /* Then add the parity stripe */ - set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr), - rbio->stripe_uptodate_bitmap); - pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sectornr)); + pointers[stripe++] = kmap_local_paddr(rbio_pstripe_step_paddr(rbio, sector_nr, step_nr)); if (has_qstripe) { /* * RAID6, add the qstripe and call the library function * to fill in our p/q */ - set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr), - rbio->stripe_uptodate_bitmap); - pointers[stripe++] = kmap_local_paddr(rbio_qstripe_paddr(rbio, sectornr)); + pointers[stripe++] = kmap_local_paddr( + rbio_qstripe_step_paddr(rbio, sector_nr, step_nr)); assert_rbio(rbio); - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); } else { /* raid5 */ - memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + memcpy(pointers[rbio->nr_data], pointers[0], step); + run_xor(pointers + 1, rbio->nr_data - 1, step); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } +/* Generate PQ for one vertical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +{ + const bool has_qstripe = (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6); + + for (int i = 0; i < rbio->sector_nsteps; i++) + generate_pq_vertical_step(rbio, sectornr, i); + + set_bit(rbio_sector_index(rbio, rbio->nr_data, sectornr), + rbio->stripe_uptodate_bitmap); + if (has_qstripe) + set_bit(rbio_sector_index(rbio, rbio->nr_data + 1, sectornr), + rbio->stripe_uptodate_bitmap); +} + static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { From 9ba67fd616d6cfbf8b90c336195819e7494645bb Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 14 Nov 2025 13:49:33 +1030 Subject: [PATCH 122/147] btrfs: raid56: prepare recover_vertical() to support bs > ps cases Currently recover_vertical() assumes that every fs block can be mapped by one page, this is blocking bs > ps support for raid56. Prepare recover_vertical() to support bs > ps cases by: - Introduce recover_vertical_step() helper Which will recover a full step (min(PAGE_SIZE, sectorsize)). Now recover_vertical() will do the error check for the specified sector, do the recover step by step, then do the sector verification. - Fix a spelling error of get_rbio_vertical_errors() The old name has a typo: "veritical". Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 141 ++++++++++++++++++++++------------------------ 1 file changed, 68 insertions(+), 73 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 209f013e4da9..62bb87d1c258 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1007,21 +1007,13 @@ static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, * Similar to sector_paddr_in_rbio(), but with extra consideration for * bs > ps cases, where we can have multiple steps for a fs block. */ -static phys_addr_t step_paddr_in_rbio(struct btrfs_raid_bio *rbio, - int stripe_nr, int sector_nr, int step_nr, - bool bio_list_only) +static phys_addr_t sector_step_paddr_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, int step_nr, + bool bio_list_only) { phys_addr_t ret = INVALID_PADDR; - int index; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr); - ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, - rbio, stripe_nr); - ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, - rbio, sector_nr); - ASSERT_RBIO_SECTOR(step_nr >= 0 && step_nr < rbio->sector_nsteps, - rbio, sector_nr); - - index = (stripe_nr * rbio->stripe_nsectors + sector_nr) * rbio->sector_nsteps + step_nr; ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); scoped_guard(spinlock, &rbio->bio_list_lock) { @@ -1147,8 +1139,8 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) * @faila and @failb will also be updated to the first and second stripe * number of the errors. */ -static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, - int *faila, int *failb) +static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) { int stripe_nr; int found_errors = 0; @@ -1219,8 +1211,8 @@ static int rbio_add_io_paddr(struct btrfs_raid_bio *rbio, struct bio_list *bio_l rbio->error_bitmap); /* Check if we have reached tolerance early. */ - found_errors = get_rbio_veritical_errors(rbio, sector_nr, - NULL, NULL); + found_errors = get_rbio_vertical_errors(rbio, sector_nr, + NULL, NULL); if (unlikely(found_errors > rbio->bioc->max_errors)) return -EIO; return 0; @@ -1367,7 +1359,7 @@ static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int /* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) pointers[stripe] = kmap_local_paddr( - step_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); + sector_step_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); /* Then add the parity stripe */ pointers[stripe++] = kmap_local_paddr(rbio_pstripe_step_paddr(rbio, sector_nr, step_nr)); @@ -1868,41 +1860,18 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, return ret; } -/* - * Recover a vertical stripe specified by @sector_nr. - * @*pointers are the pre-allocated pointers by the caller, so we don't - * need to allocate/free the pointers again and again. - */ -static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, - void **pointers, void **unmap_array) +static void recover_vertical_step(struct btrfs_raid_bio *rbio, + unsigned int sector_nr, + unsigned int step_nr, + int faila, int failb, + void **pointers, void **unmap_array) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - const u32 sectorsize = fs_info->sectorsize; - int found_errors; - int faila; - int failb; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); int stripe_nr; - int ret = 0; - /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. - */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sector_nr, &rbio->dbitmap)) - return 0; - - found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, - &failb); - /* - * No errors in the vertical stripe, skip it. Can happen for recovery - * which only part of a stripe failed csum check. - */ - if (!found_errors) - return 0; - - if (unlikely(found_errors > rbio->bioc->max_errors)) - return -EIO; + ASSERT(step_nr < rbio->sector_nsteps); + ASSERT(sector_nr < rbio->stripe_nsectors); /* * Setup our array of pointers with sectors from each stripe @@ -1918,9 +1887,9 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, 0); + paddr = sector_step_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0); } else { - paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr); + paddr = rbio_stripe_step_paddr(rbio, stripe_nr, sector_nr, step_nr); } pointers[stripe_nr] = kmap_local_paddr(paddr); unmap_array[stripe_nr] = pointers[stripe_nr]; @@ -1968,10 +1937,10 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, } if (failb == rbio->real_stripes - 2) { - raid6_datap_recov(rbio->real_stripes, sectorsize, + raid6_datap_recov(rbio->real_stripes, step, faila, pointers); } else { - raid6_2data_recov(rbio->real_stripes, sectorsize, + raid6_2data_recov(rbio->real_stripes, step, faila, failb, pointers); } } else { @@ -1981,7 +1950,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, ASSERT(failb == -1); pstripe: /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + memcpy(pointers[faila], pointers[rbio->nr_data], step); /* Rearrange the pointer array */ p = pointers[faila]; @@ -1991,24 +1960,54 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* Xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, sectorsize); - + run_xor(pointers, rbio->nr_data - 1, step); } +cleanup: + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); +} + +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + int found_errors; + int faila; + int failb; + int ret = 0; + /* - * No matter if this is a RMW or recovery, we should have all - * failed sectors repaired in the vertical stripe, thus they are now - * uptodate. - * Especially if we determine to cache the rbio, we need to - * have at least all data sectors uptodate. - * - * If possible, also check if the repaired sector matches its data - * checksum. + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return 0; + + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the vertical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) + return 0; + + if (unlikely(found_errors > rbio->bioc->max_errors)) + return -EIO; + + for (int i = 0; i < rbio->sector_nsteps; i++) + recover_vertical_step(rbio, sector_nr, i, faila, failb, + pointers, unmap_array); if (faila >= 0) { ret = verify_one_sector(rbio, faila, sector_nr); if (ret < 0) - goto cleanup; + return ret; set_bit(rbio_sector_index(rbio, faila, sector_nr), rbio->stripe_uptodate_bitmap); @@ -2016,15 +2015,11 @@ pstripe: if (failb >= 0) { ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) - goto cleanup; + return ret; set_bit(rbio_sector_index(rbio, failb, sector_nr), rbio->stripe_uptodate_bitmap); } - -cleanup: - for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) - kunmap_local(unmap_array[stripe_nr]); return ret; } @@ -2162,7 +2157,7 @@ static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_n int faila; int failb; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, &failb); /* This vertical stripe doesn't have errors. */ if (!found_errors) @@ -2455,7 +2450,7 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); + found_errors = get_rbio_vertical_errors(rbio, sectornr, NULL, NULL); if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; @@ -2735,7 +2730,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) int failb; int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, + found_errors = get_rbio_vertical_errors(rbio, sector_nr, &faila, &failb); if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; @@ -2869,7 +2864,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; - found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); + found_errors = get_rbio_vertical_errors(rbio, sector_nr, NULL, NULL); if (unlikely(found_errors > rbio->bioc->max_errors)) { ret = -EIO; break; From e0eadfcc959d282baafb3ba0c0c1bc4461669523 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 14 Nov 2025 14:01:15 +1030 Subject: [PATCH 123/147] btrfs: raid56: prepare verify_one_sector() to support bs > ps cases The function verify_one_sector() assume each fs block can be mapped by one page, blocking bs > ps support for raid56. Prepare it for bs > ps cases by: - Introduce helpers to get a paddrs pointer Thankfully all the higher layer bio should still be aligned to fs block size, thus a fs block should still be fully covered by the bio. Introduce sector_paddrs_in_rbio() and rbio_stripe_paddrs(), which will return a paddrs pointer inside btrfs_raid_bio::bio_paddrs[] or stripe_paddrs[]. The pointer can be directly passed to btrfs_calculate_block_csum_pages() to verify the checksum. - Open code btrfs_check_block_csum() btrfs_check_block_csum() only supports fs blocks backed by large folios. But for raid56 we can have fs blocks backed by multiple non-contiguous pages, e.g. direct IO, encoded read/write/send. So instead of using btrfs_check_block_csum(), open code it to use btrfs_calculate_block_csum_pages(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 55 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 62bb87d1c258..1a89cdb80fe4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -732,6 +732,13 @@ static phys_addr_t rbio_qstripe_step_paddr(const struct btrfs_raid_bio *rbio, return rbio_stripe_step_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr); } +/* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */ +static phys_addr_t *rbio_stripe_paddrs(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, unsigned int sector_nr) +{ + return &rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)]; +} + /* * The first stripe in the table for a logical address * has the lock. rbios are added in one of three ways: @@ -1003,6 +1010,41 @@ static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, return rbio->stripe_paddrs[index]; } +/* + * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr. + * + * @rbio: The raid bio + * @stripe_nr: Stripe number, valid range [0, real_stripe) + * @sector_nr: Sector number inside the stripe, + * valid range [0, stripe_nsectors) + * @bio_list_only: Whether to use sectors inside the bio list only. + * + * The read/modify/write code wants to reuse the original bio page as much + * as possible, and only use stripe_sectors as fallback. + * + * Return NULL if bio_list_only is set but the specified sector has no + * coresponding bio. + */ +static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) +{ + phys_addr_t *ret = NULL; + const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, 0); + + ASSERT(index >= 0 && index < rbio->nr_sectors * rbio->sector_nsteps); + + scoped_guard(spinlock, &rbio->bio_list_lock) { + if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (rbio->bio_paddrs[index] != INVALID_PADDR) + ret = &rbio->bio_paddrs[index]; + return ret; + } + } + return &rbio->stripe_paddrs[index]; +} + /* * Similar to sector_paddr_in_rbio(), but with extra consideration for * bs > ps cases, where we can have multiple steps for a fs block. @@ -1832,10 +1874,9 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; - phys_addr_t paddr; + phys_addr_t *paddrs; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; - int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) return 0; @@ -1848,16 +1889,18 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, 0); + paddrs = sector_paddrs_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { - paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr); + paddrs = rbio_stripe_paddrs(rbio, stripe_nr, sector_nr); } csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - ret = btrfs_check_block_csum(fs_info, paddr, csum_buf, csum_expected); - return ret; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, csum_expected, fs_info->csum_size) != 0)) + return -EIO; + return 0; } static void recover_vertical_step(struct btrfs_raid_bio *rbio, From 64e7b8c7c5873ad03e108d775fa1c0063a320070 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 14 Nov 2025 14:30:25 +1030 Subject: [PATCH 124/147] btrfs: raid56: prepare verify_bio_data_sectors() to support bs > ps cases The function verify_bio_data_sectors() assume each fs block can be mapped by one page, blocking bs > ps support for raid56. Prepare it for bs > ps cases by: - Make get_bio_sector_nr() to consider bs > ps cases The function is utilized to calculate the sector number of a device bio submitted by btrfs raid56 layer. - Assemble a local paddrs[] for checksum calculation - Open code btrfs_check_block_csum() btrfs_check_block_csum() only supports fs blocks backed by large folios. But for raid56 we can have fs blocks backed by multiple non-contiguous pages, e.g. direct IO, encoded read/write/send. So instead of using btrfs_check_block_csum(), open code it to use btrfs_calculate_block_csum_pages(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1a89cdb80fe4..7bc43f1861e6 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1620,9 +1620,9 @@ static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) int i; for (i = 0; i < rbio->nr_sectors; i++) { - if (rbio->stripe_paddrs[i] == bvec_paddr) + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == bvec_paddr) break; - if (rbio->bio_paddrs[i] == bvec_paddr) + if (rbio->bio_paddrs[i * rbio->sector_nsteps] == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -1655,7 +1655,11 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u32 step = min(fs_info->sectorsize, PAGE_SIZE); + const u32 nr_steps = rbio->sector_nsteps; int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 offset = 0; + phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; phys_addr_t paddr; /* No data csum for the whole stripe, no need to verify. */ @@ -1666,18 +1670,24 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; - btrfs_bio_for_each_block_all(paddr, bio, fs_info->sectorsize) { + btrfs_bio_for_each_block_all(paddr, bio, step) { u8 csum_buf[BTRFS_CSUM_SIZE]; - u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; - int ret; + u8 *expected_csum; + + paddrs[(offset / step) % nr_steps] = paddr; + offset += step; + + /* Not yet covering the full fs block, continue to the next step. */ + if (!IS_ALIGNED(offset, fs_info->sectorsize)) + continue; /* No csum for this sector, skip to the next sector. */ if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; - ret = btrfs_check_block_csum(fs_info, paddr, - csum_buf, expected_csum); - if (ret < 0) + expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; + btrfs_calculate_block_csum_pages(fs_info, paddrs, csum_buf); + if (unlikely(memcmp(csum_buf, expected_csum, fs_info->csum_size) != 0)) set_bit(total_sector_nr, rbio->error_bitmap); total_sector_nr++; } From 05ddf35a5d3d8d58323d6353f2bad026e9838af8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 17 Nov 2025 13:57:55 +1030 Subject: [PATCH 125/147] btrfs: raid56: prepare set_bio_pages_uptodate() to support bs > ps cases The function set_bio_pages_uptodate() assume each fs block can be mapped by one page, blocking bs > ps support for raid56. Prepare it for bs > ps cases by: - Update find_stripe_sector_nr() to check only the first step paddr We don't need to check each paddr, as the bios are still aligned to fs block size, thus checking the first step is enough. - Use step size to iterate the bio This means we only need to find the sector number for the first step of each fs block, and skip the remaining part. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 7bc43f1861e6..2b6838380544 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1588,7 +1588,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr) { for (int i = 0; i < rbio->nr_sectors; i++) { - if (rbio->stripe_paddrs[i] == paddr) + if (rbio->stripe_paddrs[i * rbio->sector_nsteps] == paddr) return i; } return -1; @@ -1600,17 +1600,23 @@ static int find_stripe_sector_nr(struct btrfs_raid_bio *rbio, phys_addr_t paddr) */ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { - const u32 blocksize = rbio->bioc->fs_info->sectorsize; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); + u32 offset = 0; phys_addr_t paddr; ASSERT(!bio_flagged(bio, BIO_CLONED)); - btrfs_bio_for_each_block_all(paddr, bio, blocksize) { - int sector_nr = find_stripe_sector_nr(rbio, paddr); + btrfs_bio_for_each_block_all(paddr, bio, step) { + /* Hitting the first step of a sector. */ + if (IS_ALIGNED(offset, sectorsize)) { + int sector_nr = find_stripe_sector_nr(rbio, paddr); - ASSERT(sector_nr >= 0); - if (sector_nr >= 0) - set_bit(sector_nr, rbio->stripe_uptodate_bitmap); + ASSERT(sector_nr >= 0); + if (sector_nr >= 0) + set_bit(sector_nr, rbio->stripe_uptodate_bitmap); + } + offset += step; } } From 53474a2ae17401821ce83c3b11f3d159f6b3583a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 17 Nov 2025 14:39:51 +1030 Subject: [PATCH 126/147] btrfs: raid56: prepare steal_rbio() to support bs > ps cases The function steal_rbio() assume each fs block can be mapped by one page, blocking bs > ps support for raid56. Prepare it for bs > ps cases by: - Introduce two helpers to calculate the sector number Previously we assume one page will contain at least one fs block, thus can use something like "sectors_per_page = PAGE_SIZE / sectorsize;", but with bs > ps support that above number will be 0. Instead introduce two helpers: * page_nr_to_sector_nr() Returns the sector number of the first sector covered by the page. * page_nr_to_num_sectors() Return how many sectors are covered by the page. And use the returned values for bitmap operations other than open-coded "PAGE_SIZE / sectorsize". Those helpers also have extra ASSERT()s to catch weird numbers. - Use above helpers The involved functions are: * steal_rbio_page() * is_data_stripe_page() * full_page_sectors_uptodate() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 57 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2b6838380544..5f2843a378e3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -300,18 +300,47 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } -static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, - unsigned int page_nr) +/* Get the sector number of the first sector covered by @page_nr. */ +static u32 page_nr_to_sector_nr(struct btrfs_raid_bio *rbio, unsigned int page_nr) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; - int i; + u32 sector_nr; ASSERT(page_nr < rbio->nr_pages); - for (i = sectors_per_page * page_nr; - i < sectors_per_page * page_nr + sectors_per_page; - i++) { + sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits; + ASSERT(sector_nr < rbio->nr_sectors); + return sector_nr; +} + +/* + * Get the number of sectors covered by @page_nr. + * + * For bs > ps cases, the result will always be 1. + * For bs <= ps cases, the result will be ps / bs. + */ +static u32 page_nr_to_num_sectors(struct btrfs_raid_bio *rbio, unsigned int page_nr) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 nr_sectors; + + ASSERT(page_nr < rbio->nr_pages); + + nr_sectors = round_up(PAGE_SIZE, fs_info->sectorsize) >> fs_info->sectorsize_bits; + ASSERT(nr_sectors > 0); + return nr_sectors; +} + +static __maybe_unused bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sector_nr = page_nr_to_sector_nr(rbio, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(rbio, page_nr); + int i; + + ASSERT(page_nr < rbio->nr_pages); + ASSERT(sector_nr + nr_bits < rbio->nr_sectors); + + for (i = sector_nr; i < sector_nr + nr_bits; i++) { if (!test_bit(i, rbio->stripe_uptodate_bitmap)) return false; } @@ -345,8 +374,11 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) static void steal_rbio_page(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest, int page_nr) { - const u32 sectorsize = src->bioc->fs_info->sectorsize; - const u32 sectors_per_page = PAGE_SIZE / sectorsize; + const u32 sector_nr = page_nr_to_sector_nr(src, page_nr); + const u32 nr_bits = page_nr_to_num_sectors(src, page_nr); + + ASSERT(page_nr < src->nr_pages); + ASSERT(sector_nr + nr_bits < src->nr_sectors); if (dest->stripe_pages[page_nr]) __free_page(dest->stripe_pages[page_nr]); @@ -354,13 +386,12 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, src->stripe_pages[page_nr] = NULL; /* Also update the stripe_uptodate_bitmap bits. */ - bitmap_set(dest->stripe_uptodate_bitmap, sectors_per_page * page_nr, sectors_per_page); + bitmap_set(dest->stripe_uptodate_bitmap, sector_nr, nr_bits); } static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) { - const int sector_nr = (page_nr << PAGE_SHIFT) >> - rbio->bioc->fs_info->sectorsize_bits; + const int sector_nr = page_nr_to_sector_nr(rbio, page_nr); /* * We have ensured PAGE_SIZE is aligned with sectorsize, thus From ba88278c69982b2c4007cd1912961fbb60693950 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 14 Nov 2025 14:39:09 +1030 Subject: [PATCH 127/147] btrfs: raid56: prepare rbio_bio_add_io_paddr() to support bs > ps cases The function rbio_bio_add_io_paddr() assume each fs block can be mapped by one page, blocking bs > ps support for raid56. Prepare it for bs > ps cases by: - Introduce a helper bio_add_paddrs() Previously we only need to add a single page to a bio for a fs block, but now we need to add multiple pages, this means we can fail halfway. In that case we need to properly revert the bio (only for its size though) for halfway failed cases. - Rename rbio_add_io_paddr() to rbio_add_io_paddrs() And change the @paddr parameter to @paddrs[]. - Change all callers to use the updated rbio_add_io_paddrs() For the @paddrs pointer used for the new function, it can be grabbed using sector_paddrs_in_rbio() and rbio_stripe_paddrs() helpers. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 106 ++++++++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5f2843a378e3..68454ef172b9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1245,17 +1245,41 @@ static int get_rbio_vertical_errors(struct btrfs_raid_bio *rbio, int sector_nr, return found_errors; } +static int bio_add_paddrs(struct bio *bio, phys_addr_t *paddrs, unsigned int nr_steps, + unsigned int step) +{ + int added = 0; + int ret; + + for (int i = 0; i < nr_steps; i++) { + ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, + offset_in_page(paddrs[i])); + if (ret != step) + goto revert; + added += ret; + } + return added; +revert: + /* + * We don't need to revert the bvec, as the bio will be submitted immediately, + * as long as the size is reduced the extra bvec will not be accessed. + */ + bio->bi_iter.bi_size -= added; + return 0; +} + /* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. - * Return <0 for error. + * Return <0 for error, and no byte will be added to @rbio. */ -static int rbio_add_io_paddr(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, - phys_addr_t paddr, unsigned int stripe_nr, - unsigned int sector_nr, enum req_op op) +static int rbio_add_io_paddrs(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, + phys_addr_t *paddrs, unsigned int stripe_nr, + unsigned int sector_nr, enum req_op op) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 step = min(sectorsize, PAGE_SIZE); struct bio *last = bio_list->tail; int ret; struct bio *bio; @@ -1271,7 +1295,7 @@ static int rbio_add_io_paddr(struct btrfs_raid_bio *rbio, struct bio_list *bio_l rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - ASSERT(paddr != INVALID_PADDR); + ASSERT(paddrs != NULL); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1302,8 +1326,7 @@ static int rbio_add_io_paddr(struct btrfs_raid_bio *rbio, struct bio_list *bio_l */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, phys_to_page(paddr), sectorsize, - offset_in_page(paddr)); + ret = bio_add_paddrs(last, paddrs, rbio->sector_nsteps, step); if (ret == sectorsize) return 0; } @@ -1316,7 +1339,8 @@ static int rbio_add_io_paddr(struct btrfs_raid_bio *rbio, struct bio_list *bio_l bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; - __bio_add_page(bio, phys_to_page(paddr), sectorsize, offset_in_page(paddr)); + ret = bio_add_paddrs(bio, paddrs, rbio->sector_nsteps, step); + ASSERT(ret == sectorsize); bio_list_add(bio_list, bio); return 0; } @@ -1497,7 +1521,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - phys_addr_t paddr; + phys_addr_t *paddrs; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; @@ -1507,15 +1531,15 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - paddr = sector_paddr_in_rbio(rbio, stripe, sectornr, 1); - if (paddr == INVALID_PADDR) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; } else { - paddr = rbio_stripe_paddr(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); } - ret = rbio_add_io_paddr(rbio, bio_list, paddr, stripe, - sectornr, REQ_OP_WRITE); + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, stripe, + sectornr, REQ_OP_WRITE); if (ret) goto error; } @@ -1532,7 +1556,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - phys_addr_t paddr; + phys_addr_t *paddrs; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; @@ -1557,14 +1581,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, continue; if (stripe < rbio->nr_data) { - paddr = sector_paddr_in_rbio(rbio, stripe, sectornr, 1); - if (paddr == INVALID_PADDR) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; } else { - paddr = rbio_stripe_paddr(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); } - ret = rbio_add_io_paddr(rbio, bio_list, paddr, + ret = rbio_add_io_paddrs(rbio, bio_list, paddrs, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) @@ -2184,7 +2208,7 @@ static void recover_rbio(struct btrfs_raid_bio *rbio) total_sector_nr++) { int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; - phys_addr_t paddr; + phys_addr_t *paddrs; /* * Skip the range which has error. It can be a range which is @@ -2201,9 +2225,9 @@ static void recover_rbio(struct btrfs_raid_bio *rbio) continue; } - paddr = rbio_stripe_paddr(rbio, stripe, sectornr); - ret = rbio_add_io_paddr(rbio, &bio_list, paddr, stripe, - sectornr, REQ_OP_READ); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, + sectornr, REQ_OP_READ); if (ret < 0) { bio_list_put(&bio_list); goto out; @@ -2393,11 +2417,11 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) total_sector_nr++) { int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; - phys_addr_t paddr; + phys_addr_t *paddrs; - paddr = rbio_stripe_paddr(rbio, stripe, sectornr); - ret = rbio_add_io_paddr(rbio, &bio_list, paddr, stripe, - sectornr, REQ_OP_READ); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, + sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; @@ -2751,11 +2775,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) * everything else. */ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - phys_addr_t paddr; + phys_addr_t *paddrs; - paddr = rbio_stripe_paddr(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_paddr(rbio, &bio_list, paddr, rbio->scrubp, - sectornr, REQ_OP_WRITE); + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->scrubp, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2769,11 +2793,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) */ ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { - phys_addr_t paddr; + phys_addr_t *paddrs; - paddr = rbio_stripe_paddr(rbio, rbio->scrubp, sectornr); - ret = rbio_add_io_paddr(rbio, &bio_list, paddr, rbio->real_stripes, - sectornr, REQ_OP_WRITE); + paddrs = rbio_stripe_paddrs(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, rbio->real_stripes, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2889,7 +2913,7 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) total_sector_nr++) { int sectornr = total_sector_nr % rbio->stripe_nsectors; int stripe = total_sector_nr / rbio->stripe_nsectors; - phys_addr_t paddr; + phys_addr_t *paddrs; /* No data in the vertical stripe, no need to read. */ if (!test_bit(sectornr, &rbio->dbitmap)) @@ -2900,11 +2924,11 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) * read them from the disk. If sector_paddr_in_rbio() finds a sector * in the bio list we don't need to read it off the stripe. */ - paddr = sector_paddr_in_rbio(rbio, stripe, sectornr, 1); - if (paddr == INVALID_PADDR) + paddrs = sector_paddrs_in_rbio(rbio, stripe, sectornr, 1); + if (paddrs == NULL) continue; - paddr = rbio_stripe_paddr(rbio, stripe, sectornr); + paddrs = rbio_stripe_paddrs(rbio, stripe, sectornr); /* * The bio cache may have handed us an uptodate sector. If so, * use it. @@ -2913,8 +2937,8 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) rbio->stripe_uptodate_bitmap)) continue; - ret = rbio_add_io_paddr(rbio, &bio_list, paddr, stripe, - sectornr, REQ_OP_READ); + ret = rbio_add_io_paddrs(rbio, &bio_list, paddrs, stripe, + sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; From 89ca1a403e541236e56d184634b0e4e5175c0054 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 14 Nov 2025 15:29:46 +1030 Subject: [PATCH 128/147] btrfs: raid56: prepare finish_parity_scrub() to support bs > ps cases The function finish_parity_scrub() assume each fs block can be mapped by one page, blocking bs > ps support for raid56. Prepare it for bs > ps cases by: - Introduce a helper, verify_one_parity_step() Since the P/Q generation is always done in a vertical stripe, we have to handle the range step by step. - Only clear the rbio->dbitmap if all steps of an fs block match - Remove rbio_stripe_paddr() and sector_paddr_in_rbio() helpers Now we either use the paddrs version for checksum, or the step version for P/Q generation/recovery. - Make alloc_rbio_essential_pages() to handle bs > ps cases Since for bs > ps cases, one fs block needs multiple pages, the existing simple check against rbio->stripe_pages[] is not enough. Extract a dedicated helper, alloc_rbio_sector_pages(), for the existing alloc_rbio_essential_pages(), which is still based on sector number. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 176 +++++++++++++++++++++++----------------------- 1 file changed, 87 insertions(+), 89 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 68454ef172b9..4ebb6b6ba4a6 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -735,13 +735,6 @@ static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio, return ret; } -/* Return a paddr from rbio->stripe_sectors, not from the bio list */ -static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, - unsigned int stripe_nr, unsigned int sector_nr) -{ - return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, 0)]; -} - static phys_addr_t rbio_stripe_step_paddr(const struct btrfs_raid_bio *rbio, unsigned int stripe_nr, unsigned int sector_nr, unsigned int step_nr) @@ -1001,46 +994,6 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) rbio_endio_bio_list(extra, status); } -/* - * Get the paddr specified by its @stripe_nr and @sector_nr. - * - * @rbio: The raid bio - * @stripe_nr: Stripe number, valid range [0, real_stripe) - * @sector_nr: Sector number inside the stripe, - * valid range [0, stripe_nsectors) - * @bio_list_only: Whether to use sectors inside the bio list only. - * - * The read/modify/write code wants to reuse the original bio page as much - * as possible, and only use stripe_sectors as fallback. - */ -static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, - int stripe_nr, int sector_nr, - bool bio_list_only) -{ - phys_addr_t ret = INVALID_PADDR; - int index; - - ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, - rbio, stripe_nr); - ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, - rbio, sector_nr); - - index = stripe_nr * rbio->stripe_nsectors + sector_nr; - ASSERT(index >= 0 && index < rbio->nr_sectors); - - spin_lock(&rbio->bio_list_lock); - if (rbio->bio_paddrs[index] != INVALID_PADDR || bio_list_only) { - /* Don't return sector without a valid page pointer */ - if (rbio->bio_paddrs[index] != INVALID_PADDR) - ret = rbio->bio_paddrs[index]; - spin_unlock(&rbio->bio_list_lock); - return ret; - } - spin_unlock(&rbio->bio_list_lock); - - return rbio->stripe_paddrs[index]; -} - /* * Get paddr pointer for the sector specified by its @stripe_nr and @sector_nr. * @@ -2635,42 +2588,116 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, return rbio; } +static int alloc_rbio_sector_pages(struct btrfs_raid_bio *rbio, + int sector_nr) +{ + const u32 step = min(PAGE_SIZE, rbio->bioc->fs_info->sectorsize); + const u32 base = sector_nr * rbio->sector_nsteps; + + for (int i = base; i < base + rbio->sector_nsteps; i++) { + const unsigned int page_index = (i * step) >> PAGE_SHIFT; + struct page *page; + + if (rbio->stripe_pages[page_index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[page_index] = page; + } + return 0; +} + /* * We just scrub the parity that we have correct data on the same horizontal, * so we needn't allocate all pages for all the stripes. */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int total_sector_nr; for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { - struct page *page; int sectornr = total_sector_nr % rbio->stripe_nsectors; - int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; + int ret; if (!test_bit(sectornr, &rbio->dbitmap)) continue; - if (rbio->stripe_pages[index]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; + ret = alloc_rbio_sector_pages(rbio, total_sector_nr); + if (ret < 0) + return ret; } index_stripe_sectors(rbio); return 0; } +/* Return true if the content of the step matches the caclulated one. */ +static bool verify_one_parity_step(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr, + unsigned int step_nr) +{ + const unsigned int nr_data = rbio->nr_data; + const bool has_qstripe = (rbio->real_stripes - rbio->nr_data == 2); + const u32 step = min(rbio->bioc->fs_info->sectorsize, PAGE_SIZE); + void *parity; + bool ret = false; + + ASSERT(step_nr < rbio->sector_nsteps); + + /* First collect one page from each data stripe. */ + for (int stripe = 0; stripe < nr_data; stripe++) + pointers[stripe] = kmap_local_paddr( + sector_step_paddr_in_rbio(rbio, stripe, sector_nr, + step_nr, 0)); + + if (has_qstripe) { + assert_rbio(rbio); + /* RAID6, call the library function to fill in our P/Q. */ + raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); + } else { + /* RAID5. */ + memcpy(pointers[nr_data], pointers[0], step); + run_xor(pointers + 1, nr_data - 1, step); + } + + /* Check scrubbing parity and repair it. */ + parity = kmap_local_paddr(rbio_stripe_step_paddr(rbio, rbio->scrubp, sector_nr, step_nr)); + if (memcmp(parity, pointers[rbio->scrubp], step) != 0) + memcpy(parity, pointers[rbio->scrubp], step); + else + ret = true; + kunmap_local(parity); + + for (int stripe = nr_data - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); + return ret; +} + +/* + * The @pointers array should have the P/Q parity already mapped. + */ +static void verify_one_parity_sector(struct btrfs_raid_bio *rbio, + void *pointers[], unsigned int sector_nr) +{ + bool found_error = false; + + for (int step_nr = 0; step_nr < rbio->sector_nsteps; step_nr++) { + bool match; + + match = verify_one_parity_step(rbio, pointers, sector_nr, step_nr); + if (!match) + found_error = true; + } + if (!found_error) + bitmap_clear(&rbio->dbitmap, sector_nr, 1); +} + static int finish_parity_scrub(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; - const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; - int stripe; int sectornr; bool has_qstripe; struct page *page; @@ -2729,37 +2756,8 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) /* Map the parity stripe just once */ - for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - void *parity; - - /* first collect one page from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) - pointers[stripe] = kmap_local_paddr( - sector_paddr_in_rbio(rbio, stripe, sectornr, 0)); - - if (has_qstripe) { - assert_rbio(rbio); - /* RAID6, call the library function to fill in our P/Q */ - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); - } else { - /* raid5 */ - memcpy(pointers[nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, nr_data - 1, sectorsize); - } - - /* Check scrubbing parity and repair it */ - parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sectornr)); - if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) - memcpy(parity, pointers[rbio->scrubp], sectorsize); - else - /* Parity is right, needn't writeback */ - bitmap_clear(&rbio->dbitmap, sectornr, 1); - kunmap_local(parity); - - for (stripe = nr_data - 1; stripe >= 0; stripe--) - kunmap_local(pointers[stripe]); - } + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) + verify_one_parity_sector(rbio, pointers, sectornr); kunmap_local(pointers[nr_data]); __free_page(phys_to_page(p_paddr)); From 8870dbeedcf9576fbc5147654e272acad3d84089 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 14 Nov 2025 19:15:28 +1030 Subject: [PATCH 129/147] btrfs: raid56: enable bs > ps support The support code for bs > ps is complete, enable it and update assertions. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ------ fs/btrfs/raid56.c | 11 ++++++----- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0df81a09a3d1..fe62f5a244f5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3258,12 +3258,6 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) PAGE_SIZE, fs_info->sectorsize); return -EINVAL; } - if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) { - btrfs_err(fs_info, - "RAID56 is not supported for page size %lu with sectorsize %u", - PAGE_SIZE, fs_info->sectorsize); - return -EINVAL; - } /* This can be called by remount, we need to protect the super block. */ spin_lock(&fs_info->super_lock); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4ebb6b6ba4a6..fca9c73b01ea 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1070,8 +1070,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, const unsigned int sector_nsteps = fs_info->sectorsize / step; struct btrfs_raid_bio *rbio; - /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ - ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * For bs <= ps cases, ps must be aligned to bs. + * For bs > ps cases, bs must be aligned to ps. + */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize) || + IS_ALIGNED(fs_info->sectorsize, PAGE_SIZE)); /* * Our current stripe len should be fixed to 64k thus stripe_nsectors * (at most 16) should be no larger than BITS_PER_LONG. @@ -3014,9 +3018,6 @@ void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio, unsigned int foffset = 0; int ret; - /* We shouldn't hit RAID56 for bs > ps cases for now. */ - ASSERT(fs_info->sectorsize <= PAGE_SIZE); - /* * If we hit ENOMEM temporarily, but later at * raid56_parity_submit_scrub_rbio() time it succeeded, we just do From 1a332a6d70475d87067038ab0cbda8292da955e1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 16 Nov 2025 10:32:50 +1030 Subject: [PATCH 130/147] btrfs: raid56: remove the "_step" infix The following functions are introduced as a middle step for bs > ps support: - rbio_streip_step_paddr() - rbio_pstripe_step_paddr() - rbio_qstripe_step_paddr() - sector_step_paddr_in_rbio() As there is already an existing function without the infix, and has a different parameter list. But the existing functions have been cleaned up, there is no need to keep the "_step" infix, just remove it completely. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fca9c73b01ea..f38d8305e46d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -735,25 +735,25 @@ static unsigned int rbio_paddr_index(const struct btrfs_raid_bio *rbio, return ret; } -static phys_addr_t rbio_stripe_step_paddr(const struct btrfs_raid_bio *rbio, +static phys_addr_t rbio_stripe_paddr(const struct btrfs_raid_bio *rbio, unsigned int stripe_nr, unsigned int sector_nr, unsigned int step_nr) { return rbio->stripe_paddrs[rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr)]; } -static phys_addr_t rbio_pstripe_step_paddr(const struct btrfs_raid_bio *rbio, +static phys_addr_t rbio_pstripe_paddr(const struct btrfs_raid_bio *rbio, unsigned int sector_nr, unsigned int step_nr) { - return rbio_stripe_step_paddr(rbio, rbio->nr_data, sector_nr, step_nr); + return rbio_stripe_paddr(rbio, rbio->nr_data, sector_nr, step_nr); } -static phys_addr_t rbio_qstripe_step_paddr(const struct btrfs_raid_bio *rbio, +static phys_addr_t rbio_qstripe_paddr(const struct btrfs_raid_bio *rbio, unsigned int sector_nr, unsigned int step_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) return INVALID_PADDR; - return rbio_stripe_step_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr); + return rbio_stripe_paddr(rbio, rbio->nr_data + 1, sector_nr, step_nr); } /* Return a paddr pointer into the rbio::stripe_paddrs[] for the specified sector. */ @@ -1033,9 +1033,9 @@ static phys_addr_t *sector_paddrs_in_rbio(struct btrfs_raid_bio *rbio, * Similar to sector_paddr_in_rbio(), but with extra consideration for * bs > ps cases, where we can have multiple steps for a fs block. */ -static phys_addr_t sector_step_paddr_in_rbio(struct btrfs_raid_bio *rbio, - int stripe_nr, int sector_nr, int step_nr, - bool bio_list_only) +static phys_addr_t sector_paddr_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, int step_nr, + bool bio_list_only) { phys_addr_t ret = INVALID_PADDR; const int index = rbio_paddr_index(rbio, stripe_nr, sector_nr, step_nr); @@ -1413,10 +1413,10 @@ static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int /* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) pointers[stripe] = kmap_local_paddr( - sector_step_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); + sector_paddr_in_rbio(rbio, stripe, sector_nr, step_nr, 0)); /* Then add the parity stripe */ - pointers[stripe++] = kmap_local_paddr(rbio_pstripe_step_paddr(rbio, sector_nr, step_nr)); + pointers[stripe++] = kmap_local_paddr(rbio_pstripe_paddr(rbio, sector_nr, step_nr)); if (has_qstripe) { /* @@ -1424,7 +1424,7 @@ static void generate_pq_vertical_step(struct btrfs_raid_bio *rbio, unsigned int * to fill in our p/q */ pointers[stripe++] = kmap_local_paddr( - rbio_qstripe_step_paddr(rbio, sector_nr, step_nr)); + rbio_qstripe_paddr(rbio, sector_nr, step_nr)); assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, step, pointers); @@ -1958,9 +1958,9 @@ static void recover_vertical_step(struct btrfs_raid_bio *rbio, * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - paddr = sector_step_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0); + paddr = sector_paddr_in_rbio(rbio, stripe_nr, sector_nr, step_nr, 0); } else { - paddr = rbio_stripe_step_paddr(rbio, stripe_nr, sector_nr, step_nr); + paddr = rbio_stripe_paddr(rbio, stripe_nr, sector_nr, step_nr); } pointers[stripe_nr] = kmap_local_paddr(paddr); unmap_array[stripe_nr] = pointers[stripe_nr]; @@ -2651,8 +2651,8 @@ static bool verify_one_parity_step(struct btrfs_raid_bio *rbio, /* First collect one page from each data stripe. */ for (int stripe = 0; stripe < nr_data; stripe++) pointers[stripe] = kmap_local_paddr( - sector_step_paddr_in_rbio(rbio, stripe, sector_nr, - step_nr, 0)); + sector_paddr_in_rbio(rbio, stripe, sector_nr, + step_nr, 0)); if (has_qstripe) { assert_rbio(rbio); @@ -2665,7 +2665,7 @@ static bool verify_one_parity_step(struct btrfs_raid_bio *rbio, } /* Check scrubbing parity and repair it. */ - parity = kmap_local_paddr(rbio_stripe_step_paddr(rbio, rbio->scrubp, sector_nr, step_nr)); + parity = kmap_local_paddr(rbio_stripe_paddr(rbio, rbio->scrubp, sector_nr, step_nr)); if (memcmp(parity, pointers[rbio->scrubp], step) != 0) memcpy(parity, pointers[rbio->scrubp], step); else From 31b37b766753682ec1434bb591c5edee94649597 Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Fri, 14 Nov 2025 15:24:45 +0800 Subject: [PATCH 131/147] btrfs: factor out root promotion logic into promote_child_to_root() The balance_level() function is overly long and contains a cold code path that handles promoting a child node to root when the root has only one item. This code has distinct logic that is clearer and more maintainable when isolated in its own function. Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 116 ++++++++++++++++++++++++++++------------------- 1 file changed, 70 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 51dc8e0bc9c1..3abddd2cdfd3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -861,6 +861,75 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, return eb; } +/* + * Promote a child node to become the new tree root. + * + * @trans: Transaction handle + * @root: Tree root structure to update + * @path: Path holding nodes and locks + * @level: Level of the parent (old root) + * @parent: The parent (old root) with exactly one item + * + * This helper is called during rebalancing when the root node contains only + * a single item (nritems == 1). We can reduce the tree height by promoting + * that child to become the new root and freeing the old root node. The path + * locks and references are updated accordingly. + * + * Return: 0 on success, negative errno on failure. The transaction is aborted + * on critical errors. + */ +static int promote_child_to_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + int level, struct extent_buffer *parent) +{ + struct extent_buffer *child; + int ret; + + ASSERT(btrfs_header_nritems(parent) == 1); + + child = btrfs_read_node_slot(parent, 0); + if (IS_ERR(child)) + return PTR_ERR(child); + + btrfs_tree_lock(child); + ret = btrfs_cow_block(trans, root, child, parent, 0, &child, BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + return ret; + } + + ret = btrfs_tree_mod_log_insert_root(root->node, child, true); + if (unlikely(ret < 0)) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + btrfs_abort_transaction(trans, ret); + return ret; + } + rcu_assign_pointer(root->node, child); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + + path->locks[level] = 0; + path->nodes[level] = NULL; + btrfs_clear_buffer_dirty(trans, parent); + btrfs_tree_unlock(parent); + /* Once for the path. */ + free_extent_buffer(parent); + + root_sub_used_bytes(root); + ret = btrfs_free_tree_block(trans, btrfs_root_id(root), parent, 0, 1); + /* Once for the root ptr. */ + free_extent_buffer_stale(parent); + if (unlikely(ret < 0)) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + return 0; +} + /* * node level balancing, used to make sure nodes are in proper order for * item deletion. We balance from the top down, so we have to make sure @@ -900,55 +969,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * by promoting the node below to a root */ if (!parent) { - struct extent_buffer *child; - if (btrfs_header_nritems(mid) != 1) return 0; - /* promote the child to a root */ - child = btrfs_read_node_slot(mid, 0); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - goto out; - } - - btrfs_tree_lock(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child, - BTRFS_NESTING_COW); - if (ret) { - btrfs_tree_unlock(child); - free_extent_buffer(child); - goto out; - } - - ret = btrfs_tree_mod_log_insert_root(root->node, child, true); - if (unlikely(ret < 0)) { - btrfs_tree_unlock(child); - free_extent_buffer(child); - btrfs_abort_transaction(trans, ret); - goto out; - } - rcu_assign_pointer(root->node, child); - - add_root_to_dirty_list(root); - btrfs_tree_unlock(child); - - path->locks[level] = 0; - path->nodes[level] = NULL; - btrfs_clear_buffer_dirty(trans, mid); - btrfs_tree_unlock(mid); - /* once for the path */ - free_extent_buffer(mid); - - root_sub_used_bytes(root); - ret = btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); - /* once for the root ptr */ - free_extent_buffer_stale(mid); - if (unlikely(ret < 0)) { - btrfs_abort_transaction(trans, ret); - goto out; - } - return 0; + return promote_child_to_root(trans, root, path, level, mid); } if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) From 3afa17bf243cf384e8caa64e1e3fad8b543c7c83 Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Fri, 14 Nov 2025 15:24:46 +0800 Subject: [PATCH 132/147] btrfs: optimize balance_level() path reference handling Instead of incrementing refcount on 'left' node when it's referenced by path, simply transfer ownership to path and set left to NULL. This eliminates: - Unnecessary refcount increment/decrement operations - Redundant conditional checks for left node cleanup The path now consistently owns the left node reference when used. Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3abddd2cdfd3..4df4c6cda620 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1125,11 +1125,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the path */ if (left) { if (btrfs_header_nritems(left) > orig_slot) { - refcount_inc(&left->refs); /* left was locked after cow */ path->nodes[level] = left; path->slots[level + 1] -= 1; path->slots[level] = orig_slot; + /* Left is now owned by path. */ + left = NULL; if (mid) { btrfs_tree_unlock(mid); free_extent_buffer(mid); @@ -1149,8 +1150,7 @@ out: free_extent_buffer(right); } if (left) { - if (path->nodes[level] != left) - btrfs_tree_unlock(left); + btrfs_tree_unlock(left); free_extent_buffer(left); } return ret; From 139f75a3b1677c76bd845228ec49e50d69ce556e Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Fri, 14 Nov 2025 15:24:47 +0800 Subject: [PATCH 133/147] btrfs: simplify leaf traversal after path release in btrfs_next_old_leaf() After releasing the path in btrfs_next_old_leaf(), we need to re-check the leaf because a balance operation may have added items or removed the last item. The original code handled this with two separate conditional blocks, the second marked with a lengthy comment explaining a "missed case". Merge these two blocks into a single logical structure that handles both scenarios more clearly. Also update the comment to be more concise and accurate, incorporating the explanation directly into the main block rather than a separate annotation. Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4df4c6cda620..fc712e00d4fb 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4853,34 +4853,23 @@ again: nritems = btrfs_header_nritems(path->nodes[0]); /* - * by releasing the path above we dropped all our locks. A balance - * could have added more items next to the key that used to be - * at the very end of the block. So, check again here and - * advance the path if there are now more items available. + * By releasing the path above we dropped all our locks. A balance + * could have happened and + * + * 1. added more items after the previous last item + * 2. deleted the previous last item + * + * So, check again here and advance the path if there are now more + * items available. */ - if (nritems > 0 && path->slots[0] < nritems - 1) { - if (ret == 0) + if (nritems > 0 && path->slots[0] <= nritems - 1) { + if (ret == 0 && path->slots[0] != nritems - 1) { path->slots[0]++; - ret = 0; - goto done; - } - /* - * So the above check misses one case: - * - after releasing the path above, someone has removed the item that - * used to be at the very end of the block, and balance between leafs - * gets another one with bigger key.offset to replace it. - * - * This one should be returned as well, or we can get leaf corruption - * later(esp. in __btrfs_drop_extents()). - * - * And a bit more explanation about this check, - * with ret > 0, the key isn't found, the path points to the slot - * where it should be inserted, so the path->slots[0] item must be the - * bigger one. - */ - if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) { - ret = 0; - goto done; + goto done; + } else if (ret > 0) { + ret = 0; + goto done; + } } while (level < BTRFS_MAX_LEVEL) { From 4357dd76f558f03fe22f28c360f7798ee3a0d238 Mon Sep 17 00:00:00 2001 From: Sun YangKai Date: Fri, 14 Nov 2025 15:24:48 +0800 Subject: [PATCH 134/147] btrfs: remove redundant level reset in btrfs_del_items() When btrfs_del_items() empties a leaf, it deletes the leaf unless it's the root node. For the root leaf case, the code used to reset its level to 0 via btrfs_set_header_level(). This is redundant as leaf nodes always have level == 0. Remove the unnecessary level assignment and invert the conditional to handle only the non-root leaf deletion. The root leaf is correctly left as-is. Signed-off-by: Sun YangKai Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index fc712e00d4fb..a48b4befbee7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4520,9 +4520,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* delete the leaf if we've emptied it */ if (nritems == 0) { - if (leaf == root->node) { - btrfs_set_header_level(leaf, 0); - } else { + if (leaf != root->node) { btrfs_clear_buffer_dirty(trans, leaf); ret = btrfs_del_leaf(trans, root, path, leaf); if (ret < 0) From f96834005386a1e44571e4077eaa7a43d9ca2318 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 18 Nov 2025 17:08:38 +0100 Subject: [PATCH 135/147] btrfs: disable various operations on encrypted inodes Initially, only normal data extents will be encrypted. This change forbids various other bits: - allows reflinking only if both inodes have the same encryption status - disable inline data on encrypted inodes Note: The patch was taken from v5 of fscrypt patchset (https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/) which was handled over time by various people: Omar Sandoval, Sweet Tea Dorminy, Josef Bacik. Signed-off-by: Omar Sandoval Signed-off-by: Daniel Vacek Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++++ fs/btrfs/reflink.c | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f71a5f7f55b9..8e13117eca16 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -592,6 +592,10 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (size < i_size_read(&inode->vfs_inode)) return false; + /* Encrypted file cannot be inlined. */ + if (IS_ENCRYPTED(&inode->vfs_inode)) + return false; + return true; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 775a32a7953a..b5fe95baf92e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include "ctree.h" #include "fs.h" @@ -789,6 +790,10 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } + /* Can only reflink encrypted files if both files are encrypted. */ + if (IS_ENCRYPTED(&inode_in->vfs_inode) != IS_ENCRYPTED(&inode_out->vfs_inode)) + return -EINVAL; + /* Don't make the dst file partly checksummed */ if ((inode_in->flags & BTRFS_INODE_NODATASUM) != (inode_out->flags & BTRFS_INODE_NODATASUM)) { From 45d99129b64b2311cc067b38221d475942166118 Mon Sep 17 00:00:00 2001 From: Sweet Tea Dorminy Date: Tue, 18 Nov 2025 17:08:39 +0100 Subject: [PATCH 136/147] btrfs: disable verity on encrypted inodes Right now there isn't a way to encrypt things that aren't either filenames in directories or data on blocks on disk with extent encryption, so for now, disable verity usage with encryption on btrfs. fscrypt with fsverity should be possible and it can be implemented in the future. Note: The patch was taken from v5 of fscrypt patchset (https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/) which was handled over time by various people: Omar Sandoval, Sweet Tea Dorminy, Josef Bacik. Reviewed-by: Boris Burkov Signed-off-by: Sweet Tea Dorminy Signed-off-by: Daniel Vacek Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/verity.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 16f5580cba55..06dfcb461f53 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -578,6 +578,9 @@ static int btrfs_begin_enable_verity(struct file *filp) btrfs_assert_inode_locked(inode); + if (IS_ENCRYPTED(&inode->vfs_inode)) + return -EOPNOTSUPP; + if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) return -EBUSY; From bd45e9e3f6232f76fa9bd0e40c1e3409e4449f5e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Nov 2025 17:08:40 +0100 Subject: [PATCH 137/147] btrfs: add orig_logical to btrfs_bio for encryption When checksumming the encrypted bio on writes we need to know which logical address this checksum is for. At the point where we get the encrypted bio the bi_sector is the physical location on the target disk, so we need to save the original logical offset in the btrfs_bio. Then we can use this when checksumming the bio instead of the bio->iter.bi_sector. Note: The patch was taken from v5 of fscrypt patchset (https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/) which was handled over time by various people: Omar Sandoval, Sweet Tea Dorminy, Josef Bacik. Signed-off-by: Josef Bacik Signed-off-by: Daniel Vacek Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/bio.c | 10 ++++++++++ fs/btrfs/bio.h | 2 ++ fs/btrfs/file-item.c | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1b38e3ee0a33..fa1d321a2fb8 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -94,6 +94,8 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, if (bbio_has_ordered_extent(bbio)) { refcount_inc(&orig_bbio->ordered->refs); bbio->ordered = orig_bbio->ordered; + bbio->orig_logical = orig_bbio->orig_logical; + orig_bbio->orig_logical += map_length; } bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; atomic_inc(&orig_bbio->pending_ios); @@ -765,6 +767,14 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) goto end_bbio; } + /* + * For fscrypt writes we will get the encrypted bio after we've remapped + * our bio to the physical disk location, so we need to save the + * original bytenr so we know what we're checksumming. + */ + if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio)) + bbio->orig_logical = logical; + map_length = min(map_length, length); if (use_append) map_length = btrfs_append_map_length(bbio, map_length); diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index 035145909b00..1be74209f0b8 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -52,6 +52,7 @@ struct btrfs_bio { * - pointer to the checksums for this bio * - original physical address from the allocator * (for zone append only) + * - original logical address, used for checksumming fscrypt bios */ struct { struct btrfs_ordered_extent *ordered; @@ -60,6 +61,7 @@ struct btrfs_bio { struct completion csum_done; struct bvec_iter csum_saved_iter; u64 orig_physical; + u64 orig_logical; }; /* For metadata reads: parentness verification. */ diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b17632ea085f..14e5257f0f04 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -824,7 +824,7 @@ int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async) if (!sums) return -ENOMEM; - sums->logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + sums->logical = bbio->orig_logical; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); bbio->sums = sums; From 0185c2292c600993199bc6b1f342ad47a9e8c678 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Nov 2025 17:08:41 +0100 Subject: [PATCH 138/147] btrfs: don't rewrite ret from inode_permission In our user safe ino resolve ioctl we'll just turn any ret into -EACCES from inode_permission(). This is redundant, and could potentially be wrong if we had an ENOMEM in the security layer or some such other error, so simply return the actual return value. Note: The patch was taken from v5 of fscrypt patchset (https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/) which was handled over time by various people: Omar Sandoval, Sweet Tea Dorminy, Josef Bacik. Fixes: 23d0b79dfaed ("btrfs: Add unprivileged version of ino_lookup ioctl") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: Daniel Vacek Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 59cef7e376a0..a10b60439718 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1910,10 +1910,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); iput(&temp_inode->vfs_inode); - if (ret) { - ret = -EACCES; + if (ret) goto out_put; - } if (key.offset == upper_limit) break; From 70085399b1a1623ef488d96b4c2d0c67be1d0607 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Nov 2025 17:08:43 +0100 Subject: [PATCH 139/147] btrfs: don't search back for dir inode item in INO_LOOKUP_USER We don't need to search back to the inode item, the directory inode number is in key.offset, so simply use that. If we can't find the directory we'll get an ENOENT at the iget(). Note: The patch was taken from v5 of fscrypt patchset (https://lore.kernel.org/linux-btrfs/cover.1706116485.git.josef@toxicpanda.com/) which was handled over time by various people: Omar Sandoval, Sweet Tea Dorminy, Josef Bacik. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: Daniel Vacek Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a10b60439718..4513c236d281 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1822,7 +1822,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_root_ref *rref; struct btrfs_root *root = NULL; struct btrfs_path *path; - struct btrfs_key key, key2; + struct btrfs_key key; struct extent_buffer *leaf; char *ptr; int slot; @@ -1877,24 +1877,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, read_extent_buffer(leaf, ptr, (unsigned long)(iref + 1), len); - /* Check the read+exec permission of this directory */ - ret = btrfs_previous_item(root, path, dirid, - BTRFS_INODE_ITEM_KEY); - if (ret < 0) { - goto out_put; - } else if (ret > 0) { - ret = -ENOENT; - goto out_put; - } - - leaf = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &key2, slot); - if (key2.objectid != dirid) { - ret = -ENOENT; - goto out_put; - } - /* * We don't need the path anymore, so release it and * avoid deadlocks and lockdep warnings in case @@ -1902,11 +1884,12 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, * btree and lock the same leaf. */ btrfs_release_path(path); - temp_inode = btrfs_iget(key2.objectid, root); + temp_inode = btrfs_iget(key.offset, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); goto out_put; } + /* Check the read+exec permission of this directory. */ ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); iput(&temp_inode->vfs_inode); From 9c78fe4a85fd968e1202b6c8cd9306746039ce2b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 19 Nov 2025 12:35:10 +0000 Subject: [PATCH 140/147] btrfs: use test_and_set_bit() in btrfs_delayed_delete_inode_ref() Instead of testing and setting the BTRFS_DELAYED_NODE_DEL_IREF bit in the delayed node's flags, use test_and_set_bit() which makes the code shorter without compromising readability and getting rid of the label and goto. Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Vacek Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index e77a597580c5..ce6e9f8812e0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -2008,13 +2008,10 @@ int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode) * It is very rare. */ mutex_lock(&delayed_node->mutex); - if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) - goto release_node; - - set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); - delayed_node->count++; - atomic_inc(&fs_info->delayed_root->items); -release_node: + if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { + delayed_node->count++; + atomic_inc(&fs_info->delayed_root->items); + } mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); return 0; From 1361f7d8da3eb5a63ce520754d3e8c2db5790e7c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 19 Nov 2025 13:01:20 +0000 Subject: [PATCH 141/147] btrfs: remove root argument from btrfs_del_dir_entries_in_log() There's no need to pass the root as we can extract it from the directory inode, so remove it. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- fs/btrfs/tree-log.c | 2 +- fs/btrfs/tree-log.h | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8e13117eca16..554bd03c9f21 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4363,7 +4363,7 @@ skip_backref: */ if (!rename_ctx) { btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, dir, index); + btrfs_del_dir_entries_in_log(trans, name, dir, index); } /* diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cc27f87c4904..c2e45e64ab6c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3900,10 +3900,10 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * or the entire directory. */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { + struct btrfs_root *root = dir->root; BTRFS_PATH_AUTO_FREE(path); int ret; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 4f149d7d4fde..a0aeec2448c0 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -79,7 +79,6 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, From 139e3167d81143f9cd719fde420a825dae7b711d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 19 Nov 2025 13:06:55 +0000 Subject: [PATCH 142/147] btrfs: reduce arguments to btrfs_del_inode_ref_in_log() Instead of passing a root and the objectid of the parent directory, just pass the directory inode, as like that we can extract both the root and the objectid, reducing the number of arguments by one. It also makes the function more consistent with other log tree functions in the sense that we pass the inode and not only its objectid. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- fs/btrfs/tree-log.c | 10 +++++----- fs/btrfs/tree-log.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 554bd03c9f21..3cf30abcdb08 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4362,7 +4362,7 @@ skip_backref: * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { - btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); + btrfs_del_inode_ref_in_log(trans, name, inode, dir); btrfs_del_dir_entries_in_log(trans, name, dir, index); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c2e45e64ab6c..42c9327e0c12 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3938,11 +3938,11 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, - struct btrfs_inode *inode, u64 dirid) + struct btrfs_inode *inode, + struct btrfs_inode *dir) { - struct btrfs_root *log; + struct btrfs_root *root = dir->root; int ret; ret = inode_logged(trans, inode, NULL); @@ -3957,10 +3957,10 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret); if (WARN_ON(ret)) return; - log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL); + ret = btrfs_del_inode_ref(trans, root->log_root, name, btrfs_ino(inode), + btrfs_ino(dir), NULL); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) btrfs_set_log_full_commit(trans); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a0aeec2448c0..41e47fda036d 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -82,9 +82,9 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct fscrypt_str *name, - struct btrfs_inode *inode, u64 dirid); + struct btrfs_inode *inode, + struct btrfs_inode *dir); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, From 7c3acdb998dd723ac791cd4a47f13599d76a1f58 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 19 Nov 2025 16:43:11 +0000 Subject: [PATCH 143/147] btrfs: send: add unlikely to all unexpected overflow checks There are several checks for unexpected overflows of buffers and path lengths that makes us fail the send operation with an error if for some highly unexpected reason they happen. So add the unlikely tag to those checks to hint the compiler to generate better code, while also making it more explicit in the source that it's highly unexpected. With gcc 14.2.0-19 from Debian on x86_64, I also got a small reduction the text size of the btrfs module. Before: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1936917 162723 15592 2115232 2046a0 fs/btrfs/btrfs.ko After: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1936789 162723 15592 2115104 204620 fs/btrfs/btrfs.ko Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3d437024e8bc..9da559f79f7f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1134,12 +1134,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_dir_item_key_to_cpu(eb, di, &di_key); if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { - if (name_len > XATTR_NAME_MAX) { + if (unlikely(name_len > XATTR_NAME_MAX)) { ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len > - BTRFS_MAX_XATTR_SIZE(root->fs_info)) { + if (unlikely(name_len + data_len > + BTRFS_MAX_XATTR_SIZE(root->fs_info))) { ret = -E2BIG; goto out; } @@ -1147,7 +1147,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len > PATH_MAX) { + if (unlikely(name_len + data_len > PATH_MAX)) { ret = -ENAMETOOLONG; goto out; } @@ -5129,7 +5129,7 @@ static int process_verity(struct send_ctx *sctx) if (ret < 0) goto iput; - if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) { + if (unlikely(ret > FS_VERITY_MAX_DESCRIPTOR_SIZE)) { ret = -EMSGSIZE; goto iput; } @@ -5173,14 +5173,14 @@ static int put_data_header(struct send_ctx *sctx, u32 len) * Since v2, the data attribute header doesn't include a length, * it is implicitly to the end of the command. */ - if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) + if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)) return -EOVERFLOW; put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); sctx->send_size += sizeof(__le16); } else { struct btrfs_tlv_header *hdr; - if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + if (unlikely(sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); @@ -5580,8 +5580,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * between the beginning of the command and the file data. */ data_offset = PAGE_ALIGN(sctx->send_size); - if (data_offset > sctx->send_max_size || - sctx->send_max_size - data_offset < disk_num_bytes) { + if (unlikely(data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes)) { ret = -EOVERFLOW; goto out; } From 5c9cac55b7a2c203cc135560fce053beea173c0f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 19 Nov 2025 17:59:52 +0000 Subject: [PATCH 144/147] btrfs: send: do not allocate memory for xattr data when checking it exists When checking if xattrs were deleted we don't care about their data, but we are allocating memory for the data and copying it, which only wastes time and can result in an unnecessary error in case the allocation fails. So stop allocating memory and copying data by making find_xattr() and __find_xattr() skip those steps if the given data buffer is NULL. Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9da559f79f7f..130aabced207 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4943,6 +4943,7 @@ struct find_xattr_ctx { int found_idx; char *found_data; int found_data_len; + bool copy_data; }; static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, @@ -4954,9 +4955,11 @@ static int __find_xattr(int num, struct btrfs_key *di_key, const char *name, strncmp(name, ctx->name, name_len) == 0) { ctx->found_idx = num; ctx->found_data_len = data_len; - ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); - if (!ctx->found_data) - return -ENOMEM; + if (ctx->copy_data) { + ctx->found_data = kmemdup(data, data_len, GFP_KERNEL); + if (!ctx->found_data) + return -ENOMEM; + } return 1; } return 0; @@ -4976,6 +4979,7 @@ static int find_xattr(struct btrfs_root *root, ctx.found_idx = -1; ctx.found_data = NULL; ctx.found_data_len = 0; + ctx.copy_data = (data != NULL); ret = iterate_dir_item(root, path, __find_xattr, &ctx); if (ret < 0) @@ -4987,7 +4991,7 @@ static int find_xattr(struct btrfs_root *root, *data = ctx.found_data; *data_len = ctx.found_data_len; } else { - kfree(ctx.found_data); + ASSERT(ctx.found_data == NULL); } return ctx.found_idx; } From 10934c131f9bcfb616dd8be9456f11efd6b240ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Nov 2025 17:06:46 +0100 Subject: [PATCH 145/147] btrfs: remaining BTRFS_PATH_AUTO_FREE conversions Do the remaining btrfs_path conversion to the auto cleaning, this seems to be the last one. Most of the conversions are trivial, only adding the declaration and removing the freeing, or changing the goto patterns to return. There are some functions with many changes, like __btrfs_free_extent(), btrfs_remove_from_free_space_tree() or btrfs_add_to_free_space_tree() but it still follows the same pattern. Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 +- fs/btrfs/dir-item.c | 3 +- fs/btrfs/extent-tree.c | 41 +++++------ fs/btrfs/free-space-tree.c | 29 ++++---- fs/btrfs/inode-item.c | 3 +- fs/btrfs/inode.c | 3 +- fs/btrfs/ioctl.c | 37 ++++------ fs/btrfs/qgroup.c | 142 ++++++++++++++----------------------- fs/btrfs/super.c | 10 +-- fs/btrfs/tree-log.c | 4 +- fs/btrfs/volumes.c | 3 +- fs/btrfs/xattr.c | 3 +- 12 files changed, 105 insertions(+), 176 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 8ae73123b610..6e6939d2e902 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1065,7 +1065,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; struct inode *inode; @@ -1305,7 +1305,6 @@ out: btrfs_put_block_group(block_group); if (remove_rsv) btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 77e1bcb2a74b..085a83ae9e62 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -112,7 +112,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, int ret = 0; int ret2 = 0; struct btrfs_root *root = dir->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *dir_item; struct extent_buffer *leaf; unsigned long name_ptr; @@ -164,7 +164,6 @@ second_insert: ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: - btrfs_free_path(path); if (ret) return ret; if (ret2) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a3646440c4fe..e4cae34620d1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3085,7 +3085,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -3120,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, node->bytenr, refs_to_drop); ret = -EINVAL; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } if (is_data) @@ -3165,15 +3165,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "invalid iref slot %u, no EXTENT/METADATA_ITEM found but has inline extent ref", path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, NULL, refs_to_drop, is_data); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); @@ -3222,7 +3221,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } extent_slot = path->slots[0]; } @@ -3231,10 +3230,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu slot %d", bytenr, node->parent, node->ref_root, owner_objectid, owner_offset, path->slots[0]); - goto out; + return ret; } else { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } leaf = path->nodes[0]; @@ -3245,7 +3244,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); @@ -3259,8 +3258,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, key.objectid, key.type, key.offset, path->slots[0], owner_objectid, item_size, sizeof(*ei) + sizeof(*bi)); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); @@ -3271,8 +3269,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "trying to drop %d refs but we only have %llu for bytenr %llu slot %u", refs_to_drop, refs, bytenr, path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } refs -= refs_to_drop; @@ -3288,8 +3285,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found, slot %u", path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } } else { btrfs_set_extent_refs(leaf, ei, refs); @@ -3299,7 +3295,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, iref, refs_to_drop, is_data); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } } } else { @@ -3319,8 +3315,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "invalid refs_to_drop, current refs %u refs_to_drop %u slot %u", extent_data_ref_count(path, iref), refs_to_drop, path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } if (iref) { if (unlikely(path->slots[0] != extent_slot)) { @@ -3328,8 +3323,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "invalid iref, extent item key " BTRFS_KEY_FMT " slot %u doesn't have wanted iref", BTRFS_KEY_FMT_VALUE(&key), path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } } else { /* @@ -3342,8 +3336,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, abort_and_dump(trans, path, "invalid SHARED_* item slot %u, previous item is not EXTENT/METADATA_ITEM", path->slots[0]); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } path->slots[0] = extent_slot; num_to_del = 2; @@ -3364,7 +3357,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, num_to_del); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); @@ -3372,8 +3365,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); -out: - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 47745ae23c7d..1ad2ad384b9e 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -833,7 +833,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { struct btrfs_block_group *block_group; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) @@ -843,7 +843,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } block_group = btrfs_lookup_block_group(trans->fs_info, start); @@ -851,7 +851,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } mutex_lock(&block_group->free_space_lock); @@ -861,8 +861,7 @@ int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); -out: - btrfs_free_path(path); + return ret; } @@ -1015,7 +1014,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, u64 start, u64 size) { struct btrfs_block_group *block_group; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) @@ -1025,7 +1024,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } block_group = btrfs_lookup_block_group(trans->fs_info, start); @@ -1033,7 +1032,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } mutex_lock(&block_group->free_space_lock); @@ -1043,8 +1042,7 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); -out: - btrfs_free_path(path); + return ret; } @@ -1458,7 +1456,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *root = btrfs_free_space_root(block_group); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key, found_key; struct extent_buffer *leaf; u64 start, end; @@ -1477,7 +1475,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, if (unlikely(!path)) { ret = -ENOMEM; btrfs_abort_transaction(trans, ret); - goto out; + return ret; } start = block_group->start; @@ -1491,7 +1489,7 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } leaf = path->nodes[0]; @@ -1522,14 +1520,13 @@ int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, path->slots[0], nr); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); - goto out; + return ret; } btrfs_release_path(path); } ret = 0; -out: - btrfs_free_path(path); + return ret; } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 98dacfd03234..b73e1dd97208 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -444,7 +444,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_truncate_control *control) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct btrfs_key key; @@ -730,6 +730,5 @@ out: if (!ret && control->last_size > new_size) control->last_size = new_size; - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3cf30abcdb08..0cbac085cdaf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4462,7 +4462,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, { struct btrfs_root *root = dir->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; @@ -4555,7 +4555,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, ret); out: - btrfs_free_path(path); fscrypt_free_filename(&fname); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4513c236d281..3a27c7a71e0c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1603,7 +1603,7 @@ static noinline int search_ioctl(struct btrfs_root *root, { struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; int num_found = 0; unsigned long sk_offset = 0; @@ -1623,10 +1623,8 @@ static noinline int search_ioctl(struct btrfs_root *root, } else { /* Look up the root from the arguments. */ root = btrfs_get_fs_root(info, sk->tree_id, true); - if (IS_ERR(root)) { - btrfs_free_path(path); + if (IS_ERR(root)) return PTR_ERR(root); - } } key.objectid = sk->min_objectid; @@ -1660,7 +1658,6 @@ static noinline int search_ioctl(struct btrfs_root *root, sk->nr_items = num_found; btrfs_put_root(root); - btrfs_free_path(path); return ret; } @@ -1743,7 +1740,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, int total_len = 0; struct btrfs_inode_ref *iref; struct extent_buffer *l; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); if (dirid == BTRFS_FIRST_FREE_OBJECTID) { name[0]='\0'; @@ -1804,7 +1801,6 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, ret = 0; out: btrfs_put_root(root); - btrfs_free_path(path); return ret; } @@ -1821,7 +1817,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct btrfs_root *root = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf; char *ptr; @@ -1842,10 +1838,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1]; root = btrfs_get_fs_root(fs_info, treeid, true); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; - } + if (IS_ERR(root)) + return PTR_ERR(root); key.objectid = dirid; key.type = BTRFS_INODE_REF_KEY; @@ -1920,12 +1914,10 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, key.type = BTRFS_ROOT_REF_KEY; key.offset = args->treeid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret < 0) + return ret; + else if (ret > 0) + return -ENOENT; leaf = path->nodes[0]; slot = path->slots[0]; @@ -1935,10 +1927,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, item_len = btrfs_item_size(leaf, slot); /* Check if dirid in ROOT_REF corresponds to passed dirid */ rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref); - if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) { - ret = -EINVAL; - goto out; - } + if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) + return -EINVAL; /* Copy subvolume's name */ item_off += sizeof(struct btrfs_root_ref); @@ -1948,8 +1938,7 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, out_put: btrfs_put_root(root); -out: - btrfs_free_path(path); + return ret; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 58fb55644be5..9e2b53e90dcb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -660,7 +660,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -672,7 +672,6 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, key.offset = dst; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - btrfs_free_path(path); return ret; } @@ -681,7 +680,7 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -694,24 +693,19 @@ static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; - ret = btrfs_del_item(trans, quota_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, quota_root, path); } static int add_qgroup_item(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root, u64 qgroupid) { int ret; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_qgroup_info_item *qgroup_info; struct btrfs_qgroup_limit_item *qgroup_limit; struct extent_buffer *leaf; @@ -737,7 +731,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_info)); if (ret && ret != -EEXIST) - goto out; + return ret; leaf = path->nodes[0]; qgroup_info = btrfs_item_ptr(leaf, path->slots[0], @@ -754,7 +748,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_limit)); if (ret && ret != -EEXIST) - goto out; + return ret; leaf = path->nodes[0]; qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], @@ -765,17 +759,14 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) { int ret; struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; path = btrfs_alloc_path(); @@ -787,33 +778,27 @@ static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) key.offset = qgroupid; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; ret = btrfs_del_item(trans, quota_root, path); if (ret) - goto out; + return ret; btrfs_release_path(path); key.type = BTRFS_QGROUP_LIMIT_KEY; ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; - if (ret > 0) { - ret = -ENOENT; - goto out; - } + if (ret > 0) + return -ENOENT; ret = btrfs_del_item(trans, quota_root, path); -out: - btrfs_free_path(path); return ret; } @@ -821,7 +806,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, struct btrfs_qgroup *qgroup) { struct btrfs_root *quota_root = trans->fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_limit_item *qgroup_limit; @@ -841,7 +826,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, ret = -ENOENT; if (ret) - goto out; + return ret; l = path->nodes[0]; slot = path->slots[0]; @@ -851,8 +836,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); -out: - btrfs_free_path(path); + return ret; } @@ -861,7 +845,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_info_item *qgroup_info; @@ -884,7 +868,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, ret = -ENOENT; if (ret) - goto out; + return ret; l = path->nodes[0]; slot = path->slots[0]; @@ -894,8 +878,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); -out: - btrfs_free_path(path); + return ret; } @@ -903,7 +886,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root = fs_info->quota_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *l; struct btrfs_qgroup_status_item *ptr; @@ -923,7 +906,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) ret = -ENOENT; if (ret) - goto out; + return ret; l = path->nodes[0]; slot = path->slots[0]; @@ -933,8 +916,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_generation(l, ptr, trans->transid); btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); -out: - btrfs_free_path(path); + return ret; } @@ -944,7 +926,7 @@ out: static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf = NULL; int ret; @@ -961,7 +943,7 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; nr = btrfs_header_nritems(leaf); if (!nr) @@ -974,14 +956,12 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_quota_enable(struct btrfs_fs_info *fs_info, @@ -1719,8 +1699,7 @@ out: static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) { struct btrfs_key key; - struct btrfs_path *path; - int ret; + BTRFS_PATH_AUTO_FREE(path); /* * Squota would never be inconsistent, but there can still be case @@ -1753,13 +1732,11 @@ static int can_delete_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup if (!path) return -ENOMEM; - ret = btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); - btrfs_free_path(path); /* * The @ret from btrfs_find_root() exactly matches our definition for * the return value, thus can be returned directly. */ - return ret; + return btrfs_find_root(fs_info->tree_root, &key, path, NULL, NULL); } int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) @@ -2308,7 +2285,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, bool trace_leaf) { struct btrfs_key key; - struct btrfs_path *src_path; + BTRFS_PATH_AUTO_FREE(src_path); struct btrfs_fs_info *fs_info = trans->fs_info; u32 nodesize = fs_info->nodesize; int cur_level = root_level; @@ -2320,10 +2297,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, return -EINVAL; src_path = btrfs_alloc_path(); - if (!src_path) { - ret = -ENOMEM; - goto out; - } + if (!src_path) + return -ENOMEM; if (dst_level) btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); @@ -2349,10 +2324,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, parent_slot = src_path->slots[cur_level + 1]; eb = btrfs_read_node_slot(eb, parent_slot); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } + if (IS_ERR(eb)) + return PTR_ERR(eb); src_path->nodes[cur_level] = eb; @@ -2373,10 +2346,8 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, &src_key, src_path->slots[cur_level]); } /* Content mismatch, something went wrong */ - if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { - ret = -ENOENT; - goto out; - } + if (btrfs_comp_cpu_keys(&dst_key, &src_key)) + return -ENOENT; cur_level--; } @@ -2387,21 +2358,20 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, nodesize); if (ret < 0) - goto out; + return ret; ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, nodesize); if (ret < 0) - goto out; + return ret; /* Record leaf file extents */ if (dst_level == 0 && trace_leaf) { ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); if (ret < 0) - goto out; + return ret; ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); } -out: - btrfs_free_path(src_path); + return ret; } @@ -2602,7 +2572,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, int level; u8 drop_subptree_thres; struct extent_buffer *eb = root_eb; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); ASSERT(0 <= root_level && root_level < BTRFS_MAX_LEVEL); ASSERT(root_eb != NULL); @@ -2635,12 +2605,12 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) - goto out; + return ret; } if (root_level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); - goto out; + return ret; } path = btrfs_alloc_path(); @@ -2676,10 +2646,8 @@ walk_down: child_bytenr = btrfs_node_blockptr(eb, parent_slot); eb = btrfs_read_node_slot(eb, parent_slot); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } + if (IS_ERR(eb)) + return PTR_ERR(eb); path->nodes[level] = eb; path->slots[level] = 0; @@ -2690,14 +2658,14 @@ walk_down: ret = btrfs_qgroup_trace_extent(trans, child_bytenr, fs_info->nodesize); if (ret) - goto out; + return ret; } if (level == 0) { ret = btrfs_qgroup_trace_leaf_items(trans, path->nodes[level]); if (ret) - goto out; + return ret; /* Nonzero return here means we completed our search */ ret = adjust_slots_upwards(path, root_level); @@ -2711,11 +2679,7 @@ walk_down: level--; } - ret = 0; -out: - btrfs_free_path(path); - - return ret; + return 0; } static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7e4cfae63bcc..1999533b52be 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -807,17 +807,15 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, struct btrfs_root_ref *root_ref; struct btrfs_inode_ref *inode_ref; struct btrfs_key key; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); char *name = NULL, *ptr; u64 dirid; int len; int ret; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto err; - } + if (!path) + return ERR_PTR(-ENOMEM); name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { @@ -905,7 +903,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, fs_root = NULL; } - btrfs_free_path(path); if (ptr == name + PATH_MAX - 1) { name[0] = '/'; name[1] = '\0'; @@ -916,7 +913,6 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, err: btrfs_put_root(fs_root); - btrfs_free_path(path); kfree(name); return ERR_PTR(ret); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 42c9327e0c12..651e6747dcec 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2648,7 +2648,7 @@ static noinline int replay_dir_deletes(struct walk_control *wc, int ret = 0; struct btrfs_key dir_key; struct btrfs_key found_key; - struct btrfs_path *log_path; + BTRFS_PATH_AUTO_FREE(log_path); struct btrfs_inode *dir; dir_key.objectid = dirid; @@ -2665,7 +2665,6 @@ static noinline int replay_dir_deletes(struct walk_control *wc, * we replay the deletes before we copy in the inode item from the log. */ if (IS_ERR(dir)) { - btrfs_free_path(log_path); ret = PTR_ERR(dir); if (ret == -ENOENT) ret = 0; @@ -2745,7 +2744,6 @@ static noinline int replay_dir_deletes(struct walk_control *wc, ret = 0; out: btrfs_release_path(wc->subvol_path); - btrfs_free_path(log_path); iput(&dir->vfs_inode); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e6a3f3ceb74b..e81c8ac0d8ae 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4054,7 +4054,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) struct btrfs_root *chunk_root = fs_info->chunk_root; u64 chunk_type; struct btrfs_chunk *chunk; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -4225,7 +4225,6 @@ loop: goto again; } error: - btrfs_free_path(path); if (enospc_errors) { btrfs_info(fs_info, "%d enospc errors during balance", enospc_errors); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 98d6aa3b7d6a..ab55d10bd71f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -73,7 +73,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); size_t name_len = strlen(name); int ret = 0; @@ -200,7 +200,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, */ } out: - btrfs_free_path(path); if (!ret) { set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); From 1c3e03b34042c2dff15d7f262b768908e4b02537 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 21 Nov 2025 15:56:14 +0000 Subject: [PATCH 146/147] btrfs: remove redundant zero/NULL initializations in btrfs_alloc_root() We have allocated the root with kzalloc() so all the memory is already zero initialized, therefore it's redundant to assign 0 and NULL to several of the root members. Remove all of them except the atomic initializations since atomic_t is an opaque type and it's not a good practice to assume its internals. This slightly reduces the binary size. With gcc 14.2.0-19 from Debian on x86_64, before this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1939404 162963 15592 2117959 205147 fs/btrfs/btrfs.ko After this change: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1939212 162963 15592 2117767 205087 fs/btrfs/btrfs.ko Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fe62f5a244f5..89149fac804c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -652,20 +652,10 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, if (!root) return NULL; - memset(&root->root_key, 0, sizeof(root->root_key)); - memset(&root->root_item, 0, sizeof(root->root_item)); - memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); root->fs_info = fs_info; root->root_key.objectid = objectid; - root->node = NULL; - root->commit_root = NULL; - root->state = 0; RB_CLEAR_NODE(&root->rb_node); - btrfs_set_root_last_trans(root, 0); - root->free_objectid = 0; - root->nr_delalloc_inodes = 0; - root->nr_ordered_extents = 0; xa_init(&root->inodes); xa_init(&root->delayed_nodes); @@ -699,10 +689,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); - btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - btrfs_set_root_last_log_commit(root, 0); - root->anon_dev = 0; if (!btrfs_is_testing(fs_info)) { btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, IO_TREE_ROOT_DIRTY_LOG_PAGES); From 9e0e6577b3e5e5cf7c1acd178eb648e8f830ba17 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 21 Nov 2025 16:56:46 +0000 Subject: [PATCH 147/147] btrfs: remove unnecessary inode key in btrfs_log_all_parents() We are setting up an inode key to lookup parent directory inode but all we need is the inode's objectid. The use of the key was necessary in the past but since commit 0202e83fdab0 ("btrfs: simplify iget helpers") we only need the objectid. So remove the key variable in the stack and use instead a simple u64 for the inode's objectid. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 651e6747dcec..fff37c8d96a4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7201,28 +7201,24 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, item_size = btrfs_item_size(leaf, slot); ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { - struct btrfs_key inode_key; + u64 dir_id; struct btrfs_inode *dir_inode; - inode_key.type = BTRFS_INODE_ITEM_KEY; - inode_key.offset = 0; - if (key.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *) (ptr + cur_offset); - inode_key.objectid = btrfs_inode_extref_parent( - leaf, extref); + dir_id = btrfs_inode_extref_parent(leaf, extref); cur_offset += sizeof(*extref); cur_offset += btrfs_inode_extref_name_len(leaf, extref); } else { - inode_key.objectid = key.offset; + dir_id = key.offset; cur_offset = item_size; } - dir_inode = btrfs_iget_logging(inode_key.objectid, root); + dir_inode = btrfs_iget_logging(dir_id, root); /* * If the parent inode was deleted, return an error to * fallback to a transaction commit. This is to prevent