mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 01:04:41 +01:00
fsverity: kick off hash readahead at data I/O submission time
Currently all reads of the fsverity hashes are kicked off from the data I/O completion handler, leading to needlessly dependent I/O. This is worked around a bit by performing readahead on the level 0 nodes, but still fairly ineffective. Switch to a model where the ->read_folio and ->readahead methods instead kick off explicit readahead of the fsverity hashed so they are usually available at I/O completion time. For 64k sequential reads on my test VM this improves read performance from 2.4GB/s - 2.6GB/s to 3.5GB/s - 3.9GB/s. The improvements for random reads are likely to be even bigger. Signed-off-by: Christoph Hellwig <hch@lst.de> Acked-by: David Sterba <dsterba@suse.com> # btrfs Link: https://lore.kernel.org/r/20260202060754.270269-5-hch@lst.de Signed-off-by: Eric Biggers <ebiggers@kernel.org>
This commit is contained in:
parent
314b652b7e
commit
f1a6cf44b3
9 changed files with 161 additions and 74 deletions
|
|
@ -697,7 +697,6 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
|
|||
*
|
||||
* @inode: inode to read a merkle tree page for
|
||||
* @index: page index relative to the start of the merkle tree
|
||||
* @num_ra_pages: number of pages to readahead. Optional, we ignore it
|
||||
*
|
||||
* The Merkle tree is stored in the filesystem btree, but its pages are cached
|
||||
* with a logical position past EOF in the inode's mapping.
|
||||
|
|
@ -705,8 +704,7 @@ int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size)
|
|||
* Returns the page we read, or an ERR_PTR on error.
|
||||
*/
|
||||
static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
|
||||
pgoff_t index,
|
||||
unsigned long num_ra_pages)
|
||||
pgoff_t index)
|
||||
{
|
||||
struct folio *folio;
|
||||
u64 off = (u64)index << PAGE_SHIFT;
|
||||
|
|
|
|||
|
|
@ -397,18 +397,20 @@ next_page:
|
|||
|
||||
int ext4_read_folio(struct file *file, struct folio *folio)
|
||||
{
|
||||
int ret = -EAGAIN;
|
||||
struct inode *inode = folio->mapping->host;
|
||||
int ret;
|
||||
|
||||
trace_ext4_read_folio(inode, folio);
|
||||
|
||||
if (ext4_has_inline_data(inode))
|
||||
if (ext4_has_inline_data(inode)) {
|
||||
ret = ext4_readpage_inline(inode, folio);
|
||||
if (ret != -EAGAIN)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (ret == -EAGAIN)
|
||||
return ext4_mpage_readpages(inode, NULL, folio);
|
||||
|
||||
return ret;
|
||||
if (ext4_need_verity(inode, folio->index))
|
||||
fsverity_readahead(inode, folio->index, folio_nr_pages(folio));
|
||||
return ext4_mpage_readpages(inode, NULL, folio);
|
||||
}
|
||||
|
||||
void ext4_readahead(struct readahead_control *rac)
|
||||
|
|
@ -419,6 +421,9 @@ void ext4_readahead(struct readahead_control *rac)
|
|||
if (ext4_has_inline_data(inode))
|
||||
return;
|
||||
|
||||
if (ext4_need_verity(inode, readahead_index(rac)))
|
||||
fsverity_readahead(inode, readahead_index(rac),
|
||||
readahead_count(rac));
|
||||
ext4_mpage_readpages(inode, rac, NULL);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -358,11 +358,17 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
|
|||
}
|
||||
|
||||
static struct page *ext4_read_merkle_tree_page(struct inode *inode,
|
||||
pgoff_t index,
|
||||
unsigned long num_ra_pages)
|
||||
pgoff_t index)
|
||||
{
|
||||
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
|
||||
return generic_read_merkle_tree_page(inode, index, num_ra_pages);
|
||||
return generic_read_merkle_tree_page(inode, index);
|
||||
}
|
||||
|
||||
static void ext4_readahead_merkle_tree(struct inode *inode, pgoff_t index,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
|
||||
generic_readahead_merkle_tree(inode, index, nr_pages);
|
||||
}
|
||||
|
||||
static int ext4_write_merkle_tree_block(struct file *file, const void *buf,
|
||||
|
|
@ -380,5 +386,6 @@ const struct fsverity_operations ext4_verityops = {
|
|||
.end_enable_verity = ext4_end_enable_verity,
|
||||
.get_verity_descriptor = ext4_get_verity_descriptor,
|
||||
.read_merkle_tree_page = ext4_read_merkle_tree_page,
|
||||
.readahead_merkle_tree = ext4_readahead_merkle_tree,
|
||||
.write_merkle_tree_block = ext4_write_merkle_tree_block,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -2458,7 +2458,7 @@ next_page:
|
|||
static int f2fs_read_data_folio(struct file *file, struct folio *folio)
|
||||
{
|
||||
struct inode *inode = folio->mapping->host;
|
||||
int ret = -EAGAIN;
|
||||
int ret;
|
||||
|
||||
trace_f2fs_readpage(folio, DATA);
|
||||
|
||||
|
|
@ -2468,11 +2468,15 @@ static int f2fs_read_data_folio(struct file *file, struct folio *folio)
|
|||
}
|
||||
|
||||
/* If the file has inline data, try to read it directly */
|
||||
if (f2fs_has_inline_data(inode))
|
||||
if (f2fs_has_inline_data(inode)) {
|
||||
ret = f2fs_read_inline_data(inode, folio);
|
||||
if (ret == -EAGAIN)
|
||||
ret = f2fs_mpage_readpages(inode, NULL, folio);
|
||||
return ret;
|
||||
if (ret != -EAGAIN)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (f2fs_need_verity(inode, folio->index))
|
||||
fsverity_readahead(inode, folio->index, folio_nr_pages(folio));
|
||||
return f2fs_mpage_readpages(inode, NULL, folio);
|
||||
}
|
||||
|
||||
static void f2fs_readahead(struct readahead_control *rac)
|
||||
|
|
@ -2488,6 +2492,9 @@ static void f2fs_readahead(struct readahead_control *rac)
|
|||
if (f2fs_has_inline_data(inode))
|
||||
return;
|
||||
|
||||
if (f2fs_need_verity(inode, readahead_index(rac)))
|
||||
fsverity_readahead(inode, readahead_index(rac),
|
||||
readahead_count(rac));
|
||||
f2fs_mpage_readpages(inode, rac, NULL);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -256,11 +256,17 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
|
|||
}
|
||||
|
||||
static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
|
||||
pgoff_t index,
|
||||
unsigned long num_ra_pages)
|
||||
pgoff_t index)
|
||||
{
|
||||
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
|
||||
return generic_read_merkle_tree_page(inode, index, num_ra_pages);
|
||||
return generic_read_merkle_tree_page(inode, index);
|
||||
}
|
||||
|
||||
static void f2fs_readahead_merkle_tree(struct inode *inode, pgoff_t index,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
|
||||
generic_readahead_merkle_tree(inode, index, nr_pages);
|
||||
}
|
||||
|
||||
static int f2fs_write_merkle_tree_block(struct file *file, const void *buf,
|
||||
|
|
@ -278,5 +284,6 @@ const struct fsverity_operations f2fs_verityops = {
|
|||
.end_enable_verity = f2fs_end_enable_verity,
|
||||
.get_verity_descriptor = f2fs_get_verity_descriptor,
|
||||
.read_merkle_tree_page = f2fs_read_merkle_tree_page,
|
||||
.readahead_merkle_tree = f2fs_readahead_merkle_tree,
|
||||
.write_merkle_tree_block = f2fs_write_merkle_tree_block,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
* Copyright 2019 Google LLC
|
||||
*/
|
||||
|
||||
#include <linux/export.h>
|
||||
#include <linux/fsverity.h>
|
||||
#include <linux/pagemap.h>
|
||||
|
||||
|
|
@ -10,33 +11,48 @@
|
|||
* generic_read_merkle_tree_page - generic ->read_merkle_tree_page helper
|
||||
* @inode: inode containing the Merkle tree
|
||||
* @index: 0-based index of the Merkle tree page in the inode
|
||||
* @num_ra_pages: The number of Merkle tree pages that should be prefetched.
|
||||
*
|
||||
* The caller needs to adjust @index from the Merkle-tree relative index passed
|
||||
* to ->read_merkle_tree_page to the actual index where the Merkle tree is
|
||||
* stored in the page cache for @inode.
|
||||
*/
|
||||
struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index,
|
||||
unsigned long num_ra_pages)
|
||||
struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index)
|
||||
{
|
||||
struct folio *folio;
|
||||
|
||||
folio = read_mapping_folio(inode->i_mapping, index, NULL);
|
||||
if (IS_ERR(folio))
|
||||
return ERR_CAST(folio);
|
||||
return folio_file_page(folio, index);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(generic_read_merkle_tree_page);
|
||||
|
||||
/**
|
||||
* generic_readahead_merkle_tree() - generic ->readahead_merkle_tree helper
|
||||
* @inode: inode containing the Merkle tree
|
||||
* @index: 0-based index of the first Merkle tree page to read ahead in the
|
||||
* inode
|
||||
* @nr_pages: the number of Merkle tree pages that should be read ahead
|
||||
*
|
||||
* The caller needs to adjust @index from the Merkle-tree relative index passed
|
||||
* to ->read_merkle_tree_page to the actual index where the Merkle tree is
|
||||
* stored in the page cache for @inode.
|
||||
*/
|
||||
void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
struct folio *folio;
|
||||
|
||||
lockdep_assert_held(&inode->i_mapping->invalidate_lock);
|
||||
|
||||
folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
|
||||
if (folio == ERR_PTR(-ENOENT) ||
|
||||
(!IS_ERR(folio) && !folio_test_uptodate(folio))) {
|
||||
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
|
||||
|
||||
if (!IS_ERR(folio)) {
|
||||
folio_put(folio);
|
||||
} else if (num_ra_pages > 1) {
|
||||
filemap_invalidate_lock_shared(inode->i_mapping);
|
||||
page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
|
||||
filemap_invalidate_unlock_shared(inode->i_mapping);
|
||||
}
|
||||
folio = read_mapping_folio(inode->i_mapping, index, NULL);
|
||||
page_cache_ra_unbounded(&ractl, nr_pages, 0);
|
||||
}
|
||||
if (IS_ERR(folio))
|
||||
return ERR_CAST(folio);
|
||||
return folio_file_page(folio, index);
|
||||
if (!IS_ERR(folio))
|
||||
folio_put(folio);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(generic_read_merkle_tree_page);
|
||||
EXPORT_SYMBOL_GPL(generic_readahead_merkle_tree);
|
||||
|
|
|
|||
|
|
@ -28,24 +28,33 @@ static int fsverity_read_merkle_tree(struct inode *inode,
|
|||
if (offset >= end_offset)
|
||||
return 0;
|
||||
offs_in_page = offset_in_page(offset);
|
||||
index = offset >> PAGE_SHIFT;
|
||||
last_index = (end_offset - 1) >> PAGE_SHIFT;
|
||||
|
||||
/*
|
||||
* Kick off readahead for the range we are going to read to ensure a
|
||||
* single large sequential read instead of lots of small ones.
|
||||
*/
|
||||
if (inode->i_sb->s_vop->readahead_merkle_tree) {
|
||||
filemap_invalidate_lock_shared(inode->i_mapping);
|
||||
inode->i_sb->s_vop->readahead_merkle_tree(
|
||||
inode, index, last_index - index + 1);
|
||||
filemap_invalidate_unlock_shared(inode->i_mapping);
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate through each Merkle tree page in the requested range and copy
|
||||
* the requested portion to userspace. Note that the Merkle tree block
|
||||
* size isn't important here, as we are returning a byte stream; i.e.,
|
||||
* we can just work with pages even if the tree block size != PAGE_SIZE.
|
||||
*/
|
||||
for (index = offset >> PAGE_SHIFT; index <= last_index; index++) {
|
||||
unsigned long num_ra_pages =
|
||||
min_t(unsigned long, last_index - index + 1,
|
||||
inode->i_sb->s_bdi->io_pages);
|
||||
for (; index <= last_index; index++) {
|
||||
unsigned int bytes_to_copy = min_t(u64, end_offset - offset,
|
||||
PAGE_SIZE - offs_in_page);
|
||||
struct page *page;
|
||||
const void *virt;
|
||||
|
||||
page = vops->read_merkle_tree_page(inode, index, num_ra_pages);
|
||||
page = vops->read_merkle_tree_page(inode, index);
|
||||
if (IS_ERR(page)) {
|
||||
err = PTR_ERR(page);
|
||||
fsverity_err(inode,
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ struct fsverity_pending_block {
|
|||
struct fsverity_verification_context {
|
||||
struct inode *inode;
|
||||
struct fsverity_info *vi;
|
||||
unsigned long max_ra_pages;
|
||||
|
||||
/*
|
||||
* This is the queue of data blocks that are pending verification. When
|
||||
|
|
@ -37,6 +36,50 @@ struct fsverity_verification_context {
|
|||
|
||||
static struct workqueue_struct *fsverity_read_workqueue;
|
||||
|
||||
/**
|
||||
* fsverity_readahead() - kick off readahead on fsverity hashes
|
||||
* @inode: inode that is being read
|
||||
* @index: first file data page index that is being read
|
||||
* @nr_pages: number of file data pages to be read
|
||||
*
|
||||
* Start readahead on the fsverity hashes that are needed to verify the file
|
||||
* data in the range from @index to @index + @nr_pages (exclusive upper bound).
|
||||
*
|
||||
* To be called from the file systems' ->read_folio and ->readahead methods to
|
||||
* ensure that the hashes are already cached on completion of the file data
|
||||
* read if possible.
|
||||
*/
|
||||
void fsverity_readahead(struct inode *inode, pgoff_t index,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
const struct fsverity_info *vi = *fsverity_info_addr(inode);
|
||||
const struct merkle_tree_params *params = &vi->tree_params;
|
||||
u64 start_hidx = (u64)index << params->log_blocks_per_page;
|
||||
u64 end_hidx =
|
||||
(((u64)index + nr_pages) << params->log_blocks_per_page) - 1;
|
||||
int level;
|
||||
|
||||
if (!inode->i_sb->s_vop->readahead_merkle_tree)
|
||||
return;
|
||||
|
||||
for (level = 0; level < params->num_levels; level++) {
|
||||
unsigned long level_start = params->level_start[level];
|
||||
unsigned long next_start_hidx = start_hidx >> params->log_arity;
|
||||
unsigned long next_end_hidx = end_hidx >> params->log_arity;
|
||||
pgoff_t start_idx = (level_start + next_start_hidx) >>
|
||||
params->log_blocks_per_page;
|
||||
pgoff_t end_idx = (level_start + next_end_hidx) >>
|
||||
params->log_blocks_per_page;
|
||||
|
||||
inode->i_sb->s_vop->readahead_merkle_tree(
|
||||
inode, start_idx, end_idx - start_idx + 1);
|
||||
|
||||
start_hidx = next_start_hidx;
|
||||
end_hidx = next_end_hidx;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(fsverity_readahead);
|
||||
|
||||
/*
|
||||
* Returns true if the hash block with index @hblock_idx in the tree, located in
|
||||
* @hpage, has already been verified.
|
||||
|
|
@ -114,8 +157,7 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
|
|||
* Return: %true if the data block is valid, else %false.
|
||||
*/
|
||||
static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
|
||||
const struct fsverity_pending_block *dblock,
|
||||
unsigned long max_ra_pages)
|
||||
const struct fsverity_pending_block *dblock)
|
||||
{
|
||||
const u64 data_pos = dblock->pos;
|
||||
const struct merkle_tree_params *params = &vi->tree_params;
|
||||
|
|
@ -200,8 +242,7 @@ static bool verify_data_block(struct inode *inode, struct fsverity_info *vi,
|
|||
(params->block_size - 1);
|
||||
|
||||
hpage = inode->i_sb->s_vop->read_merkle_tree_page(inode,
|
||||
hpage_idx, level == 0 ? min(max_ra_pages,
|
||||
params->tree_pages - hpage_idx) : 0);
|
||||
hpage_idx);
|
||||
if (IS_ERR(hpage)) {
|
||||
fsverity_err(inode,
|
||||
"Error %ld reading Merkle tree page %lu",
|
||||
|
|
@ -272,14 +313,12 @@ error:
|
|||
|
||||
static void
|
||||
fsverity_init_verification_context(struct fsverity_verification_context *ctx,
|
||||
struct inode *inode,
|
||||
unsigned long max_ra_pages)
|
||||
struct inode *inode)
|
||||
{
|
||||
struct fsverity_info *vi = *fsverity_info_addr(inode);
|
||||
|
||||
ctx->inode = inode;
|
||||
ctx->vi = vi;
|
||||
ctx->max_ra_pages = max_ra_pages;
|
||||
ctx->num_pending = 0;
|
||||
if (vi->tree_params.hash_alg->algo_id == HASH_ALGO_SHA256 &&
|
||||
sha256_finup_2x_is_optimized())
|
||||
|
|
@ -322,8 +361,7 @@ fsverity_verify_pending_blocks(struct fsverity_verification_context *ctx)
|
|||
}
|
||||
|
||||
for (i = 0; i < ctx->num_pending; i++) {
|
||||
if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i],
|
||||
ctx->max_ra_pages))
|
||||
if (!verify_data_block(ctx->inode, vi, &ctx->pending_blocks[i]))
|
||||
return false;
|
||||
}
|
||||
fsverity_clear_pending_blocks(ctx);
|
||||
|
|
@ -373,7 +411,7 @@ bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset)
|
|||
{
|
||||
struct fsverity_verification_context ctx;
|
||||
|
||||
fsverity_init_verification_context(&ctx, folio->mapping->host, 0);
|
||||
fsverity_init_verification_context(&ctx, folio->mapping->host);
|
||||
|
||||
if (fsverity_add_data_blocks(&ctx, folio, len, offset) &&
|
||||
fsverity_verify_pending_blocks(&ctx))
|
||||
|
|
@ -403,22 +441,8 @@ void fsverity_verify_bio(struct bio *bio)
|
|||
struct inode *inode = bio_first_folio_all(bio)->mapping->host;
|
||||
struct fsverity_verification_context ctx;
|
||||
struct folio_iter fi;
|
||||
unsigned long max_ra_pages = 0;
|
||||
|
||||
if (bio->bi_opf & REQ_RAHEAD) {
|
||||
/*
|
||||
* If this bio is for data readahead, then we also do readahead
|
||||
* of the first (largest) level of the Merkle tree. Namely,
|
||||
* when a Merkle tree page is read, we also try to piggy-back on
|
||||
* some additional pages -- up to 1/4 the number of data pages.
|
||||
*
|
||||
* This improves sequential read performance, as it greatly
|
||||
* reduces the number of I/O requests made to the Merkle tree.
|
||||
*/
|
||||
max_ra_pages = bio->bi_iter.bi_size >> (PAGE_SHIFT + 2);
|
||||
}
|
||||
|
||||
fsverity_init_verification_context(&ctx, inode, max_ra_pages);
|
||||
fsverity_init_verification_context(&ctx, inode);
|
||||
|
||||
bio_for_each_folio_all(fi, bio) {
|
||||
if (!fsverity_add_data_blocks(&ctx, fi.folio, fi.length,
|
||||
|
|
|
|||
|
|
@ -97,10 +97,6 @@ struct fsverity_operations {
|
|||
*
|
||||
* @inode: the inode
|
||||
* @index: 0-based index of the page within the Merkle tree
|
||||
* @num_ra_pages: The number of Merkle tree pages that should be
|
||||
* prefetched starting at @index if the page at @index
|
||||
* isn't already cached. Implementations may ignore this
|
||||
* argument; it's only a performance optimization.
|
||||
*
|
||||
* This can be called at any time on an open verity file. It may be
|
||||
* called by multiple processes concurrently, even with the same page.
|
||||
|
|
@ -110,8 +106,23 @@ struct fsverity_operations {
|
|||
* Return: the page on success, ERR_PTR() on failure
|
||||
*/
|
||||
struct page *(*read_merkle_tree_page)(struct inode *inode,
|
||||
pgoff_t index,
|
||||
unsigned long num_ra_pages);
|
||||
pgoff_t index);
|
||||
|
||||
/**
|
||||
* Perform readahead of a Merkle tree for the given inode.
|
||||
*
|
||||
* @inode: the inode
|
||||
* @index: 0-based index of the first page within the Merkle tree
|
||||
* @nr_pages: number of pages to be read ahead.
|
||||
*
|
||||
* This can be called at any time on an open verity file. It may be
|
||||
* called by multiple processes concurrently, even with the same range.
|
||||
*
|
||||
* Optional method so that ->read_merkle_tree_page preferably finds
|
||||
* cached data instead of issuing dependent I/O.
|
||||
*/
|
||||
void (*readahead_merkle_tree)(struct inode *inode, pgoff_t index,
|
||||
unsigned long nr_pages);
|
||||
|
||||
/**
|
||||
* Write a Merkle tree block to the given file.
|
||||
|
|
@ -308,8 +319,11 @@ static inline int fsverity_file_open(struct inode *inode, struct file *filp)
|
|||
}
|
||||
|
||||
void fsverity_cleanup_inode(struct inode *inode);
|
||||
void fsverity_readahead(struct inode *inode, pgoff_t index,
|
||||
unsigned long nr_pages);
|
||||
|
||||
struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index,
|
||||
unsigned long num_ra_pages);
|
||||
struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index);
|
||||
void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index,
|
||||
unsigned long nr_pages);
|
||||
|
||||
#endif /* _LINUX_FSVERITY_H */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue