From fa0bdd45d7e3703826ea75f5fe3359865d75c319 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:32 +0100 Subject: [PATCH 01/15] block: add a BIO_MAX_SIZE constant and use it Currently the only constant for the maximum bio size is BIO_MAX_SECTORS, which is in units of 512-byte sectors, but a lot of user need a byte limit. Add a BIO_MAX_SIZE constant, redefine BIO_MAX_SECTORS in terms of it, and switch all bio-related uses of UINT_MAX for the maximum size to use the symbolic names instead. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Reviewed-by: Anuj Gupta Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/bio.c | 10 +++++----- block/blk-lib.c | 9 ++++----- block/blk-merge.c | 8 ++++---- include/linux/blk_types.h | 3 ++- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/block/bio.c b/block/bio.c index 2359c0723b88..ac7703e149c6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -924,7 +924,7 @@ static inline bool bio_full(struct bio *bio, unsigned len) { if (bio->bi_vcnt >= bio->bi_max_vecs) return true; - if (bio->bi_iter.bi_size > UINT_MAX - len) + if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) return true; return false; } @@ -1030,7 +1030,7 @@ int bio_add_page(struct bio *bio, struct page *page, { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (bio->bi_iter.bi_size > UINT_MAX - len) + if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) return 0; if (bio->bi_vcnt > 0) { @@ -1057,7 +1057,7 @@ void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, { unsigned long nr = off / PAGE_SIZE; - WARN_ON_ONCE(len > UINT_MAX); + WARN_ON_ONCE(len > BIO_MAX_SIZE); __bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE); } EXPORT_SYMBOL_GPL(bio_add_folio_nofail); @@ -1081,7 +1081,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, { unsigned long nr = off / PAGE_SIZE; - if (len > UINT_MAX) + if (len > BIO_MAX_SIZE) return false; return bio_add_page(bio, folio_page(folio, nr), len, off % PAGE_SIZE) > 0; } @@ -1238,7 +1238,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) extraction_flags |= ITER_ALLOW_P2PDMA; size = iov_iter_extract_pages(iter, &pages, - UINT_MAX - bio->bi_iter.bi_size, + BIO_MAX_SIZE - bio->bi_iter.bi_size, nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; diff --git a/block/blk-lib.c b/block/blk-lib.c index 9e2cc58f881f..0be3acdc3eb5 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -32,7 +32,7 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) * Align the bio size to the discard granularity to make splitting the bio * at discard granularity boundaries easier in the driver if needed. */ - return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; + return round_down(BIO_MAX_SIZE, discard_granularity) >> SECTOR_SHIFT; } struct bio *blk_alloc_discard_bio(struct block_device *bdev, @@ -107,8 +107,7 @@ static sector_t bio_write_zeroes_limit(struct block_device *bdev) { sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - return min(bdev_write_zeroes_sectors(bdev), - (UINT_MAX >> SECTOR_SHIFT) & ~bs_mask); + return min(bdev_write_zeroes_sectors(bdev), BIO_MAX_SECTORS & ~bs_mask); } /* @@ -337,8 +336,8 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, int ret = 0; /* make sure that "len << SECTOR_SHIFT" doesn't overflow */ - if (max_sectors > UINT_MAX >> SECTOR_SHIFT) - max_sectors = UINT_MAX >> SECTOR_SHIFT; + if (max_sectors > BIO_MAX_SECTORS) + max_sectors = BIO_MAX_SECTORS; max_sectors &= ~bs_mask; if (max_sectors == 0) diff --git a/block/blk-merge.c b/block/blk-merge.c index b82c6d304658..0eb0aef97197 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -95,13 +95,13 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) } /* - * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size - * is defined as 'unsigned int', meantime it has to be aligned to with the + * The maximum size that a bio can fit has to be aligned down to the * logical block size, which is the minimum accepted unit by hardware. */ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) { - return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; + return round_down(BIO_MAX_SIZE, lim->logical_block_size) >> + SECTOR_SHIFT; } /* @@ -502,7 +502,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq) rq_for_each_bvec(bv, rq, iter) bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes, - UINT_MAX, UINT_MAX); + UINT_MAX, BIO_MAX_SIZE); return nr_phys_segs; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 19a888a2f104..d59553324a84 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -281,7 +281,8 @@ struct bio { }; #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) -#define BIO_MAX_SECTORS (UINT_MAX >> SECTOR_SHIFT) +#define BIO_MAX_SIZE UINT_MAX /* max value of bi_iter.bi_size */ +#define BIO_MAX_SECTORS (BIO_MAX_SIZE >> SECTOR_SHIFT) static inline struct bio_vec *bio_inline_vecs(struct bio *bio) { From 4d77007d42fd4f44c2f5a1555603df53e16a1362 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:33 +0100 Subject: [PATCH 02/15] block: refactor get_contig_folio_len Move all of the logic to find the contigous length inside a folio into get_contig_folio_len instead of keeping some of it in the caller. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Reviewed-by: Martin K. Petersen Reviewed-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/bio.c | 62 +++++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/block/bio.c b/block/bio.c index ac7703e149c6..d633e80d821f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1172,33 +1172,35 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static unsigned int get_contig_folio_len(unsigned int *num_pages, - struct page **pages, unsigned int i, - struct folio *folio, size_t left, +static unsigned int get_contig_folio_len(struct page **pages, + unsigned int *num_pages, size_t left, size_t offset) { - size_t bytes = left; - size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); - unsigned int j; + struct folio *folio = page_folio(pages[0]); + size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left); + unsigned int max_pages, i; + size_t folio_offset, len; + + folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset; + len = min(folio_size(folio) - folio_offset, left); /* - * We might COW a single page in the middle of - * a large folio, so we have to check that all - * pages belong to the same folio. + * We might COW a single page in the middle of a large folio, so we have + * to check that all pages belong to the same folio. */ - bytes -= contig_sz; - for (j = i + 1; j < i + *num_pages; j++) { - size_t next = min_t(size_t, PAGE_SIZE, bytes); + left -= contig_sz; + max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + for (i = 1; i < max_pages; i++) { + size_t next = min_t(size_t, PAGE_SIZE, left); - if (page_folio(pages[j]) != folio || - pages[j] != pages[j - 1] + 1) { + if (page_folio(pages[i]) != folio || + pages[i] != pages[i - 1] + 1) break; - } contig_sz += next; - bytes -= next; + left -= next; } - *num_pages = j - i; + *num_pages = i; return contig_sz; } @@ -1222,8 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; ssize_t size; - unsigned int num_pages, i = 0; - size_t offset, folio_offset, left, len; + unsigned int i = 0; + size_t offset, left, len; int ret = 0; /* @@ -1244,23 +1246,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return size ? size : -EFAULT; nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - for (left = size, i = 0; left > 0; left -= len, i += num_pages) { - struct page *page = pages[i]; - struct folio *folio = page_folio(page); + for (left = size; left > 0; left -= len) { unsigned int old_vcnt = bio->bi_vcnt; + unsigned int nr_to_add; - folio_offset = ((size_t)folio_page_idx(folio, page) << - PAGE_SHIFT) + offset; - - len = min(folio_size(folio) - folio_offset, left); - - num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - - if (num_pages > 1) - len = get_contig_folio_len(&num_pages, pages, i, - folio, left, offset); - - if (!bio_add_folio(bio, folio, len, folio_offset)) { + len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset); + if (!bio_add_page(bio, pages[i], len, offset)) { WARN_ON_ONCE(1); ret = -EINVAL; goto out; @@ -1275,8 +1266,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * single pin per page. */ if (offset && bio->bi_vcnt == old_vcnt) - unpin_user_folio(folio, 1); + unpin_user_folio(page_folio(pages[i]), 1); } + i += nr_to_add; offset = 0; } From 12da89e8844ae16e86b75a32b34a4f0b0525f453 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:34 +0100 Subject: [PATCH 03/15] block: open code bio_add_page and fix handling of mismatching P2P ranges bio_add_page fails to add data to the bio when mixing P2P with non-P2P ranges, or ranges that map to different P2P providers. In that case it will trigger that WARN_ON and return an error up the chain instead of simply starting a new bio as intended. Fix this by open coding bio_add_page and handling this case explicitly. While doing so, stop merging physical contiguous data that belongs to multiple folios. While this merge could lead to more efficient bio packing in some case, dropping will allow to remove handling of this corner case in other places and make the code more robust. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/block/bio.c b/block/bio.c index d633e80d821f..4591f0ba90f5 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1216,7 +1216,7 @@ static unsigned int get_contig_folio_len(struct page **pages, * For a multi-segment *iter, this function only adds pages from the next * non-empty segment of the iov iterator. */ -static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +static ssize_t __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; @@ -1226,7 +1226,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ssize_t size; unsigned int i = 0; size_t offset, left, len; - int ret = 0; /* * Move page array up in the allocated memory for the bio vecs as far as @@ -1247,37 +1246,26 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); for (left = size; left > 0; left -= len) { - unsigned int old_vcnt = bio->bi_vcnt; unsigned int nr_to_add; - len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset); - if (!bio_add_page(bio, pages[i], len, offset)) { - WARN_ON_ONCE(1); - ret = -EINVAL; - goto out; + if (bio->bi_vcnt > 0) { + struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (!zone_device_pages_have_same_pgmap(prev->bv_page, + pages[i])) + break; } - if (bio_flagged(bio, BIO_PAGE_PINNED)) { - /* - * We're adding another fragment of a page that already - * was part of the last segment. Undo our pin as the - * page was pinned when an earlier fragment of it was - * added to the bio and __bio_release_pages expects a - * single pin per page. - */ - if (offset && bio->bi_vcnt == old_vcnt) - unpin_user_folio(page_folio(pages[i]), 1); - } + len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset); + __bio_add_page(bio, pages[i], len, offset); i += nr_to_add; offset = 0; } iov_iter_revert(iter, left); -out: while (i < nr_pages) bio_release_page(bio, pages[i++]); - - return ret; + return size - left; } /* @@ -1337,7 +1325,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { - int ret = 0; + ssize_t ret; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EIO; @@ -1350,9 +1338,10 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); + do { ret = __bio_iov_iter_get_pages(bio, iter); - } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); + } while (ret > 0 && iov_iter_count(iter) && !bio_full(bio, 0)); if (bio->bi_vcnt) return bio_iov_iter_align_down(bio, iter, len_align_mask); From 91b73c458182801a8c9cf6135335e064567d1013 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:35 +0100 Subject: [PATCH 04/15] iov_iter: extract a iov_iter_extract_bvecs helper from bio code Massage __bio_iov_iter_get_pages so that it doesn't need the bio, and move it to lib/iov_iter.c so that it can be used by block code for other things than filling a bio and by other subsystems like netfs. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 120 +++++++------------------------------------- include/linux/uio.h | 3 ++ lib/iov_iter.c | 98 ++++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 102 deletions(-) diff --git a/block/bio.c b/block/bio.c index 4591f0ba90f5..530082c8cf0c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1172,102 +1172,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static unsigned int get_contig_folio_len(struct page **pages, - unsigned int *num_pages, size_t left, - size_t offset) -{ - struct folio *folio = page_folio(pages[0]); - size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left); - unsigned int max_pages, i; - size_t folio_offset, len; - - folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset; - len = min(folio_size(folio) - folio_offset, left); - - /* - * We might COW a single page in the middle of a large folio, so we have - * to check that all pages belong to the same folio. - */ - left -= contig_sz; - max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - for (i = 1; i < max_pages; i++) { - size_t next = min_t(size_t, PAGE_SIZE, left); - - if (page_folio(pages[i]) != folio || - pages[i] != pages[i - 1] + 1) - break; - contig_sz += next; - left -= next; - } - - *num_pages = i; - return contig_sz; -} - -#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) - -/** - * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio - * @bio: bio to add pages to - * @iter: iov iterator describing the region to be mapped - * - * Extracts pages from *iter and appends them to @bio's bvec array. The pages - * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. - * For a multi-segment *iter, this function only adds pages from the next - * non-empty segment of the iov iterator. - */ -static ssize_t __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) -{ - iov_iter_extraction_t extraction_flags = 0; - unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; - unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; - struct page **pages = (struct page **)bv; - ssize_t size; - unsigned int i = 0; - size_t offset, left, len; - - /* - * Move page array up in the allocated memory for the bio vecs as far as - * possible so that we can start filling biovecs from the beginning - * without overwriting the temporary page array. - */ - BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); - pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - - if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) - extraction_flags |= ITER_ALLOW_P2PDMA; - - size = iov_iter_extract_pages(iter, &pages, - BIO_MAX_SIZE - bio->bi_iter.bi_size, - nr_pages, extraction_flags, &offset); - if (unlikely(size <= 0)) - return size ? size : -EFAULT; - - nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - for (left = size; left > 0; left -= len) { - unsigned int nr_to_add; - - if (bio->bi_vcnt > 0) { - struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (!zone_device_pages_have_same_pgmap(prev->bv_page, - pages[i])) - break; - } - - len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset); - __bio_add_page(bio, pages[i], len, offset); - i += nr_to_add; - offset = 0; - } - - iov_iter_revert(iter, left); - while (i < nr_pages) - bio_release_page(bio, pages[i++]); - return size - left; -} - /* * Aligns the bio size to the len_align_mask, releasing excessive bio vecs that * __bio_iov_iter_get_pages may have inserted, and reverts the trimmed length @@ -1325,7 +1229,7 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, unsigned len_align_mask) { - ssize_t ret; + iov_iter_extraction_t flags = 0; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EIO; @@ -1338,14 +1242,26 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); + if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) + flags |= ITER_ALLOW_P2PDMA; do { - ret = __bio_iov_iter_get_pages(bio, iter); - } while (ret > 0 && iov_iter_count(iter) && !bio_full(bio, 0)); + ssize_t ret; - if (bio->bi_vcnt) - return bio_iov_iter_align_down(bio, iter, len_align_mask); - return ret; + ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec, + BIO_MAX_SIZE - bio->bi_iter.bi_size, + &bio->bi_vcnt, bio->bi_max_vecs, flags); + if (ret <= 0) { + if (!bio->bi_vcnt) + return ret; + break; + } + bio->bi_iter.bi_size += ret; + } while (iov_iter_count(iter) && !bio_full(bio, 0)); + + if (is_pci_p2pdma_page(bio->bi_io_vec->bv_page)) + bio->bi_opf |= REQ_NOMERGE; + return bio_iov_iter_align_down(bio, iter, len_align_mask); } static void submit_bio_wait_endio(struct bio *bio) diff --git a/include/linux/uio.h b/include/linux/uio.h index 5b127043a151..a9bc5b3067e3 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -389,6 +389,9 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0); +ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv, + size_t max_size, unsigned short *nr_vecs, + unsigned short max_vecs, iov_iter_extraction_t extraction_flags); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 896760bad455..545250507f08 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1845,3 +1845,101 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages); + +static unsigned int get_contig_folio_len(struct page **pages, + unsigned int *num_pages, size_t left, size_t offset) +{ + struct folio *folio = page_folio(pages[0]); + size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left); + unsigned int max_pages, i; + size_t folio_offset, len; + + folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset; + len = min(folio_size(folio) - folio_offset, left); + + /* + * We might COW a single page in the middle of a large folio, so we have + * to check that all pages belong to the same folio. + */ + left -= contig_sz; + max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + for (i = 1; i < max_pages; i++) { + size_t next = min_t(size_t, PAGE_SIZE, left); + + if (page_folio(pages[i]) != folio || + pages[i] != pages[i - 1] + 1) + break; + contig_sz += next; + left -= next; + } + + *num_pages = i; + return contig_sz; +} + +#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) + +/** + * iov_iter_extract_bvecs - Extract bvecs from an iterator + * @iter: the iterator to extract from + * @bv: bvec return array + * @max_size: maximum size to extract from @iter + * @nr_vecs: number of vectors in @bv (on in and output) + * @max_vecs: maximum vectors in @bv, including those filled before calling + * @extraction_flags: flags to qualify request + * + * Like iov_iter_extract_pages(), but returns physically contiguous ranges + * contained in a single folio as a single bvec instead of multiple entries. + * + * Returns the number of bytes extracted when successful, or a negative errno. + * If @nr_vecs was non-zero on entry, the number of successfully extracted bytes + * can be 0. + */ +ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv, + size_t max_size, unsigned short *nr_vecs, + unsigned short max_vecs, iov_iter_extraction_t extraction_flags) +{ + unsigned short entries_left = max_vecs - *nr_vecs; + unsigned short nr_pages, i = 0; + size_t left, offset, len; + struct page **pages; + ssize_t size; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages = (struct page **)(bv + *nr_vecs) + + entries_left * (PAGE_PTRS_PER_BVEC - 1); + + size = iov_iter_extract_pages(iter, &pages, max_size, entries_left, + extraction_flags, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); + for (left = size; left > 0; left -= len) { + unsigned int nr_to_add; + + if (*nr_vecs > 0 && + !zone_device_pages_have_same_pgmap(bv[*nr_vecs - 1].bv_page, + pages[i])) + break; + + len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset); + bvec_set_page(&bv[*nr_vecs], pages[i], len, offset); + i += nr_to_add; + (*nr_vecs)++; + offset = 0; + } + + iov_iter_revert(iter, left); + if (iov_iter_extract_will_pin(iter)) { + while (i < nr_pages) + unpin_user_page(pages[i++]); + } + return size - left; +} +EXPORT_SYMBOL_GPL(iov_iter_extract_bvecs); From 301f5356521ed90f72a67797156d75093aac786f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:36 +0100 Subject: [PATCH 05/15] block: remove bio_release_page Merge bio_release_page into the only remaining caller. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio.c | 4 +++- block/blk.h | 11 ----------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/block/bio.c b/block/bio.c index 530082c8cf0c..285b573ae82f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1195,7 +1195,9 @@ static int bio_iov_iter_align_down(struct bio *bio, struct iov_iter *iter, break; } - bio_release_page(bio, bv->bv_page); + if (bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_page(bv->bv_page); + bio->bi_vcnt--; nbytes -= bv->bv_len; } while (nbytes); diff --git a/block/blk.h b/block/blk.h index 980eef1f5690..886238cae5f1 100644 --- a/block/blk.h +++ b/block/blk.h @@ -595,17 +595,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); - -/* - * Clean up a page appropriately, where the page may be pinned, may have a - * ref taken on it or neither. - */ -static inline void bio_release_page(struct bio *bio, struct page *page) -{ - if (bio_flagged(bio, BIO_PAGE_PINNED)) - unpin_user_page(page); -} - struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); From 8dd5e7c75d7bb2635c7efd219ff20693fc24096a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:37 +0100 Subject: [PATCH 06/15] block: add helpers to bounce buffer an iov_iter into bios Add helpers to implement bounce buffering of data into a bio to implement direct I/O for cases where direct user access is not possible because stable in-flight data is required. These are intended to be used as easily as bio_iov_iter_get_pages for the zero-copy path. The write side is trivial and just copies data into the bounce buffer. The read side is a lot more complex because it needs to perform the copy from the completion context, and without preserving the iov_iter through the call chain. It steals a trick from the integrity data user interface and uses the first vector in the bio for the bounce buffer data that is fed to the block I/O stack, and uses the others to record the user buffer fragments. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Tested-by: Anuj Gupta Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Signed-off-by: Jens Axboe --- block/bio.c | 179 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/bio.h | 26 +++++++ 2 files changed, 205 insertions(+) diff --git a/block/bio.c b/block/bio.c index 285b573ae82f..49f7548a31d6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1266,6 +1266,185 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, return bio_iov_iter_align_down(bio, iter, len_align_mask); } +static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size) +{ + struct folio *folio; + + while (*size > PAGE_SIZE) { + folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size)); + if (folio) + return folio; + *size = rounddown_pow_of_two(*size - 1); + } + + return folio_alloc(gfp, get_order(*size)); +} + +static void bio_free_folios(struct bio *bio) +{ + struct bio_vec *bv; + int i; + + bio_for_each_bvec_all(bv, bio, i) { + struct folio *folio = page_folio(bv->bv_page); + + if (!is_zero_folio(folio)) + folio_put(folio); + } +} + +static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter) +{ + size_t total_len = iov_iter_count(iter); + + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EINVAL; + if (WARN_ON_ONCE(bio->bi_iter.bi_size)) + return -EINVAL; + if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs)) + return -EINVAL; + + do { + size_t this_len = min(total_len, SZ_1M); + struct folio *folio; + + if (this_len > PAGE_SIZE * 2) + this_len = rounddown_pow_of_two(this_len); + + if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len) + break; + + folio = folio_alloc_greedy(GFP_KERNEL, &this_len); + if (!folio) + break; + bio_add_folio_nofail(bio, folio, this_len, 0); + + if (copy_from_iter(folio_address(folio), this_len, iter) != + this_len) { + bio_free_folios(bio); + return -EFAULT; + } + + total_len -= this_len; + } while (total_len && bio->bi_vcnt < bio->bi_max_vecs); + + if (!bio->bi_iter.bi_size) + return -ENOMEM; + return 0; +} + +static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) +{ + size_t len = min(iov_iter_count(iter), SZ_1M); + struct folio *folio; + + folio = folio_alloc_greedy(GFP_KERNEL, &len); + if (!folio) + return -ENOMEM; + + do { + ssize_t ret; + + ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len, + &bio->bi_vcnt, bio->bi_max_vecs - 1, 0); + if (ret <= 0) { + if (!bio->bi_vcnt) + return ret; + break; + } + len -= ret; + bio->bi_iter.bi_size += ret; + } while (len && bio->bi_vcnt < bio->bi_max_vecs - 1); + + /* + * Set the folio directly here. The above loop has already calculated + * the correct bi_size, and we use bi_vcnt for the user buffers. That + * is safe as bi_vcnt is only used by the submitter and not the actual + * I/O path. + */ + bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0); + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); + return 0; +} + +/** + * bio_iov_iter_bounce - bounce buffer data from an iter into a bio + * @bio: bio to send + * @iter: iter to read from / write into + * + * Helper for direct I/O implementations that need to bounce buffer because + * we need to checksum the data or perform other operations that require + * consistency. Allocates folios to back the bounce buffer, and for writes + * copies the data into it. Needs to be paired with bio_iov_iter_unbounce() + * called on completion. + */ +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter) +{ + if (op_is_write(bio_op(bio))) + return bio_iov_iter_bounce_write(bio, iter); + return bio_iov_iter_bounce_read(bio, iter); +} + +static void bvec_unpin(struct bio_vec *bv, bool mark_dirty) +{ + struct folio *folio = page_folio(bv->bv_page); + size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE - + bv->bv_offset / PAGE_SIZE + 1; + + if (mark_dirty) + folio_mark_dirty_lock(folio); + unpin_user_folio(folio, nr_pages); +} + +static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error, + bool mark_dirty) +{ + unsigned int len = bio->bi_io_vec[0].bv_len; + + if (likely(!is_error)) { + void *buf = bvec_virt(&bio->bi_io_vec[0]); + struct iov_iter to; + + iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt, + len); + /* copying to pinned pages should always work */ + WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len); + } else { + /* No need to mark folios dirty if never copied to them */ + mark_dirty = false; + } + + if (bio_flagged(bio, BIO_PAGE_PINNED)) { + int i; + + for (i = 0; i < bio->bi_vcnt; i++) + bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty); + } + + folio_put(page_folio(bio->bi_io_vec[0].bv_page)); +} + +/** + * bio_iov_iter_unbounce - finish a bounce buffer operation + * @bio: completed bio + * @is_error: %true if an I/O error occurred and data should not be copied + * @mark_dirty: If %true, folios will be marked dirty. + * + * Helper for direct I/O implementations that need to bounce buffer because + * we need to checksum the data or perform other operations that require + * consistency. Called to complete a bio set up by bio_iov_iter_bounce(). + * Copies data back for reads, and marks the original folios dirty if + * requested and then frees the bounce buffer. + */ +void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty) +{ + if (op_is_write(bio_op(bio))) + bio_free_folios(bio); + else + bio_iov_iter_unbounce_read(bio, is_error, mark_dirty); +} + static void submit_bio_wait_endio(struct bio *bio) { complete(bio->bi_private); diff --git a/include/linux/bio.h b/include/linux/bio.h index d32aee2857a9..69d56b1d1bd2 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -397,6 +397,29 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) return iov_iter_npages(iter, max_segs); } +/** + * bio_iov_bounce_nr_vecs - calculate number of bvecs for a bounce bio + * @iter: iter to bounce from + * @op: REQ_OP_* for the bio + * + * Calculates how many bvecs are needed for the next bio to bounce from/to + * @iter. + */ +static inline unsigned short +bio_iov_bounce_nr_vecs(struct iov_iter *iter, blk_opf_t op) +{ + /* + * We still need to bounce bvec iters, so don't special case them + * here unlike in bio_iov_vecs_to_alloc. + * + * For reads we need to use a vector for the bounce buffer, account + * for that here. + */ + if (op_is_write(op)) + return iov_iter_npages(iter, BIO_MAX_VECS); + return iov_iter_npages(iter, BIO_MAX_VECS - 1) + 1; +} + struct request_queue; void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, @@ -450,6 +473,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter); +void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); + extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); From 4ad357e39b2ecd5da7bcc7e840ee24d179593cd5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:38 +0100 Subject: [PATCH 07/15] iomap: fix submission side handling of completion side errors The "if (dio->error)" in iomap_dio_bio_iter exists to stop submitting more bios when a completion already return an error. Commit cfe057f7db1f ("iomap_dio_actor(): fix iov_iter bugs") made it revert the iov by "copied", which is very wrong given that we've already consumed that range and submitted a bio for it. Fixes: cfe057f7db1f ("iomap_dio_actor(): fix iov_iter bugs") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Signed-off-by: Jens Axboe --- fs/iomap/direct-io.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4000c8596d9b..867c0ac6df8f 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -443,9 +443,13 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { size_t n; - if (dio->error) { - iov_iter_revert(dio->submit.iter, copied); - copied = ret = 0; + + /* + * If completions already occurred and reported errors, give up now and + * don't bother submitting more bios. + */ + if (unlikely(data_race(dio->error))) { + ret = 0; goto out; } From 6e7a6c80198ead08b11aa6cdc92e60a42fc5895f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:39 +0100 Subject: [PATCH 08/15] iomap: simplify iomap_dio_bio_iter Use iov_iter_count to check if we need to continue as that just reads a field in the iov_iter, and only use bio_iov_vecs_to_alloc to calculate the actual number of vectors to allocate for the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- fs/iomap/direct-io.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 867c0ac6df8f..de03bc7cf4ed 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -312,7 +312,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE; struct bio *bio; bool need_zeroout = false; - int nr_pages, ret = 0; + int ret = 0; u64 copied = 0; size_t orig_count; unsigned int alignment; @@ -440,7 +440,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) goto out; } - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { size_t n; @@ -453,7 +452,9 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) goto out; } - bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf); + bio = iomap_dio_alloc_bio(iter, dio, + bio_iov_vecs_to_alloc(dio->submit.iter, + BIO_MAX_VECS), bio_opf); fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); @@ -495,16 +496,14 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) dio->size += n; copied += n; - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, - BIO_MAX_VECS); /* * We can only poll for single bio I/Os. */ - if (nr_pages) + if (iov_iter_count(dio->submit.iter)) dio->iocb->ki_flags &= ~IOCB_HIPRI; iomap_dio_submit_bio(iter, dio, bio, pos); pos += n; - } while (nr_pages); + } while (iov_iter_count(dio->submit.iter)); /* * We need to zeroout the tail of a sub-block write if the extent type From 2631c94602297090febd8f93d6f96d9d2045466d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:40 +0100 Subject: [PATCH 09/15] iomap: split out the per-bio logic from iomap_dio_bio_iter Factor out a separate helper that builds and submits a single bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- fs/iomap/direct-io.c | 111 +++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 52 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index de03bc7cf4ed..bb79519dec65 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -302,6 +302,56 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, return 0; } +static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, + struct iomap_dio *dio, loff_t pos, unsigned int alignment, + blk_opf_t op) +{ + struct bio *bio; + ssize_t ret; + + bio = iomap_dio_alloc_bio(iter, dio, + bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS), + op); + fscrypt_set_bio_crypt_ctx(bio, iter->inode, + pos >> iter->inode->i_blkbits, GFP_KERNEL); + bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); + bio->bi_write_hint = iter->inode->i_write_hint; + bio->bi_ioprio = dio->iocb->ki_ioprio; + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); + if (unlikely(ret)) + goto out_put_bio; + ret = bio->bi_iter.bi_size; + + /* + * An atomic write bio must cover the complete length. If it doesn't, + * error out. + */ + if ((op & REQ_ATOMIC) && WARN_ON_ONCE(ret != iomap_length(iter))) { + ret = -EINVAL; + goto out_put_bio; + } + + if (dio->flags & IOMAP_DIO_WRITE) + task_io_account_write(ret); + else if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); + + /* + * We can only poll for single bio I/Os. + */ + if (iov_iter_count(dio->submit.iter)) + dio->iocb->ki_flags &= ~IOCB_HIPRI; + iomap_dio_submit_bio(iter, dio, bio, pos); + return ret; + +out_put_bio: + bio_put(bio); + return ret; +} + static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { const struct iomap *iomap = &iter->iomap; @@ -310,12 +360,11 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) const loff_t length = iomap_length(iter); loff_t pos = iter->pos; blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE; - struct bio *bio; bool need_zeroout = false; - int ret = 0; u64 copied = 0; size_t orig_count; unsigned int alignment; + ssize_t ret = 0; /* * File systems that write out of place and always allocate new blocks @@ -441,68 +490,27 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) } do { - size_t n; - /* * If completions already occurred and reported errors, give up now and * don't bother submitting more bios. */ - if (unlikely(data_race(dio->error))) { - ret = 0; + if (unlikely(data_race(dio->error))) goto out; - } - bio = iomap_dio_alloc_bio(iter, dio, - bio_iov_vecs_to_alloc(dio->submit.iter, - BIO_MAX_VECS), bio_opf); - fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, - GFP_KERNEL); - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); - bio->bi_write_hint = inode->i_write_hint; - bio->bi_ioprio = dio->iocb->ki_ioprio; - bio->bi_private = dio; - bio->bi_end_io = iomap_dio_bio_end_io; - - ret = bio_iov_iter_get_pages(bio, dio->submit.iter, - alignment - 1); - if (unlikely(ret)) { + ret = iomap_dio_bio_iter_one(iter, dio, pos, alignment, bio_opf); + if (unlikely(ret < 0)) { /* * We have to stop part way through an IO. We must fall * through to the sub-block tail zeroing here, otherwise * this short IO may expose stale data in the tail of * the block we haven't written data to. */ - bio_put(bio); - goto zero_tail; + break; } - - n = bio->bi_iter.bi_size; - if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) { - /* - * An atomic write bio must cover the complete length, - * which it doesn't, so error. We may need to zero out - * the tail (complete FS block), similar to when - * bio_iov_iter_get_pages() returns an error, above. - */ - ret = -EINVAL; - bio_put(bio); - goto zero_tail; - } - if (dio->flags & IOMAP_DIO_WRITE) - task_io_account_write(n); - else if (dio->flags & IOMAP_DIO_DIRTY) - bio_set_pages_dirty(bio); - - dio->size += n; - copied += n; - - /* - * We can only poll for single bio I/Os. - */ - if (iov_iter_count(dio->submit.iter)) - dio->iocb->ki_flags &= ~IOCB_HIPRI; - iomap_dio_submit_bio(iter, dio, bio, pos); - pos += n; + dio->size += ret; + copied += ret; + pos += ret; + ret = 0; } while (iov_iter_count(dio->submit.iter)); /* @@ -511,7 +519,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) * the block tail in the latter case, we can expose stale data via mmap * reads of the EOF block. */ -zero_tail: if (need_zeroout || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) { /* zero out from the end of the write to the end of the block */ From e2fcff5bb4c48bf602082e5a1428ff7328f7558f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:41 +0100 Subject: [PATCH 10/15] iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct Refactor the two per-bio completion handlers to share common code using a new helper. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- fs/iomap/direct-io.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bb79519dec65..c1d5db85c8c7 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -211,16 +211,20 @@ static void iomap_dio_done(struct iomap_dio *dio) iomap_dio_complete_work(&dio->aio.work); } -void iomap_dio_bio_end_io(struct bio *bio) +static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion) { struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - - if (atomic_dec_and_test(&dio->ref)) + if (atomic_dec_and_test(&dio->ref)) { + /* + * Avoid another context switch for the completion when already + * called from the ioend completion workqueue. + */ + if (inline_completion) + dio->flags &= ~IOMAP_DIO_COMP_WORK; iomap_dio_done(dio); + } if (should_dirty) { bio_check_pages_dirty(bio); @@ -229,33 +233,25 @@ void iomap_dio_bio_end_io(struct bio *bio) bio_put(bio); } } + +void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + __iomap_dio_bio_end_io(bio, false); +} EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) { struct iomap_dio *dio = ioend->io_bio.bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); u32 vec_count = ioend->io_bio.bi_vcnt; if (ioend->io_error) iomap_dio_set_error(dio, ioend->io_error); - - if (atomic_dec_and_test(&dio->ref)) { - /* - * Try to avoid another context switch for the completion given - * that we are already called from the ioend completion - * workqueue. - */ - dio->flags &= ~IOMAP_DIO_COMP_WORK; - iomap_dio_done(dio); - } - - if (should_dirty) { - bio_check_pages_dirty(&ioend->io_bio); - } else { - bio_release_pages(&ioend->io_bio, false); - bio_put(&ioend->io_bio); - } + __iomap_dio_bio_end_io(&ioend->io_bio, true); /* * Return the number of bvecs completed as even direct I/O completions From 45cec0de6c8973660da279e44b24d37af49daeb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:42 +0100 Subject: [PATCH 11/15] iomap: free the bio before completing the dio There are good arguments for processing the user completions ASAP vs. freeing resources ASAP, but freeing the bio first here removes potential use after free hazards when checking flags, and will simplify the upcoming bounce buffer support. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- fs/iomap/direct-io.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index c1d5db85c8c7..d4d52775ce25 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -214,7 +214,15 @@ static void iomap_dio_done(struct iomap_dio *dio) static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion) { struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (dio->flags & IOMAP_DIO_DIRTY) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } + + /* Do not touch bio below, we just gave up our reference. */ if (atomic_dec_and_test(&dio->ref)) { /* @@ -225,13 +233,6 @@ static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion) dio->flags &= ~IOMAP_DIO_COMP_WORK; iomap_dio_done(dio); } - - if (should_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } } void iomap_dio_bio_end_io(struct bio *bio) From c96b8b220271024c04289d6d9779dc2ccbd12be2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:43 +0100 Subject: [PATCH 12/15] iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED Match the more descriptive iov_iter terminology instead of encoding what we do with them for reads only. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- fs/iomap/direct-io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index d4d52775ce25..eca7adda595a 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -22,7 +22,7 @@ #define IOMAP_DIO_WRITE_THROUGH (1U << 28) #define IOMAP_DIO_NEED_SYNC (1U << 29) #define IOMAP_DIO_WRITE (1U << 30) -#define IOMAP_DIO_DIRTY (1U << 31) +#define IOMAP_DIO_USER_BACKED (1U << 31) struct iomap_dio { struct kiocb *iocb; @@ -215,7 +215,7 @@ static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion) { struct iomap_dio *dio = bio->bi_private; - if (dio->flags & IOMAP_DIO_DIRTY) { + if (dio->flags & IOMAP_DIO_USER_BACKED) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); @@ -333,7 +333,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, if (dio->flags & IOMAP_DIO_WRITE) task_io_account_write(ret); - else if (dio->flags & IOMAP_DIO_DIRTY) + else if (dio->flags & IOMAP_DIO_USER_BACKED) bio_set_pages_dirty(bio); /* @@ -679,7 +679,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, goto out_free_dio; if (user_backed_iter(iter)) - dio->flags |= IOMAP_DIO_DIRTY; + dio->flags |= IOMAP_DIO_USER_BACKED; ret = kiocb_write_and_wait(iocb, iomi.len); if (ret) From d969bd72cf6835a4c915b326feb92c7597a46d98 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:44 +0100 Subject: [PATCH 13/15] iomap: support ioends for direct reads Support using the ioend structure to defer I/O completion for direct reads in addition to writes. This requires a check for the operation to not merge reads and writes in iomap_ioend_can_merge. This support will be used for bounce buffered direct I/O reads that need to copy data back to the user address space on read completion. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- fs/iomap/ioend.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 86f44922ed3b..800d12f45438 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -299,6 +299,14 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends); static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { + /* + * There is no point in merging reads as there is no completion + * processing that can be easily batched up for them. + */ + if (bio_op(&ioend->io_bio) == REQ_OP_READ || + bio_op(&next->io_bio) == REQ_OP_READ) + return false; + if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; if (next->io_flags & IOMAP_IOEND_BOUNDARY) From c9d114846b380fec1093b7bca91ee5a8cd7b575d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:45 +0100 Subject: [PATCH 14/15] iomap: add a flag to bounce buffer direct I/O Add a new flag that request bounce buffering for direct I/O. This is needed to provide the stable pages requirement requested by devices that need to calculate checksums or parity over the data and allows file systems to properly work with things like T10 protection information. The implementation just calls out to the new bio bounce buffering helpers to allocate a bounce buffer, which is used for I/O and to copy to/from it. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- fs/iomap/direct-io.c | 30 ++++++++++++++++++++---------- include/linux/iomap.h | 9 +++++++++ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index eca7adda595a..9c572de0d596 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -215,7 +215,11 @@ static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion) { struct iomap_dio *dio = bio->bi_private; - if (dio->flags & IOMAP_DIO_USER_BACKED) { + if (dio->flags & IOMAP_DIO_BOUNCE) { + bio_iov_iter_unbounce(bio, !!dio->error, + dio->flags & IOMAP_DIO_USER_BACKED); + bio_put(bio); + } else if (dio->flags & IOMAP_DIO_USER_BACKED) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); @@ -303,12 +307,16 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned int alignment, blk_opf_t op) { + unsigned int nr_vecs; struct bio *bio; ssize_t ret; - bio = iomap_dio_alloc_bio(iter, dio, - bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS), - op); + if (dio->flags & IOMAP_DIO_BOUNCE) + nr_vecs = bio_iov_bounce_nr_vecs(dio->submit.iter, op); + else + nr_vecs = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); + + bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, op); fscrypt_set_bio_crypt_ctx(bio, iter->inode, pos >> iter->inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); @@ -317,7 +325,11 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); + if (dio->flags & IOMAP_DIO_BOUNCE) + ret = bio_iov_iter_bounce(bio, dio->submit.iter); + else + ret = bio_iov_iter_get_pages(bio, dio->submit.iter, + alignment - 1); if (unlikely(ret)) goto out_put_bio; ret = bio->bi_iter.bi_size; @@ -333,7 +345,8 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, if (dio->flags & IOMAP_DIO_WRITE) task_io_account_write(ret); - else if (dio->flags & IOMAP_DIO_USER_BACKED) + else if ((dio->flags & IOMAP_DIO_USER_BACKED) && + !(dio->flags & IOMAP_DIO_BOUNCE)) bio_set_pages_dirty(bio); /* @@ -662,7 +675,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->i_size = i_size_read(inode); dio->dops = dops; dio->error = 0; - dio->flags = 0; + dio->flags = dio_flags & (IOMAP_DIO_FSBLOCK_ALIGNED | IOMAP_DIO_BOUNCE); dio->done_before = done_before; dio->submit.iter = iter; @@ -671,9 +684,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED) - dio->flags |= IOMAP_DIO_FSBLOCK_ALIGNED; - if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) goto out_free_dio; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 520e967cb501..cf152f638665 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -562,6 +562,15 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) +/* + * Bounce buffer instead of using zero copy access. + * + * This is needed if the device needs stable data to checksum or generate + * parity. The file system must hook into the I/O submission and offload + * completions to user context for reads when this is set. + */ +#define IOMAP_DIO_BOUNCE (1 << 4) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); From 3373503df025ab6c9a8ad2ce6b7febd2eb3c99dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Jan 2026 06:53:46 +0100 Subject: [PATCH 15/15] xfs: use bounce buffering direct I/O when the device requires stable pages Fix direct I/O on devices that require stable pages by asking iomap to bounce buffer. To support this, ioends are used for direct reads in this case to provide a user context for copying data back from the bounce buffer. This fixes qemu when used on devices using T10 protection information and probably other cases like iSCSI using data digests. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- fs/xfs/xfs_aops.c | 8 ++++++-- fs/xfs/xfs_file.c | 41 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 56a544638491..c3c1e149fff4 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -103,7 +103,7 @@ xfs_ioend_put_open_zones( * IO write completion. */ STATIC void -xfs_end_ioend( +xfs_end_ioend_write( struct iomap_ioend *ioend) { struct xfs_inode *ip = XFS_I(ioend->io_inode); @@ -202,7 +202,11 @@ xfs_end_io( io_list))) { list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp); - xfs_end_ioend(ioend); + if (bio_op(&ioend->io_bio) == REQ_OP_READ) + iomap_finish_ioends(ioend, + blk_status_to_errno(ioend->io_bio.bi_status)); + else + xfs_end_ioend_write(ioend); cond_resched(); } } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7874cf745af3..f6cc63dcf961 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -224,12 +224,34 @@ xfs_ilock_iocb_for_write( return 0; } +/* + * Bounce buffering dio reads need a user context to copy back the data. + * Use an ioend to provide that. + */ +static void +xfs_dio_read_bounce_submit_io( + const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + iomap_init_ioend(iter->inode, bio, file_offset, IOMAP_IOEND_DIRECT); + bio->bi_end_io = xfs_end_bio; + submit_bio(bio); +} + +static const struct iomap_dio_ops xfs_dio_read_bounce_ops = { + .submit_io = xfs_dio_read_bounce_submit_io, + .bio_set = &iomap_ioend_bioset, +}; + STATIC ssize_t xfs_file_dio_read( struct kiocb *iocb, struct iov_iter *to) { struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + unsigned int dio_flags = 0; + const struct iomap_dio_ops *dio_ops = NULL; ssize_t ret; trace_xfs_file_direct_read(iocb, to); @@ -242,7 +264,12 @@ xfs_file_dio_read( ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) return ret; - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0); + if (mapping_stable_writes(iocb->ki_filp->f_mapping)) { + dio_ops = &xfs_dio_read_bounce_ops; + dio_flags |= IOMAP_DIO_BOUNCE; + } + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, dio_ops, dio_flags, + NULL, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -703,6 +730,8 @@ xfs_file_dio_write_aligned( xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } + if (mapping_stable_writes(iocb->ki_filp->f_mapping)) + dio_flags |= IOMAP_DIO_BOUNCE; trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); out_unlock: @@ -750,6 +779,7 @@ xfs_file_dio_write_atomic( { unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret, ocount = iov_iter_count(from); + unsigned int dio_flags = 0; const struct iomap_ops *dops; /* @@ -777,8 +807,10 @@ retry: } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, - 0, NULL, 0); + if (mapping_stable_writes(iocb->ki_filp->f_mapping)) + dio_flags |= IOMAP_DIO_BOUNCE; + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, dio_flags, + NULL, 0); /* * The retry mechanism is based on the ->iomap_begin method returning @@ -867,6 +899,9 @@ retry_exclusive: if (flags & IOMAP_DIO_FORCE_WAIT) inode_dio_wait(VFS_I(ip)); + if (mapping_stable_writes(iocb->ki_filp->f_mapping)) + flags |= IOMAP_DIO_BOUNCE; + trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, flags, NULL, 0);