for-7.0/block-stable-pages-20260206

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmmGPZwQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpjy5EAC8z4IFCz+ua+q3hqJIlGfTlkxR6kM+DMn/
 WKqaFYjnwzwApYe7kgBtlVcINnX5riCdNEk70tG1SCkAHqqdnzF4Ps1kQz0RflXS
 7DftN76hSTUbEfolQWTzqDAGMrcn7GUjjjwaRKjSVF30UBKjZ6U4fKfyzWChEwah
 UtnmLMd3Osl58C9RTcjQPN1qMeQagmLej9C8plyCu9iLauoLA8XlkjxWvXRCcYwc
 L+IY9F0s1rxmjGZ3eeaevs7V59RjOwJZvL4EPICajkx3oE7EAxS3VVt0p9LC3tPD
 F4U6SXL0UkIeinduKlbEGP17N6l/4a4Twetyu6rSu//APzKIPAOPeD2xqIbrNSlI
 rxHqKCsI8KW5JfNTvo9+JjiDOeDxRwt19ZCVCFUzXcsNfRq0EljtuY/4V5P1tPr9
 0rOe5SdYS94AncwrabeV/ZOLEGmujjY9YhsCcP3J49LDkFG+T3fBgCpmFWwlWLs7
 92MUHVcQmvb+j0z/fZVWRsqzhqtHBG4SO4yg2+Q0RQZeWnsVNTOR5cWfUEShI9G+
 hnfYLdyyBTy37n60WXJOq2VhiWbPDAetEjKr+ulbD9hvpPdh6QL7rFiWZsVlnc7V
 wUQoUjNltfHlyPI/YSwqa9YyyLPAl6YGKba2/qBKSwFTQmFLpSynJIa87W6jUx6B
 sofywm9ZZw==
 =faTj
 -----END PGP SIGNATURE-----

Merge tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull bounce buffer dio for stable pages from Jens Axboe:
 "This adds support for bounce buffering of dio for stable pages. This
  was all done by Christoph. In his words:

  This series tries to address the problem that under I/O pages can be
  modified during direct I/O, even when the device or file system
  require stable pages during I/O to calculate checksums, parity or data
  operations. It does so by adding block layer helpers to bounce buffer
  an iov_iter into a bio, then wires that up in iomap and ultimately
  XFS.

  The reason that the file system even needs to know about it, is
  because reads need a user context to copy the data back, and the
  infrastructure to defer ioends to a workqueue currently sits in XFS.
  I'm going to look into moving that into ioend and enabling it for
  other file systems. Additionally btrfs already has it's own
  infrastructure for this, and actually an urgent need to bounce buffer,
  so this should be useful there and could be wire up easily. In fact
  the idea comes from patches by Qu that did this in btrfs.

  This patch fixes all but one xfstests failures on T10 PI capable
  devices (generic/095 seems to have issues with a mix of mmap and
  splice still, I'm looking into that separately), and make qemu VMs
  running Windows, or Linux with swap enabled fine on an XFS file on a
  device using PI.

  Performance numbers on my (not exactly state of the art) NVMe PI test
  setup:

      Sequential reads using io_uring, QD=16.
      Bandwidth and CPU usage (usr/sys):

      | size |        zero copy         |          bounce          |
      +------+--------------------------+--------------------------+
      |   4k | 1316MiB/s (12.65/55.40%) | 1081MiB/s (11.76/49.78%) |
      |  64K | 3370MiB/s ( 5.46/18.20%) | 3365MiB/s ( 4.47/15.68%) |
      |   1M | 3401MiB/s ( 0.76/23.05%) | 3400MiB/s ( 0.80/09.06%) |
      +------+--------------------------+--------------------------+

      Sequential writes using io_uring, QD=16.
      Bandwidth and CPU usage (usr/sys):

      | size |        zero copy         |          bounce          |
      +------+--------------------------+--------------------------+
      |   4k |  882MiB/s (11.83/33.88%) |  750MiB/s (10.53/34.08%) |
      |  64K | 2009MiB/s ( 7.33/15.80%) | 2007MiB/s ( 7.47/24.71%) |
      |   1M | 1992MiB/s ( 7.26/ 9.13%) | 1992MiB/s ( 9.21/19.11%) |
      +------+--------------------------+--------------------------+

  Note that the 64k read numbers look really odd to me for the baseline
  zero copy case, but are reproducible over many repeated runs.

  The bounce read numbers should further improve when moving the PI
  validation to the file system and removing the double context switch,
  which I have patches for that will sent out soon"

* tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
  xfs: use bounce buffering direct I/O when the device requires stable pages
  iomap: add a flag to bounce buffer direct I/O
  iomap: support ioends for direct reads
  iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED
  iomap: free the bio before completing the dio
  iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct
  iomap: split out the per-bio logic from iomap_dio_bio_iter
  iomap: simplify iomap_dio_bio_iter
  iomap: fix submission side handling of completion side errors
  block: add helpers to bounce buffer an iov_iter into bios
  block: remove bio_release_page
  iov_iter: extract a iov_iter_extract_bvecs helper from bio code
  block: open code bio_add_page and fix handling of mismatching P2P ranges
  block: refactor get_contig_folio_len
  block: add a BIO_MAX_SIZE constant and use it
This commit is contained in:
Linus Torvalds 2026-02-09 18:14:52 -08:00
commit 4adc13ed7c
13 changed files with 508 additions and 241 deletions

View file

@ -1845,3 +1845,101 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i,
return -EFAULT;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
static unsigned int get_contig_folio_len(struct page **pages,
unsigned int *num_pages, size_t left, size_t offset)
{
struct folio *folio = page_folio(pages[0]);
size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, left);
unsigned int max_pages, i;
size_t folio_offset, len;
folio_offset = PAGE_SIZE * folio_page_idx(folio, pages[0]) + offset;
len = min(folio_size(folio) - folio_offset, left);
/*
* We might COW a single page in the middle of a large folio, so we have
* to check that all pages belong to the same folio.
*/
left -= contig_sz;
max_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
for (i = 1; i < max_pages; i++) {
size_t next = min_t(size_t, PAGE_SIZE, left);
if (page_folio(pages[i]) != folio ||
pages[i] != pages[i - 1] + 1)
break;
contig_sz += next;
left -= next;
}
*num_pages = i;
return contig_sz;
}
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
/**
* iov_iter_extract_bvecs - Extract bvecs from an iterator
* @iter: the iterator to extract from
* @bv: bvec return array
* @max_size: maximum size to extract from @iter
* @nr_vecs: number of vectors in @bv (on in and output)
* @max_vecs: maximum vectors in @bv, including those filled before calling
* @extraction_flags: flags to qualify request
*
* Like iov_iter_extract_pages(), but returns physically contiguous ranges
* contained in a single folio as a single bvec instead of multiple entries.
*
* Returns the number of bytes extracted when successful, or a negative errno.
* If @nr_vecs was non-zero on entry, the number of successfully extracted bytes
* can be 0.
*/
ssize_t iov_iter_extract_bvecs(struct iov_iter *iter, struct bio_vec *bv,
size_t max_size, unsigned short *nr_vecs,
unsigned short max_vecs, iov_iter_extraction_t extraction_flags)
{
unsigned short entries_left = max_vecs - *nr_vecs;
unsigned short nr_pages, i = 0;
size_t left, offset, len;
struct page **pages;
ssize_t size;
/*
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
* without overwriting the temporary page array.
*/
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages = (struct page **)(bv + *nr_vecs) +
entries_left * (PAGE_PTRS_PER_BVEC - 1);
size = iov_iter_extract_pages(iter, &pages, max_size, entries_left,
extraction_flags, &offset);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
for (left = size; left > 0; left -= len) {
unsigned int nr_to_add;
if (*nr_vecs > 0 &&
!zone_device_pages_have_same_pgmap(bv[*nr_vecs - 1].bv_page,
pages[i]))
break;
len = get_contig_folio_len(&pages[i], &nr_to_add, left, offset);
bvec_set_page(&bv[*nr_vecs], pages[i], len, offset);
i += nr_to_add;
(*nr_vecs)++;
offset = 0;
}
iov_iter_revert(iter, left);
if (iov_iter_extract_will_pin(iter)) {
while (i < nr_pages)
unpin_user_page(pages[i++]);
}
return size - left;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_bvecs);