mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 01:04:41 +01:00
-----BEGIN PGP SIGNATURE-----
iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmmGPZwQHGF4Ym9lQGtl
cm5lbC5kawAKCRD301j7KXHgpjy5EAC8z4IFCz+ua+q3hqJIlGfTlkxR6kM+DMn/
WKqaFYjnwzwApYe7kgBtlVcINnX5riCdNEk70tG1SCkAHqqdnzF4Ps1kQz0RflXS
7DftN76hSTUbEfolQWTzqDAGMrcn7GUjjjwaRKjSVF30UBKjZ6U4fKfyzWChEwah
UtnmLMd3Osl58C9RTcjQPN1qMeQagmLej9C8plyCu9iLauoLA8XlkjxWvXRCcYwc
L+IY9F0s1rxmjGZ3eeaevs7V59RjOwJZvL4EPICajkx3oE7EAxS3VVt0p9LC3tPD
F4U6SXL0UkIeinduKlbEGP17N6l/4a4Twetyu6rSu//APzKIPAOPeD2xqIbrNSlI
rxHqKCsI8KW5JfNTvo9+JjiDOeDxRwt19ZCVCFUzXcsNfRq0EljtuY/4V5P1tPr9
0rOe5SdYS94AncwrabeV/ZOLEGmujjY9YhsCcP3J49LDkFG+T3fBgCpmFWwlWLs7
92MUHVcQmvb+j0z/fZVWRsqzhqtHBG4SO4yg2+Q0RQZeWnsVNTOR5cWfUEShI9G+
hnfYLdyyBTy37n60WXJOq2VhiWbPDAetEjKr+ulbD9hvpPdh6QL7rFiWZsVlnc7V
wUQoUjNltfHlyPI/YSwqa9YyyLPAl6YGKba2/qBKSwFTQmFLpSynJIa87W6jUx6B
sofywm9ZZw==
=faTj
-----END PGP SIGNATURE-----
Merge tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull bounce buffer dio for stable pages from Jens Axboe:
"This adds support for bounce buffering of dio for stable pages. This
was all done by Christoph. In his words:
This series tries to address the problem that under I/O pages can be
modified during direct I/O, even when the device or file system
require stable pages during I/O to calculate checksums, parity or data
operations. It does so by adding block layer helpers to bounce buffer
an iov_iter into a bio, then wires that up in iomap and ultimately
XFS.
The reason that the file system even needs to know about it, is
because reads need a user context to copy the data back, and the
infrastructure to defer ioends to a workqueue currently sits in XFS.
I'm going to look into moving that into ioend and enabling it for
other file systems. Additionally btrfs already has it's own
infrastructure for this, and actually an urgent need to bounce buffer,
so this should be useful there and could be wire up easily. In fact
the idea comes from patches by Qu that did this in btrfs.
This patch fixes all but one xfstests failures on T10 PI capable
devices (generic/095 seems to have issues with a mix of mmap and
splice still, I'm looking into that separately), and make qemu VMs
running Windows, or Linux with swap enabled fine on an XFS file on a
device using PI.
Performance numbers on my (not exactly state of the art) NVMe PI test
setup:
Sequential reads using io_uring, QD=16.
Bandwidth and CPU usage (usr/sys):
| size | zero copy | bounce |
+------+--------------------------+--------------------------+
| 4k | 1316MiB/s (12.65/55.40%) | 1081MiB/s (11.76/49.78%) |
| 64K | 3370MiB/s ( 5.46/18.20%) | 3365MiB/s ( 4.47/15.68%) |
| 1M | 3401MiB/s ( 0.76/23.05%) | 3400MiB/s ( 0.80/09.06%) |
+------+--------------------------+--------------------------+
Sequential writes using io_uring, QD=16.
Bandwidth and CPU usage (usr/sys):
| size | zero copy | bounce |
+------+--------------------------+--------------------------+
| 4k | 882MiB/s (11.83/33.88%) | 750MiB/s (10.53/34.08%) |
| 64K | 2009MiB/s ( 7.33/15.80%) | 2007MiB/s ( 7.47/24.71%) |
| 1M | 1992MiB/s ( 7.26/ 9.13%) | 1992MiB/s ( 9.21/19.11%) |
+------+--------------------------+--------------------------+
Note that the 64k read numbers look really odd to me for the baseline
zero copy case, but are reproducible over many repeated runs.
The bounce read numbers should further improve when moving the PI
validation to the file system and removing the double context switch,
which I have patches for that will sent out soon"
* tag 'for-7.0/block-stable-pages-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
xfs: use bounce buffering direct I/O when the device requires stable pages
iomap: add a flag to bounce buffer direct I/O
iomap: support ioends for direct reads
iomap: rename IOMAP_DIO_DIRTY to IOMAP_DIO_USER_BACKED
iomap: free the bio before completing the dio
iomap: share code between iomap_dio_bio_end_io and iomap_finish_ioend_direct
iomap: split out the per-bio logic from iomap_dio_bio_iter
iomap: simplify iomap_dio_bio_iter
iomap: fix submission side handling of completion side errors
block: add helpers to bounce buffer an iov_iter into bios
block: remove bio_release_page
iov_iter: extract a iov_iter_extract_bvecs helper from bio code
block: open code bio_add_page and fix handling of mismatching P2P ranges
block: refactor get_contig_folio_len
block: add a BIO_MAX_SIZE constant and use it
824 lines
23 KiB
C
824 lines
23 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
|
|
* Copyright (c) 2016-2025 Christoph Hellwig.
|
|
* All Rights Reserved.
|
|
*/
|
|
#include "xfs_platform.h"
|
|
#include "xfs_shared.h"
|
|
#include "xfs_format.h"
|
|
#include "xfs_log_format.h"
|
|
#include "xfs_trans_resv.h"
|
|
#include "xfs_mount.h"
|
|
#include "xfs_inode.h"
|
|
#include "xfs_trans.h"
|
|
#include "xfs_iomap.h"
|
|
#include "xfs_trace.h"
|
|
#include "xfs_bmap.h"
|
|
#include "xfs_bmap_util.h"
|
|
#include "xfs_reflink.h"
|
|
#include "xfs_errortag.h"
|
|
#include "xfs_error.h"
|
|
#include "xfs_icache.h"
|
|
#include "xfs_zone_alloc.h"
|
|
#include "xfs_rtgroup.h"
|
|
|
|
struct xfs_writepage_ctx {
|
|
struct iomap_writepage_ctx ctx;
|
|
unsigned int data_seq;
|
|
unsigned int cow_seq;
|
|
};
|
|
|
|
static inline struct xfs_writepage_ctx *
|
|
XFS_WPC(struct iomap_writepage_ctx *ctx)
|
|
{
|
|
return container_of(ctx, struct xfs_writepage_ctx, ctx);
|
|
}
|
|
|
|
/*
|
|
* Fast and loose check if this write could update the on-disk inode size.
|
|
*/
|
|
static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
|
|
{
|
|
return ioend->io_offset + ioend->io_size >
|
|
XFS_I(ioend->io_inode)->i_disk_size;
|
|
}
|
|
|
|
/*
|
|
* Update on-disk file size now that data has been written to disk.
|
|
*/
|
|
int
|
|
xfs_setfilesize(
|
|
struct xfs_inode *ip,
|
|
xfs_off_t offset,
|
|
size_t size)
|
|
{
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
struct xfs_trans *tp;
|
|
xfs_fsize_t isize;
|
|
int error;
|
|
|
|
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
|
|
if (error)
|
|
return error;
|
|
|
|
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
|
isize = xfs_new_eof(ip, offset + size);
|
|
if (!isize) {
|
|
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
|
xfs_trans_cancel(tp);
|
|
return 0;
|
|
}
|
|
|
|
trace_xfs_setfilesize(ip, offset, size);
|
|
|
|
ip->i_disk_size = isize;
|
|
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
|
|
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
|
|
|
|
return xfs_trans_commit(tp);
|
|
}
|
|
|
|
static void
|
|
xfs_ioend_put_open_zones(
|
|
struct iomap_ioend *ioend)
|
|
{
|
|
struct iomap_ioend *tmp;
|
|
|
|
/*
|
|
* Put the open zone for all ioends merged into this one (if any).
|
|
*/
|
|
list_for_each_entry(tmp, &ioend->io_list, io_list)
|
|
xfs_open_zone_put(tmp->io_private);
|
|
|
|
/*
|
|
* The main ioend might not have an open zone if the submission failed
|
|
* before xfs_zone_alloc_and_submit got called.
|
|
*/
|
|
if (ioend->io_private)
|
|
xfs_open_zone_put(ioend->io_private);
|
|
}
|
|
|
|
/*
|
|
* IO write completion.
|
|
*/
|
|
STATIC void
|
|
xfs_end_ioend_write(
|
|
struct iomap_ioend *ioend)
|
|
{
|
|
struct xfs_inode *ip = XFS_I(ioend->io_inode);
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
bool is_zoned = xfs_is_zoned_inode(ip);
|
|
xfs_off_t offset = ioend->io_offset;
|
|
size_t size = ioend->io_size;
|
|
unsigned int nofs_flag;
|
|
int error;
|
|
|
|
/*
|
|
* We can allocate memory here while doing writeback on behalf of
|
|
* memory reclaim. To avoid memory allocation deadlocks set the
|
|
* task-wide nofs context for the following operations.
|
|
*/
|
|
nofs_flag = memalloc_nofs_save();
|
|
|
|
/*
|
|
* Just clean up the in-memory structures if the fs has been shut down.
|
|
*/
|
|
if (xfs_is_shutdown(mp)) {
|
|
error = -EIO;
|
|
goto done;
|
|
}
|
|
|
|
/*
|
|
* Clean up all COW blocks and underlying data fork delalloc blocks on
|
|
* I/O error. The delalloc punch is required because this ioend was
|
|
* mapped to blocks in the COW fork and the associated pages are no
|
|
* longer dirty. If we don't remove delalloc blocks here, they become
|
|
* stale and can corrupt free space accounting on unmount.
|
|
*/
|
|
error = blk_status_to_errno(ioend->io_bio.bi_status);
|
|
if (unlikely(error)) {
|
|
if (ioend->io_flags & IOMAP_IOEND_SHARED) {
|
|
ASSERT(!is_zoned);
|
|
xfs_reflink_cancel_cow_range(ip, offset, size, true);
|
|
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
|
|
offset + size, NULL);
|
|
}
|
|
goto done;
|
|
}
|
|
|
|
/*
|
|
* Success: commit the COW or unwritten blocks if needed.
|
|
*/
|
|
if (is_zoned)
|
|
error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
|
|
ioend->io_private, NULLFSBLOCK);
|
|
else if (ioend->io_flags & IOMAP_IOEND_SHARED)
|
|
error = xfs_reflink_end_cow(ip, offset, size);
|
|
else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
|
|
error = xfs_iomap_write_unwritten(ip, offset, size, false);
|
|
|
|
if (!error &&
|
|
!(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
|
|
xfs_ioend_is_append(ioend))
|
|
error = xfs_setfilesize(ip, offset, size);
|
|
done:
|
|
if (is_zoned)
|
|
xfs_ioend_put_open_zones(ioend);
|
|
iomap_finish_ioends(ioend, error);
|
|
memalloc_nofs_restore(nofs_flag);
|
|
}
|
|
|
|
/*
|
|
* Finish all pending IO completions that require transactional modifications.
|
|
*
|
|
* We try to merge physical and logically contiguous ioends before completion to
|
|
* minimise the number of transactions we need to perform during IO completion.
|
|
* Both unwritten extent conversion and COW remapping need to iterate and modify
|
|
* one physical extent at a time, so we gain nothing by merging physically
|
|
* discontiguous extents here.
|
|
*
|
|
* The ioend chain length that we can be processing here is largely unbound in
|
|
* length and we may have to perform significant amounts of work on each ioend
|
|
* to complete it. Hence we have to be careful about holding the CPU for too
|
|
* long in this loop.
|
|
*/
|
|
void
|
|
xfs_end_io(
|
|
struct work_struct *work)
|
|
{
|
|
struct xfs_inode *ip =
|
|
container_of(work, struct xfs_inode, i_ioend_work);
|
|
struct iomap_ioend *ioend;
|
|
struct list_head tmp;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&ip->i_ioend_lock, flags);
|
|
list_replace_init(&ip->i_ioend_list, &tmp);
|
|
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
|
|
|
|
iomap_sort_ioends(&tmp);
|
|
while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
|
|
io_list))) {
|
|
list_del_init(&ioend->io_list);
|
|
iomap_ioend_try_merge(ioend, &tmp);
|
|
if (bio_op(&ioend->io_bio) == REQ_OP_READ)
|
|
iomap_finish_ioends(ioend,
|
|
blk_status_to_errno(ioend->io_bio.bi_status));
|
|
else
|
|
xfs_end_ioend_write(ioend);
|
|
cond_resched();
|
|
}
|
|
}
|
|
|
|
void
|
|
xfs_end_bio(
|
|
struct bio *bio)
|
|
{
|
|
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
|
|
struct xfs_inode *ip = XFS_I(ioend->io_inode);
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
unsigned long flags;
|
|
|
|
/*
|
|
* For Appends record the actually written block number and set the
|
|
* boundary flag if needed.
|
|
*/
|
|
if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
|
|
ioend->io_sector = bio->bi_iter.bi_sector;
|
|
xfs_mark_rtg_boundary(ioend);
|
|
}
|
|
|
|
spin_lock_irqsave(&ip->i_ioend_lock, flags);
|
|
if (list_empty(&ip->i_ioend_list))
|
|
WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
|
|
&ip->i_ioend_work));
|
|
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
|
|
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
|
|
}
|
|
|
|
/*
|
|
* We cannot cancel the ioend directly on error. We may have already set other
|
|
* pages under writeback and hence we have to run I/O completion to mark the
|
|
* error state of the pages under writeback appropriately.
|
|
*
|
|
* If the folio has delalloc blocks on it, the caller is asking us to punch them
|
|
* out. If we don't, we can leave a stale delalloc mapping covered by a clean
|
|
* page that needs to be dirtied again before the delalloc mapping can be
|
|
* converted. This stale delalloc mapping can trip up a later direct I/O read
|
|
* operation on the same region.
|
|
*
|
|
* We prevent this by truncating away the delalloc regions on the folio. Because
|
|
* they are delalloc, we can do this without needing a transaction. Indeed - if
|
|
* we get ENOSPC errors, we have to be able to do this truncation without a
|
|
* transaction as there is no space left for block reservation (typically why
|
|
* we see a ENOSPC in writeback).
|
|
*/
|
|
static void
|
|
xfs_discard_folio(
|
|
struct folio *folio,
|
|
loff_t pos)
|
|
{
|
|
struct xfs_inode *ip = XFS_I(folio->mapping->host);
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
return;
|
|
|
|
xfs_alert_ratelimited(mp,
|
|
"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
|
|
folio, ip->i_ino, pos);
|
|
|
|
/*
|
|
* The end of the punch range is always the offset of the first
|
|
* byte of the next folio. Hence the end offset is only dependent on the
|
|
* folio itself and not the start offset that is passed in.
|
|
*/
|
|
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
|
|
folio_next_pos(folio), NULL);
|
|
}
|
|
|
|
/*
|
|
* Fast revalidation of the cached writeback mapping. Return true if the current
|
|
* mapping is valid, false otherwise.
|
|
*/
|
|
static bool
|
|
xfs_imap_valid(
|
|
struct iomap_writepage_ctx *wpc,
|
|
struct xfs_inode *ip,
|
|
loff_t offset)
|
|
{
|
|
if (offset < wpc->iomap.offset ||
|
|
offset >= wpc->iomap.offset + wpc->iomap.length)
|
|
return false;
|
|
/*
|
|
* If this is a COW mapping, it is sufficient to check that the mapping
|
|
* covers the offset. Be careful to check this first because the caller
|
|
* can revalidate a COW mapping without updating the data seqno.
|
|
*/
|
|
if (wpc->iomap.flags & IOMAP_F_SHARED)
|
|
return true;
|
|
|
|
/*
|
|
* This is not a COW mapping. Check the sequence number of the data fork
|
|
* because concurrent changes could have invalidated the extent. Check
|
|
* the COW fork because concurrent changes since the last time we
|
|
* checked (and found nothing at this offset) could have added
|
|
* overlapping blocks.
|
|
*/
|
|
if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
|
|
trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
|
|
XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
|
|
return false;
|
|
}
|
|
if (xfs_inode_has_cow_data(ip) &&
|
|
XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
|
|
trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
|
|
XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static int
|
|
xfs_map_blocks(
|
|
struct iomap_writepage_ctx *wpc,
|
|
loff_t offset,
|
|
unsigned int len)
|
|
{
|
|
struct xfs_inode *ip = XFS_I(wpc->inode);
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
ssize_t count = i_blocksize(wpc->inode);
|
|
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
|
|
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
|
|
xfs_fileoff_t cow_fsb;
|
|
int whichfork;
|
|
struct xfs_bmbt_irec imap;
|
|
struct xfs_iext_cursor icur;
|
|
int retries = 0;
|
|
int error = 0;
|
|
unsigned int *seq;
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
return -EIO;
|
|
|
|
XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
|
|
|
|
/*
|
|
* COW fork blocks can overlap data fork blocks even if the blocks
|
|
* aren't shared. COW I/O always takes precedent, so we must always
|
|
* check for overlap on reflink inodes unless the mapping is already a
|
|
* COW one, or the COW fork hasn't changed from the last time we looked
|
|
* at it.
|
|
*
|
|
* It's safe to check the COW fork if_seq here without the ILOCK because
|
|
* we've indirectly protected against concurrent updates: writeback has
|
|
* the page locked, which prevents concurrent invalidations by reflink
|
|
* and directio and prevents concurrent buffered writes to the same
|
|
* page. Changes to if_seq always happen under i_lock, which protects
|
|
* against concurrent updates and provides a memory barrier on the way
|
|
* out that ensures that we always see the current value.
|
|
*/
|
|
if (xfs_imap_valid(wpc, ip, offset))
|
|
return 0;
|
|
|
|
/*
|
|
* If we don't have a valid map, now it's time to get a new one for this
|
|
* offset. This will convert delayed allocations (including COW ones)
|
|
* into real extents. If we return without a valid map, it means we
|
|
* landed in a hole and we skip the block.
|
|
*/
|
|
retry:
|
|
cow_fsb = NULLFILEOFF;
|
|
whichfork = XFS_DATA_FORK;
|
|
xfs_ilock(ip, XFS_ILOCK_SHARED);
|
|
ASSERT(!xfs_need_iread_extents(&ip->i_df));
|
|
|
|
/*
|
|
* Check if this is offset is covered by a COW extents, and if yes use
|
|
* it directly instead of looking up anything in the data fork.
|
|
*/
|
|
if (xfs_inode_has_cow_data(ip) &&
|
|
xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
|
|
cow_fsb = imap.br_startoff;
|
|
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
|
|
XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
|
|
xfs_iunlock(ip, XFS_ILOCK_SHARED);
|
|
|
|
whichfork = XFS_COW_FORK;
|
|
goto allocate_blocks;
|
|
}
|
|
|
|
/*
|
|
* No COW extent overlap. Revalidate now that we may have updated
|
|
* ->cow_seq. If the data mapping is still valid, we're done.
|
|
*/
|
|
if (xfs_imap_valid(wpc, ip, offset)) {
|
|
xfs_iunlock(ip, XFS_ILOCK_SHARED);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* If we don't have a valid map, now it's time to get a new one for this
|
|
* offset. This will convert delayed allocations (including COW ones)
|
|
* into real extents.
|
|
*/
|
|
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
|
|
imap.br_startoff = end_fsb; /* fake a hole past EOF */
|
|
XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
|
|
xfs_iunlock(ip, XFS_ILOCK_SHARED);
|
|
|
|
/* landed in a hole or beyond EOF? */
|
|
if (imap.br_startoff > offset_fsb) {
|
|
imap.br_blockcount = imap.br_startoff - offset_fsb;
|
|
imap.br_startoff = offset_fsb;
|
|
imap.br_startblock = HOLESTARTBLOCK;
|
|
imap.br_state = XFS_EXT_NORM;
|
|
}
|
|
|
|
/*
|
|
* Truncate to the next COW extent if there is one. This is the only
|
|
* opportunity to do this because we can skip COW fork lookups for the
|
|
* subsequent blocks in the mapping; however, the requirement to treat
|
|
* the COW range separately remains.
|
|
*/
|
|
if (cow_fsb != NULLFILEOFF &&
|
|
cow_fsb < imap.br_startoff + imap.br_blockcount)
|
|
imap.br_blockcount = cow_fsb - imap.br_startoff;
|
|
|
|
/* got a delalloc extent? */
|
|
if (imap.br_startblock != HOLESTARTBLOCK &&
|
|
isnullstartblock(imap.br_startblock))
|
|
goto allocate_blocks;
|
|
|
|
xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
|
|
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
|
|
return 0;
|
|
allocate_blocks:
|
|
/*
|
|
* Convert a dellalloc extent to a real one. The current page is held
|
|
* locked so nothing could have removed the block backing offset_fsb,
|
|
* although it could have moved from the COW to the data fork by another
|
|
* thread.
|
|
*/
|
|
if (whichfork == XFS_COW_FORK)
|
|
seq = &XFS_WPC(wpc)->cow_seq;
|
|
else
|
|
seq = &XFS_WPC(wpc)->data_seq;
|
|
|
|
error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
|
|
&wpc->iomap, seq);
|
|
if (error) {
|
|
/*
|
|
* If we failed to find the extent in the COW fork we might have
|
|
* raced with a COW to data fork conversion or truncate.
|
|
* Restart the lookup to catch the extent in the data fork for
|
|
* the former case, but prevent additional retries to avoid
|
|
* looping forever for the latter case.
|
|
*/
|
|
if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
|
|
goto retry;
|
|
ASSERT(error != -EAGAIN);
|
|
return error;
|
|
}
|
|
|
|
/*
|
|
* Due to merging the return real extent might be larger than the
|
|
* original delalloc one. Trim the return extent to the next COW
|
|
* boundary again to force a re-lookup.
|
|
*/
|
|
if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
|
|
loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
|
|
|
|
if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
|
|
wpc->iomap.length = cow_offset - wpc->iomap.offset;
|
|
}
|
|
|
|
ASSERT(wpc->iomap.offset <= offset);
|
|
ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
|
|
trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
|
|
return 0;
|
|
}
|
|
|
|
static ssize_t
|
|
xfs_writeback_range(
|
|
struct iomap_writepage_ctx *wpc,
|
|
struct folio *folio,
|
|
u64 offset,
|
|
unsigned int len,
|
|
u64 end_pos)
|
|
{
|
|
ssize_t ret;
|
|
|
|
ret = xfs_map_blocks(wpc, offset, len);
|
|
if (!ret)
|
|
ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
|
|
if (ret < 0)
|
|
xfs_discard_folio(folio, offset);
|
|
return ret;
|
|
}
|
|
|
|
static bool
|
|
xfs_ioend_needs_wq_completion(
|
|
struct iomap_ioend *ioend)
|
|
{
|
|
/* Changing inode size requires a transaction. */
|
|
if (xfs_ioend_is_append(ioend))
|
|
return true;
|
|
|
|
/* Extent manipulation requires a transaction. */
|
|
if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
|
|
return true;
|
|
|
|
/* Page cache invalidation cannot be done in irq context. */
|
|
if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static int
|
|
xfs_writeback_submit(
|
|
struct iomap_writepage_ctx *wpc,
|
|
int error)
|
|
{
|
|
struct iomap_ioend *ioend = wpc->wb_ctx;
|
|
|
|
/*
|
|
* Convert CoW extents to regular.
|
|
*
|
|
* We can allocate memory here while doing writeback on behalf of memory
|
|
* reclaim. To avoid memory allocation deadlocks, set the task-wide
|
|
* nofs context.
|
|
*/
|
|
if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
|
|
unsigned int nofs_flag;
|
|
|
|
nofs_flag = memalloc_nofs_save();
|
|
error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
|
|
ioend->io_offset, ioend->io_size);
|
|
memalloc_nofs_restore(nofs_flag);
|
|
}
|
|
|
|
/*
|
|
* Send ioends that might require a transaction to the completion wq.
|
|
*/
|
|
if (xfs_ioend_needs_wq_completion(ioend))
|
|
ioend->io_bio.bi_end_io = xfs_end_bio;
|
|
|
|
return iomap_ioend_writeback_submit(wpc, error);
|
|
}
|
|
|
|
static const struct iomap_writeback_ops xfs_writeback_ops = {
|
|
.writeback_range = xfs_writeback_range,
|
|
.writeback_submit = xfs_writeback_submit,
|
|
};
|
|
|
|
struct xfs_zoned_writepage_ctx {
|
|
struct iomap_writepage_ctx ctx;
|
|
struct xfs_open_zone *open_zone;
|
|
};
|
|
|
|
static inline struct xfs_zoned_writepage_ctx *
|
|
XFS_ZWPC(struct iomap_writepage_ctx *ctx)
|
|
{
|
|
return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
|
|
}
|
|
|
|
static int
|
|
xfs_zoned_map_blocks(
|
|
struct iomap_writepage_ctx *wpc,
|
|
loff_t offset,
|
|
unsigned int len)
|
|
{
|
|
struct xfs_inode *ip = XFS_I(wpc->inode);
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
|
|
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
|
|
xfs_filblks_t count_fsb;
|
|
struct xfs_bmbt_irec imap, del;
|
|
struct xfs_iext_cursor icur;
|
|
|
|
if (xfs_is_shutdown(mp))
|
|
return -EIO;
|
|
|
|
XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
|
|
|
|
/*
|
|
* All dirty data must be covered by delalloc extents. But truncate can
|
|
* remove delalloc extents underneath us or reduce their size.
|
|
* Returning a hole tells iomap to not write back any data from this
|
|
* range, which is the right thing to do in that case.
|
|
*
|
|
* Otherwise just tell iomap to treat ranges previously covered by a
|
|
* delalloc extent as mapped. The actual block allocation will be done
|
|
* just before submitting the bio.
|
|
*
|
|
* This implies we never map outside folios that are locked or marked
|
|
* as under writeback, and thus there is no need check the fork sequence
|
|
* count here.
|
|
*/
|
|
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
|
if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
|
|
imap.br_startoff = end_fsb; /* fake a hole past EOF */
|
|
if (imap.br_startoff > offset_fsb) {
|
|
imap.br_blockcount = imap.br_startoff - offset_fsb;
|
|
imap.br_startoff = offset_fsb;
|
|
imap.br_startblock = HOLESTARTBLOCK;
|
|
imap.br_state = XFS_EXT_NORM;
|
|
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
|
xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
|
|
return 0;
|
|
}
|
|
end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
|
|
count_fsb = end_fsb - offset_fsb;
|
|
|
|
del = imap;
|
|
xfs_trim_extent(&del, offset_fsb, count_fsb);
|
|
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
|
|
XFS_BMAPI_REMAP);
|
|
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
|
|
|
wpc->iomap.type = IOMAP_MAPPED;
|
|
wpc->iomap.flags = IOMAP_F_DIRTY;
|
|
wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
|
|
wpc->iomap.offset = offset;
|
|
wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
|
|
wpc->iomap.flags = IOMAP_F_ANON_WRITE;
|
|
|
|
trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
|
|
return 0;
|
|
}
|
|
|
|
static ssize_t
|
|
xfs_zoned_writeback_range(
|
|
struct iomap_writepage_ctx *wpc,
|
|
struct folio *folio,
|
|
u64 offset,
|
|
unsigned int len,
|
|
u64 end_pos)
|
|
{
|
|
ssize_t ret;
|
|
|
|
ret = xfs_zoned_map_blocks(wpc, offset, len);
|
|
if (!ret)
|
|
ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
|
|
if (ret < 0)
|
|
xfs_discard_folio(folio, offset);
|
|
return ret;
|
|
}
|
|
|
|
static int
|
|
xfs_zoned_writeback_submit(
|
|
struct iomap_writepage_ctx *wpc,
|
|
int error)
|
|
{
|
|
struct iomap_ioend *ioend = wpc->wb_ctx;
|
|
|
|
ioend->io_bio.bi_end_io = xfs_end_bio;
|
|
if (error) {
|
|
ioend->io_bio.bi_status = errno_to_blk_status(error);
|
|
bio_endio(&ioend->io_bio);
|
|
return error;
|
|
}
|
|
xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone);
|
|
return 0;
|
|
}
|
|
|
|
static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
|
|
.writeback_range = xfs_zoned_writeback_range,
|
|
.writeback_submit = xfs_zoned_writeback_submit,
|
|
};
|
|
|
|
STATIC int
|
|
xfs_vm_writepages(
|
|
struct address_space *mapping,
|
|
struct writeback_control *wbc)
|
|
{
|
|
struct xfs_inode *ip = XFS_I(mapping->host);
|
|
|
|
xfs_iflags_clear(ip, XFS_ITRUNCATED);
|
|
|
|
if (xfs_is_zoned_inode(ip)) {
|
|
struct xfs_zoned_writepage_ctx xc = {
|
|
.ctx = {
|
|
.inode = mapping->host,
|
|
.wbc = wbc,
|
|
.ops = &xfs_zoned_writeback_ops
|
|
},
|
|
};
|
|
int error;
|
|
|
|
error = iomap_writepages(&xc.ctx);
|
|
if (xc.open_zone)
|
|
xfs_open_zone_put(xc.open_zone);
|
|
return error;
|
|
} else {
|
|
struct xfs_writepage_ctx wpc = {
|
|
.ctx = {
|
|
.inode = mapping->host,
|
|
.wbc = wbc,
|
|
.ops = &xfs_writeback_ops
|
|
},
|
|
};
|
|
|
|
return iomap_writepages(&wpc.ctx);
|
|
}
|
|
}
|
|
|
|
STATIC int
|
|
xfs_dax_writepages(
|
|
struct address_space *mapping,
|
|
struct writeback_control *wbc)
|
|
{
|
|
struct xfs_inode *ip = XFS_I(mapping->host);
|
|
|
|
xfs_iflags_clear(ip, XFS_ITRUNCATED);
|
|
return dax_writeback_mapping_range(mapping,
|
|
xfs_inode_buftarg(ip)->bt_daxdev, wbc);
|
|
}
|
|
|
|
STATIC sector_t
|
|
xfs_vm_bmap(
|
|
struct address_space *mapping,
|
|
sector_t block)
|
|
{
|
|
struct xfs_inode *ip = XFS_I(mapping->host);
|
|
|
|
trace_xfs_vm_bmap(ip);
|
|
|
|
/*
|
|
* The swap code (ab-)uses ->bmap to get a block mapping and then
|
|
* bypasses the file system for actual I/O. We really can't allow
|
|
* that on reflinks inodes, so we have to skip out here. And yes,
|
|
* 0 is the magic code for a bmap error.
|
|
*
|
|
* Since we don't pass back blockdev info, we can't return bmap
|
|
* information for rt files either.
|
|
*/
|
|
if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
|
|
return 0;
|
|
return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
|
|
}
|
|
|
|
STATIC int
|
|
xfs_vm_read_folio(
|
|
struct file *unused,
|
|
struct folio *folio)
|
|
{
|
|
iomap_bio_read_folio(folio, &xfs_read_iomap_ops);
|
|
return 0;
|
|
}
|
|
|
|
STATIC void
|
|
xfs_vm_readahead(
|
|
struct readahead_control *rac)
|
|
{
|
|
iomap_bio_readahead(rac, &xfs_read_iomap_ops);
|
|
}
|
|
|
|
static int
|
|
xfs_vm_swap_activate(
|
|
struct swap_info_struct *sis,
|
|
struct file *swap_file,
|
|
sector_t *span)
|
|
{
|
|
struct xfs_inode *ip = XFS_I(file_inode(swap_file));
|
|
|
|
if (xfs_is_zoned_inode(ip))
|
|
return -EINVAL;
|
|
|
|
/*
|
|
* Swap file activation can race against concurrent shared extent
|
|
* removal in files that have been cloned. If this happens,
|
|
* iomap_swapfile_iter() can fail because it encountered a shared
|
|
* extent even though an operation is in progress to remove those
|
|
* shared extents.
|
|
*
|
|
* This race becomes problematic when we defer extent removal
|
|
* operations beyond the end of a syscall (i.e. use async background
|
|
* processing algorithms). Users think the extents are no longer
|
|
* shared, but iomap_swapfile_iter() still sees them as shared
|
|
* because the refcountbt entries for the extents being removed have
|
|
* not yet been updated. Hence the swapon call fails unexpectedly.
|
|
*
|
|
* The race condition is currently most obvious from the unlink()
|
|
* operation as extent removal is deferred until after the last
|
|
* reference to the inode goes away. We then process the extent
|
|
* removal asynchronously, hence triggers the "syscall completed but
|
|
* work not done" condition mentioned above. To close this race
|
|
* window, we need to flush any pending inodegc operations to ensure
|
|
* they have updated the refcountbt records before we try to map the
|
|
* swapfile.
|
|
*/
|
|
xfs_inodegc_flush(ip->i_mount);
|
|
|
|
/*
|
|
* Direct the swap code to the correct block device when this file
|
|
* sits on the RT device.
|
|
*/
|
|
sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
|
|
|
|
return iomap_swapfile_activate(sis, swap_file, span,
|
|
&xfs_read_iomap_ops);
|
|
}
|
|
|
|
const struct address_space_operations xfs_address_space_operations = {
|
|
.read_folio = xfs_vm_read_folio,
|
|
.readahead = xfs_vm_readahead,
|
|
.writepages = xfs_vm_writepages,
|
|
.dirty_folio = iomap_dirty_folio,
|
|
.release_folio = iomap_release_folio,
|
|
.invalidate_folio = iomap_invalidate_folio,
|
|
.bmap = xfs_vm_bmap,
|
|
.migrate_folio = filemap_migrate_folio,
|
|
.is_partially_uptodate = iomap_is_partially_uptodate,
|
|
.error_remove_folio = generic_error_remove_folio,
|
|
.swap_activate = xfs_vm_swap_activate,
|
|
};
|
|
|
|
const struct address_space_operations xfs_dax_aops = {
|
|
.writepages = xfs_dax_writepages,
|
|
.dirty_folio = noop_dirty_folio,
|
|
.swap_activate = xfs_vm_swap_activate,
|
|
};
|