mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 01:24:47 +01:00
Chris Mason reports that his AI tools noticed that we were using
xfs_perag_put and xfs_group_put to release the group reference returned
by xfs_group_next_range. However, the iterator function returns an
object with an active refcount, which means that we must use the correct
function to release the active refcount, which is _rele.
Fixes: b8accfd65d ("xfs: add media verification ioctl")
Reported-by: Chris Mason <clm@meta.com>
Link: https://lore.kernel.org/linux-xfs/20260206030527.2506821-1-clm@meta.com/
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
445 lines
10 KiB
C
445 lines
10 KiB
C
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
/*
|
|
* Copyright (c) 2026 Oracle. All Rights Reserved.
|
|
* Author: Darrick J. Wong <djwong@kernel.org>
|
|
*/
|
|
#include "xfs_platform.h"
|
|
#include "xfs_shared.h"
|
|
#include "xfs_format.h"
|
|
#include "xfs_log_format.h"
|
|
#include "xfs_trans_resv.h"
|
|
#include "xfs_mount.h"
|
|
#include "xfs_bit.h"
|
|
#include "xfs_btree.h"
|
|
#include "xfs_inode.h"
|
|
#include "xfs_icache.h"
|
|
#include "xfs_trans.h"
|
|
#include "xfs_alloc.h"
|
|
#include "xfs_ag.h"
|
|
#include "xfs_rmap.h"
|
|
#include "xfs_rmap_btree.h"
|
|
#include "xfs_rtgroup.h"
|
|
#include "xfs_rtrmap_btree.h"
|
|
#include "xfs_health.h"
|
|
#include "xfs_healthmon.h"
|
|
#include "xfs_trace.h"
|
|
#include "xfs_verify_media.h"
|
|
|
|
#include <linux/fserror.h>
|
|
|
|
struct xfs_group_data_lost {
|
|
xfs_agblock_t startblock;
|
|
xfs_extlen_t blockcount;
|
|
};
|
|
|
|
/* Report lost file data from rmap records */
|
|
static int
|
|
xfs_verify_report_data_lost(
|
|
struct xfs_btree_cur *cur,
|
|
const struct xfs_rmap_irec *rec,
|
|
void *data)
|
|
{
|
|
struct xfs_mount *mp = cur->bc_mp;
|
|
struct xfs_inode *ip;
|
|
struct xfs_group_data_lost *lost = data;
|
|
xfs_fileoff_t fileoff = rec->rm_offset;
|
|
xfs_extlen_t blocks = rec->rm_blockcount;
|
|
const bool is_attr =
|
|
(rec->rm_flags & XFS_RMAP_ATTR_FORK);
|
|
const xfs_agblock_t lost_end =
|
|
lost->startblock + lost->blockcount;
|
|
const xfs_agblock_t rmap_end =
|
|
rec->rm_startblock + rec->rm_blockcount;
|
|
int error = 0;
|
|
|
|
if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
|
|
return 0;
|
|
|
|
error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
|
|
if (error)
|
|
return 0;
|
|
|
|
if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
|
|
xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
|
|
goto out_rele;
|
|
}
|
|
|
|
if (is_attr) {
|
|
xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
|
|
goto out_rele;
|
|
}
|
|
|
|
if (lost->startblock > rec->rm_startblock) {
|
|
fileoff += lost->startblock - rec->rm_startblock;
|
|
blocks -= lost->startblock - rec->rm_startblock;
|
|
}
|
|
if (rmap_end > lost_end)
|
|
blocks -= rmap_end - lost_end;
|
|
|
|
fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
|
|
XFS_FSB_TO_B(mp, blocks), GFP_NOFS);
|
|
|
|
out_rele:
|
|
xfs_irele(ip);
|
|
return 0;
|
|
}
|
|
|
|
/* Walk reverse mappings to look for all file data loss */
|
|
static int
|
|
xfs_verify_report_losses(
|
|
struct xfs_mount *mp,
|
|
enum xfs_group_type type,
|
|
xfs_daddr_t daddr,
|
|
u64 bblen)
|
|
{
|
|
struct xfs_group *xg = NULL;
|
|
struct xfs_trans *tp;
|
|
xfs_fsblock_t start_bno, end_bno;
|
|
uint32_t start_gno, end_gno;
|
|
int error;
|
|
|
|
if (type == XG_TYPE_RTG) {
|
|
start_bno = xfs_daddr_to_rtb(mp, daddr);
|
|
end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
|
|
} else {
|
|
start_bno = XFS_DADDR_TO_FSB(mp, daddr);
|
|
end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
|
|
}
|
|
|
|
tp = xfs_trans_alloc_empty(mp);
|
|
start_gno = xfs_fsb_to_gno(mp, start_bno, type);
|
|
end_gno = xfs_fsb_to_gno(mp, end_bno, type);
|
|
while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
|
|
struct xfs_buf *agf_bp = NULL;
|
|
struct xfs_rtgroup *rtg = NULL;
|
|
struct xfs_btree_cur *cur;
|
|
struct xfs_rmap_irec ri_low = { };
|
|
struct xfs_rmap_irec ri_high;
|
|
struct xfs_group_data_lost lost;
|
|
|
|
if (type == XG_TYPE_AG) {
|
|
struct xfs_perag *pag = to_perag(xg);
|
|
|
|
error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
|
|
if (error) {
|
|
xfs_perag_rele(pag);
|
|
break;
|
|
}
|
|
|
|
cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
|
|
} else {
|
|
rtg = to_rtg(xg);
|
|
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
|
|
cur = xfs_rtrmapbt_init_cursor(tp, rtg);
|
|
}
|
|
|
|
/*
|
|
* Set the rmap range from ri_low to ri_high, which represents
|
|
* a [start, end] where we looking for the files or metadata.
|
|
*/
|
|
memset(&ri_high, 0xFF, sizeof(ri_high));
|
|
if (xg->xg_gno == start_gno)
|
|
ri_low.rm_startblock =
|
|
xfs_fsb_to_gbno(mp, start_bno, type);
|
|
if (xg->xg_gno == end_gno)
|
|
ri_high.rm_startblock =
|
|
xfs_fsb_to_gbno(mp, end_bno, type);
|
|
|
|
lost.startblock = ri_low.rm_startblock;
|
|
lost.blockcount = min(xg->xg_block_count,
|
|
ri_high.rm_startblock + 1) -
|
|
ri_low.rm_startblock;
|
|
|
|
error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
|
|
xfs_verify_report_data_lost, &lost);
|
|
xfs_btree_del_cursor(cur, error);
|
|
if (agf_bp)
|
|
xfs_trans_brelse(tp, agf_bp);
|
|
if (rtg)
|
|
xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
|
|
if (error) {
|
|
xfs_group_rele(xg);
|
|
break;
|
|
}
|
|
}
|
|
|
|
xfs_trans_cancel(tp);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Compute the desired verify IO size.
|
|
*
|
|
* To minimize command overhead, we'd like to create bios that are 1MB, though
|
|
* we allow the user to ask for a smaller size.
|
|
*/
|
|
static unsigned int
|
|
xfs_verify_iosize(
|
|
const struct xfs_verify_media *me,
|
|
struct xfs_buftarg *btp,
|
|
uint64_t bbcount)
|
|
{
|
|
unsigned int iosize =
|
|
min_not_zero(SZ_1M, me->me_max_io_size);
|
|
|
|
BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
|
|
ASSERT(BBTOB(bbcount) >= bdev_logical_block_size(btp->bt_bdev));
|
|
|
|
return clamp(iosize, bdev_logical_block_size(btp->bt_bdev),
|
|
BBTOB(bbcount));
|
|
}
|
|
|
|
/* Allocate as much memory as we can get for verification buffer. */
|
|
static struct folio *
|
|
xfs_verify_alloc_folio(
|
|
const unsigned int iosize)
|
|
{
|
|
unsigned int order = get_order(iosize);
|
|
|
|
while (order > 0) {
|
|
struct folio *folio =
|
|
folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);
|
|
|
|
if (folio)
|
|
return folio;
|
|
order--;
|
|
}
|
|
|
|
return folio_alloc(GFP_KERNEL, 0);
|
|
}
|
|
|
|
/* Report any kind of problem verifying media */
|
|
static void
|
|
xfs_verify_media_error(
|
|
struct xfs_mount *mp,
|
|
struct xfs_verify_media *me,
|
|
struct xfs_buftarg *btp,
|
|
xfs_daddr_t daddr,
|
|
unsigned int bio_bbcount,
|
|
blk_status_t bio_status)
|
|
{
|
|
trace_xfs_verify_media_error(mp, me, btp->bt_bdev->bd_dev, daddr,
|
|
bio_bbcount, bio_status);
|
|
|
|
/*
|
|
* Pass any error, I/O or otherwise, up to the caller if we didn't
|
|
* successfully verify any bytes at all.
|
|
*/
|
|
if (me->me_start_daddr == daddr)
|
|
me->me_ioerror = -blk_status_to_errno(bio_status);
|
|
|
|
/*
|
|
* PI validation failures, medium errors, or general IO errors are
|
|
* treated as indicators of data loss. Everything else are (hopefully)
|
|
* transient errors and are not reported to healthmon or fsnotify.
|
|
*/
|
|
switch (bio_status) {
|
|
case BLK_STS_PROTECTION:
|
|
case BLK_STS_IOERR:
|
|
case BLK_STS_MEDIUM:
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
|
|
if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
|
|
return;
|
|
|
|
xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);
|
|
|
|
if (!xfs_has_rmapbt(mp))
|
|
return;
|
|
|
|
switch (me->me_dev) {
|
|
case XFS_DEV_DATA:
|
|
xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
|
|
break;
|
|
case XFS_DEV_RT:
|
|
xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Verify the media of an xfs device by submitting read requests to the disk. */
|
|
static int
|
|
xfs_verify_media(
|
|
struct xfs_mount *mp,
|
|
struct xfs_verify_media *me)
|
|
{
|
|
struct xfs_buftarg *btp = NULL;
|
|
struct bio *bio;
|
|
struct folio *folio;
|
|
xfs_daddr_t daddr;
|
|
uint64_t bbcount;
|
|
int error = 0;
|
|
|
|
me->me_ioerror = 0;
|
|
|
|
switch (me->me_dev) {
|
|
case XFS_DEV_DATA:
|
|
btp = mp->m_ddev_targp;
|
|
break;
|
|
case XFS_DEV_LOG:
|
|
if (mp->m_logdev_targp->bt_bdev != mp->m_ddev_targp->bt_bdev)
|
|
btp = mp->m_logdev_targp;
|
|
break;
|
|
case XFS_DEV_RT:
|
|
btp = mp->m_rtdev_targp;
|
|
break;
|
|
}
|
|
if (!btp)
|
|
return -ENODEV;
|
|
|
|
/*
|
|
* If the caller told us to verify beyond the end of the disk, tell the
|
|
* user exactly where that was.
|
|
*/
|
|
if (me->me_end_daddr > btp->bt_nr_sectors)
|
|
me->me_end_daddr = btp->bt_nr_sectors;
|
|
|
|
/* start and end have to be aligned to the lba size */
|
|
if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
|
|
bdev_logical_block_size(btp->bt_bdev)))
|
|
return -EINVAL;
|
|
|
|
/*
|
|
* end_daddr is the exclusive end of the range, so if start_daddr
|
|
* reaches there (or beyond), there's no work to be done.
|
|
*/
|
|
if (me->me_start_daddr >= me->me_end_daddr)
|
|
return 0;
|
|
|
|
/*
|
|
* There are three ranges involved here:
|
|
*
|
|
* - [me->me_start_daddr, me->me_end_daddr) is the range that the
|
|
* user wants to verify. end_daddr can be beyond the end of the
|
|
* disk; we'll constrain it to the end if necessary.
|
|
*
|
|
* - [daddr, me->me_end_daddr) is the range that we have not yet
|
|
* verified. We update daddr after each successful read.
|
|
* me->me_start_daddr is set to daddr before returning.
|
|
*
|
|
* - [daddr, daddr + bio_bbcount) is the range that we're currently
|
|
* verifying.
|
|
*/
|
|
daddr = me->me_start_daddr;
|
|
bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
|
|
me->me_start_daddr;
|
|
|
|
folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
|
|
if (!folio)
|
|
return -ENOMEM;
|
|
|
|
trace_xfs_verify_media(mp, me, btp->bt_bdev->bd_dev, daddr, bbcount,
|
|
folio);
|
|
|
|
bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
|
|
if (!bio) {
|
|
error = -ENOMEM;
|
|
goto out_folio;
|
|
}
|
|
|
|
while (bbcount > 0) {
|
|
unsigned int bio_bbcount;
|
|
blk_status_t bio_status;
|
|
|
|
bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
|
|
bio->bi_iter.bi_sector = daddr;
|
|
bio_add_folio_nofail(bio, folio,
|
|
min(bbcount << SECTOR_SHIFT, folio_size(folio)),
|
|
0);
|
|
|
|
/*
|
|
* Save the length of the bio before we submit it, because we
|
|
* need the original daddr and length for reporting IO errors
|
|
* if the bio fails.
|
|
*/
|
|
bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
|
|
submit_bio_wait(bio);
|
|
bio_status = bio->bi_status;
|
|
if (bio_status != BLK_STS_OK) {
|
|
xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
|
|
bio_status);
|
|
error = 0;
|
|
break;
|
|
}
|
|
|
|
daddr += bio_bbcount;
|
|
bbcount -= bio_bbcount;
|
|
|
|
if (bbcount == 0)
|
|
break;
|
|
|
|
if (me->me_rest_us) {
|
|
ktime_t expires;
|
|
|
|
expires = ktime_add_ns(ktime_get(),
|
|
me->me_rest_us * 1000);
|
|
set_current_state(TASK_KILLABLE);
|
|
schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
|
|
}
|
|
|
|
if (fatal_signal_pending(current)) {
|
|
error = -EINTR;
|
|
break;
|
|
}
|
|
|
|
cond_resched();
|
|
}
|
|
|
|
bio_put(bio);
|
|
out_folio:
|
|
folio_put(folio);
|
|
|
|
if (error)
|
|
return error;
|
|
|
|
/*
|
|
* Advance start_daddr to the end of what we verified if there wasn't
|
|
* an operational error.
|
|
*/
|
|
me->me_start_daddr = daddr;
|
|
trace_xfs_verify_media_end(mp, me, btp->bt_bdev->bd_dev);
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
xfs_ioc_verify_media(
|
|
struct file *file,
|
|
struct xfs_verify_media __user *arg)
|
|
{
|
|
struct xfs_verify_media me;
|
|
struct xfs_inode *ip = XFS_I(file_inode(file));
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
int error;
|
|
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
|
|
if (copy_from_user(&me, arg, sizeof(me)))
|
|
return -EFAULT;
|
|
|
|
if (me.me_pad)
|
|
return -EINVAL;
|
|
if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
|
|
return -EINVAL;
|
|
|
|
switch (me.me_dev) {
|
|
case XFS_DEV_DATA:
|
|
case XFS_DEV_LOG:
|
|
case XFS_DEV_RT:
|
|
break;
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
|
|
error = xfs_verify_media(mp, &me);
|
|
if (error)
|
|
return error;
|
|
|
|
if (copy_to_user(arg, &me, sizeof(me)))
|
|
return -EFAULT;
|
|
|
|
return 0;
|
|
}
|