From 00f6c1b4d15d35fadb7f34768a1831c81aaa8936 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Mon, 15 Dec 2025 20:44:12 +0800
Subject: [PATCH 01/18] md: Fix static checker warning in analyze_sbs

The following warn is reported:

 drivers/md/md.c:3912 analyze_sbs()
 warn: iterator 'i' not incremented

Fixes: d8730f0cf4ef ("md: Remove deprecated CONFIG_MD_MULTIPATH")
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/linux-raid/7e2e95ce-3740-09d8-a561-af6bfb767f18@huaweicloud.com/T/#t
Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/linux-raid/20251215124412.4015572-1-linan666@huaweicloud.com
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 drivers/md/md.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e5922a682953..03433c88fb54 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3882,7 +3882,6 @@ out_free_rdev:
 
 static int analyze_sbs(struct mddev *mddev)
 {
-	int i;
 	struct md_rdev *rdev, *freshest, *tmp;
 
 	freshest = NULL;
@@ -3909,11 +3908,9 @@ static int analyze_sbs(struct mddev *mddev)
 	super_types[mddev->major_version].
 		validate_super(mddev, NULL/*freshest*/, freshest);
 
-	i = 0;
 	rdev_for_each_safe(rdev, tmp, mddev) {
 		if (mddev->max_disks &&
-		    (rdev->desc_nr >= mddev->max_disks ||
-		     i > mddev->max_disks)) {
+		    rdev->desc_nr >= mddev->max_disks) {
 			pr_warn("md: %s: %pg: only %d devices permitted\n",
 				mdname(mddev), rdev->bdev,
 				mddev->max_disks);

From 7ad6ef91d8745d04aff9cce7bdbc6320d8e05fe9 Mon Sep 17 00:00:00 2001
From: Tuo Li <islituo@gmail.com>
Date: Thu, 25 Dec 2025 21:03:26 +0800
Subject: [PATCH 02/18] md/raid5: fix possible null-pointer dereferences in
 raid5_store_group_thread_cnt()

The variable mddev->private is first assigned to conf and then checked:

  conf = mddev->private;
  if (!conf) ...

If conf is NULL, then mddev->private is also NULL. In this case,
null-pointer dereferences can occur when calling raid5_quiesce():

  raid5_quiesce(mddev, true);
  raid5_quiesce(mddev, false);

since mddev->private is assigned to conf again in raid5_quiesce(), and conf
is dereferenced in several places, for example:

  conf->quiesce = 0;
  wake_up(&conf->wait_for_quiescent);

To fix this issue, the function should unlock mddev and return before
invoking raid5_quiesce() when conf is NULL, following the existing pattern
in raid5_change_consistency_policy().

Fixes: fa1944bbe622 ("md/raid5: Wait sync io to finish before changing group cnt")
Signed-off-by: Tuo Li <islituo@gmail.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Reviewed-by: Paul Menzel <pmenzel@molgen.mpg.de>
Link: https://lore.kernel.org/linux-raid/20251225130326.67780-1-islituo@gmail.com
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 drivers/md/raid5.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e57ce3295292..8dc98f545969 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7187,12 +7187,14 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
 	err = mddev_suspend_and_lock(mddev);
 	if (err)
 		return err;
+	conf = mddev->private;
+	if (!conf) {
+		mddev_unlock_and_resume(mddev);
+		return -ENODEV;
+	}
 	raid5_quiesce(mddev, true);
 
-	conf = mddev->private;
-	if (!conf)
-		err = -ENODEV;
-	else if (new != conf->worker_cnt_per_group) {
+	if (new != conf->worker_cnt_per_group) {
 		old_groups = conf->worker_groups;
 		if (old_groups)
 			flush_workqueue(raid5_wq);

From 2cc583653bbe050bacd1cadcc9776d39bf449740 Mon Sep 17 00:00:00 2001
From: FengWei Shih <dannyshih@synology.com>
Date: Fri, 26 Dec 2025 18:18:16 +0800
Subject: [PATCH 03/18] md: suspend array while updating raid_disks via sysfs

In raid1_reshape(), freeze_array() is called before modifying the r1bio
memory pool (conf->r1bio_pool) and conf->raid_disks, and
unfreeze_array() is called after the update is completed.

However, freeze_array() only waits until nr_sync_pending and
(nr_pending - nr_queued) of all buckets reaches zero. When an I/O error
occurs, nr_queued is increased and the corresponding r1bio is queued to
either retry_list or bio_end_io_list. As a result, freeze_array() may
unblock before these r1bios are released.

This can lead to a situation where conf->raid_disks and the mempool have
already been updated while queued r1bios, allocated with the old
raid_disks value, are later released. Consequently, free_r1bio() may
access memory out of bounds in put_all_bios() and release r1bios of the
wrong size to the new mempool, potentially causing issues with the
mempool as well.

Since only normal I/O might increase nr_queued while an I/O error occurs,
suspending the array avoids this issue.

Note: Updating raid_disks via ioctl SET_ARRAY_INFO already suspends
the array. Therefore, we suspend the array when updating raid_disks
via sysfs to avoid this issue too.

Signed-off-by: FengWei Shih <dannyshih@synology.com>
Link: https://lore.kernel.org/linux-raid/20251226101816.4506-1-dannyshih@synology.com
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 drivers/md/md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 03433c88fb54..dda272e87a1b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4404,7 +4404,7 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
 	if (err < 0)
 		return err;
 
-	err = mddev_lock(mddev);
+	err = mddev_suspend_and_lock(mddev);
 	if (err)
 		return err;
 	if (mddev->pers)
@@ -4429,7 +4429,7 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
 	} else
 		mddev->raid_disks = n;
 out_unlock:
-	mddev_unlock(mddev);
+	mddev_unlock_and_resume(mddev);
 	return err ? err : len;
 }
 static struct md_sysfs_entry md_raid_disks =

From 864466c38c4a0446088a8e866538c83c3f2018cb Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Fri, 26 Dec 2025 10:42:20 +0800
Subject: [PATCH 04/18] md: Fix logical_block_size configuration being
 overwritten

In super_1_validate(), mddev->logical_block_size is directly overwritten
with the value from metadata. This causes the previously configured lbs
to be lost, making the configuration ineffective. Fix it.

Fixes: 62ed1b582246 ("md: allow configuring logical block size")
Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Link: https://lore.kernel.org/linux-raid/20251226024221.724201-1-linan666@huaweicloud.com
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 drivers/md/md.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index dda272e87a1b..a13f92a64df6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1999,7 +1999,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
 		mddev->layout = le32_to_cpu(sb->layout);
 		mddev->raid_disks = le32_to_cpu(sb->raid_disks);
 		mddev->dev_sectors = le64_to_cpu(sb->size);
-		mddev->logical_block_size = le32_to_cpu(sb->logical_block_size);
 		mddev->events = ev1;
 		mddev->bitmap_info.offset = 0;
 		mddev->bitmap_info.space = 0;
@@ -2015,6 +2014,9 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
 
 		mddev->max_disks =  (4096-256)/2;
 
+		if (!mddev->logical_block_size)
+			mddev->logical_block_size = le32_to_cpu(sb->logical_block_size);
+
 		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
 		    mddev->bitmap_info.file == NULL) {
 			mddev->bitmap_info.offset =

From a4166f1c4893a9a620507255b9d1ccab44fab189 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Fri, 26 Dec 2025 10:42:21 +0800
Subject: [PATCH 05/18] md: Fix forward incompatibility from configurable
 logical block size

Commit 62ed1b582246 ("md: allow configuring logical block size") used
reserved pad to add 'logical_block_size' to metadata. RAID rejects
non-zero reserved pad, so arrays fail when rolling back to old kernels
after booting new ones.

Set 'logical_block_size' only for newly created arrays to support rollback
to old kernels. Importantly new arrays still won't work on old kernels to
prevent data loss issue from LBS changes.

For arrays created on old kernels which confirmed not to rollback,
configure LBS by echo current LBS (queue/logical_block_size) to
md/logical_block_size.

Fixes: 62ed1b582246 ("md: allow configuring logical block size")
Reported-by: BugReports <bugreports61@gmail.com>
Closes: https://lore.kernel.org/linux-raid/825e532d-d1e1-44bb-5581-692b7c091796@huaweicloud.com/T/#t
Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/linux-raid/20251226024221.724201-2-linan666@huaweicloud.com
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
 drivers/md/md.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a13f92a64df6..6d73f6e196a9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5980,13 +5980,33 @@ lbs_store(struct mddev *mddev, const char *buf, size_t len)
 	if (mddev->major_version == 0)
 		return -EINVAL;
 
-	if (mddev->pers)
-		return -EBUSY;
-
 	err = kstrtouint(buf, 10, &lbs);
 	if (err < 0)
 		return -EINVAL;
 
+	if (mddev->pers) {
+		unsigned int curr_lbs;
+
+		if (mddev->logical_block_size)
+			return -EBUSY;
+		/*
+		 * To fix forward compatibility issues, LBS is not
+		 * configured for arrays from old kernels (<=6.18) by default.
+		 * If the user confirms no rollback to old kernels,
+		 * enable LBS by writing current LBS — to prevent data
+		 * loss from LBS changes.
+		 */
+		curr_lbs = queue_logical_block_size(mddev->gendisk->queue);
+		if (lbs != curr_lbs)
+			return -EINVAL;
+
+		mddev->logical_block_size = curr_lbs;
+		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
+		pr_info("%s: logical block size configured successfully, array will not be assembled in old kernels (<= 6.18)\n",
+			mdname(mddev));
+		return len;
+	}
+
 	err = mddev_lock(mddev);
 	if (err)
 		goto unlock;
@@ -6162,7 +6182,27 @@ int mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim,
 			mdname(mddev));
 		return -EINVAL;
 	}
-	mddev->logical_block_size = lim->logical_block_size;
+
+	/* Only 1.x meta needs to set logical block size */
+	if (mddev->major_version == 0)
+		return 0;
+
+	/*
+	 * Fix forward compatibility issue. Only set LBS by default for
+	 * new arrays, mddev->events == 0 indicates the array was just
+	 * created. When assembling an array, read LBS from the superblock
+	 * instead — LBS is 0 in superblocks created by old kernels.
+	 */
+	if (!mddev->events) {
+		pr_info("%s: array will not be assembled in old kernels that lack configurable LBS support (<= 6.18)\n",
+			mdname(mddev));
+		mddev->logical_block_size = lim->logical_block_size;
+	}
+
+	if (!mddev->logical_block_size)
+		pr_warn("%s: echo current LBS to md/logical_block_size to prevent data loss issues from LBS changes.\n"
+			"\tNote: After setting, array will not be assembled in old kernels (<= 6.18)\n",
+			mdname(mddev));
 
 	return 0;
 }

From 04bdb1a04d8a2a89df504c1e34250cd3c6e31a1c Mon Sep 17 00:00:00 2001
From: shechenglong <shechenglong@xfusion.com>
Date: Sun, 28 Dec 2025 21:04:26 +0800
Subject: [PATCH 06/18] block,bfq: fix aux stat accumulation destination

Route bfqg_stats_add_aux() time accumulation into the destination
stats object instead of the source, aligning with other stat fields.

Reviewed-by: Yu Kuai <yukuai@fnnas.com>
Signed-off-by: shechenglong <shechenglong@xfusion.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9fb9f3533150..6a75fe1c7a5c 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -380,7 +380,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
 	blkg_rwstat_add_aux(&to->merged, &from->merged);
 	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
 	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
-	bfq_stat_add_aux(&from->time, &from->time);
+	bfq_stat_add_aux(&to->time, &from->time);
 	bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
 	bfq_stat_add_aux(&to->avg_queue_size_samples,
 			  &from->avg_queue_size_samples);

From 7fc4da6a304bdcd3de14fc946dc2c19437a9cc5a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 23 Dec 2025 11:27:40 +0800
Subject: [PATCH 07/18] ublk: scan partition in async way

Implement async partition scan to avoid IO hang when reading partition
tables. Similar to nvme_partition_scan_work(), partition scanning is
deferred to a work queue to prevent deadlocks.

When partition scan happens synchronously during add_disk(), IO errors
can cause the partition scan to wait while holding ub->mutex, which
can deadlock with other operations that need the mutex.

Changes:
- Add partition_scan_work to ublk_device structure
- Implement ublk_partition_scan_work() to perform async scan
- Always suppress sync partition scan during add_disk()
- Schedule async work after add_disk() for trusted daemons
- Add flush_work() in ublk_stop_dev() before grabbing ub->mutex

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Reported-by: Yoav Cohen <yoav@nvidia.com>
Closes: https://lore.kernel.org/linux-block/DM4PR12MB63280C5637917C071C2F0D65A9A8A@DM4PR12MB6328.namprd12.prod.outlook.com/
Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 49c208457198..837fedb02e0d 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -237,6 +237,7 @@ struct ublk_device {
 	bool canceling;
 	pid_t 	ublksrv_tgid;
 	struct delayed_work	exit_work;
+	struct work_struct	partition_scan_work;
 
 	struct ublk_queue       *queues[];
 };
@@ -254,6 +255,20 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
 		u16 q_id, u16 tag, struct ublk_io *io, size_t offset);
 static inline unsigned int ublk_req_build_flags(struct request *req);
 
+static void ublk_partition_scan_work(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, partition_scan_work);
+
+	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
+					     &ub->ub_disk->state)))
+		return;
+
+	mutex_lock(&ub->ub_disk->open_mutex);
+	bdev_disk_changed(ub->ub_disk, false);
+	mutex_unlock(&ub->ub_disk->open_mutex);
+}
+
 static inline struct ublksrv_io_desc *
 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
 {
@@ -2026,6 +2041,7 @@ static void ublk_stop_dev(struct ublk_device *ub)
 	mutex_lock(&ub->mutex);
 	ublk_stop_dev_unlocked(ub);
 	mutex_unlock(&ub->mutex);
+	flush_work(&ub->partition_scan_work);
 	ublk_cancel_dev(ub);
 }
 
@@ -2954,9 +2970,17 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
 
 	ublk_apply_params(ub);
 
-	/* don't probe partitions if any daemon task is un-trusted */
-	if (ub->unprivileged_daemons)
-		set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+	/*
+	 * Suppress partition scan to avoid potential IO hang.
+	 *
+	 * If ublk server error occurs during partition scan, the IO may
+	 * wait while holding ub->mutex, which can deadlock with other
+	 * operations that need the mutex. Defer partition scan to async
+	 * work.
+	 * For unprivileged daemons, keep GD_SUPPRESS_PART_SCAN set
+	 * permanently.
+	 */
+	set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
 
 	ublk_get_device(ub);
 	ub->dev_info.state = UBLK_S_DEV_LIVE;
@@ -2973,6 +2997,10 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
 
 	set_bit(UB_STATE_USED, &ub->state);
 
+	/* Schedule async partition scan for trusted daemons */
+	if (!ub->unprivileged_daemons)
+		schedule_work(&ub->partition_scan_work);
+
 out_put_cdev:
 	if (ret) {
 		ublk_detach_disk(ub);
@@ -3138,6 +3166,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
 	mutex_init(&ub->mutex);
 	spin_lock_init(&ub->lock);
 	mutex_init(&ub->cancel_mutex);
+	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
 
 	ret = ublk_alloc_dev_number(ub, header->dev_id);
 	if (ret < 0)

From 60cf863720308ab89ce2fdafea7fcb2cefd9c144 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 23 Dec 2025 11:27:41 +0800
Subject: [PATCH 08/18] selftests/ublk: add test for async partition scan

Add test_generic_15.sh to verify that async partition scan prevents
IO hang when reading partition tables.

The test creates ublk devices with fault_inject target and very large
delay (60s) to simulate blocked partition table reads, then kills the
daemon to verify proper state transitions without hanging:

1. Without recovery support:
   - Create device with fault_inject and 60s delay
   - Kill daemon while partition scan may be blocked
   - Verify device transitions to DEAD state

2. With recovery support (-r 1):
   - Create device with fault_inject, 60s delay, and recovery
   - Kill daemon while partition scan may be blocked
   - Verify device transitions to QUIESCED state

Before the async partition scan fix, killing the daemon during
partition scan would cause deadlock as partition scan held ub->mutex
while waiting for IO. With the async fix, partition scan happens in
a work function and flush_work() ensures proper synchronization.

Add _add_ublk_dev_no_settle() helper function to skip udevadm settle,
which would otherwise hang waiting for partition scan events to
complete when partition table read is delayed.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile         |  1 +
 tools/testing/selftests/ublk/test_common.sh   | 16 +++--
 .../testing/selftests/ublk/test_generic_15.sh | 68 +++++++++++++++++++
 3 files changed, 81 insertions(+), 4 deletions(-)
 create mode 100755 tools/testing/selftests/ublk/test_generic_15.sh

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index 837977b62417..eb0e6cfb00ad 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -22,6 +22,7 @@ TEST_PROGS += test_generic_11.sh
 TEST_PROGS += test_generic_12.sh
 TEST_PROGS += test_generic_13.sh
 TEST_PROGS += test_generic_14.sh
+TEST_PROGS += test_generic_15.sh
 
 TEST_PROGS += test_null_01.sh
 TEST_PROGS += test_null_02.sh
diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh
index 6f1c042de40e..ea9a5f3eb70a 100755
--- a/tools/testing/selftests/ublk/test_common.sh
+++ b/tools/testing/selftests/ublk/test_common.sh
@@ -178,8 +178,9 @@ _have_feature()
 _create_ublk_dev() {
 	local dev_id;
 	local cmd=$1
+	local settle=$2
 
-	shift 1
+	shift 2
 
 	if [ ! -c /dev/ublk-control ]; then
 		return ${UBLK_SKIP_CODE}
@@ -194,7 +195,10 @@ _create_ublk_dev() {
 		echo "fail to add ublk dev $*"
 		return 255
 	fi
-	udevadm settle
+
+	if [ "$settle" = "yes" ]; then
+		udevadm settle
+	fi
 
 	if [[ "$dev_id" =~ ^[0-9]+$ ]]; then
 		echo "${dev_id}"
@@ -204,14 +208,18 @@ _create_ublk_dev() {
 }
 
 _add_ublk_dev() {
-	_create_ublk_dev "add" "$@"
+	_create_ublk_dev "add" "yes" "$@"
+}
+
+_add_ublk_dev_no_settle() {
+	_create_ublk_dev "add" "no" "$@"
 }
 
 _recover_ublk_dev() {
 	local dev_id
 	local state
 
-	dev_id=$(_create_ublk_dev "recover" "$@")
+	dev_id=$(_create_ublk_dev "recover" "yes" "$@")
 	for ((j=0;j<20;j++)); do
 		state=$(_get_ublk_dev_state "${dev_id}")
 		[ "$state" == "LIVE" ] && break
diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_generic_15.sh
new file mode 100755
index 000000000000..76379362e0a2
--- /dev/null
+++ b/tools/testing/selftests/ublk/test_generic_15.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
+
+TID="generic_15"
+ERR_CODE=0
+
+_test_partition_scan_no_hang()
+{
+	local recovery_flag=$1
+	local expected_state=$2
+	local dev_id
+	local state
+	local daemon_pid
+	local start_time
+	local elapsed
+
+	# Create ublk device with fault_inject target and very large delay
+	# to simulate hang during partition table read
+	# --delay_us 60000000 = 60 seconds delay
+	# Use _add_ublk_dev_no_settle to avoid udevadm settle hang waiting
+	# for partition scan events to complete
+	if [ "$recovery_flag" = "yes" ]; then
+		echo "Testing partition scan with recovery support..."
+		dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000 -r 1)
+	else
+		echo "Testing partition scan without recovery..."
+		dev_id=$(_add_ublk_dev_no_settle -t fault_inject -q 1 -d 1 --delay_us 60000000)
+	fi
+
+	_check_add_dev "$TID" $?
+
+	# The add command should return quickly because partition scan is async.
+	# Now sleep briefly to let the async partition scan work start and hit
+	# the delay in the fault_inject handler.
+	sleep 1
+
+	# Kill the ublk daemon while partition scan is potentially blocked
+	# And check state transitions properly
+	start_time=${SECONDS}
+	daemon_pid=$(_get_ublk_daemon_pid "${dev_id}")
+	state=$(__ublk_kill_daemon "${dev_id}" "${expected_state}")
+	elapsed=$((SECONDS - start_time))
+
+	# Verify the device transitioned to expected state
+	if [ "$state" != "${expected_state}" ]; then
+		echo "FAIL: Device state is $state, expected ${expected_state}"
+		ERR_CODE=255
+		${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
+		return
+	fi
+	echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging"
+
+	# Clean up the device
+	${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1
+}
+
+_prep_test "partition_scan" "verify async partition scan prevents IO hang"
+
+# Test 1: Without recovery support - should transition to DEAD
+_test_partition_scan_no_hang "no" "DEAD"
+
+# Test 2: With recovery support - should transition to QUIESCED
+_test_partition_scan_no_hang "yes" "QUIESCED"
+
+_cleanup_test "partition_scan"
+_show_result $TID $ERR_CODE

From a2ce133969175d36d708b7c76536b375d0522e53 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 23 Dec 2025 11:27:42 +0800
Subject: [PATCH 09/18] selftests/ublk: fix Makefile to rebuild on header
 changes

Add header dependencies to kublk build rule so that changes to
kublk.h, ublk_dep.h, or utils.h trigger a rebuild.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 tools/testing/selftests/ublk/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile
index eb0e6cfb00ad..06ba6fde098d 100644
--- a/tools/testing/selftests/ublk/Makefile
+++ b/tools/testing/selftests/ublk/Makefile
@@ -51,10 +51,10 @@ TEST_PROGS += test_stress_07.sh
 
 TEST_GEN_PROGS_EXTENDED = kublk
 
+LOCAL_HDRS += $(wildcard *.h)
 include ../lib.mk
 
-$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c stripe.c \
-	fault_inject.c
+$(TEST_GEN_PROGS_EXTENDED): $(wildcard *.c)
 
 check:
 	shellcheck -x -f gcc *.sh

From 10845a105bbcb030647a729f1716c2309da71d33 Mon Sep 17 00:00:00 2001
From: Cong Zhang <cong.zhang@oss.qualcomm.com>
Date: Tue, 30 Dec 2025 17:17:05 +0800
Subject: [PATCH 10/18] blk-mq: skip CPU offline notify on unmapped hctx

If an hctx has no software ctx mapped, blk_mq_map_swqueue() never
allocates tags and leaves hctx->tags NULL. The CPU hotplug offline
notifier can still run for that hctx, return early since hctx cannot
hold any requests.

Signed-off-by: Cong Zhang <cong.zhang@oss.qualcomm.com>
Fixes: bf0beec0607d ("blk-mq: drain I/O when all CPUs in a hctx are offline")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1978eef95dca..eff4f72ce83b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3721,7 +3721,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
 			struct blk_mq_hw_ctx, cpuhp_online);
 	int ret = 0;
 
-	if (blk_mq_hctx_has_online_cpu(hctx, cpu))
+	if (!hctx->nr_ctx || blk_mq_hctx_has_online_cpu(hctx, cpu))
 		return 0;
 
 	/*

From 69153e8b97ebe2afc0dd101767a9805130305500 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@inria.fr>
Date: Wed, 31 Dec 2025 18:22:07 +0100
Subject: [PATCH 11/18] block, bfq: update outdated comment

The function bfq_bfqq_may_idle() was renamed as bfq_better_to_idle()
in commit 277a4a9b56cd ("block, bfq: give a better name to
bfq_bfqq_may_idle").  Update the comment accordingly.

Signed-off-by: Julia Lawall <Julia.Lawall@inria.fr>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 34a498e6b2a5..355a731e2c04 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -984,7 +984,7 @@ struct bfq_group_data {
  *                   unused for the root group. Used to know whether there
  *                   are groups with more than one active @bfq_entity
  *                   (see the comments to the function
- *                   bfq_bfqq_may_idle()).
+ *                   bfq_better_to_idle()).
  * @rq_pos_tree: rbtree sorted by next_request position, used when
  *               determining if two or more queues have interleaving
  *               requests (see bfq_find_close_cooperator()).

From 08e136ebd193eae7d5eff4c66d576c4a2dabdc3f Mon Sep 17 00:00:00 2001
From: Raphael Pinsonneault-Thibeault <rpthibeault@gmail.com>
Date: Wed, 17 Dec 2025 14:00:40 -0500
Subject: [PATCH 12/18] loop: don't change loop device under exclusive opener
 in loop_set_status

loop_set_status() is allowed to change the loop device while there
are other openers of the device, even exclusive ones.

In this case, it causes a KASAN: slab-out-of-bounds Read in
ext4_search_dir(), since when looking for an entry in an inlined
directory, e_value_offs is changed underneath the filesystem by
loop_set_status().

Fix the problem by forbidding loop_set_status() from modifying the loop
device while there are exclusive openers of the device. This is similar
to the fix in loop_configure() by commit 33ec3e53e7b1 ("loop: Don't
change loop device under exclusive opener") alongside commit ecbe6bc0003b
("block: use bd_prepare_to_claim directly in the loop driver").

Reported-by: syzbot+3ee481e21fd75e14c397@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=3ee481e21fd75e14c397
Tested-by: syzbot+3ee481e21fd75e14c397@syzkaller.appspotmail.com
Tested-by: Yongpeng Yang <yangyongpeng@xiaomi.com>
Signed-off-by: Raphael Pinsonneault-Thibeault <rpthibeault@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 32a3a5b13802..ca74cc31bf07 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1225,13 +1225,24 @@ static int loop_clr_fd(struct loop_device *lo)
 }
 
 static int
-loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
+loop_set_status(struct loop_device *lo, blk_mode_t mode,
+		struct block_device *bdev, const struct loop_info64 *info)
 {
 	int err;
 	bool partscan = false;
 	bool size_changed = false;
 	unsigned int memflags;
 
+	/*
+	 * If we don't hold exclusive handle for the device, upgrade to it
+	 * here to avoid changing device under exclusive owner.
+	 */
+	if (!(mode & BLK_OPEN_EXCL)) {
+		err = bd_prepare_to_claim(bdev, loop_set_status, NULL);
+		if (err)
+			goto out_reread_partitions;
+	}
+
 	err = mutex_lock_killable(&lo->lo_mutex);
 	if (err)
 		return err;
@@ -1273,6 +1284,9 @@ out_unfreeze:
 	}
 out_unlock:
 	mutex_unlock(&lo->lo_mutex);
+	if (!(mode & BLK_OPEN_EXCL))
+		bd_abort_claiming(bdev, loop_set_status);
+out_reread_partitions:
 	if (partscan)
 		loop_reread_partitions(lo);
 
@@ -1352,7 +1366,9 @@ loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
 }
 
 static int
-loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
+loop_set_status_old(struct loop_device *lo, blk_mode_t mode,
+		    struct block_device *bdev,
+		    const struct loop_info __user *arg)
 {
 	struct loop_info info;
 	struct loop_info64 info64;
@@ -1360,17 +1376,19 @@ loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
 	if (copy_from_user(&info, arg, sizeof (struct loop_info)))
 		return -EFAULT;
 	loop_info64_from_old(&info, &info64);
-	return loop_set_status(lo, &info64);
+	return loop_set_status(lo, mode, bdev, &info64);
 }
 
 static int
-loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg)
+loop_set_status64(struct loop_device *lo, blk_mode_t mode,
+		  struct block_device *bdev,
+		  const struct loop_info64 __user *arg)
 {
 	struct loop_info64 info64;
 
 	if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
 		return -EFAULT;
-	return loop_set_status(lo, &info64);
+	return loop_set_status(lo, mode, bdev, &info64);
 }
 
 static int
@@ -1549,14 +1567,14 @@ static int lo_ioctl(struct block_device *bdev, blk_mode_t mode,
 	case LOOP_SET_STATUS:
 		err = -EPERM;
 		if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN))
-			err = loop_set_status_old(lo, argp);
+			err = loop_set_status_old(lo, mode, bdev, argp);
 		break;
 	case LOOP_GET_STATUS:
 		return loop_get_status_old(lo, argp);
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
 		if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN))
-			err = loop_set_status64(lo, argp);
+			err = loop_set_status64(lo, mode, bdev, argp);
 		break;
 	case LOOP_GET_STATUS64:
 		return loop_get_status64(lo, argp);
@@ -1650,8 +1668,9 @@ loop_info64_to_compat(const struct loop_info64 *info64,
 }
 
 static int
-loop_set_status_compat(struct loop_device *lo,
-		       const struct compat_loop_info __user *arg)
+loop_set_status_compat(struct loop_device *lo, blk_mode_t mode,
+		    struct block_device *bdev,
+		    const struct compat_loop_info __user *arg)
 {
 	struct loop_info64 info64;
 	int ret;
@@ -1659,7 +1678,7 @@ loop_set_status_compat(struct loop_device *lo,
 	ret = loop_info64_from_compat(arg, &info64);
 	if (ret < 0)
 		return ret;
-	return loop_set_status(lo, &info64);
+	return loop_set_status(lo, mode, bdev, &info64);
 }
 
 static int
@@ -1685,7 +1704,7 @@ static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode,
 
 	switch(cmd) {
 	case LOOP_SET_STATUS:
-		err = loop_set_status_compat(lo,
+		err = loop_set_status_compat(lo, mode, bdev,
 			     (const struct compat_loop_info __user *)arg);
 		break;
 	case LOOP_GET_STATUS:

From 7d121d701d58a92f26decb10da1d04a88b74519d Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Tue, 6 Jan 2026 06:26:57 -0800
Subject: [PATCH 13/18] blk-rq-qos: Remove unlikely() hints from QoS checks

The unlikely() annotations on QUEUE_FLAG_QOS_ENABLED checks are
counterproductive. Writeback throttling (WBT) might be enabled by
default, mainly because CONFIG_BLK_WBT_MQ defaults to 'y'.

Branch profiling on Meta servers, which have WBT enabled, confirms 100%
misprediction rates on these checks.

Remove the unlikely() annotations to let the CPU's branch predictor
learn the actual behavior, potentially improving I/O path performance.

Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-rq-qos.h | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index b538f2c0febc..a747a504fe42 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -112,29 +112,26 @@ void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
 
 static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
 {
-	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
-			q->rq_qos)
+	if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
 		__rq_qos_cleanup(q->rq_qos, bio);
 }
 
 static inline void rq_qos_done(struct request_queue *q, struct request *rq)
 {
-	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
-			q->rq_qos && !blk_rq_is_passthrough(rq))
+	if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) &&
+	    q->rq_qos && !blk_rq_is_passthrough(rq))
 		__rq_qos_done(q->rq_qos, rq);
 }
 
 static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
 {
-	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
-			q->rq_qos)
+	if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
 		__rq_qos_issue(q->rq_qos, rq);
 }
 
 static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
 {
-	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
-			q->rq_qos)
+	if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
 		__rq_qos_requeue(q->rq_qos, rq);
 }
 
@@ -162,8 +159,7 @@ static inline void rq_qos_done_bio(struct bio *bio)
 
 static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
 {
-	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
-			q->rq_qos) {
+	if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) {
 		bio_set_flag(bio, BIO_QOS_THROTTLED);
 		__rq_qos_throttle(q->rq_qos, bio);
 	}
@@ -172,16 +168,14 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
 static inline void rq_qos_track(struct request_queue *q, struct request *rq,
 				struct bio *bio)
 {
-	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
-			q->rq_qos)
+	if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
 		__rq_qos_track(q->rq_qos, rq, bio);
 }
 
 static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
 				struct bio *bio)
 {
-	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
-			q->rq_qos) {
+	if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos) {
 		bio_set_flag(bio, BIO_QOS_MERGED);
 		__rq_qos_merge(q->rq_qos, rq, bio);
 	}
@@ -189,8 +183,7 @@ static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
 
 static inline void rq_qos_queue_depth_changed(struct request_queue *q)
 {
-	if (unlikely(test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags)) &&
-			q->rq_qos)
+	if (test_bit(QUEUE_FLAG_QOS_ENABLED, &q->queue_flags) && q->rq_qos)
 		__rq_qos_queue_depth_changed(q->rq_qos);
 }
 

From 6acd4ac5f8f0ec9b946875553e52907700bcfc77 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Tue, 6 Jan 2026 13:08:37 -0700
Subject: [PATCH 14/18] block: don't merge bios with different app_tags

nvme_set_app_tag() uses the app_tag value from the bio_integrity_payload
of the struct request's first bio. This assumes all the request's bios
have the same app_tag. However, it is possible for bios with different
app_tag values to be merged into a single request.
Add a check in blk_integrity_merge_{bio,rq}() to prevent the merging of
bios/requests with different app_tag values if BIP_CHECK_APPTAG is set.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Fixes: 3d8b5a22d404 ("block: add support to pass user meta buffer")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-integrity.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 9b27963680dc..964eebbee14d 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -140,14 +140,21 @@ EXPORT_SYMBOL_GPL(blk_rq_integrity_map_user);
 bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
 			    struct request *next)
 {
+	struct bio_integrity_payload *bip, *bip_next;
+
 	if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0)
 		return true;
 
 	if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0)
 		return false;
 
-	if (bio_integrity(req->bio)->bip_flags !=
-	    bio_integrity(next->bio)->bip_flags)
+	bip = bio_integrity(req->bio);
+	bip_next = bio_integrity(next->bio);
+	if (bip->bip_flags != bip_next->bip_flags)
+		return false;
+
+	if (bip->bip_flags & BIP_CHECK_APPTAG &&
+	    bip->app_tag != bip_next->app_tag)
 		return false;
 
 	if (req->nr_integrity_segments + next->nr_integrity_segments >
@@ -163,15 +170,21 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
 bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
 			     struct bio *bio)
 {
+	struct bio_integrity_payload *bip, *bip_bio = bio_integrity(bio);
 	int nr_integrity_segs;
 
-	if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL)
+	if (blk_integrity_rq(req) == 0 && bip_bio == NULL)
 		return true;
 
-	if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL)
+	if (blk_integrity_rq(req) == 0 || bip_bio == NULL)
 		return false;
 
-	if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags)
+	bip = bio_integrity(req->bio);
+	if (bip->bip_flags != bip_bio->bip_flags)
+		return false;
+
+	if (bip->bip_flags & BIP_CHECK_APPTAG &&
+	    bip->app_tag != bip_bio->app_tag)
 		return false;
 
 	nr_integrity_segs = blk_rq_count_integrity_sg(q, bio);

From 2704024d83fa9eb8e5f16925aae340fd9d246694 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 7 Jan 2026 19:41:43 +0900
Subject: [PATCH 15/18] loop: add missing bd_abort_claiming in loop_set_status

Commit 08e136ebd193 ("loop: don't change loop device under exclusive
opener in loop_set_status") forgot to call bd_abort_claiming() when
mutex_lock_killable() failed.

Fixes: 08e136ebd193 ("loop: don't change loop device under exclusive opener in loop_set_status")
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ca74cc31bf07..bd59c0e9508b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1245,7 +1245,8 @@ loop_set_status(struct loop_device *lo, blk_mode_t mode,
 
 	err = mutex_lock_killable(&lo->lo_mutex);
 	if (err)
-		return err;
+		goto out_abort_claiming;
+
 	if (lo->lo_state != Lo_bound) {
 		err = -ENXIO;
 		goto out_unlock;
@@ -1284,6 +1285,7 @@ out_unfreeze:
 	}
 out_unlock:
 	mutex_unlock(&lo->lo_mutex);
+out_abort_claiming:
 	if (!(mode & BLK_OPEN_EXCL))
 		bd_abort_claiming(bdev, loop_set_status);
 out_reread_partitions:

From 9670db22e7ab4aefe2b2619589a47fef9d3e0c7e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 6 Jan 2026 16:56:07 +0100
Subject: [PATCH 16/18] blk-mq: avoid stall during boot due to
 synchronize_rcu_expedited

On the kernel 6.19-rc, I am experiencing 15-second boot stall in a
virtual machine when probing a virtio-scsi disk:
[    1.011641] SCSI subsystem initialized
[    1.013972] virtio_scsi virtio6: 16/0/0 default/read/poll queues
[    1.015983] scsi host0: Virtio SCSI HBA
[    1.019578] ACPI: \_SB_.GSIA: Enabled at IRQ 16
[    1.020225] ahci 0000:00:1f.2: AHCI vers 0001.0000, 32 command slots, 1.5 Gbps, SATA mode
[    1.020228] ahci 0000:00:1f.2: 6/6 ports implemented (port mask 0x3f)
[    1.020230] ahci 0000:00:1f.2: flags: 64bit ncq only
[    1.024688] scsi host1: ahci
[    1.025432] scsi host2: ahci
[    1.025966] scsi host3: ahci
[    1.026511] scsi host4: ahci
[    1.028371] scsi host5: ahci
[    1.028918] scsi host6: ahci
[    1.029266] ata1: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23100 irq 16 lpm-pol 1
[    1.029305] ata2: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23180 irq 16 lpm-pol 1
[    1.029316] ata3: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23200 irq 16 lpm-pol 1
[    1.029327] ata4: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23280 irq 16 lpm-pol 1
[    1.029341] ata5: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23300 irq 16 lpm-pol 1
[    1.029356] ata6: SATA max UDMA/133 abar m4096@0xfea23000 port 0xfea23380 irq 16 lpm-pol 1
[    1.118111] scsi 0:0:0:0: Direct-Access     QEMU     QEMU HARDDISK 2.5+ PQ: 0 ANSI: 5
[    1.348916] ata1: SATA link down (SStatus 0 SControl 300)
[    1.350713] ata2: SATA link down (SStatus 0 SControl 300)
[    1.351025] ata6: SATA link down (SStatus 0 SControl 300)
[    1.351160] ata5: SATA link down (SStatus 0 SControl 300)
[    1.351326] ata3: SATA link down (SStatus 0 SControl 300)
[    1.351536] ata4: SATA link down (SStatus 0 SControl 300)
[    1.449153] input: ImExPS/2 Generic Explorer Mouse as /devices/platform/i8042/serio1/input/input2
[   16.483477] sd 0:0:0:0: Power-on or device reset occurred
[   16.483691] sd 0:0:0:0: [sda] 2097152 512-byte logical blocks: (1.07 GB/1.00 GiB)
[   16.483762] sd 0:0:0:0: [sda] Write Protect is off
[   16.483877] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
[   16.569225] sd 0:0:0:0: [sda] Attached SCSI disk

I bisected it and it is caused by the commit 89e1fb7ceffd which
introduces calls to synchronize_rcu_expedited.

This commit replaces synchronize_rcu_expedited and kfree with a call to
kfree_rcu_mightsleep, avoiding the 15-second delay.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Fixes: 89e1fb7ceffd ("blk-mq: fix potential uaf for 'queue_hw_ctx'")
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index eff4f72ce83b..a29d8ac9d3e3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4553,8 +4553,7 @@ static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		 * Make sure reading the old queue_hw_ctx from other
 		 * context concurrently won't trigger uaf.
 		 */
-		synchronize_rcu_expedited();
-		kfree(hctxs);
+		kfree_rcu_mightsleep(hctxs);
 		hctxs = new_hctxs;
 	}
 

From f0d385f6689f37a2828c686fb279121df006b4cb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 9 Jan 2026 20:14:54 +0800
Subject: [PATCH 17/18] ublk: fix use-after-free in ublk_partition_scan_work

A race condition exists between the async partition scan work and device
teardown that can lead to a use-after-free of ub->ub_disk:

1. ublk_ctrl_start_dev() schedules partition_scan_work after add_disk()
2. ublk_stop_dev() calls ublk_stop_dev_unlocked() which does:
   - del_gendisk(ub->ub_disk)
   - ublk_detach_disk() sets ub->ub_disk = NULL
   - put_disk() which may free the disk
3. The worker ublk_partition_scan_work() then dereferences ub->ub_disk
   leading to UAF

Fix this by using ublk_get_disk()/ublk_put_disk() in the worker to hold
a reference to the disk during the partition scan. The spinlock in
ublk_get_disk() synchronizes with ublk_detach_disk() ensuring the worker
either gets a valid reference or sees NULL and exits early.

Also change flush_work() to cancel_work_sync() to avoid running the
partition scan work unnecessarily when the disk is already detached.

Fixes: 7fc4da6a304b ("ublk: scan partition in async way")
Reported-by: Ruikai Peng <ruikai@pwno.io>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ublk_drv.c | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 837fedb02e0d..f6e5a0766721 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -255,20 +255,6 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
 		u16 q_id, u16 tag, struct ublk_io *io, size_t offset);
 static inline unsigned int ublk_req_build_flags(struct request *req);
 
-static void ublk_partition_scan_work(struct work_struct *work)
-{
-	struct ublk_device *ub =
-		container_of(work, struct ublk_device, partition_scan_work);
-
-	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
-					     &ub->ub_disk->state)))
-		return;
-
-	mutex_lock(&ub->ub_disk->open_mutex);
-	bdev_disk_changed(ub->ub_disk, false);
-	mutex_unlock(&ub->ub_disk->open_mutex);
-}
-
 static inline struct ublksrv_io_desc *
 ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
 {
@@ -1597,6 +1583,27 @@ static void ublk_put_disk(struct gendisk *disk)
 		put_device(disk_to_dev(disk));
 }
 
+static void ublk_partition_scan_work(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, partition_scan_work);
+	/* Hold disk reference to prevent UAF during concurrent teardown */
+	struct gendisk *disk = ublk_get_disk(ub);
+
+	if (!disk)
+		return;
+
+	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
+					     &disk->state)))
+		goto out;
+
+	mutex_lock(&disk->open_mutex);
+	bdev_disk_changed(disk, false);
+	mutex_unlock(&disk->open_mutex);
+out:
+	ublk_put_disk(disk);
+}
+
 /*
  * Use this function to ensure that ->canceling is consistently set for
  * the device and all queues. Do not set these flags directly.
@@ -2041,7 +2048,7 @@ static void ublk_stop_dev(struct ublk_device *ub)
 	mutex_lock(&ub->mutex);
 	ublk_stop_dev_unlocked(ub);
 	mutex_unlock(&ub->mutex);
-	flush_work(&ub->partition_scan_work);
+	cancel_work_sync(&ub->partition_scan_work);
 	ublk_cancel_dev(ub);
 }
 

From ca22c566b89164f6e670af56ecc45f47ef3df819 Mon Sep 17 00:00:00 2001
From: Caleb Sander Mateos <csander@purestorage.com>
Date: Thu, 8 Jan 2026 10:22:10 -0700
Subject: [PATCH 18/18] block: zero non-PI portion of auto integrity buffer

The auto-generated integrity buffer for writes needs to be fully
initialized before being passed to the underlying block device,
otherwise the uninitialized memory can be read back by userspace or
anyone with physical access to the storage device. If protection
information is generated, that portion of the integrity buffer is
already initialized. The integrity data is also zeroed if PI generation
is disabled via sysfs or the PI tuple size is 0. However, this misses
the case where PI is generated and the PI tuple size is nonzero, but the
metadata size is larger than the PI tuple. In this case, the remainder
("opaque") of the metadata is left uninitialized.
Generalize the BLK_INTEGRITY_CSUM_NONE check to cover any case when the
metadata is larger than just the PI tuple.

Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Fixes: c546d6f43833 ("block: only zero non-PI metadata tuples in bio_integrity_prep")
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity-auto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c
index 9850c338548d..cff025b06be1 100644
--- a/block/bio-integrity-auto.c
+++ b/block/bio-integrity-auto.c
@@ -140,7 +140,7 @@ bool bio_integrity_prep(struct bio *bio)
 				return true;
 			set_flags = false;
 			gfp |= __GFP_ZERO;
-		} else if (bi->csum_type == BLK_INTEGRITY_CSUM_NONE)
+		} else if (bi->metadata_size > bi->pi_tuple_size)
 			gfp |= __GFP_ZERO;
 		break;
 	default: