From 4cef2fcda3adabcf6937170d9b869bf72a6d9dc6 Mon Sep 17 00:00:00 2001 From: Tamir Duberstein Date: Mon, 22 Dec 2025 13:26:19 +0100 Subject: [PATCH 001/162] rnull: replace `kernel::c_str!` with C-Strings C-String literals were added in Rust 1.77. Replace instances of `kernel::c_str!` with C-String literals where possible. Signed-off-by: Tamir Duberstein Reviewed-by: Daniel Almeida Signed-off-by: Jens Axboe --- drivers/block/rnull/configfs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs index 6713a6d92391..2f5a7da03af5 100644 --- a/drivers/block/rnull/configfs.rs +++ b/drivers/block/rnull/configfs.rs @@ -25,7 +25,7 @@ pub(crate) fn subsystem() -> impl PinInit, E ], }; - kernel::configfs::Subsystem::new(c_str!("rnull"), item_type, try_pin_init!(Config {})) + kernel::configfs::Subsystem::new(c"rnull", item_type, try_pin_init!(Config {})) } #[pin_data] From e1418af7660f67abc7f6f0cf6867f3989aa45e9a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 21 Dec 2025 00:59:23 +0100 Subject: [PATCH 002/162] brd: replace simple_strtol with kstrtoul in ramdisk_size Replace simple_strtol() with the recommended kstrtoul() for parsing the 'ramdisk_size=' boot parameter. Unlike simple_strtol(), which returns a long, kstrtoul() converts the string directly to an unsigned long and avoids implicit casting. Check the return value of kstrtoul() and reject invalid values. This adds error handling while preserving behavior for existing values, and removes use of the deprecated simple_strtol() helper. The current code silently sets 'rd_size = 0' if parsing fails, instead of leaving the default value (CONFIG_BLK_DEV_RAM_SIZE) unchanged. Signed-off-by: Thorsten Blum Signed-off-by: Jens Axboe --- drivers/block/brd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 9778259b30d4..a5104cf96609 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -247,8 +247,7 @@ MODULE_ALIAS("rd"); /* Legacy boot options - nonmodular */ static int __init ramdisk_size(char *str) { - rd_size = simple_strtol(str, NULL, 0); - return 1; + return kstrtoul(str, 0, &rd_size) == 0; } __setup("ramdisk_size=", ramdisk_size); #endif From 9e371032cbf0c8fdc757df5510b55e824668b938 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 29 Dec 2025 11:26:07 +0100 Subject: [PATCH 003/162] null_blk: Constify struct configfs_item_operations and configfs_group_operations 'struct configfs_item_operations' and 'configfs_group_operations' are not modified in this driver. Constifying these structures moves some data to a read-only section, so increases overall security, especially when the structure holds some function pointers. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 100263 37808 2752 140823 22617 drivers/block/null_blk/main.o After: ===== text data bss dec hex filename 100423 37648 2752 140823 22617 drivers/block/null_blk/main.o Signed-off-by: Christophe JAILLET Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index c7c0fb79a6bf..29a371f48b57 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -642,7 +642,7 @@ static void nullb_device_release(struct config_item *item) null_free_dev(dev); } -static struct configfs_item_operations nullb_device_ops = { +static const struct configfs_item_operations nullb_device_ops = { .release = nullb_device_release, }; @@ -739,7 +739,7 @@ static struct configfs_attribute *nullb_group_attrs[] = { NULL, }; -static struct configfs_group_operations nullb_group_ops = { +static const struct configfs_group_operations nullb_group_ops = { .make_group = nullb_group_make_group, .drop_item = nullb_group_drop_item, }; From 483cbec3422399aca449265205be3aa83df281f6 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Fri, 5 Dec 2025 13:47:28 +0100 Subject: [PATCH 004/162] block/rnbd-proto: Handle PREFLUSH flag properly for IOs In RNBD client, for a WRITE request of size 0, with only the REQ_PREFLUSH bit set, while converting from bio_opf to rnbd_opf, we do REQ_OP_WRITE to RNBD_OP_WRITE, and then check if the rq is flush through function op_is_flush. That function checks both REQ_PREFLUSH and REQ_FUA flag, and if any of them is set, the RNBD_F_FUA is set. On the RNBD server side, while converting the RNBD flags to req flags, if the RNBD_F_FUA flag is set, we just set the REQ_FUA flag. This means we have lost the PREFLUSH flag, and added the REQ_FUA flag in its place. This commits adds a new RNBD_F_PREFLUSH flag, and also adds separate handling for REQ_PREFLUSH flag. On the server side, if the RNBD_F_PREFLUSH is present, the REQ_PREFLUSH is added to the bio. Since it is a change in the wire protocol, bump the minor version of protocol. The change is backwards compatible, and does not change the functionality if either the client or the server is running older/newer versions. If the client side is running the older version, both REQ_PREFLUSH and REQ_FUA is converted to RNBD_F_FUA. The server running newer one would still add only the REQ_FUA flag which is what happens when both client and server is running the older version. If the client side is running the newer version, just like before a RNBD_F_FUA is added, but now a RNBD_F_PREFLUSH is also added to the rnbd_opf. In case the server is running the older version the RNBD_F_PREFLUSH is ignored, and only the RNBD_F_FUA is processed. Signed-off-by: Md Haris Iqbal Reviewed-by: Jack Wang Reviewed-by: Florian-Ewald Mueller Signed-off-by: Grzegorz Prajsner Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-proto.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index 77360c2a6069..5e74ae86169b 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -18,7 +18,7 @@ #include #define RNBD_PROTO_VER_MAJOR 2 -#define RNBD_PROTO_VER_MINOR 0 +#define RNBD_PROTO_VER_MINOR 1 /* The default port number the RTRS server is listening on. */ #define RTRS_PORT 1234 @@ -197,6 +197,7 @@ struct rnbd_msg_io { * * @RNBD_F_SYNC: request is sync (sync write or read) * @RNBD_F_FUA: forced unit access + * @RNBD_F_PREFLUSH: request for cache flush */ enum rnbd_io_flags { @@ -211,6 +212,7 @@ enum rnbd_io_flags { /* Flags */ RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0), RNBD_F_FUA = 1<<(RNBD_OP_BITS + 1), + RNBD_F_PREFLUSH = 1<<(RNBD_OP_BITS + 2) }; static inline u32 rnbd_op(u32 flags) @@ -258,6 +260,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) if (rnbd_opf & RNBD_F_FUA) bio_opf |= REQ_FUA; + if (rnbd_opf & RNBD_F_PREFLUSH) + bio_opf |= REQ_PREFLUSH; + return bio_opf; } @@ -297,6 +302,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq) if (op_is_flush(rq->cmd_flags)) rnbd_opf |= RNBD_F_FUA; + if (rq->cmd_flags & REQ_PREFLUSH) + rnbd_opf |= RNBD_F_PREFLUSH; + return rnbd_opf; } From 581cf833cac4461d90ef5da4c5ef4475f440e489 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 5 Dec 2025 13:47:29 +0100 Subject: [PATCH 005/162] block: rnbd: add .release to rnbd_dev_ktype Every ktype must provides a .release function that will be called after the last kobject_put. Signed-off-by: Zhu Yanjun Reviewed-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 8 ++++++++ drivers/block/rnbd/rnbd-clt.c | 18 ++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 6ea7c12e3a87..144aea1466a4 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -475,9 +475,17 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) } } +static void rnbd_dev_release(struct kobject *kobj) +{ + struct rnbd_clt_dev *dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + kfree(dev); +} + static const struct kobj_type rnbd_dev_ktype = { .sysfs_ops = &kobj_sysfs_ops, .default_groups = rnbd_dev_groups, + .release = rnbd_dev_release, }; static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index d1c354636315..094ecc174f41 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -60,7 +60,9 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) kfree(dev->pathname); rnbd_clt_put_sess(dev->sess); mutex_destroy(&dev->lock); - kfree(dev); + + if (dev->kobj.state_initialized) + kobject_put(&dev->kobj); } static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) @@ -1517,7 +1519,7 @@ static bool insert_dev_if_not_exists_devpath(struct rnbd_clt_dev *dev) return found; } -static void delete_dev(struct rnbd_clt_dev *dev) +static void rnbd_delete_dev(struct rnbd_clt_dev *dev) { struct rnbd_clt_session *sess = dev->sess; @@ -1638,7 +1640,7 @@ put_iu: kfree(rsp); rnbd_put_iu(sess, iu); del_dev: - delete_dev(dev); + rnbd_delete_dev(dev); put_dev: rnbd_clt_put_dev(dev); put_sess: @@ -1647,13 +1649,13 @@ put_sess: return ERR_PTR(ret); } -static void destroy_gen_disk(struct rnbd_clt_dev *dev) +static void rnbd_destroy_gen_disk(struct rnbd_clt_dev *dev) { del_gendisk(dev->gd); put_disk(dev->gd); } -static void destroy_sysfs(struct rnbd_clt_dev *dev, +static void rnbd_destroy_sysfs(struct rnbd_clt_dev *dev, const struct attribute *sysfs_self) { rnbd_clt_remove_dev_symlink(dev); @@ -1691,9 +1693,9 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, dev->dev_state = DEV_STATE_UNMAPPED; mutex_unlock(&dev->lock); - delete_dev(dev); - destroy_sysfs(dev, sysfs_self); - destroy_gen_disk(dev); + rnbd_delete_dev(dev); + rnbd_destroy_sysfs(dev, sysfs_self); + rnbd_destroy_gen_disk(dev); if (was_mapped && sess->rtrs) send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT); From ef63e9ef76c801ac3081811fc6226ffb4c02453a Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Fri, 5 Dec 2025 13:47:30 +0100 Subject: [PATCH 006/162] block/rnbd-proto: Check and retain the NOUNMAP flag for requests The NOUNMAP flag is in combination with WRITE_ZEROES flag to indicate that the upper layers wants the sectors zeroed, but does not want it to get freed. This instruction is especially important for storage stacks which involves a layer capable of thin provisioning. This commit makes RNBD block device transfer and retain this NOUNMAP flag for requests, so it can be passed onto the backend device on the server side. Since it is a change in the wire protocol, bump the minor version of protocol. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-proto.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index 5e74ae86169b..64f1cfe9f8ef 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -18,7 +18,7 @@ #include #define RNBD_PROTO_VER_MAJOR 2 -#define RNBD_PROTO_VER_MINOR 1 +#define RNBD_PROTO_VER_MINOR 2 /* The default port number the RTRS server is listening on. */ #define RTRS_PORT 1234 @@ -198,6 +198,7 @@ struct rnbd_msg_io { * @RNBD_F_SYNC: request is sync (sync write or read) * @RNBD_F_FUA: forced unit access * @RNBD_F_PREFLUSH: request for cache flush + * @RNBD_F_NOUNMAP: do not free blocks when zeroing */ enum rnbd_io_flags { @@ -212,7 +213,8 @@ enum rnbd_io_flags { /* Flags */ RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0), RNBD_F_FUA = 1<<(RNBD_OP_BITS + 1), - RNBD_F_PREFLUSH = 1<<(RNBD_OP_BITS + 2) + RNBD_F_PREFLUSH = 1<<(RNBD_OP_BITS + 2), + RNBD_F_NOUNMAP = 1<<(RNBD_OP_BITS + 3) }; static inline u32 rnbd_op(u32 flags) @@ -247,6 +249,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) break; case RNBD_OP_WRITE_ZEROES: bio_opf = REQ_OP_WRITE_ZEROES; + + if (rnbd_opf & RNBD_F_NOUNMAP) + bio_opf |= REQ_NOUNMAP; break; default: WARN(1, "Unknown RNBD type: %d (flags %d)\n", @@ -285,6 +290,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq) break; case REQ_OP_WRITE_ZEROES: rnbd_opf = RNBD_OP_WRITE_ZEROES; + + if (rq->cmd_flags & REQ_NOUNMAP) + rnbd_opf |= RNBD_F_NOUNMAP; break; case REQ_OP_FLUSH: rnbd_opf = RNBD_OP_FLUSH; From e1384543e85b11b494051d11728d6d88a93161bc Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Fri, 5 Dec 2025 13:47:31 +0100 Subject: [PATCH 007/162] rnbd-srv: fix the trace format for flags The __print_flags helper meant for bitmask, while the rnbd_rw_flags is mixed with bitmask and enum, to avoid confusion, just print the data as it is. Signed-off-by: Jack Wang Reviewed-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-trace.h | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h index 89d0bcb17195..18ae2ed5537a 100644 --- a/drivers/block/rnbd/rnbd-srv-trace.h +++ b/drivers/block/rnbd/rnbd-srv-trace.h @@ -44,24 +44,6 @@ DEFINE_EVENT(rnbd_srv_link_class, name, \ DEFINE_LINK_EVENT(create_sess); DEFINE_LINK_EVENT(destroy_sess); -TRACE_DEFINE_ENUM(RNBD_OP_READ); -TRACE_DEFINE_ENUM(RNBD_OP_WRITE); -TRACE_DEFINE_ENUM(RNBD_OP_FLUSH); -TRACE_DEFINE_ENUM(RNBD_OP_DISCARD); -TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE); -TRACE_DEFINE_ENUM(RNBD_F_SYNC); -TRACE_DEFINE_ENUM(RNBD_F_FUA); - -#define show_rnbd_rw_flags(x) \ - __print_flags(x, "|", \ - { RNBD_OP_READ, "READ" }, \ - { RNBD_OP_WRITE, "WRITE" }, \ - { RNBD_OP_FLUSH, "FLUSH" }, \ - { RNBD_OP_DISCARD, "DISCARD" }, \ - { RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \ - { RNBD_F_SYNC, "SYNC" }, \ - { RNBD_F_FUA, "FUA" }) - TRACE_EVENT(process_rdma, TP_PROTO(struct rnbd_srv_session *srv, const struct rnbd_msg_io *msg, @@ -97,7 +79,7 @@ TRACE_EVENT(process_rdma, __entry->usrlen = usrlen; ), - TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu", + TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %u, ioprio: %d, datalen: %u, usrlen: %zu", __get_str(sessname), __print_symbolic(__entry->dir, { READ, "READ" }, @@ -106,7 +88,7 @@ TRACE_EVENT(process_rdma, __entry->device_id, __entry->sector, __entry->bi_size, - show_rnbd_rw_flags(__entry->flags), + __entry->flags, __entry->ioprio, __entry->datalen, __entry->usrlen From 4ac9690d4b9456ca1d5276d86547fa2e7cd47684 Mon Sep 17 00:00:00 2001 From: Florian-Ewald Mueller Date: Fri, 5 Dec 2025 13:47:32 +0100 Subject: [PATCH 008/162] rnbd-srv: Fix server side setting of bi_size for special IOs On rnbd-srv, the bi_size of the bio is set during the bio_add_page function, to which datalen is passed. But for special IOs like DISCARD and WRITE_ZEROES, datalen is 0, since there is no data to write. For these special IOs, use the bi_size of the rnbd_msg_io. Fixes: f6f84be089c9 ("block/rnbd-srv: Add sanity check and remove redundant assignment") Signed-off-by: Florian-Ewald Mueller Signed-off-by: Md Haris Iqbal Signed-off-by: Grzegorz Prajsner Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 2df8941a6b14..9b3fdc202e15 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -145,18 +145,30 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, priv->sess_dev = sess_dev; priv->id = id; - bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1, + bio = bio_alloc(file_bdev(sess_dev->bdev_file), !!datalen, rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); - bio_add_virt_nofail(bio, data, datalen); - - bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); - if (bio_has_data(bio) && - bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { - rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", - bio->bi_iter.bi_size, msg->bi_size); - err = -EINVAL; - goto bio_put; + if (unlikely(!bio)) { + err = -ENOMEM; + goto put_sess_dev; } + + if (!datalen) { + /* + * For special requests like DISCARD and WRITE_ZEROES, the datalen is zero. + */ + bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); + } else { + bio_add_virt_nofail(bio, data, datalen); + bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); + if (bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { + rnbd_srv_err_rl(sess_dev, + "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", + bio->bi_iter.bi_size, msg->bi_size); + err = -EINVAL; + goto bio_put; + } + } + bio->bi_end_io = rnbd_dev_bi_end_io; bio->bi_private = priv; bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); @@ -170,6 +182,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, bio_put: bio_put(bio); +put_sess_dev: rnbd_put_sess_dev(sess_dev); err: kfree(priv); From 69d26698e4fd44935510553809007151b2fe4db5 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Fri, 5 Dec 2025 13:47:33 +0100 Subject: [PATCH 009/162] rnbd-srv: Zero the rsp buffer before using it Before using the data buffer to send back the response message, zero it completely. This prevents any stray bytes to be picked up by the client side when there the message is exchanged between different protocol versions. Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Signed-off-by: Grzegorz Prajsner Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 9b3fdc202e15..7eeb321d6140 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -551,6 +551,8 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, { struct block_device *bdev = file_bdev(sess_dev->bdev_file); + memset(rsp, 0, sizeof(*rsp)); + rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); rsp->device_id = cpu_to_le32(sess_dev->device_id); rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev)); @@ -657,6 +659,7 @@ static void process_msg_sess_info(struct rnbd_srv_session *srv_sess, trace_process_msg_sess_info(srv_sess, sess_info_msg); + memset(rsp, 0, sizeof(*rsp)); rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); rsp->ver = srv_sess->ver; } From 073b9bf9af463d32555c5ebaf7e28c3a44c715d0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 17 Dec 2025 11:41:23 +0200 Subject: [PATCH 010/162] nvme-pci: Use size_t for length fields to handle larger sizes This patch changes the length variables from unsigned int to size_t. Using size_t ensures that we can handle larger sizes, as size_t is always equal to or larger than the previously used u32 type. Originally, u32 was used because blk-mq-dma code evolved from scatter-gather implementation, which uses unsigned int to describe length. This change will also allow us to reuse the existing struct phys_vec in places that don't need scatter-gather. Signed-off-by: Leon Romanovsky Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 8 ++++++-- drivers/nvme/host/pci.c | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index fb018fffffdc..a2bedc8f8666 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -8,7 +8,7 @@ struct phys_vec { phys_addr_t paddr; - u32 len; + size_t len; }; static bool __blk_map_iter_next(struct blk_map_iter *iter) @@ -112,8 +112,8 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, struct phys_vec *vec) { enum dma_data_direction dir = rq_dma_dir(req); - unsigned int mapped = 0; unsigned int attrs = 0; + size_t mapped = 0; int error; iter->addr = state->addr; @@ -297,6 +297,8 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, blk_rq_map_iter_init(rq, &iter); while (blk_map_iter_next(rq, &iter, &vec)) { *last_sg = blk_next_sg(last_sg, sglist); + + WARN_ON_ONCE(overflows_type(vec.len, unsigned int)); sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, offset_in_page(vec.paddr)); nsegs++; @@ -417,6 +419,8 @@ int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist) while (blk_map_iter_next(rq, &iter, &vec)) { sg = blk_next_sg(&sg, sglist); + + WARN_ON_ONCE(overflows_type(vec.len, unsigned int)); sg_set_page(sg, phys_to_page(vec.paddr), vec.len, offset_in_page(vec.paddr)); segments++; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0e4caeab739c..3b528369f545 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -290,14 +290,14 @@ struct nvme_iod { u8 flags; u8 nr_descriptors; - unsigned int total_len; + size_t total_len; struct dma_iova_state dma_state; void *descriptors[NVME_MAX_NR_DESCRIPTORS]; struct nvme_dma_vec *dma_vecs; unsigned int nr_dma_vecs; dma_addr_t meta_dma; - unsigned int meta_total_len; + size_t meta_total_len; struct dma_iova_state meta_dma_state; struct nvme_sgl_desc *meta_descriptor; }; From fcf463b92a08686d1aeb1e66674a72eb7a8bfb9b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 17 Dec 2025 11:41:24 +0200 Subject: [PATCH 011/162] types: move phys_vec definition to common header Move the struct phys_vec definition from block/blk-mq-dma.c to include/linux/types.h to make it available for use across the kernel. The phys_vec structure represents a physical address range with a length, which is used by the new physical address-based DMA mapping API. This structure is already used by the block layer and will be needed for DMA phys API users. Moving this definition to types.h provides a centralized location for this common data structure and eliminates code duplication across subsystems that need to work with physical address ranges. Signed-off-by: Leon Romanovsky Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 5 ----- include/linux/types.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index a2bedc8f8666..752060d7261c 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -6,11 +6,6 @@ #include #include "blk.h" -struct phys_vec { - phys_addr_t paddr; - size_t len; -}; - static bool __blk_map_iter_next(struct blk_map_iter *iter) { if (iter->iter.bi_size) diff --git a/include/linux/types.h b/include/linux/types.h index d4437e9c452c..d673747eda8a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -171,6 +171,11 @@ typedef u64 phys_addr_t; typedef u32 phys_addr_t; #endif +struct phys_vec { + phys_addr_t paddr; + size_t len; +}; + typedef phys_addr_t resource_size_t; /* From ee623c892aa59003fca173de0041abc2ccc2c72d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 31 Dec 2025 11:00:55 +0800 Subject: [PATCH 012/162] block: use bvec iterator helper for bio_may_need_split() bio_may_need_split() uses bi_vcnt to determine if a bio has a single segment, but bi_vcnt is unreliable for cloned bios. Cloned bios share the parent's bi_io_vec array but iterate over a subset via bi_iter, so bi_vcnt may not reflect the actual segment count being iterated. Replace the bi_vcnt check with bvec iterator access via __bvec_iter_bvec(), comparing bi_iter.bi_size against the current bvec's length. This correctly handles both cloned and non-cloned bios. Move bi_io_vec into the first cache line adjacent to bi_iter. This is a sensible layout since bi_io_vec and bi_iter are commonly accessed together throughout the block layer - every bvec iteration requires both fields. This displaces bi_end_io to the second cache line, which is acceptable since bi_end_io and bi_private are always fetched together in bio_endio() anyway. The struct layout change requires bio_reset() to preserve and restore bi_io_vec across the memset, since it now falls within BIO_RESET_BYTES. Nitesh verified that this patch doesn't regress NVMe 512-byte IO perf [1]. Link: https://lore.kernel.org/linux-block/20251220081607.tvnrltcngl3cc2fh@green245.gost/ [1] Signed-off-by: Ming Lei Reviewed-by: Nitesh Shetty Signed-off-by: Jens Axboe --- block/bio.c | 3 +++ block/blk.h | 12 +++++++++--- include/linux/blk_types.h | 4 ++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/block/bio.c b/block/bio.c index e726c0e280a8..0e936288034e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -301,9 +301,12 @@ EXPORT_SYMBOL(bio_init); */ void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { + struct bio_vec *bv = bio->bi_io_vec; + bio_uninit(bio); memset(bio, 0, BIO_RESET_BYTES); atomic_set(&bio->__bi_remaining, 1); + bio->bi_io_vec = bv; bio->bi_bdev = bdev; if (bio->bi_bdev) bio_associate_blkg(bio); diff --git a/block/blk.h b/block/blk.h index e4c433f62dfc..98f4dfd4ec75 100644 --- a/block/blk.h +++ b/block/blk.h @@ -371,12 +371,18 @@ struct bio *bio_split_zone_append(struct bio *bio, static inline bool bio_may_need_split(struct bio *bio, const struct queue_limits *lim) { + const struct bio_vec *bv; + if (lim->chunk_sectors) return true; - if (bio->bi_vcnt != 1) + + if (!bio->bi_io_vec) return true; - return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > - lim->max_fast_segment_size; + + bv = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + if (bio->bi_iter.bi_size > bv->bv_len) + return true; + return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size; } /** diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5dc061d318a4..19a888a2f104 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -232,6 +232,8 @@ struct bio { atomic_t __bi_remaining; + /* The actual vec list, preserved by bio_reset() */ + struct bio_vec *bi_io_vec; struct bvec_iter bi_iter; union { @@ -275,8 +277,6 @@ struct bio { atomic_t __bi_cnt; /* pin count */ - struct bio_vec *bi_io_vec; /* the actual vec list */ - struct bio_set *bi_pool; }; From 641864314866dff382f64cd8b52fd6bf4c4d84f6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 31 Dec 2025 11:00:56 +0800 Subject: [PATCH 013/162] block: don't initialize bi_vcnt for cloned bio in bio_iov_bvec_set() bio_iov_bvec_set() creates a cloned bio that borrows a bvec array from an iov_iter. For cloned bios, bi_vcnt is meaningless because iteration is controlled entirely by bi_iter (bi_idx, bi_size, bi_bvec_done), not by bi_vcnt. Remove the incorrect bi_vcnt assignment. Explicitly initialize bi_iter.bi_idx to 0 to ensure iteration starts at the first bvec. While bi_idx is typically already zero from bio initialization, making this explicit improves clarity and correctness. This change also avoids accessing iter->nr_segs, which is an iov_iter implementation detail that block code should not depend on. Signed-off-by: Ming Lei Reviewed-by: Nitesh Shetty Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 0e936288034e..2359c0723b88 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1165,8 +1165,8 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); - bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; + bio->bi_iter.bi_idx = 0; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = iov_iter_count(iter); bio_set_flag(bio, BIO_CLONED); From 15f506a77ad61ac3273ade9b7ef87af9bdba22ad Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 31 Dec 2025 11:00:57 +0800 Subject: [PATCH 014/162] io_uring: remove nr_segs recalculation in io_import_kbuf() io_import_kbuf() recalculates iter->nr_segs to reflect only the bvecs needed for the requested byte range. This was added to provide an accurate segment count to bio_iov_bvec_set(), which copied nr_segs to bio->bi_vcnt for use as a bio split hint. The previous two patches eliminated this dependency: - bio_may_need_split() now uses bi_iter instead of bi_vcnt for split decisions - bio_iov_bvec_set() no longer copies nr_segs to bi_vcnt Since nr_segs is no longer used for bio split decisions, the recalculation loop is unnecessary. The iov_iter already has the correct bi_size to cap iteration, so an oversized nr_segs is harmless. Link: https://lkml.org/lkml/2025/4/16/351 Signed-off-by: Ming Lei Reviewed-by: Nitesh Shetty Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 41c89f5c616d..ee6283676ba7 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1055,17 +1055,6 @@ static int io_import_kbuf(int ddir, struct iov_iter *iter, iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); iov_iter_advance(iter, offset); - - if (count < imu->len) { - const struct bio_vec *bvec = iter->bvec; - - len += iter->iov_offset; - while (len > bvec->bv_len) { - len -= bvec->bv_len; - bvec++; - } - iter->nr_segs = 1 + bvec - iter->bvec; - } return 0; } From a31bde687b10b1a3db9c61eba5abb662dda15277 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 10:22:12 -0700 Subject: [PATCH 015/162] block: use pi_tuple_size in bi_offload_capable() bi_offload_capable() returns whether a block device's metadata size matches its PI tuple size. Use pi_tuple_size instead of switching on csum_type. This makes the code considerably simpler and less branchy. Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 9850c338548d..eb95e29c93bc 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -52,19 +52,7 @@ static bool bip_should_check(struct bio_integrity_payload *bip) static bool bi_offload_capable(struct blk_integrity *bi) { - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - return bi->metadata_size == sizeof(struct crc64_pi_tuple); - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - return bi->metadata_size == sizeof(struct t10_pi_tuple); - default: - pr_warn_once("%s: unknown integrity checksum type:%d\n", - __func__, bi->csum_type); - fallthrough; - case BLK_INTEGRITY_CSUM_NONE: - return false; - } + return bi->metadata_size == bi->pi_tuple_size; } /** From f7ba87dfa8e42642d43faf29a71cee338086218b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 10 Jan 2026 21:42:36 +0800 Subject: [PATCH 016/162] block: account for bi_bvec_done in bio_may_need_split() When checking if a bio fits in a single segment, bio_may_need_split() compares bi_size against the current bvec's bv_len. However, for partially consumed bvecs (bi_bvec_done > 0), such as in cloned or split bios, the remaining bytes in the current bvec is actually (bv_len - bi_bvec_done), not bv_len. This could cause bio_may_need_split() to incorrectly return false, leading to nr_phys_segments being set to 1 when the bio actually spans multiple segments. This triggers the WARN_ON in __blk_rq_map_sg() when the actual mapped segments exceed the expected count. Fix by subtracting bi_bvec_done from bv_len in the comparison. Reported-by: Venkat Rao Bagalkote Close: https://lore.kernel.org/linux-block/9687cf2b-1f32-44e1-b58d-2492dc6e7185@linux.ibm.com/ Repored-and-bisected-by: Christoph Hellwig Tested-by: Venkat Rao Bagalkote Tested-by: Christoph Hellwig Fixes: ee623c892aa5 ("block: use bvec iterator helper for bio_may_need_split()") Cc: Nitesh Shetty Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk.h b/block/blk.h index 98f4dfd4ec75..980eef1f5690 100644 --- a/block/blk.h +++ b/block/blk.h @@ -380,7 +380,7 @@ static inline bool bio_may_need_split(struct bio *bio, return true; bv = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); - if (bio->bi_iter.bi_size > bv->bv_len) + if (bio->bi_iter.bi_size > bv->bv_len - bio->bi_iter.bi_bvec_done) return true; return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size; } From c22756a9978e8f5917ff41cf17fc8db00d09e776 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:41 +0100 Subject: [PATCH 017/162] fscrypt: pass a real sector_t to fscrypt_zeroout_range_inline_crypt While the pblk argument to fscrypt_zeroout_range_inline_crypt is declared as a sector_t it actually is interpreted as a logical block size unit, which is highly unusual. Switch to passing the 512 byte units that sector_t is defined for. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- fs/crypto/bio.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 5f5599020e94..68b0424d879a 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -48,7 +48,7 @@ bool fscrypt_decrypt_bio(struct bio *bio) EXPORT_SYMBOL(fscrypt_decrypt_bio); static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, - pgoff_t lblk, sector_t pblk, + pgoff_t lblk, sector_t sector, unsigned int len) { const unsigned int blockbits = inode->i_blkbits; @@ -67,8 +67,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, if (num_pages == 0) { fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); - bio->bi_iter.bi_sector = - pblk << (blockbits - SECTOR_SHIFT); + bio->bi_iter.bi_sector = sector; } ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); if (WARN_ON_ONCE(ret != bytes_this_page)) { @@ -78,7 +77,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, num_pages++; len -= blocks_this_page; lblk += blocks_this_page; - pblk += blocks_this_page; + sector += (bytes_this_page >> SECTOR_SHIFT); if (num_pages == BIO_MAX_VECS || !len || !fscrypt_mergeable_bio(bio, inode, lblk)) { err = submit_bio_wait(bio); @@ -132,7 +131,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, return 0; if (fscrypt_inode_uses_inline_crypto(inode)) - return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk, + return fscrypt_zeroout_range_inline_crypt(inode, lblk, sector, len); BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); From bc26e2efa2c5bb9289fa894834446840dea0bc31 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:42 +0100 Subject: [PATCH 018/162] fscrypt: keep multiple bios in flight in fscrypt_zeroout_range_inline_crypt This should slightly improve performance for large zeroing operations, but more importantly prepares for blk-crypto refactoring that requires all fscrypt users to call submit_bio directly. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- fs/crypto/bio.c | 86 +++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 68b0424d879a..c2b3ca100f8d 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -47,49 +47,71 @@ bool fscrypt_decrypt_bio(struct bio *bio) } EXPORT_SYMBOL(fscrypt_decrypt_bio); +struct fscrypt_zero_done { + atomic_t pending; + blk_status_t status; + struct completion done; +}; + +static void fscrypt_zeroout_range_done(struct fscrypt_zero_done *done) +{ + if (atomic_dec_and_test(&done->pending)) + complete(&done->done); +} + +static void fscrypt_zeroout_range_end_io(struct bio *bio) +{ + struct fscrypt_zero_done *done = bio->bi_private; + + if (bio->bi_status) + cmpxchg(&done->status, 0, bio->bi_status); + fscrypt_zeroout_range_done(done); + bio_put(bio); +} + static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, pgoff_t lblk, sector_t sector, unsigned int len) { const unsigned int blockbits = inode->i_blkbits; const unsigned int blocks_per_page = 1 << (PAGE_SHIFT - blockbits); - struct bio *bio; - int ret, err = 0; - int num_pages = 0; - - /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ - bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE, - GFP_NOFS); + struct fscrypt_zero_done done = { + .pending = ATOMIC_INIT(1), + .done = COMPLETION_INITIALIZER_ONSTACK(done.done), + }; while (len) { - unsigned int blocks_this_page = min(len, blocks_per_page); - unsigned int bytes_this_page = blocks_this_page << blockbits; + struct bio *bio; + unsigned int n; - if (num_pages == 0) { - fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); - bio->bi_iter.bi_sector = sector; - } - ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); - if (WARN_ON_ONCE(ret != bytes_this_page)) { - err = -EIO; - goto out; - } - num_pages++; - len -= blocks_this_page; - lblk += blocks_this_page; - sector += (bytes_this_page >> SECTOR_SHIFT); - if (num_pages == BIO_MAX_VECS || !len || - !fscrypt_mergeable_bio(bio, inode, lblk)) { - err = submit_bio_wait(bio); - if (err) - goto out; - bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE); - num_pages = 0; + bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE, + GFP_NOFS); + bio->bi_iter.bi_sector = sector; + bio->bi_private = &done; + bio->bi_end_io = fscrypt_zeroout_range_end_io; + fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); + + for (n = 0; n < BIO_MAX_VECS; n++) { + unsigned int blocks_this_page = + min(len, blocks_per_page); + unsigned int bytes_this_page = blocks_this_page << blockbits; + + __bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); + len -= blocks_this_page; + lblk += blocks_this_page; + sector += (bytes_this_page >> SECTOR_SHIFT); + if (!len || !fscrypt_mergeable_bio(bio, inode, lblk)) + break; } + + atomic_inc(&done.pending); + submit_bio(bio); } -out: - bio_put(bio); - return err; + + fscrypt_zeroout_range_done(&done); + + wait_for_completion(&done.done); + return blk_status_to_errno(done.status); } /** From a3cc978e61f5c909ca94a38d2daeeddc051a18e0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:43 +0100 Subject: [PATCH 019/162] blk-crypto: add a bio_crypt_ctx() helper This returns the bio_crypt_ctx if CONFIG_BLK_INLINE_ENCRYPTION is enabled and a crypto context is attached to the bio, else NULL. The use case is to allow safely dereferencing the context in common code without needed #ifdef CONFIG_BLK_INLINE_ENCRYPTION. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- include/linux/blk-crypto.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 58b0c5254a67..eb80df19be68 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -132,6 +132,11 @@ static inline bool bio_has_crypt_ctx(struct bio *bio) return bio->bi_crypt_context; } +static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio) +{ + return bio->bi_crypt_context; +} + void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask); @@ -169,6 +174,11 @@ static inline bool bio_has_crypt_ctx(struct bio *bio) return false; } +static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio) +{ + return NULL; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); From aefc2a1fa2edc2a486aaf857e48b3fd13062b0eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:44 +0100 Subject: [PATCH 020/162] blk-crypto: submit the encrypted bio in blk_crypto_fallback_bio_prep Restructure blk_crypto_fallback_bio_prep so that it always submits the encrypted bio instead of passing it back to the caller, which allows to simplify the calling conventions for blk_crypto_fallback_bio_prep and blk_crypto_bio_prep so that they never have to return a bio, and can use a true return value to indicate that the caller should submit the bio, and false that the blk-crypto code consumed it. The submission is handled by the on-stack bio list in the current task_struct by the block layer and does not cause additional stack usage or major overhead. It also prepares for the following optimization and fixes for the blk-crypto fallback write path. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-crypto-fallback.c | 70 +++++++++++++++++-------------------- block/blk-crypto-internal.h | 19 ++++------ block/blk-crypto.c | 53 ++++++++++++++-------------- 4 files changed, 67 insertions(+), 77 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 8387fe50ea15..f87e5f1a101f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -628,7 +628,7 @@ static void __submit_bio(struct bio *bio) /* If plug is not used, add new plug here to cache nsecs time. */ struct blk_plug plug; - if (unlikely(!blk_crypto_bio_prep(&bio))) + if (unlikely(!blk_crypto_bio_prep(bio))) return; blk_start_plug(&plug); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 86b27f96051a..cc9e90be23b7 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -250,14 +250,14 @@ static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], /* * The crypto API fallback's encryption routine. - * Allocate a bounce bio for encryption, encrypt the input bio using crypto API, - * and replace *bio_ptr with the bounce bio. May split input bio if it's too - * large. Returns true on success. Returns false and sets bio->bi_status on - * error. + * + * Allocate one or more bios for encryption, encrypt the input bio using the + * crypto API, and submit the encrypted bios. Sets bio->bi_status and + * completes the source bio on error */ -static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) +static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio) { - struct bio *src_bio, *enc_bio; + struct bio *enc_bio; struct bio_crypt_ctx *bc; struct blk_crypto_keyslot *slot; int data_unit_size; @@ -267,14 +267,12 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) struct scatterlist src, dst; union blk_crypto_iv iv; unsigned int i, j; - bool ret = false; blk_status_t blk_st; /* Split the bio if it's too big for single page bvec */ - if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr)) - return false; + if (!blk_crypto_fallback_split_bio_if_needed(&src_bio)) + goto out_endio; - src_bio = *bio_ptr; bc = src_bio->bi_crypt_context; data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; @@ -282,7 +280,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) enc_bio = blk_crypto_fallback_clone_bio(src_bio); if (!enc_bio) { src_bio->bi_status = BLK_STS_RESOURCE; - return false; + goto out_endio; } /* @@ -345,25 +343,23 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) enc_bio->bi_private = src_bio; enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio; - *bio_ptr = enc_bio; - ret = true; - - enc_bio = NULL; - goto out_free_ciph_req; + skcipher_request_free(ciph_req); + blk_crypto_put_keyslot(slot); + submit_bio(enc_bio); + return; out_free_bounce_pages: while (i > 0) mempool_free(enc_bio->bi_io_vec[--i].bv_page, blk_crypto_bounce_page_pool); -out_free_ciph_req: skcipher_request_free(ciph_req); out_release_keyslot: blk_crypto_put_keyslot(slot); out_put_enc_bio: - if (enc_bio) - bio_uninit(enc_bio); + bio_uninit(enc_bio); kfree(enc_bio); - return ret; +out_endio: + bio_endio(src_bio); } /* @@ -466,44 +462,44 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio) /** * blk_crypto_fallback_bio_prep - Prepare a bio to use fallback en/decryption + * @bio: bio to prepare * - * @bio_ptr: pointer to the bio to prepare + * If bio is doing a WRITE operation, allocate one or more bios to contain the + * encrypted payload and submit them. * - * If bio is doing a WRITE operation, this splits the bio into two parts if it's - * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a - * bounce bio for the first part, encrypts it, and updates bio_ptr to point to - * the bounce bio. - * - * For a READ operation, we mark the bio for decryption by using bi_private and + * For a READ operation, mark the bio for decryption by using bi_private and * bi_end_io. * - * In either case, this function will make the bio look like a regular bio (i.e. - * as if no encryption context was ever specified) for the purposes of the rest - * of the stack except for blk-integrity (blk-integrity and blk-crypto are not - * currently supported together). + * In either case, this function will make the submitted bio(s) look like + * regular bios (i.e. as if no encryption context was ever specified) for the + * purposes of the rest of the stack except for blk-integrity (blk-integrity and + * blk-crypto are not currently supported together). * - * Return: true on success. Sets bio->bi_status and returns false on error. + * Return: true if @bio should be submitted to the driver by the caller, else + * false. Sets bio->bi_status, calls bio_endio and returns false on error. */ -bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) +bool blk_crypto_fallback_bio_prep(struct bio *bio) { - struct bio *bio = *bio_ptr; struct bio_crypt_ctx *bc = bio->bi_crypt_context; struct bio_fallback_crypt_ctx *f_ctx; if (WARN_ON_ONCE(!tfms_inited[bc->bc_key->crypto_cfg.crypto_mode])) { /* User didn't call blk_crypto_start_using_key() first */ - bio->bi_status = BLK_STS_IOERR; + bio_io_error(bio); return false; } if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile, &bc->bc_key->crypto_cfg)) { bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); return false; } - if (bio_data_dir(bio) == WRITE) - return blk_crypto_fallback_encrypt_bio(bio_ptr); + if (bio_data_dir(bio) == WRITE) { + blk_crypto_fallback_encrypt_bio(bio); + return false; + } /* * bio READ case: Set up a f_ctx in the bio's bi_private and set the diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index ccf6dff6ff6b..d65023120341 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -165,11 +165,11 @@ static inline void bio_crypt_do_front_merge(struct request *rq, #endif } -bool __blk_crypto_bio_prep(struct bio **bio_ptr); -static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) +bool __blk_crypto_bio_prep(struct bio *bio); +static inline bool blk_crypto_bio_prep(struct bio *bio) { - if (bio_has_crypt_ctx(*bio_ptr)) - return __blk_crypto_bio_prep(bio_ptr); + if (bio_has_crypt_ctx(bio)) + return __blk_crypto_bio_prep(bio); return true; } @@ -215,12 +215,12 @@ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, return 0; } +bool blk_crypto_fallback_bio_prep(struct bio *bio); + #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); -bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); - int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); #else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ @@ -232,13 +232,6 @@ blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) return -ENOPKG; } -static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) -{ - pr_warn_once("crypto API fallback disabled; failing request.\n"); - (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; - return false; -} - static inline int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 3e7bf1974cbd..69e869d1c9bd 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -260,54 +260,55 @@ void __blk_crypto_free_request(struct request *rq) /** * __blk_crypto_bio_prep - Prepare bio for inline encryption - * - * @bio_ptr: pointer to original bio pointer + * @bio: bio to prepare * * If the bio crypt context provided for the bio is supported by the underlying * device's inline encryption hardware, do nothing. * * Otherwise, try to perform en/decryption for this bio by falling back to the - * kernel crypto API. When the crypto API fallback is used for encryption, - * blk-crypto may choose to split the bio into 2 - the first one that will - * continue to be processed and the second one that will be resubmitted via - * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents - * of the aforementioned "first one", and *bio_ptr will be updated to this - * bounce bio. + * kernel crypto API. For encryption this means submitting newly allocated + * bios for the encrypted payload while keeping back the source bio until they + * complete, while for reads the decryption happens in-place by a hooked in + * completion handler. * * Caller must ensure bio has bio_crypt_ctx. * - * Return: true on success; false on error (and bio->bi_status will be set - * appropriately, and bio_endio() will have been called so bio - * submission should abort). + * Return: true if @bio should be submitted to the driver by the caller, else + * false. Sets bio->bi_status, calls bio_endio and returns false on error. */ -bool __blk_crypto_bio_prep(struct bio **bio_ptr) +bool __blk_crypto_bio_prep(struct bio *bio) { - struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; + struct block_device *bdev = bio->bi_bdev; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { - bio->bi_status = BLK_STS_IOERR; - goto fail; + bio_io_error(bio); + return false; } if (!bio_crypt_check_alignment(bio)) { bio->bi_status = BLK_STS_INVAL; - goto fail; + bio_endio(bio); + return false; } /* - * Success if device supports the encryption context, or if we succeeded - * in falling back to the crypto API. + * If the device does not natively support the encryption context, try to use + * the fallback if available. */ - if (blk_crypto_config_supported_natively(bio->bi_bdev, - &bc_key->crypto_cfg)) - return true; - if (blk_crypto_fallback_bio_prep(bio_ptr)) - return true; -fail: - bio_endio(*bio_ptr); - return false; + if (!blk_crypto_config_supported_natively(bdev, &bc_key->crypto_cfg)) { + if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)) { + pr_warn_once("%pg: crypto API fallback disabled; failing request.\n", + bdev); + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + return false; + } + return blk_crypto_fallback_bio_prep(bio); + } + + return true; } int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, From b37fbce460ad60b0c4449c1c7566cf24f3016713 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:45 +0100 Subject: [PATCH 021/162] blk-crypto: optimize bio splitting in blk_crypto_fallback_encrypt_bio The current code in blk_crypto_fallback_encrypt_bio is inefficient and prone to deadlocks under memory pressure: It first walks the passed in plaintext bio to see how much of it can fit into a single encrypted bio using up to BIO_MAX_VEC PAGE_SIZE segments, and then allocates a plaintext clone that fits the size, only to allocate another bio for the ciphertext later. While the plaintext clone uses a bioset to avoid deadlocks when allocations could fail, the ciphertex one uses bio_kmalloc which is a no-go in the file system I/O path. Switch blk_crypto_fallback_encrypt_bio to walk the source plaintext bio while consuming bi_iter without cloning it, and instead allocate a ciphertext bio at the beginning and whenever we fille up the previous one. The existing bio_set for the plaintext clones is reused for the ciphertext bios to remove the deadlock risk. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- block/blk-crypto-fallback.c | 181 +++++++++++++++--------------------- 1 file changed, 76 insertions(+), 105 deletions(-) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index cc9e90be23b7..4ec7da342280 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -81,7 +81,7 @@ static struct blk_crypto_fallback_keyslot { static struct blk_crypto_profile *blk_crypto_fallback_profile; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; -static struct bio_set crypto_bio_split; +static struct bio_set enc_bio_set; /* * This is the key we set when evicting a keyslot. This *should* be the all 0's @@ -150,37 +150,29 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) mempool_free(enc_bio->bi_io_vec[i].bv_page, blk_crypto_bounce_page_pool); - src_bio->bi_status = enc_bio->bi_status; + if (enc_bio->bi_status) + cmpxchg(&src_bio->bi_status, 0, enc_bio->bi_status); - bio_uninit(enc_bio); - kfree(enc_bio); + bio_put(enc_bio); bio_endio(src_bio); } -static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) +static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src, + unsigned int nr_segs) { - unsigned int nr_segs = bio_segments(bio_src); - struct bvec_iter iter; - struct bio_vec bv; struct bio *bio; - bio = bio_kmalloc(nr_segs, GFP_NOIO); - if (!bio) - return NULL; - bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf); + bio = bio_alloc_bioset(bio_src->bi_bdev, nr_segs, bio_src->bi_opf, + GFP_NOIO, &enc_bio_set); if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); + bio->bi_private = bio_src; + bio->bi_end_io = blk_crypto_fallback_encrypt_endio; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; - bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; - - bio_for_each_segment(bv, bio_src, iter) - bio->bi_io_vec[bio->bi_vcnt++] = bv; - bio_clone_blkg_association(bio, bio_src); - return bio; } @@ -208,32 +200,6 @@ blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot, return true; } -static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) -{ - struct bio *bio = *bio_ptr; - unsigned int i = 0; - unsigned int num_sectors = 0; - struct bio_vec bv; - struct bvec_iter iter; - - bio_for_each_segment(bv, bio, iter) { - num_sectors += bv.bv_len >> SECTOR_SHIFT; - if (++i == BIO_MAX_VECS) - break; - } - - if (num_sectors < bio_sectors(bio)) { - bio = bio_submit_split_bioset(bio, num_sectors, - &crypto_bio_split); - if (!bio) - return false; - - *bio_ptr = bio; - } - - return true; -} - union blk_crypto_iv { __le64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; u8 bytes[BLK_CRYPTO_MAX_IV_SIZE]; @@ -257,46 +223,35 @@ static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], */ static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio) { - struct bio *enc_bio; - struct bio_crypt_ctx *bc; - struct blk_crypto_keyslot *slot; - int data_unit_size; + struct bio_crypt_ctx *bc = src_bio->bi_crypt_context; + int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; struct skcipher_request *ciph_req = NULL; + struct blk_crypto_keyslot *slot; DECLARE_CRYPTO_WAIT(wait); u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; struct scatterlist src, dst; union blk_crypto_iv iv; - unsigned int i, j; - blk_status_t blk_st; - - /* Split the bio if it's too big for single page bvec */ - if (!blk_crypto_fallback_split_bio_if_needed(&src_bio)) - goto out_endio; - - bc = src_bio->bi_crypt_context; - data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; - - /* Allocate bounce bio for encryption */ - enc_bio = blk_crypto_fallback_clone_bio(src_bio); - if (!enc_bio) { - src_bio->bi_status = BLK_STS_RESOURCE; - goto out_endio; - } + unsigned int nr_enc_pages, enc_idx; + struct bio *enc_bio; + blk_status_t status; + unsigned int i; /* * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for * this bio's algorithm and key. */ - blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile, + status = blk_crypto_get_keyslot(blk_crypto_fallback_profile, bc->bc_key, &slot); - if (blk_st != BLK_STS_OK) { - src_bio->bi_status = blk_st; - goto out_put_enc_bio; + if (status != BLK_STS_OK) { + src_bio->bi_status = status; + bio_endio(src_bio); + return; } /* and then allocate an skcipher_request for it */ if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { src_bio->bi_status = BLK_STS_RESOURCE; + bio_endio(src_bio); goto out_release_keyslot; } @@ -307,59 +262,75 @@ static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio) skcipher_request_set_crypt(ciph_req, &src, &dst, data_unit_size, iv.bytes); - /* Encrypt each page in the bounce bio */ - for (i = 0; i < enc_bio->bi_vcnt; i++) { - struct bio_vec *enc_bvec = &enc_bio->bi_io_vec[i]; - struct page *plaintext_page = enc_bvec->bv_page; - struct page *ciphertext_page = - mempool_alloc(blk_crypto_bounce_page_pool, GFP_NOIO); + /* + * Encrypt each page in the source bio. Because the source bio could + * have bio_vecs that span more than a single page, but the encrypted + * bios are limited to a single page per bio_vec, this can generate + * more than a single encrypted bio per source bio. + */ +new_bio: + nr_enc_pages = min(bio_segments(src_bio), BIO_MAX_VECS); + enc_bio = blk_crypto_alloc_enc_bio(src_bio, nr_enc_pages); + enc_idx = 0; + for (;;) { + struct bio_vec src_bv = + bio_iter_iovec(src_bio, src_bio->bi_iter); + struct page *enc_page; - enc_bvec->bv_page = ciphertext_page; + enc_page = mempool_alloc(blk_crypto_bounce_page_pool, + GFP_NOIO); + __bio_add_page(enc_bio, enc_page, src_bv.bv_len, + src_bv.bv_offset); - if (!ciphertext_page) { - src_bio->bi_status = BLK_STS_RESOURCE; - goto out_free_bounce_pages; - } + sg_set_page(&src, src_bv.bv_page, data_unit_size, + src_bv.bv_offset); + sg_set_page(&dst, enc_page, data_unit_size, src_bv.bv_offset); - sg_set_page(&src, plaintext_page, data_unit_size, - enc_bvec->bv_offset); - sg_set_page(&dst, ciphertext_page, data_unit_size, - enc_bvec->bv_offset); + /* + * Increment the index now that the encrypted page is added to + * the bio. This is important for the error unwind path. + */ + enc_idx++; - /* Encrypt each data unit in this page */ - for (j = 0; j < enc_bvec->bv_len; j += data_unit_size) { + /* + * Encrypt each data unit in this page. + */ + for (i = 0; i < src_bv.bv_len; i += data_unit_size) { blk_crypto_dun_to_iv(curr_dun, &iv); if (crypto_wait_req(crypto_skcipher_encrypt(ciph_req), &wait)) { - i++; - src_bio->bi_status = BLK_STS_IOERR; - goto out_free_bounce_pages; + bio_io_error(enc_bio); + goto out_free_request; } bio_crypt_dun_increment(curr_dun, 1); src.offset += data_unit_size; dst.offset += data_unit_size; } + + bio_advance_iter_single(src_bio, &src_bio->bi_iter, + src_bv.bv_len); + if (!src_bio->bi_iter.bi_size) + break; + + if (enc_idx == nr_enc_pages) { + /* + * For each additional encrypted bio submitted, + * increment the source bio's remaining count. Each + * encrypted bio's completion handler calls bio_endio on + * the source bio, so this keeps the source bio from + * completing until the last encrypted bio does. + */ + bio_inc_remaining(src_bio); + submit_bio(enc_bio); + goto new_bio; + } } - enc_bio->bi_private = src_bio; - enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio; - skcipher_request_free(ciph_req); - blk_crypto_put_keyslot(slot); submit_bio(enc_bio); - return; - -out_free_bounce_pages: - while (i > 0) - mempool_free(enc_bio->bi_io_vec[--i].bv_page, - blk_crypto_bounce_page_pool); +out_free_request: skcipher_request_free(ciph_req); out_release_keyslot: blk_crypto_put_keyslot(slot); -out_put_enc_bio: - bio_uninit(enc_bio); - kfree(enc_bio); -out_endio: - bio_endio(src_bio); } /* @@ -533,7 +504,7 @@ static int blk_crypto_fallback_init(void) get_random_bytes(blank_key, sizeof(blank_key)); - err = bioset_init(&crypto_bio_split, 64, 0, 0); + err = bioset_init(&enc_bio_set, 64, 0, BIOSET_NEED_BVECS); if (err) goto out; @@ -603,7 +574,7 @@ fail_destroy_profile: fail_free_profile: kfree(blk_crypto_fallback_profile); fail_free_bioset: - bioset_exit(&crypto_bio_split); + bioset_exit(&enc_bio_set); out: return err; } From 2f655dcb2d925b55deb8c1ec8f42b522c6bc5698 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:46 +0100 Subject: [PATCH 022/162] blk-crypto: use on-stack skcipher requests for fallback en/decryption Allocating a skcipher request dynamically can deadlock or cause unexpected I/O failures when called from writeback context. Avoid the allocation entirely by using on-stack skciphers, similar to what the non-blk-crypto fscrypt path already does. This drops the incomplete support for asynchronous algorithms, which previously could be used, but only synchronously. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- block/blk-crypto-fallback.c | 209 ++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 115 deletions(-) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 4ec7da342280..4a682230c278 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -75,7 +75,7 @@ static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; static struct blk_crypto_fallback_keyslot { enum blk_crypto_mode_num crypto_mode; - struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; + struct crypto_sync_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; } *blk_crypto_keyslots; static struct blk_crypto_profile *blk_crypto_fallback_profile; @@ -98,7 +98,7 @@ static void blk_crypto_fallback_evict_keyslot(unsigned int slot) WARN_ON(slotp->crypto_mode == BLK_ENCRYPTION_MODE_INVALID); /* Clear the key in the skcipher */ - err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], blank_key, + err = crypto_sync_skcipher_setkey(slotp->tfms[crypto_mode], blank_key, blk_crypto_modes[crypto_mode].keysize); WARN_ON(err); slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; @@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; - err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes, + err = crypto_sync_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes, key->size); if (err) { blk_crypto_fallback_evict_keyslot(slot); @@ -176,28 +176,13 @@ static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src, return bio; } -static bool -blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot, - struct skcipher_request **ciph_req_ret, - struct crypto_wait *wait) +static struct crypto_sync_skcipher * +blk_crypto_fallback_tfm(struct blk_crypto_keyslot *slot) { - struct skcipher_request *ciph_req; - const struct blk_crypto_fallback_keyslot *slotp; - int keyslot_idx = blk_crypto_keyslot_index(slot); + const struct blk_crypto_fallback_keyslot *slotp = + &blk_crypto_keyslots[blk_crypto_keyslot_index(slot)]; - slotp = &blk_crypto_keyslots[keyslot_idx]; - ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], - GFP_NOIO); - if (!ciph_req) - return false; - - skcipher_request_set_callback(ciph_req, - CRYPTO_TFM_REQ_MAY_BACKLOG | - CRYPTO_TFM_REQ_MAY_SLEEP, - crypto_req_done, wait); - *ciph_req_ret = ciph_req; - - return true; + return slotp->tfms[slotp->crypto_mode]; } union blk_crypto_iv { @@ -214,46 +199,22 @@ static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], iv->dun[i] = cpu_to_le64(dun[i]); } -/* - * The crypto API fallback's encryption routine. - * - * Allocate one or more bios for encryption, encrypt the input bio using the - * crypto API, and submit the encrypted bios. Sets bio->bi_status and - * completes the source bio on error - */ -static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio) +static void __blk_crypto_fallback_encrypt_bio(struct bio *src_bio, + struct crypto_sync_skcipher *tfm) { struct bio_crypt_ctx *bc = src_bio->bi_crypt_context; int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; - struct skcipher_request *ciph_req = NULL; - struct blk_crypto_keyslot *slot; - DECLARE_CRYPTO_WAIT(wait); + SYNC_SKCIPHER_REQUEST_ON_STACK(ciph_req, tfm); u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; struct scatterlist src, dst; union blk_crypto_iv iv; unsigned int nr_enc_pages, enc_idx; struct bio *enc_bio; - blk_status_t status; unsigned int i; - /* - * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for - * this bio's algorithm and key. - */ - status = blk_crypto_get_keyslot(blk_crypto_fallback_profile, - bc->bc_key, &slot); - if (status != BLK_STS_OK) { - src_bio->bi_status = status; - bio_endio(src_bio); - return; - } - - /* and then allocate an skcipher_request for it */ - if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { - src_bio->bi_status = BLK_STS_RESOURCE; - bio_endio(src_bio); - goto out_release_keyslot; - } + skcipher_request_set_callback(ciph_req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); sg_init_table(&src, 1); @@ -297,10 +258,9 @@ new_bio: */ for (i = 0; i < src_bv.bv_len; i += data_unit_size) { blk_crypto_dun_to_iv(curr_dun, &iv); - if (crypto_wait_req(crypto_skcipher_encrypt(ciph_req), - &wait)) { + if (crypto_skcipher_encrypt(ciph_req)) { bio_io_error(enc_bio); - goto out_free_request; + return; } bio_crypt_dun_increment(curr_dun, 1); src.offset += data_unit_size; @@ -327,14 +287,76 @@ new_bio: } submit_bio(enc_bio); -out_free_request: - skcipher_request_free(ciph_req); -out_release_keyslot: +} + +/* + * The crypto API fallback's encryption routine. + * + * Allocate one or more bios for encryption, encrypt the input bio using the + * crypto API, and submit the encrypted bios. Sets bio->bi_status and + * completes the source bio on error + */ +static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio) +{ + struct bio_crypt_ctx *bc = src_bio->bi_crypt_context; + struct blk_crypto_keyslot *slot; + blk_status_t status; + + status = blk_crypto_get_keyslot(blk_crypto_fallback_profile, + bc->bc_key, &slot); + if (status != BLK_STS_OK) { + src_bio->bi_status = status; + bio_endio(src_bio); + return; + } + __blk_crypto_fallback_encrypt_bio(src_bio, + blk_crypto_fallback_tfm(slot)); blk_crypto_put_keyslot(slot); } +static blk_status_t __blk_crypto_fallback_decrypt_bio(struct bio *bio, + struct bio_crypt_ctx *bc, struct bvec_iter iter, + struct crypto_sync_skcipher *tfm) +{ + SYNC_SKCIPHER_REQUEST_ON_STACK(ciph_req, tfm); + u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + union blk_crypto_iv iv; + struct scatterlist sg; + struct bio_vec bv; + const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; + unsigned int i; + + skcipher_request_set_callback(ciph_req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + NULL, NULL); + + memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); + sg_init_table(&sg, 1); + skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size, + iv.bytes); + + /* Decrypt each segment in the bio */ + __bio_for_each_segment(bv, bio, iter, iter) { + struct page *page = bv.bv_page; + + sg_set_page(&sg, page, data_unit_size, bv.bv_offset); + + /* Decrypt each data unit in the segment */ + for (i = 0; i < bv.bv_len; i += data_unit_size) { + blk_crypto_dun_to_iv(curr_dun, &iv); + if (crypto_skcipher_decrypt(ciph_req)) + return BLK_STS_IOERR; + bio_crypt_dun_increment(curr_dun, 1); + sg.offset += data_unit_size; + } + } + + return BLK_STS_OK; +} + /* * The crypto API fallback's main decryption routine. + * * Decrypts input bio in place, and calls bio_endio on the bio. */ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) @@ -344,63 +366,19 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) struct bio *bio = f_ctx->bio; struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; struct blk_crypto_keyslot *slot; - struct skcipher_request *ciph_req = NULL; - DECLARE_CRYPTO_WAIT(wait); - u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; - union blk_crypto_iv iv; - struct scatterlist sg; - struct bio_vec bv; - struct bvec_iter iter; - const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; - unsigned int i; - blk_status_t blk_st; + blk_status_t status; - /* - * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for - * this bio's algorithm and key. - */ - blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile, + status = blk_crypto_get_keyslot(blk_crypto_fallback_profile, bc->bc_key, &slot); - if (blk_st != BLK_STS_OK) { - bio->bi_status = blk_st; - goto out_no_keyslot; + if (status == BLK_STS_OK) { + status = __blk_crypto_fallback_decrypt_bio(bio, bc, + f_ctx->crypt_iter, + blk_crypto_fallback_tfm(slot)); + blk_crypto_put_keyslot(slot); } - - /* and then allocate an skcipher_request for it */ - if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { - bio->bi_status = BLK_STS_RESOURCE; - goto out; - } - - memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); - sg_init_table(&sg, 1); - skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size, - iv.bytes); - - /* Decrypt each segment in the bio */ - __bio_for_each_segment(bv, bio, iter, f_ctx->crypt_iter) { - struct page *page = bv.bv_page; - - sg_set_page(&sg, page, data_unit_size, bv.bv_offset); - - /* Decrypt each data unit in the segment */ - for (i = 0; i < bv.bv_len; i += data_unit_size) { - blk_crypto_dun_to_iv(curr_dun, &iv); - if (crypto_wait_req(crypto_skcipher_decrypt(ciph_req), - &wait)) { - bio->bi_status = BLK_STS_IOERR; - goto out; - } - bio_crypt_dun_increment(curr_dun, 1); - sg.offset += data_unit_size; - } - } - -out: - skcipher_request_free(ciph_req); - blk_crypto_put_keyslot(slot); -out_no_keyslot: mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); + + bio->bi_status = status; bio_endio(bio); } @@ -608,7 +586,8 @@ int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) for (i = 0; i < blk_crypto_num_keyslots; i++) { slotp = &blk_crypto_keyslots[i]; - slotp->tfms[mode_num] = crypto_alloc_skcipher(cipher_str, 0, 0); + slotp->tfms[mode_num] = crypto_alloc_sync_skcipher(cipher_str, + 0, 0); if (IS_ERR(slotp->tfms[mode_num])) { err = PTR_ERR(slotp->tfms[mode_num]); if (err == -ENOENT) { @@ -620,7 +599,7 @@ int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) goto out_free_tfms; } - crypto_skcipher_set_flags(slotp->tfms[mode_num], + crypto_sync_skcipher_set_flags(slotp->tfms[mode_num], CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); } @@ -634,7 +613,7 @@ int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) out_free_tfms: for (i = 0; i < blk_crypto_num_keyslots; i++) { slotp = &blk_crypto_keyslots[i]; - crypto_free_skcipher(slotp->tfms[mode_num]); + crypto_free_sync_skcipher(slotp->tfms[mode_num]); slotp->tfms[mode_num] = NULL; } out: From 3d939695e68218d420be2b5dbb2fa39ccb7e97ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:47 +0100 Subject: [PATCH 023/162] blk-crypto: use mempool_alloc_bulk for encrypted bio page allocation Calling mempool_alloc in a loop is not safe unless the maximum allocation size times the maximum number of threads using it is less than the minimum pool size. Use the new mempool_alloc_bulk helper to allocate all missing elements in one pass to remove this deadlock risk. This also means that non-pool allocations now use alloc_pages_bulk which can be significantly faster than a loop over individual page allocations. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- block/blk-crypto-fallback.c | 76 ++++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 14 deletions(-) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 4a682230c278..6be971859542 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -22,7 +22,7 @@ #include "blk-cgroup.h" #include "blk-crypto-internal.h" -static unsigned int num_prealloc_bounce_pg = 32; +static unsigned int num_prealloc_bounce_pg = BIO_MAX_VECS; module_param(num_prealloc_bounce_pg, uint, 0); MODULE_PARM_DESC(num_prealloc_bounce_pg, "Number of preallocated bounce pages for the blk-crypto crypto API fallback"); @@ -144,11 +144,21 @@ static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = { static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) { struct bio *src_bio = enc_bio->bi_private; - int i; + struct page **pages = (struct page **)enc_bio->bi_io_vec; + struct bio_vec *bv; + unsigned int i; - for (i = 0; i < enc_bio->bi_vcnt; i++) - mempool_free(enc_bio->bi_io_vec[i].bv_page, - blk_crypto_bounce_page_pool); + /* + * Use the same trick as the alloc side to avoid the need for an extra + * pages array. + */ + bio_for_each_bvec_all(bv, enc_bio, i) + pages[i] = bv->bv_page; + + i = mempool_free_bulk(blk_crypto_bounce_page_pool, (void **)pages, + enc_bio->bi_vcnt); + if (i < enc_bio->bi_vcnt) + release_pages(pages + i, enc_bio->bi_vcnt - i); if (enc_bio->bi_status) cmpxchg(&src_bio->bi_status, 0, enc_bio->bi_status); @@ -157,9 +167,14 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) bio_endio(src_bio); } +#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) + static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src, - unsigned int nr_segs) + unsigned int nr_segs, struct page ***pages_ret) { + unsigned int memflags = memalloc_noio_save(); + unsigned int nr_allocated; + struct page **pages; struct bio *bio; bio = bio_alloc_bioset(bio_src->bi_bdev, nr_segs, bio_src->bi_opf, @@ -173,6 +188,30 @@ static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src, bio->bi_write_stream = bio_src->bi_write_stream; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio_clone_blkg_association(bio, bio_src); + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + static_assert(PAGE_PTRS_PER_BVEC > 1); + pages = (struct page **)bio->bi_io_vec; + pages += nr_segs * (PAGE_PTRS_PER_BVEC - 1); + + /* + * Try a bulk allocation first. This could leave random pages in the + * array unallocated, but we'll fix that up later in mempool_alloc_bulk. + * + * Note: alloc_pages_bulk needs the array to be zeroed, as it assumes + * any non-zero slot already contains a valid allocation. + */ + memset(pages, 0, sizeof(struct page *) * nr_segs); + nr_allocated = alloc_pages_bulk(GFP_KERNEL, nr_segs, pages); + if (nr_allocated < nr_segs) + mempool_alloc_bulk(blk_crypto_bounce_page_pool, (void **)pages, + nr_segs, nr_allocated); + memalloc_noio_restore(memflags); + *pages_ret = pages; return bio; } @@ -209,6 +248,7 @@ static void __blk_crypto_fallback_encrypt_bio(struct bio *src_bio, struct scatterlist src, dst; union blk_crypto_iv iv; unsigned int nr_enc_pages, enc_idx; + struct page **enc_pages; struct bio *enc_bio; unsigned int i; @@ -231,15 +271,13 @@ static void __blk_crypto_fallback_encrypt_bio(struct bio *src_bio, */ new_bio: nr_enc_pages = min(bio_segments(src_bio), BIO_MAX_VECS); - enc_bio = blk_crypto_alloc_enc_bio(src_bio, nr_enc_pages); + enc_bio = blk_crypto_alloc_enc_bio(src_bio, nr_enc_pages, &enc_pages); enc_idx = 0; for (;;) { struct bio_vec src_bv = bio_iter_iovec(src_bio, src_bio->bi_iter); - struct page *enc_page; + struct page *enc_page = enc_pages[enc_idx]; - enc_page = mempool_alloc(blk_crypto_bounce_page_pool, - GFP_NOIO); __bio_add_page(enc_bio, enc_page, src_bv.bv_len, src_bv.bv_offset); @@ -258,10 +296,8 @@ new_bio: */ for (i = 0; i < src_bv.bv_len; i += data_unit_size) { blk_crypto_dun_to_iv(curr_dun, &iv); - if (crypto_skcipher_encrypt(ciph_req)) { - bio_io_error(enc_bio); - return; - } + if (crypto_skcipher_encrypt(ciph_req)) + goto out_free_enc_bio; bio_crypt_dun_increment(curr_dun, 1); src.offset += data_unit_size; dst.offset += data_unit_size; @@ -287,6 +323,18 @@ new_bio: } submit_bio(enc_bio); + return; + +out_free_enc_bio: + /* + * Add the remaining pages to the bio so that the normal completion path + * in blk_crypto_fallback_encrypt_endio frees them. The exact data + * layout does not matter for that, so don't bother iterating the source + * bio. + */ + for (; enc_idx < nr_enc_pages; enc_idx++) + __bio_add_page(enc_bio, enc_pages[enc_idx], PAGE_SIZE, 0); + bio_io_error(enc_bio); } /* From 66e5a11d2ed6d58006d5cd8276de28751daaa230 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:48 +0100 Subject: [PATCH 024/162] blk-crypto: optimize data unit alignment checking Avoid the relatively high overhead of constructing and walking per-page segment bio_vecs for data unit alignment checking by merging the checks into existing loops. For hardware support crypto, perform the check in bio_split_io_at, which already contains a similar alignment check applied for all I/O. This means bio-based drivers that do not call bio_split_to_limits, should they ever grow blk-crypto support, need to implement the check themselves, just like all other queue limits checks. For blk-crypto-fallback do it in the encryption/decryption loops. This means alignment errors for decryption will only be detected after I/O has completed, but that seems like a worthwhile trade off. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- block/blk-crypto-fallback.c | 15 +++++++++++++-- block/blk-crypto.c | 22 ---------------------- block/blk-merge.c | 9 ++++++++- 3 files changed, 21 insertions(+), 25 deletions(-) diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 6be971859542..a331b061dbf4 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -278,6 +278,12 @@ new_bio: bio_iter_iovec(src_bio, src_bio->bi_iter); struct page *enc_page = enc_pages[enc_idx]; + if (!IS_ALIGNED(src_bv.bv_len | src_bv.bv_offset, + data_unit_size)) { + enc_bio->bi_status = BLK_STS_INVAL; + goto out_free_enc_bio; + } + __bio_add_page(enc_bio, enc_page, src_bv.bv_len, src_bv.bv_offset); @@ -296,8 +302,10 @@ new_bio: */ for (i = 0; i < src_bv.bv_len; i += data_unit_size) { blk_crypto_dun_to_iv(curr_dun, &iv); - if (crypto_skcipher_encrypt(ciph_req)) + if (crypto_skcipher_encrypt(ciph_req)) { + enc_bio->bi_status = BLK_STS_IOERR; goto out_free_enc_bio; + } bio_crypt_dun_increment(curr_dun, 1); src.offset += data_unit_size; dst.offset += data_unit_size; @@ -334,7 +342,7 @@ out_free_enc_bio: */ for (; enc_idx < nr_enc_pages; enc_idx++) __bio_add_page(enc_bio, enc_pages[enc_idx], PAGE_SIZE, 0); - bio_io_error(enc_bio); + bio_endio(enc_bio); } /* @@ -387,6 +395,9 @@ static blk_status_t __blk_crypto_fallback_decrypt_bio(struct bio *bio, __bio_for_each_segment(bv, bio, iter, iter) { struct page *page = bv.bv_page; + if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) + return BLK_STS_INVAL; + sg_set_page(&sg, page, data_unit_size, bv.bv_offset); /* Decrypt each data unit in the segment */ diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 69e869d1c9bd..0b2535d8dbcc 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -219,22 +219,6 @@ bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun); } -/* Check that all I/O segments are data unit aligned. */ -static bool bio_crypt_check_alignment(struct bio *bio) -{ - const unsigned int data_unit_size = - bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size; - struct bvec_iter iter; - struct bio_vec bv; - - bio_for_each_segment(bv, bio, iter) { - if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) - return false; - } - - return true; -} - blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq) { return blk_crypto_get_keyslot(rq->q->crypto_profile, @@ -287,12 +271,6 @@ bool __blk_crypto_bio_prep(struct bio *bio) return false; } - if (!bio_crypt_check_alignment(bio)) { - bio->bi_status = BLK_STS_INVAL; - bio_endio(bio); - return false; - } - /* * If the device does not natively support the encryption context, try to use * the fallback if available. diff --git a/block/blk-merge.c b/block/blk-merge.c index d3115d7469df..b82c6d304658 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -324,12 +324,19 @@ static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv, int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align_mask) { + struct bio_crypt_ctx *bc = bio_crypt_ctx(bio); struct bio_vec bv, bvprv, *bvprvp = NULL; unsigned nsegs = 0, bytes = 0, gaps = 0; struct bvec_iter iter; + unsigned start_align_mask = lim->dma_alignment; + + if (bc) { + start_align_mask |= (bc->bc_key->crypto_cfg.data_unit_size - 1); + len_align_mask |= (bc->bc_key->crypto_cfg.data_unit_size - 1); + } bio_for_each_bvec(bv, bio, iter) { - if (bv.bv_offset & lim->dma_alignment || + if (bv.bv_offset & start_align_mask || bv.bv_len & len_align_mask) return -EINVAL; From bb8e2019ad613dd023a59bf91d1768018d17e09b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jan 2026 07:07:49 +0100 Subject: [PATCH 025/162] blk-crypto: handle the fallback above the block layer Add a blk_crypto_submit_bio helper that either submits the bio when it is not encrypted or inline encryption is provided, but otherwise handles the encryption before going down into the low-level driver. This reduces the risk from bio reordering and keeps memory allocation as high up in the stack as possible. Note that if the submitter knows that inline enctryption is known to be supported by the underyling driver, it can still use plain submit_bio. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Signed-off-by: Jens Axboe --- Documentation/block/inline-encryption.rst | 6 ++++++ block/blk-core.c | 10 +++++++--- block/blk-crypto-internal.h | 19 +++++++++++-------- block/blk-crypto.c | 23 ++++++----------------- fs/buffer.c | 3 ++- fs/crypto/bio.c | 2 +- fs/ext4/page-io.c | 3 ++- fs/ext4/readpage.c | 9 +++++---- fs/f2fs/data.c | 4 ++-- fs/f2fs/file.c | 3 ++- fs/iomap/direct-io.c | 3 ++- include/linux/blk-crypto.h | 22 ++++++++++++++++++++++ 12 files changed, 68 insertions(+), 39 deletions(-) diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 6380e6ab492b..7e0703a12dfb 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -206,6 +206,12 @@ it to a bio, given the blk_crypto_key and the data unit number that will be used for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx later, as that happens automatically when the bio is freed or reset. +To submit a bio that uses inline encryption, users must call +``blk_crypto_submit_bio()`` instead of the usual ``submit_bio()``. This will +submit the bio to the underlying driver if it supports inline crypto, or else +call the blk-crypto fallback routines before submitting normal bios to the +underlying drivers. + Finally, when done using inline encryption with a blk_crypto_key on a block_device, users must call ``blk_crypto_evict_key()``. This ensures that the key is evicted from all keyslots it may be programmed into and unlinked from diff --git a/block/blk-core.c b/block/blk-core.c index f87e5f1a101f..a0bf5174e9e9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -628,9 +628,6 @@ static void __submit_bio(struct bio *bio) /* If plug is not used, add new plug here to cache nsecs time. */ struct blk_plug plug; - if (unlikely(!blk_crypto_bio_prep(bio))) - return; - blk_start_plug(&plug); if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { @@ -794,6 +791,13 @@ void submit_bio_noacct(struct bio *bio) if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; + if (bio_has_crypt_ctx(bio)) { + if (WARN_ON_ONCE(!bio_has_data(bio))) + goto end_io; + if (!blk_crypto_supported(bio)) + goto not_supported; + } + if (should_fail_bio(bio)) goto end_io; bio_check_ro(bio); diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index d65023120341..742694213529 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -86,6 +86,12 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, void __user *argp); +static inline bool blk_crypto_supported(struct bio *bio) +{ + return blk_crypto_config_supported_natively(bio->bi_bdev, + &bio->bi_crypt_context->bc_key->crypto_cfg); +} + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) @@ -139,6 +145,11 @@ static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd, return -ENOTTY; } +static inline bool blk_crypto_supported(struct bio *bio) +{ + return false; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); @@ -165,14 +176,6 @@ static inline void bio_crypt_do_front_merge(struct request *rq, #endif } -bool __blk_crypto_bio_prep(struct bio *bio); -static inline bool blk_crypto_bio_prep(struct bio *bio) -{ - if (bio_has_crypt_ctx(bio)) - return __blk_crypto_bio_prep(bio); - return true; -} - blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 0b2535d8dbcc..856d3c5b1fa0 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -242,25 +242,13 @@ void __blk_crypto_free_request(struct request *rq) rq->crypt_ctx = NULL; } -/** - * __blk_crypto_bio_prep - Prepare bio for inline encryption - * @bio: bio to prepare +/* + * Process a bio with a crypto context. Returns true if the caller should + * submit the passed in bio, false if the bio is consumed. * - * If the bio crypt context provided for the bio is supported by the underlying - * device's inline encryption hardware, do nothing. - * - * Otherwise, try to perform en/decryption for this bio by falling back to the - * kernel crypto API. For encryption this means submitting newly allocated - * bios for the encrypted payload while keeping back the source bio until they - * complete, while for reads the decryption happens in-place by a hooked in - * completion handler. - * - * Caller must ensure bio has bio_crypt_ctx. - * - * Return: true if @bio should be submitted to the driver by the caller, else - * false. Sets bio->bi_status, calls bio_endio and returns false on error. + * See the kerneldoc comment for blk_crypto_submit_bio for further details. */ -bool __blk_crypto_bio_prep(struct bio *bio) +bool __blk_crypto_submit_bio(struct bio *bio) { const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; struct block_device *bdev = bio->bi_bdev; @@ -288,6 +276,7 @@ bool __blk_crypto_bio_prep(struct bio *bio) return true; } +EXPORT_SYMBOL_GPL(__blk_crypto_submit_bio); int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) diff --git a/fs/buffer.c b/fs/buffer.c index 838c0c571022..da18053f66e8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -2821,7 +2822,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } - submit_bio(bio); + blk_crypto_submit_bio(bio); } void submit_bh(blk_opf_t opf, struct buffer_head *bh) diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index c2b3ca100f8d..6da683ea69dc 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -105,7 +105,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, } atomic_inc(&done.pending); - submit_bio(bio); + blk_crypto_submit_bio(bio); } fscrypt_zeroout_range_done(&done); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 39abfeec5f36..a8c95eee91b7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -7,6 +7,7 @@ * Written by Theodore Ts'o, 2010. */ +#include #include #include #include @@ -401,7 +402,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { if (io->io_wbc->sync_mode == WB_SYNC_ALL) io->io_bio->bi_opf |= REQ_SYNC; - submit_bio(io->io_bio); + blk_crypto_submit_bio(io->io_bio); } io->io_bio = NULL; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index e7f2350c725b..49a6d36a8dba 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -345,7 +346,7 @@ int ext4_mpage_readpages(struct inode *inode, if (bio && (last_block_in_bio != first_block - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: - submit_bio(bio); + blk_crypto_submit_bio(bio); bio = NULL; } if (bio == NULL) { @@ -371,14 +372,14 @@ int ext4_mpage_readpages(struct inode *inode, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_folio)) { - submit_bio(bio); + blk_crypto_submit_bio(bio); bio = NULL; } else last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { - submit_bio(bio); + blk_crypto_submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) @@ -389,7 +390,7 @@ next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) - submit_bio(bio); + blk_crypto_submit_bio(bio); return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c30e69392a62..c3dd8a5c8589 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -513,7 +513,7 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, trace_f2fs_submit_read_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); - submit_bio(bio); + blk_crypto_submit_bio(bio); } static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, @@ -522,7 +522,7 @@ static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio, WARN_ON_ONCE(is_read_io(bio_op(bio))); trace_f2fs_submit_write_bio(sbi->sb, type, bio); iostat_update_submit_ctx(bio, type); - submit_bio(bio); + blk_crypto_submit_bio(bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d7047ca6b98d..914790f37915 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -5,6 +5,7 @@ * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ +#include #include #include #include @@ -5046,7 +5047,7 @@ static void f2fs_dio_write_submit_io(const struct iomap_iter *iter, enum temp_type temp = f2fs_get_segment_temp(sbi, type); bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp); - submit_bio(bio); + blk_crypto_submit_bio(bio); } static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 8e273408453a..4000c8596d9b 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -3,6 +3,7 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2025 Christoph Hellwig. */ +#include #include #include #include @@ -74,7 +75,7 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, dio->dops->submit_io(iter, bio, pos); } else { WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); - submit_bio(bio); + blk_crypto_submit_bio(bio); } } diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index eb80df19be68..f7c3cb4a342f 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -181,6 +181,28 @@ static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio) #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ +bool __blk_crypto_submit_bio(struct bio *bio); + +/** + * blk_crypto_submit_bio - Submit a bio that may have a crypto context + * @bio: bio to submit + * + * If @bio has no crypto context, or the crypt context attached to @bio is + * supported by the underlying device's inline encryption hardware, just submit + * @bio. + * + * Otherwise, try to perform en/decryption for this bio by falling back to the + * kernel crypto API. For encryption this means submitting newly allocated + * bios for the encrypted payload while keeping back the source bio until they + * complete, while for reads the decryption happens in-place by a hooked in + * completion handler. + */ +static inline void blk_crypto_submit_bio(struct bio *bio) +{ + if (!bio_has_crypt_ctx(bio) || __blk_crypto_submit_bio(bio)) + submit_bio(bio); +} + int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); /** * bio_crypt_clone - clone bio encryption context From 835042fb1971b1cc6acb46d53b8862643fd7d0a8 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:29 -0700 Subject: [PATCH 026/162] blk-integrity: take const pointer in blk_integrity_rq() blk_integrity_rq() doesn't modify the struct request passed in, so allow a const pointer to be passed. Use a matching signature for the !CONFIG_BLK_DEV_INTEGRITY version. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index a6b84206eb94..c15b1ac62765 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -91,7 +91,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, return bio_integrity_intervals(bi, sectors) * bi->metadata_size; } -static inline bool blk_integrity_rq(struct request *rq) +static inline bool blk_integrity_rq(const struct request *rq) { return rq->cmd_flags & REQ_INTEGRITY; } @@ -168,9 +168,9 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, { return 0; } -static inline int blk_integrity_rq(struct request *rq) +static inline bool blk_integrity_rq(const struct request *rq) { - return 0; + return false; } static inline struct bio_vec rq_integrity_vec(struct request *rq) From e859e7c26a5c4689083f161a52d039b9b454e403 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:30 -0700 Subject: [PATCH 027/162] ublk: move ublk flag check functions earlier ublk_dev_support_user_copy() will be used in ublk_validate_params(). Move these functions next to ublk_{dev,queue}_is_zoned() to avoid needing to forward-declare them. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 60 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f6e5a0766721..53df4bfa2c92 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -261,6 +261,36 @@ ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) return &ubq->io_cmd_buf[tag]; } +static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; +} + +static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; +} + +static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_AUTO_BUF_REG; +} + +static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; +} + +static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_USER_COPY; +} + +static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_USER_COPY; +} + static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) { return ub->dev_info.flags & UBLK_F_ZONED; @@ -659,36 +689,6 @@ static void ublk_apply_params(struct ublk_device *ub) ublk_dev_param_zoned_apply(ub); } -static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq) -{ - return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY; -} - -static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) -{ - return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; -} - -static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) -{ - return ubq->flags & UBLK_F_AUTO_BUF_REG; -} - -static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub) -{ - return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG; -} - -static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) -{ - return ubq->flags & UBLK_F_USER_COPY; -} - -static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub) -{ - return ub->dev_info.flags & UBLK_F_USER_COPY; -} - static inline bool ublk_need_map_io(const struct ublk_queue *ubq) { return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) && From 98bf2256855eb682433a33e6a7c4bce35191ca99 Mon Sep 17 00:00:00 2001 From: Stanley Zhang Date: Thu, 8 Jan 2026 02:19:31 -0700 Subject: [PATCH 028/162] ublk: support UBLK_PARAM_TYPE_INTEGRITY in device creation Add a feature flag UBLK_F_INTEGRITY for a ublk server to request integrity/metadata support when creating a ublk device. The ublk server can also check for the feature flag on the created device or the result of UBLK_U_CMD_GET_FEATURES to tell if the ublk driver supports it. UBLK_F_INTEGRITY requires UBLK_F_USER_COPY, as user copy is the only data copy mode initially supported for integrity data. Add UBLK_PARAM_TYPE_INTEGRITY and struct ublk_param_integrity to struct ublk_params to specify the integrity params of a ublk device. UBLK_PARAM_TYPE_INTEGRITY requires UBLK_F_INTEGRITY and a nonzero metadata_size. The LBMD_PI_CAP_* and LBMD_PI_CSUM_* values from the linux/fs.h UAPI header are used for the flags and csum_type fields. If the UBLK_PARAM_TYPE_INTEGRITY flag is set, validate the integrity parameters and apply them to the blk_integrity limits. The struct ublk_param_integrity validations are based on the checks in blk_validate_integrity_limits(). Any invalid parameters should be rejected before being applied to struct blk_integrity. [csander: drop redundant pi_tuple_size field, use block metadata UAPI constants, add param validation] Signed-off-by: Stanley Zhang Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 101 +++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 19 +++++++ 2 files changed, 119 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 53df4bfa2c92..a4d62e8e4f6b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -44,6 +44,8 @@ #include #include #include +#include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -83,7 +85,8 @@ #define UBLK_PARAM_TYPE_ALL \ (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \ UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \ - UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT) + UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \ + UBLK_PARAM_TYPE_INTEGRITY) struct ublk_uring_cmd_pdu { /* @@ -301,6 +304,11 @@ static inline bool ublk_queue_is_zoned(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_ZONED; } +static inline bool ublk_dev_support_integrity(const struct ublk_device *ub) +{ + return ub->dev_info.flags & UBLK_F_INTEGRITY; +} + #ifdef CONFIG_BLK_DEV_ZONED struct ublk_zoned_report_desc { @@ -616,6 +624,53 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) set_capacity(ub->ub_disk, p->dev_sectors); } +static int ublk_integrity_flags(u32 flags) +{ + int ret_flags = 0; + + if (flags & LBMD_PI_CAP_INTEGRITY) { + flags &= ~LBMD_PI_CAP_INTEGRITY; + ret_flags |= BLK_INTEGRITY_DEVICE_CAPABLE; + } + if (flags & LBMD_PI_CAP_REFTAG) { + flags &= ~LBMD_PI_CAP_REFTAG; + ret_flags |= BLK_INTEGRITY_REF_TAG; + } + return flags ? -EINVAL : ret_flags; +} + +static int ublk_integrity_pi_tuple_size(u8 csum_type) +{ + switch (csum_type) { + case LBMD_PI_CSUM_NONE: + return 0; + case LBMD_PI_CSUM_IP: + case LBMD_PI_CSUM_CRC16_T10DIF: + return 8; + case LBMD_PI_CSUM_CRC64_NVME: + return 16; + default: + return -EINVAL; + } +} + +static enum blk_integrity_checksum ublk_integrity_csum_type(u8 csum_type) +{ + switch (csum_type) { + case LBMD_PI_CSUM_NONE: + return BLK_INTEGRITY_CSUM_NONE; + case LBMD_PI_CSUM_IP: + return BLK_INTEGRITY_CSUM_IP; + case LBMD_PI_CSUM_CRC16_T10DIF: + return BLK_INTEGRITY_CSUM_CRC; + case LBMD_PI_CSUM_CRC64_NVME: + return BLK_INTEGRITY_CSUM_CRC64; + default: + WARN_ON_ONCE(1); + return BLK_INTEGRITY_CSUM_NONE; + } +} + static int ublk_validate_params(const struct ublk_device *ub) { /* basic param is the only one which must be set */ @@ -678,6 +733,29 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) { + const struct ublk_param_integrity *p = &ub->params.integrity; + int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type); + int flags = ublk_integrity_flags(p->flags); + + if (!ublk_dev_support_integrity(ub)) + return -EINVAL; + if (flags < 0) + return flags; + if (pi_tuple_size < 0) + return pi_tuple_size; + if (!p->metadata_size) + return -EINVAL; + if (p->csum_type == LBMD_PI_CSUM_NONE && + p->flags & LBMD_PI_CAP_REFTAG) + return -EINVAL; + if (p->pi_offset + pi_tuple_size > p->metadata_size) + return -EINVAL; + if (p->interval_exp < SECTOR_SHIFT || + p->interval_exp > ub->params.basic.logical_bs_shift) + return -EINVAL; + } + return 0; } @@ -2950,6 +3028,23 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, lim.max_segments = ub->params.seg.max_segments; } + if (ub->params.types & UBLK_PARAM_TYPE_INTEGRITY) { + const struct ublk_param_integrity *p = &ub->params.integrity; + int pi_tuple_size = ublk_integrity_pi_tuple_size(p->csum_type); + + lim.max_integrity_segments = + p->max_integrity_segments ?: USHRT_MAX; + lim.integrity = (struct blk_integrity) { + .flags = ublk_integrity_flags(p->flags), + .csum_type = ublk_integrity_csum_type(p->csum_type), + .metadata_size = p->metadata_size, + .pi_offset = p->pi_offset, + .interval_exp = p->interval_exp, + .tag_size = p->tag_size, + .pi_tuple_size = pi_tuple_size, + }; + } + if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; @@ -3140,6 +3235,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) return -EINVAL; } + /* User copy is required to access integrity buffer */ + if (info.flags & UBLK_F_INTEGRITY && !(info.flags & UBLK_F_USER_COPY)) + return -EINVAL; + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index ec77dabba45b..4c141d7e4710 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -311,6 +311,12 @@ */ #define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14) +/* + * ublk device supports requests with integrity/metadata buffer. + * Requires UBLK_F_USER_COPY. + */ +#define UBLK_F_INTEGRITY (1ULL << 16) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -600,6 +606,17 @@ struct ublk_param_segment { __u8 pad[2]; }; +struct ublk_param_integrity { + __u32 flags; /* LBMD_PI_CAP_* from linux/fs.h */ + __u16 max_integrity_segments; /* 0 means no limit */ + __u8 interval_exp; + __u8 metadata_size; /* UBLK_PARAM_TYPE_INTEGRITY requires nonzero */ + __u8 pi_offset; + __u8 csum_type; /* LBMD_PI_CSUM_* from linux/fs.h */ + __u8 tag_size; + __u8 pad[5]; +}; + struct ublk_params { /* * Total length of parameters, userspace has to set 'len' for both @@ -614,6 +631,7 @@ struct ublk_params { #define UBLK_PARAM_TYPE_ZONED (1 << 3) #define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4) #define UBLK_PARAM_TYPE_SEGMENT (1 << 5) +#define UBLK_PARAM_TYPE_INTEGRITY (1 << 6) /* requires UBLK_F_INTEGRITY */ __u32 types; /* types of parameter included */ struct ublk_param_basic basic; @@ -622,6 +640,7 @@ struct ublk_params { struct ublk_param_zoned zoned; struct ublk_param_dma_align dma; struct ublk_param_segment seg; + struct ublk_param_integrity integrity; }; #endif From f82f0a16a8270b17211254beeb123d11a0f279cd Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:32 -0700 Subject: [PATCH 029/162] ublk: set UBLK_IO_F_INTEGRITY in ublksrv_io_desc Indicate to the ublk server when an incoming request has integrity data by setting UBLK_IO_F_INTEGRITY in the ublksrv_io_desc's op_flags field. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 +++ include/uapi/linux/ublk_cmd.h | 2 ++ 2 files changed, 5 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a4d62e8e4f6b..fc7de2985a20 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1113,6 +1113,9 @@ static inline unsigned int ublk_req_build_flags(struct request *req) if (req->cmd_flags & REQ_SWAP) flags |= UBLK_IO_F_SWAP; + if (blk_integrity_rq(req)) + flags |= UBLK_IO_F_INTEGRITY; + return flags; } diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 4c141d7e4710..dfde4aee39eb 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -414,6 +414,8 @@ struct ublksrv_ctrl_dev_info { * passed in. */ #define UBLK_IO_F_NEED_REG_BUF (1U << 17) +/* Request has an integrity data buffer */ +#define UBLK_IO_F_INTEGRITY (1UL << 18) /* * io cmd is described by this structure, and stored in share memory, indexed From fc652d415cd8b45e9a534d1c019da175cca4c95a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:33 -0700 Subject: [PATCH 030/162] ublk: split out ublk_copy_user_bvec() helper Factor a helper function ublk_copy_user_bvec() out of ublk_copy_user_pages(). It will be used for copying integrity data too. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 52 +++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index fc7de2985a20..8f0d005a64e2 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -993,6 +993,35 @@ static const struct block_device_operations ub_fops = { .report_zones = ublk_report_zones, }; +static bool ublk_copy_user_bvec(const struct bio_vec *bv, unsigned *offset, + struct iov_iter *uiter, int dir, size_t *done) +{ + unsigned len; + void *bv_buf; + size_t copied; + + if (*offset >= bv->bv_len) { + *offset -= bv->bv_len; + return true; + } + + len = bv->bv_len - *offset; + bv_buf = kmap_local_page(bv->bv_page) + bv->bv_offset + *offset; + if (dir == ITER_DEST) + copied = copy_to_iter(bv_buf, len, uiter); + else + copied = copy_from_iter(bv_buf, len, uiter); + + kunmap_local(bv_buf); + + *done += copied; + if (copied < len) + return false; + + *offset = 0; + return true; +} + /* * Copy data between request pages and io_iter, and 'offset' * is the start point of linear offset of request. @@ -1005,29 +1034,8 @@ static size_t ublk_copy_user_pages(const struct request *req, size_t done = 0; rq_for_each_segment(bv, req, iter) { - unsigned len; - void *bv_buf; - size_t copied; - - if (offset >= bv.bv_len) { - offset -= bv.bv_len; - continue; - } - - len = bv.bv_len - offset; - bv_buf = kmap_local_page(bv.bv_page) + bv.bv_offset + offset; - if (dir == ITER_DEST) - copied = copy_to_iter(bv_buf, len, uiter); - else - copied = copy_from_iter(bv_buf, len, uiter); - - kunmap_local(bv_buf); - - done += copied; - if (copied < len) + if (!ublk_copy_user_bvec(&bv, &offset, uiter, dir, &done)) break; - - offset = 0; } return done; } From 5bfbbc9938f5dee7f252ef05f47b9a26f05f281a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:34 -0700 Subject: [PATCH 031/162] ublk: split out ublk_user_copy() helper ublk_ch_read_iter() and ublk_ch_write_iter() are nearly identical except for the iter direction. Split out a helper function ublk_user_copy() to reduce the code duplication as these functions are about to get larger. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8f0d005a64e2..06d69251df85 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2709,38 +2709,32 @@ fail: return ERR_PTR(-EACCES); } -static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) +static ssize_t +ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) { struct request *req; struct ublk_io *io; size_t buf_off; size_t ret; - req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io); + req = ublk_check_and_get_req(iocb, iter, &buf_off, dir, &io); if (IS_ERR(req)) return PTR_ERR(req); - ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); + ret = ublk_copy_user_pages(req, buf_off, iter, dir); ublk_put_req_ref(io, req); return ret; } +static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + return ublk_user_copy(iocb, to, ITER_DEST); +} + static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct request *req; - struct ublk_io *io; - size_t buf_off; - size_t ret; - - req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io); - if (IS_ERR(req)) - return PTR_ERR(req); - - ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); - ublk_put_req_ref(io, req); - - return ret; + return ublk_user_copy(iocb, from, ITER_SOURCE); } static const struct file_operations ublk_ch_fops = { From ca80afd8708fa22f6d3a1e0306ae12a64e5291b5 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:35 -0700 Subject: [PATCH 032/162] ublk: inline ublk_check_and_get_req() into ublk_user_copy() ublk_check_and_get_req() has a single callsite in ublk_user_copy(). It takes a ton of arguments in order to pass local variables from ublk_user_copy() to ublk_check_and_get_req() and vice versa. And more are about to be added. Combine the functions to reduce the argument passing noise. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 51 ++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 06d69251df85..57a1af3d5261 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2664,66 +2664,51 @@ static inline bool ublk_check_ubuf_dir(const struct request *req, return false; } -static struct request *ublk_check_and_get_req(struct kiocb *iocb, - struct iov_iter *iter, size_t *off, int dir, - struct ublk_io **io) +static ssize_t +ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) { struct ublk_device *ub = iocb->ki_filp->private_data; struct ublk_queue *ubq; struct request *req; + struct ublk_io *io; size_t buf_off; u16 tag, q_id; + ssize_t ret; if (!user_backed_iter(iter)) - return ERR_PTR(-EACCES); + return -EACCES; if (ub->dev_info.state == UBLK_S_DEV_DEAD) - return ERR_PTR(-EACCES); + return -EACCES; tag = ublk_pos_to_tag(iocb->ki_pos); q_id = ublk_pos_to_hwq(iocb->ki_pos); buf_off = ublk_pos_to_buf_off(iocb->ki_pos); if (q_id >= ub->dev_info.nr_hw_queues) - return ERR_PTR(-EINVAL); + return -EINVAL; ubq = ublk_get_queue(ub, q_id); if (!ublk_dev_support_user_copy(ub)) - return ERR_PTR(-EACCES); + return -EACCES; if (tag >= ub->dev_info.queue_depth) - return ERR_PTR(-EINVAL); + return -EINVAL; - *io = &ubq->ios[tag]; - req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off); + io = &ubq->ios[tag]; + req = __ublk_check_and_get_req(ub, q_id, tag, io, buf_off); if (!req) - return ERR_PTR(-EINVAL); + return -EINVAL; - if (!ublk_check_ubuf_dir(req, dir)) - goto fail; - - *off = buf_off; - return req; -fail: - ublk_put_req_ref(*io, req); - return ERR_PTR(-EACCES); -} - -static ssize_t -ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) -{ - struct request *req; - struct ublk_io *io; - size_t buf_off; - size_t ret; - - req = ublk_check_and_get_req(iocb, iter, &buf_off, dir, &io); - if (IS_ERR(req)) - return PTR_ERR(req); + if (!ublk_check_ubuf_dir(req, dir)) { + ret = -EACCES; + goto out; + } ret = ublk_copy_user_pages(req, buf_off, iter, dir); - ublk_put_req_ref(io, req); +out: + ublk_put_req_ref(io, req); return ret; } From fd5a005fa6a261762292a2d89ef8d0174b66f541 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:36 -0700 Subject: [PATCH 033/162] ublk: move offset check out of __ublk_check_and_get_req() __ublk_check_and_get_req() checks that the passed in offset is within the data length of the specified ublk request. However, only user copy (ublk_check_and_get_req()) supports accessing ublk request data at a nonzero offset. Zero-copy buffer registration (ublk_register_io_buf()) always passes 0 for the offset, so the check is unnecessary. Move the check from __ublk_check_and_get_req() to ublk_check_and_get_req(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 57a1af3d5261..d428a25121db 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -255,7 +255,7 @@ static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - u16 q_id, u16 tag, struct ublk_io *io, size_t offset); + u16 q_id, u16 tag, struct ublk_io *io); static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc * @@ -2297,7 +2297,7 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd, if (!ublk_dev_support_zero_copy(ub)) return -EINVAL; - req = __ublk_check_and_get_req(ub, q_id, tag, io, 0); + req = __ublk_check_and_get_req(ub, q_id, tag, io); if (!req) return -EINVAL; @@ -2591,7 +2591,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, } static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, - u16 q_id, u16 tag, struct ublk_io *io, size_t offset) + u16 q_id, u16 tag, struct ublk_io *io) { struct request *req; @@ -2612,9 +2612,6 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, if (!ublk_rq_has_data(req)) goto fail_put; - if (offset > blk_rq_bytes(req)) - goto fail_put; - return req; fail_put: ublk_put_req_ref(io, req); @@ -2696,10 +2693,15 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) return -EINVAL; io = &ubq->ios[tag]; - req = __ublk_check_and_get_req(ub, q_id, tag, io, buf_off); + req = __ublk_check_and_get_req(ub, q_id, tag, io); if (!req) return -EINVAL; + if (buf_off > blk_rq_bytes(req)) { + ret = -EINVAL; + goto out; + } + if (!ublk_check_ubuf_dir(req, dir)) { ret = -EACCES; goto out; From be82a89066d595da334f6e153ababcedc3f92ad6 Mon Sep 17 00:00:00 2001 From: Stanley Zhang Date: Thu, 8 Jan 2026 02:19:37 -0700 Subject: [PATCH 034/162] ublk: implement integrity user copy Add a function ublk_copy_user_integrity() to copy integrity information between a request and a user iov_iter. This mirrors the existing ublk_copy_user_pages() but operates on request integrity data instead of regular data. Check UBLKSRV_IO_INTEGRITY_FLAG in iocb->ki_pos in ublk_user_copy() to choose between copying data or integrity data. [csander: change offset units from data bytes to integrity data bytes, fix CONFIG_BLK_DEV_INTEGRITY=n build, rebase on user copy refactor] Signed-off-by: Stanley Zhang Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 53 +++++++++++++++++++++++++++++++++-- include/uapi/linux/ublk_cmd.h | 4 +++ 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d428a25121db..5c441f507c43 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1040,6 +1040,33 @@ static size_t ublk_copy_user_pages(const struct request *req, return done; } +#ifdef CONFIG_BLK_DEV_INTEGRITY +static size_t ublk_copy_user_integrity(const struct request *req, + unsigned offset, struct iov_iter *uiter, int dir) +{ + size_t done = 0; + struct bio *bio = req->bio; + struct bvec_iter iter; + struct bio_vec iv; + + if (!blk_integrity_rq(req)) + return 0; + + bio_for_each_integrity_vec(iv, bio, iter) { + if (!ublk_copy_user_bvec(&iv, &offset, uiter, dir, &done)) + break; + } + + return done; +} +#else /* #ifdef CONFIG_BLK_DEV_INTEGRITY */ +static size_t ublk_copy_user_integrity(const struct request *req, + unsigned offset, struct iov_iter *uiter, int dir) +{ + return 0; +} +#endif /* #ifdef CONFIG_BLK_DEV_INTEGRITY */ + static inline bool ublk_need_map_req(const struct request *req) { return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; @@ -2668,6 +2695,8 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) struct ublk_queue *ubq; struct request *req; struct ublk_io *io; + unsigned data_len; + bool is_integrity; size_t buf_off; u16 tag, q_id; ssize_t ret; @@ -2681,6 +2710,10 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) tag = ublk_pos_to_tag(iocb->ki_pos); q_id = ublk_pos_to_hwq(iocb->ki_pos); buf_off = ublk_pos_to_buf_off(iocb->ki_pos); + is_integrity = !!(iocb->ki_pos & UBLKSRV_IO_INTEGRITY_FLAG); + + if (unlikely(!ublk_dev_support_integrity(ub) && is_integrity)) + return -EINVAL; if (q_id >= ub->dev_info.nr_hw_queues) return -EINVAL; @@ -2697,7 +2730,14 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) if (!req) return -EINVAL; - if (buf_off > blk_rq_bytes(req)) { + if (is_integrity) { + struct blk_integrity *bi = &req->q->limits.integrity; + + data_len = bio_integrity_bytes(bi, blk_rq_sectors(req)); + } else { + data_len = blk_rq_bytes(req); + } + if (buf_off > data_len) { ret = -EINVAL; goto out; } @@ -2707,7 +2747,10 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) goto out; } - ret = ublk_copy_user_pages(req, buf_off, iter, dir); + if (is_integrity) + ret = ublk_copy_user_integrity(req, buf_off, iter, dir); + else + ret = ublk_copy_user_pages(req, buf_off, iter, dir); out: ublk_put_req_ref(io, req); @@ -3948,6 +3991,12 @@ static int __init ublk_init(void) BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + /* + * Ensure UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE + * doesn't overflow into UBLKSRV_IO_INTEGRITY_FLAG + */ + BUILD_BUG_ON(UBLKSRV_IO_BUF_OFFSET + UBLKSRV_IO_BUF_TOTAL_SIZE >= + UBLKSRV_IO_INTEGRITY_FLAG); BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8); init_waitqueue_head(&ublk_idr_wq); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index dfde4aee39eb..61ac5d8e1078 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -134,6 +134,10 @@ #define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS) #define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) +/* Copy to/from request integrity buffer instead of data buffer */ +#define UBLK_INTEGRITY_FLAG_OFF 62 +#define UBLKSRV_IO_INTEGRITY_FLAG (1ULL << UBLK_INTEGRITY_FLAG_OFF) + /* * ublk server can register data buffers for incoming I/O requests with a sparse * io_uring buffer table. The request buffer can then be used as the data buffer From b2503e936b598b993cb09005194dc77d2fa3f082 Mon Sep 17 00:00:00 2001 From: Stanley Zhang Date: Thu, 8 Jan 2026 02:19:38 -0700 Subject: [PATCH 035/162] ublk: support UBLK_F_INTEGRITY Now that all the components of the ublk integrity feature have been implemented, add UBLK_F_INTEGRITY to UBLK_F_ALL, conditional on block layer integrity support (CONFIG_BLK_DEV_INTEGRITY). This allows ublk servers to create ublk devices with UBLK_F_INTEGRITY set and UBLK_U_CMD_GET_FEATURES to report the feature as supported. Signed-off-by: Stanley Zhang [csander: make feature conditional on CONFIG_BLK_DEV_INTEGRITY] Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5c441f507c43..eaff32c77898 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -75,7 +75,8 @@ | UBLK_F_AUTO_BUF_REG \ | UBLK_F_QUIESCE \ | UBLK_F_PER_IO_DAEMON \ - | UBLK_F_BUF_REG_OFF_DAEMON) + | UBLK_F_BUF_REG_OFF_DAEMON \ + | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0)) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ From bfe1255712a3b1c1f7418c5504a1bf53735d3848 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:39 -0700 Subject: [PATCH 036/162] ublk: optimize ublk_user_copy() on daemon task ublk user copy syscalls may be issued from any task, so they take a reference count on the struct ublk_io to check whether it is owned by the ublk server and prevent a concurrent UBLK_IO_COMMIT_AND_FETCH_REQ from completing the request. However, if the user copy syscall is issued on the io's daemon task, a concurrent UBLK_IO_COMMIT_AND_FETCH_REQ isn't possible, so the atomic reference count dance is unnecessary. Check for UBLK_IO_FLAG_OWNED_BY_SRV to ensure the request is dispatched to the sever and obtain the request from ublk_io's req field instead of looking it up on the tagset. Skip the reference count increment and decrement. Commit 8a8fe42d765b ("ublk: optimize UBLK_IO_REGISTER_IO_BUF on daemon task") made an analogous optimization for ublk zero copy buffer registration. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index eaff32c77898..ec96d5afad7a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -183,7 +183,7 @@ struct ublk_io { * if user copy or zero copy are enabled: * - UBLK_REFCOUNT_INIT from dispatch to the server * until UBLK_IO_COMMIT_AND_FETCH_REQ - * - 1 for each inflight ublk_ch_{read,write}_iter() call + * - 1 for each inflight ublk_ch_{read,write}_iter() call not on task * - 1 for each io_uring registered buffer not registered on task * The I/O can only be completed once all references are dropped. * User copy and buffer registration operations are only permitted @@ -2698,6 +2698,7 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) struct ublk_io *io; unsigned data_len; bool is_integrity; + bool on_daemon; size_t buf_off; u16 tag, q_id; ssize_t ret; @@ -2727,9 +2728,20 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) return -EINVAL; io = &ubq->ios[tag]; - req = __ublk_check_and_get_req(ub, q_id, tag, io); - if (!req) - return -EINVAL; + on_daemon = current == READ_ONCE(io->task); + if (on_daemon) { + /* On daemon, io can't be completed concurrently, so skip ref */ + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + return -EINVAL; + + req = io->req; + if (!ublk_rq_has_data(req)) + return -EINVAL; + } else { + req = __ublk_check_and_get_req(ub, q_id, tag, io); + if (!req) + return -EINVAL; + } if (is_integrity) { struct blk_integrity *bi = &req->q->limits.integrity; @@ -2754,7 +2766,8 @@ ublk_user_copy(struct kiocb *iocb, struct iov_iter *iter, int dir) ret = ublk_copy_user_pages(req, buf_off, iter, dir); out: - ublk_put_req_ref(io, req); + if (!on_daemon) + ublk_put_req_ref(io, req); return ret; } From c1d7c0f9cdf6690eff4518f1c17a37d5ee647cd1 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:40 -0700 Subject: [PATCH 037/162] selftests: ublk: display UBLK_F_INTEGRITY support Add support for printing the UBLK_F_INTEGRITY feature flag in the human-readable kublk features output. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 185ba553686a..261095f19c93 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1454,6 +1454,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_QUIESCE), FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), + FEAT_NAME(UBLK_F_INTEGRITY), }; struct ublk_dev *dev; __u64 features = 0; From 261b67f4e34716e793b0b95d2722b2fe780ed5f4 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:41 -0700 Subject: [PATCH 038/162] selftests: ublk: add utility to get block device metadata size Some block device integrity parameters are available in sysfs, but others are only accessible using the FS_IOC_GETLBMD_CAP ioctl. Add a metadata_size utility program to print out the logical block metadata size, PI offset, and PI size within the metadata. Example output: $ metadata_size /dev/ublkb0 metadata_size: 64 pi_offset: 56 pi_tuple_size: 8 Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 5 +-- tools/testing/selftests/ublk/metadata_size.c | 36 ++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/ublk/metadata_size.c diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 06ba6fde098d..351ac6438561 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -49,12 +49,13 @@ TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh -TEST_GEN_PROGS_EXTENDED = kublk +TEST_GEN_PROGS_EXTENDED = kublk metadata_size +STANDALONE_UTILS := metadata_size.c LOCAL_HDRS += $(wildcard *.h) include ../lib.mk -$(TEST_GEN_PROGS_EXTENDED): $(wildcard *.c) +$(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c)) check: shellcheck -x -f gcc *.sh diff --git a/tools/testing/selftests/ublk/metadata_size.c b/tools/testing/selftests/ublk/metadata_size.c new file mode 100644 index 000000000000..76ecddf04d25 --- /dev/null +++ b/tools/testing/selftests/ublk/metadata_size.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + struct logical_block_metadata_cap cap = {}; + const char *filename; + int fd; + int result; + + if (argc != 2) { + fprintf(stderr, "Usage: %s BLOCK_DEVICE\n", argv[0]); + return 1; + } + + filename = argv[1]; + fd = open(filename, O_RDONLY); + if (fd < 0) { + perror(filename); + return 1; + } + + result = ioctl(fd, FS_IOC_GETLBMD_CAP, &cap); + if (result < 0) { + perror("ioctl"); + return 1; + } + + printf("metadata_size: %u\n", cap.lbmd_size); + printf("pi_offset: %u\n", cap.lbmd_pi_offset); + printf("pi_tuple_size: %u\n", cap.lbmd_pi_size); + return 0; +} From 6ed6476c4aefa9ee3ba90f39bcc002dd034f6e03 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:42 -0700 Subject: [PATCH 039/162] selftests: ublk: add kublk support for integrity params Add integrity param command line arguments to kublk. Plumb these to struct ublk_params for the null and fault_inject targets, as they don't need to actually read or write the integrity data. Forbid the integrity params for loop or stripe until the integrity data copy is implemented. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/fault_inject.c | 1 + tools/testing/selftests/ublk/file_backed.c | 4 ++ tools/testing/selftests/ublk/kublk.c | 47 +++++++++++++++++++++ tools/testing/selftests/ublk/kublk.h | 21 +++++++++ tools/testing/selftests/ublk/null.c | 1 + tools/testing/selftests/ublk/stripe.c | 4 ++ 6 files changed, 78 insertions(+) diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c index b227bd78b252..3b897f69c014 100644 --- a/tools/testing/selftests/ublk/fault_inject.c +++ b/tools/testing/selftests/ublk/fault_inject.c @@ -33,6 +33,7 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, .dev_sectors = dev_size >> 9, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000); return 0; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 269d5f124e06..c14ce6608696 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -158,6 +158,10 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } + if (ctx->metadata_size) { + ublk_err("%s: integrity not supported\n", __func__); + return -EINVAL; + } ret = backing_file_tgt_init(dev); if (ret) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 261095f19c93..48e1865b4875 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -3,6 +3,7 @@ * Description: uring_cmd based ublk */ +#include #include "kublk.h" #define MAX_NR_TGT_ARG 64 @@ -1550,6 +1551,8 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n"); printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n"); printf("\t[--nthreads threads] [--per_io_tasks]\n"); + printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " + "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1613,6 +1616,12 @@ int main(int argc, char *argv[]) { "nthreads", 1, NULL, 0 }, { "per_io_tasks", 0, NULL, 0 }, { "no_ublk_fixed_fd", 0, NULL, 0 }, + { "integrity_capable", 0, NULL, 0 }, + { "integrity_reftag", 0, NULL, 0 }, + { "metadata_size", 1, NULL, 0 }, + { "pi_offset", 1, NULL, 0 }, + { "csum_type", 1, NULL, 0 }, + { "tag_size", 1, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1623,6 +1632,7 @@ int main(int argc, char *argv[]) .nr_hw_queues = 2, .dev_id = -1, .tgt_type = "unknown", + .csum_type = LBMD_PI_CSUM_NONE, }; int ret = -EINVAL, i; int tgt_argc = 1; @@ -1697,6 +1707,28 @@ int main(int argc, char *argv[]) ctx.per_io_tasks = 1; if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd")) ctx.no_ublk_fixed_fd = 1; + if (!strcmp(longopts[option_idx].name, "integrity_capable")) + ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY; + if (!strcmp(longopts[option_idx].name, "integrity_reftag")) + ctx.integrity_flags |= LBMD_PI_CAP_REFTAG; + if (!strcmp(longopts[option_idx].name, "metadata_size")) + ctx.metadata_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "pi_offset")) + ctx.pi_offset = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "csum_type")) { + if (!strcmp(optarg, "ip")) { + ctx.csum_type = LBMD_PI_CSUM_IP; + } else if (!strcmp(optarg, "t10dif")) { + ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF; + } else if (!strcmp(optarg, "nvme")) { + ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME; + } else { + ublk_err("invalid csum_type: %s\n", optarg); + return -EINVAL; + } + } + if (!strcmp(longopts[option_idx].name, "tag_size")) + ctx.tag_size = strtoul(optarg, NULL, 0); break; case '?': /* @@ -1739,6 +1771,21 @@ int main(int argc, char *argv[]) return -EINVAL; } + if (ctx.metadata_size) { + if (!(ctx.flags & UBLK_F_USER_COPY)) { + ublk_err("integrity requires user_copy\n"); + return -EINVAL; + } + + ctx.flags |= UBLK_F_INTEGRITY; + } else if (ctx.integrity_flags || + ctx.pi_offset || + ctx.csum_type != LBMD_PI_CSUM_NONE || + ctx.tag_size) { + ublk_err("integrity parameters require metadata_size\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 8a83b90ec603..d00f2b465cdf 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -78,6 +78,11 @@ struct dev_ctx { unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; unsigned int no_ublk_fixed_fd:1; + __u32 integrity_flags; + __u8 metadata_size; + __u8 pi_offset; + __u8 csum_type; + __u8 tag_size; int _evtfd; int _shmid; @@ -202,6 +207,22 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, + struct ublk_params *params) +{ + if (!ctx->metadata_size) + return; + + params->types |= UBLK_PARAM_TYPE_INTEGRITY; + params->integrity = (struct ublk_param_integrity) { + .flags = ctx->integrity_flags, + .interval_exp = params->basic.logical_bs_shift, + .metadata_size = ctx->metadata_size, + .pi_offset = ctx->pi_offset, + .csum_type = ctx->csum_type, + .tag_size = ctx->tag_size, + }; +} static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) { diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 280043f6b689..3aa162f08476 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -36,6 +36,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) .max_segments = 32, }, }; + ublk_set_integrity_params(ctx, &dev->tgt.params); if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth; diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index fd412e1f01c0..d4aaf3351d71 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -298,6 +298,10 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } + if (ctx->metadata_size) { + ublk_err("%s: integrity not supported\n", __func__); + return -EINVAL; + } if ((chunk_size & (chunk_size - 1)) || !chunk_size) { ublk_err("invalid chunk size %u\n", chunk_size); From 24f8a44b797f03dfadb455138930523599d3c22a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:43 -0700 Subject: [PATCH 040/162] selftests: ublk: implement integrity user copy in kublk If integrity data is enabled for kublk, allocate an integrity buffer for each I/O. Extend ublk_user_copy() to copy the integrity data between the ublk request and the integrity buffer if the ublksrv_io_desc indicates that the request has integrity data. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 41 ++++++++++++++++++++++++---- tools/testing/selftests/ublk/kublk.h | 14 ++++++++++ 2 files changed, 50 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 48e1865b4875..d95937dd6167 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -416,8 +416,10 @@ static void ublk_queue_deinit(struct ublk_queue *q) if (q->io_cmd_buf) munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); - for (i = 0; i < nr_ios; i++) + for (i = 0; i < nr_ios; i++) { free(q->ios[i].buf_addr); + free(q->ios[i].integrity_buf); + } } static void ublk_thread_deinit(struct ublk_thread *t) @@ -433,12 +435,13 @@ static void ublk_thread_deinit(struct ublk_thread *t) } } -static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) +static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, + __u8 metadata_size) { struct ublk_dev *dev = q->dev; int depth = dev->dev_info.queue_depth; int i; - int cmd_buf_size, io_buf_size; + int cmd_buf_size, io_buf_size, integrity_size; unsigned long off; q->tgt_ops = dev->tgt.ops; @@ -446,6 +449,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) q->q_depth = depth; q->flags = dev->dev_info.flags; q->flags |= extra_flags; + q->metadata_size = metadata_size; /* Cache fd in queue for fast path access */ q->ublk_fd = dev->fds[0]; @@ -461,11 +465,23 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags) } io_buf_size = dev->dev_info.max_io_buf_bytes; + integrity_size = ublk_integrity_len(q, io_buf_size); for (i = 0; i < q->q_depth; i++) { q->ios[i].buf_addr = NULL; q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE; q->ios[i].tag = i; + if (integrity_size) { + q->ios[i].integrity_buf = malloc(integrity_size); + if (!q->ios[i].integrity_buf) { + ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n", + dev->dev_info.dev_id, q->q_id, i, + integrity_size); + goto fail; + } + } + + if (ublk_queue_no_buf(q)) continue; @@ -608,13 +624,13 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) __u8 ublk_op = ublksrv_get_op(iod); __u32 len = iod->nr_sectors << 9; void *addr = io->buf_addr; + ssize_t copied; if (ublk_op != match_ublk_op) return; while (len) { __u32 copy_len = min(len, UBLK_USER_COPY_LEN); - ssize_t copied; if (ublk_op == UBLK_IO_OP_WRITE) copied = pread(q->ublk_fd, addr, copy_len, off); @@ -627,6 +643,20 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op) off += copy_len; len -= copy_len; } + + if (!(iod->op_flags & UBLK_IO_F_INTEGRITY)) + return; + + len = ublk_integrity_len(q, iod->nr_sectors << 9); + off = ublk_user_copy_offset(q->q_id, io->tag); + off |= UBLKSRV_IO_INTEGRITY_FLAG; + if (ublk_op == UBLK_IO_OP_WRITE) + copied = pread(q->ublk_fd, io->integrity_buf, len, off); + else if (ublk_op == UBLK_IO_OP_READ) + copied = pwrite(q->ublk_fd, io->integrity_buf, len, off); + else + assert(0); + assert(copied == (ssize_t)len); } int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) @@ -1013,7 +1043,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) dev->q[i].dev = dev; dev->q[i].q_id = i; - ret = ublk_queue_init(&dev->q[i], extra_flags); + ret = ublk_queue_init(&dev->q[i], extra_flags, + ctx->metadata_size); if (ret) { ublk_err("ublk dev %d queue %d init queue failed\n", dinfo->dev_id, i); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index d00f2b465cdf..830b49a7716a 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -112,6 +112,7 @@ struct ublk_ctrl_cmd_data { struct ublk_io { char *buf_addr; + void *integrity_buf; #define UBLKS_IO_NEED_FETCH_RQ (1UL << 0) #define UBLKS_IO_NEED_COMMIT_RQ_COMP (1UL << 1) @@ -175,6 +176,7 @@ struct ublk_queue { #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) __u64 flags; int ublk_fd; /* cached ublk char device fd */ + __u8 metadata_size; struct ublk_io ios[UBLK_QUEUE_DEPTH]; }; @@ -224,6 +226,18 @@ static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, }; } +static inline size_t ublk_integrity_len(const struct ublk_queue *q, size_t len) +{ + /* All targets currently use interval_exp = logical_bs_shift = 9 */ + return (len >> 9) * q->metadata_size; +} + +static inline size_t +ublk_integrity_data_len(const struct ublk_queue *q, size_t integrity_len) +{ + return (integrity_len / q->metadata_size) << 9; +} + static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod) { return !!(iod->op_flags & UBLK_IO_F_NEED_REG_BUF); From a1805442674b85ff9d626965f828e4fd71a82b28 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:44 -0700 Subject: [PATCH 041/162] selftests: ublk: support non-O_DIRECT backing files A subsequent commit will add support for using a backing file to store integrity data. Since integrity data is accessed in intervals of metadata_size, which may be much smaller than a logical block on the backing device, direct I/O cannot be used. Add an argument to backing_file_tgt_init() to specify the number of files to open for direct I/O. The remaining files will use buffered I/O. For now, continue to request direct I/O for all the files. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/common.c | 4 ++-- tools/testing/selftests/ublk/file_backed.c | 2 +- tools/testing/selftests/ublk/kublk.h | 2 +- tools/testing/selftests/ublk/stripe.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index 01580a6f8519..d9873d4d50d0 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -12,7 +12,7 @@ void backing_file_tgt_deinit(struct ublk_dev *dev) } } -int backing_file_tgt_init(struct ublk_dev *dev) +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct) { int fd, i; @@ -25,7 +25,7 @@ int backing_file_tgt_init(struct ublk_dev *dev) ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); - fd = open(file, O_RDWR | O_DIRECT); + fd = open(file, O_RDWR | (i < nr_direct ? O_DIRECT : 0)); if (fd < 0) { ublk_err("%s: backing file %s can't be opened: %s\n", __func__, file, strerror(errno)); diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index c14ce6608696..db4c176a4f28 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -163,7 +163,7 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) return -EINVAL; } - ret = backing_file_tgt_init(dev); + ret = backing_file_tgt_init(dev, 1); if (ret) return ret; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 830b49a7716a..96c66b337bc0 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -462,6 +462,6 @@ extern const struct ublk_tgt_ops stripe_tgt_ops; extern const struct ublk_tgt_ops fault_inject_tgt_ops; void backing_file_tgt_deinit(struct ublk_dev *dev); -int backing_file_tgt_init(struct ublk_dev *dev); +int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct); #endif diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index d4aaf3351d71..2be1c36438e7 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -315,7 +315,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) chunk_shift = ilog2(chunk_size); - ret = backing_file_tgt_init(dev); + ret = backing_file_tgt_init(dev, dev->tgt.nr_backing_files); if (ret) return ret; From f48250dc5ba8368ccb587093eb20d1c7baecaacf Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:45 -0700 Subject: [PATCH 042/162] selftests: ublk: add integrity data support to loop target To perform and end-to-end test of integrity information through a ublk device, we need to actually store it somewhere and retrieve it. Add this support to kublk's loop target. It uses a second backing file for the integrity data corresponding to the data stored in the first file. The integrity file is initialized with byte 0xFF, which ensures the app and reference tags are set to the "escape" pattern to disable the bio-integrity-auto guard and reftag checks until the blocks are written. The integrity file is opened without O_DIRECT since it will be accessed at sub-block granularity. Each incoming read/write results in a pair of reads/writes, one to the data file, and one to the integrity file. If either backing I/O fails, the error is propagated to the ublk request. If both backing I/Os read/write some bytes, the ublk request is completed with the smaller of the number of blocks accessed by each I/O. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 92 +++++++++++++++++----- 1 file changed, 74 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index db4c176a4f28..c3ce5ff72422 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -35,9 +35,23 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, unsigned auto_zc = ublk_queue_use_auto_zc(q); enum io_uring_op op = ublk_to_uring_op(iod, zc | auto_zc); struct ublk_io *io = ublk_get_io(q, tag); + __u64 offset = iod->start_sector << 9; + __u32 len = iod->nr_sectors << 9; struct io_uring_sqe *sqe[3]; void *addr = io->buf_addr; + if (iod->op_flags & UBLK_IO_F_INTEGRITY) { + ublk_io_alloc_sqes(t, sqe, 1); + /* Use second backing file for integrity data */ + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 2), + io->integrity_buf, + ublk_integrity_len(q, len), + ublk_integrity_len(q, offset)); + sqe[0]->flags = IOSQE_FIXED_FILE; + /* tgt_data = 1 indicates integrity I/O */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 1, q->q_id, 1); + } + if (!zc || auto_zc) { ublk_io_alloc_sqes(t, sqe, 1); if (!sqe[0]) @@ -45,14 +59,14 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/, addr, - iod->nr_sectors << 9, - iod->start_sector << 9); + len, + offset); if (auto_zc) sqe[0]->buf_index = tag; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - return 1; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 1; } ublk_io_alloc_sqes(t, sqe, 3); @@ -63,8 +77,8 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, - iod->nr_sectors << 9, - iod->start_sector << 9); + len, + offset); sqe[1]->buf_index = tag; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -72,7 +86,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); - return 2; + return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2; } static int loop_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q, int tag) @@ -119,12 +133,17 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, unsigned op = user_data_to_op(cqe->user_data); struct ublk_io *io = ublk_get_io(q, tag); - if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { - if (!io->result) - io->result = cqe->res; - if (cqe->res < 0) - ublk_err("%s: io failed op %x user_data %lx\n", - __func__, op, cqe->user_data); + if (cqe->res < 0) { + io->result = cqe->res; + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } else if (op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + __s32 data_len = user_data_to_tgt_data(cqe->user_data) + ? ublk_integrity_data_len(q, cqe->res) + : cqe->res; + + if (!io->result || data_len < io->result) + io->result = data_len; } /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ @@ -135,9 +154,30 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q, ublk_complete_io(t, q, tag, io->result); } +static int ublk_loop_memset_file(int fd, __u8 byte, size_t len) +{ + off_t offset = 0; + __u8 buf[4096]; + + memset(buf, byte, sizeof(buf)); + while (len) { + int ret = pwrite(fd, buf, min(len, sizeof(buf)), offset); + + if (ret < 0) + return -errno; + if (!ret) + return -EIO; + + len -= ret; + offset += ret; + } + return 0; +} + static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { unsigned long long bytes; + unsigned long blocks; int ret; struct ublk_params p = { .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, @@ -154,23 +194,39 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) }, }; + ublk_set_integrity_params(ctx, &p); if (ctx->auto_zc_fallback) { ublk_err("%s: not support auto_zc_fallback\n", __func__); return -EINVAL; } - if (ctx->metadata_size) { - ublk_err("%s: integrity not supported\n", __func__); - return -EINVAL; - } + /* Use O_DIRECT only for data file */ ret = backing_file_tgt_init(dev, 1); if (ret) return ret; - if (dev->tgt.nr_backing_files != 1) + /* Expect a second file for integrity data */ + if (dev->tgt.nr_backing_files != 1 + !!ctx->metadata_size) return -EINVAL; - bytes = dev->tgt.backing_file_size[0]; + blocks = dev->tgt.backing_file_size[0] >> p.basic.logical_bs_shift; + if (ctx->metadata_size) { + unsigned long metadata_blocks = + dev->tgt.backing_file_size[1] / ctx->metadata_size; + unsigned long integrity_len; + + /* Ensure both data and integrity data fit in backing files */ + blocks = min(blocks, metadata_blocks); + integrity_len = blocks * ctx->metadata_size; + /* + * Initialize PI app tag and ref tag to 0xFF + * to disable bio-integrity-auto checks + */ + ret = ublk_loop_memset_file(dev->fds[2], 0xFF, integrity_len); + if (ret) + return ret; + } + bytes = blocks << p.basic.logical_bs_shift; dev->tgt.dev_size = bytes; p.basic.dev_sectors = bytes >> 9; dev->tgt.params = p; From 9e9f635525b12f055558a7cfe2e54d109839d030 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:46 -0700 Subject: [PATCH 043/162] selftests: ublk: add integrity params test Add test case null_04 to exercise all the different integrity params. It creates 4 different ublk devices with different combinations of integrity arguments and verifies their integrity limits via sysfs and the metadata_size utility. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_common.sh | 10 ++ tools/testing/selftests/ublk/test_null_04.sh | 166 +++++++++++++++++++ 3 files changed, 177 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_null_04.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 351ac6438561..239ad1c741ef 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -27,6 +27,7 @@ TEST_PROGS += test_generic_15.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh +TEST_PROGS += test_null_04.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index ea9a5f3eb70a..7ff6ce79d62c 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -384,6 +384,16 @@ _ublk_test_top_dir() cd "$(dirname "$0")" && pwd } +METADATA_SIZE_PROG="$(_ublk_test_top_dir)/metadata_size" + +_get_metadata_size() +{ + local dev_id=$1 + local field=$2 + + "$METADATA_SIZE_PROG" "/dev/ublkb$dev_id" | grep "$field" | grep -o "[0-9]*" +} + UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh new file mode 100755 index 000000000000..0b0719ea33a3 --- /dev/null +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID=null_04 + +_prep_test "null" "integrity params" + +dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 8 ]; then + echo "metadata_size $metadata_size != 8" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 0 ]; then + echo "pi_offset $pi_offset != 0" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 0 ]; then + echo "pi_tuple_size $pi_tuple_size != 0" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 0 ]; then + echo "device_is_integrity_capable $capable != 0" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != nop ]; then + echo "format $format != nop" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 0 ]; then + echo "tag_size $tag_size != 0" + _show_result $TID 255 +fi +_cleanup_test + +dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 64 ]; then + echo "metadata_size $metadata_size != 64" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 56 ]; then + echo "pi_offset $pi_offset != 56" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 8 ]; then + echo "pi_tuple_size $pi_tuple_size != 8" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 1 ]; then + echo "device_is_integrity_capable $capable != 1" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != T10-DIF-TYPE3-IP ]; then + echo "format $format != T10-DIF-TYPE3-IP" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 0 ]; then + echo "tag_size $tag_size != 0" + _show_result $TID 255 +fi +_cleanup_test + +dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 8 ]; then + echo "metadata_size $metadata_size != 8" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 0 ]; then + echo "pi_offset $pi_offset != 0" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 8 ]; then + echo "pi_tuple_size $pi_tuple_size != 8" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 0 ]; then + echo "device_is_integrity_capable $capable != 0" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != T10-DIF-TYPE1-CRC ]; then + echo "format $format != T10-DIF-TYPE1-CRC" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 0 ]; then + echo "tag_size $tag_size != 0" + _show_result $TID 255 +fi +_cleanup_test + +dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) +_check_add_dev $TID $? +metadata_size=$(_get_metadata_size "$dev_id" metadata_size) +if [ "$metadata_size" != 16 ]; then + echo "metadata_size $metadata_size != 16" + _show_result $TID 255 +fi +pi_offset=$(_get_metadata_size "$dev_id" pi_offset) +if [ "$pi_offset" != 0 ]; then + echo "pi_offset $pi_offset != 0" + _show_result $TID 255 +fi +pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) +if [ "$pi_tuple_size" != 16 ]; then + echo "pi_tuple_size $pi_tuple_size != 16" + _show_result $TID 255 +fi +capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") +if [ "$capable" != 0 ]; then + echo "device_is_integrity_capable $capable != 0" + _show_result $TID 255 +fi +format=$(cat "/sys/block/ublkb$dev_id/integrity/format") +if [ "$format" != EXT-DIF-TYPE3-CRC64 ]; then + echo "format $format != EXT-DIF-TYPE3-CRC64" + _show_result $TID 255 +fi +protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") +if [ "$protection_interval_bytes" != 512 ]; then + echo "protection_interval_bytes $protection_interval_bytes != 512" + _show_result $TID 255 +fi +tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") +if [ "$tag_size" != 8 ]; then + echo "tag_size $tag_size != 8" + _show_result $TID 255 +fi +_cleanup_test + +_show_result $TID 0 From 78796b6bae8684b753b658f431b5b1ee24300d64 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 8 Jan 2026 02:19:47 -0700 Subject: [PATCH 044/162] selftests: ublk: add end-to-end integrity test Add test case loop_08 to verify the ublk integrity data flow. It uses the kublk loop target to create a ublk device with integrity on top of backing data and integrity files. It then writes to the whole device with fio configured to generate integrity data. Then it reads back the whole device with fio configured to verify the integrity data. It also verifies that injected guard, reftag, and apptag corruptions are correctly detected. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_loop_08.sh | 111 +++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_loop_08.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 239ad1c741ef..036a9f01b464 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -35,6 +35,7 @@ TEST_PROGS += test_loop_04.sh TEST_PROGS += test_loop_05.sh TEST_PROGS += test_loop_06.sh TEST_PROGS += test_loop_07.sh +TEST_PROGS += test_loop_08.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stripe_03.sh diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh new file mode 100755 index 000000000000..ca289cfb2ad4 --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_08.sh @@ -0,0 +1,111 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +if ! _have_program fio; then + exit $UBLK_SKIP_CODE +fi + +fio_version=$(fio --version) +if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then + echo "Requires development fio version with https://github.com/axboe/fio/pull/1992" + exit $UBLK_SKIP_CODE +fi + +TID=loop_08 + +_prep_test "loop" "end-to-end integrity" + +_create_backfile 0 256M +_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) +integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" +dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") +_check_add_dev $TID $? + +# 1M * (64 integrity bytes / 512 data bytes) = 128K +fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" +fio --name fill --rw randwrite $fio_args > /dev/null +err=$? +if [ $err != 0 ]; then + echo "fio fill failed" + _show_result $TID $err +fi + +fio --name verify --rw randread $fio_args > /dev/null +err=$? +if [ $err != 0 ]; then + echo "fio verify failed" + _show_result $TID $err +fi + +fio_err=$(mktemp fio_err_XXXXX) + +# Overwrite 4-byte reftag at offset 56 + 4 = 60 +dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" +dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args +err=$? +if [ $err != 0 ]; then + echo "dd corrupted_reftag failed" + rm -f "$fio_err" + _show_result $TID $err +fi +if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + rm -f "$fio_err" + _show_result $TID 255 +fi +expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" +if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + rm -f "$fio_err" + _show_result $TID 255 +fi +# Reset to 0 +dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args +err=$? +if [ $err != 0 ]; then + echo "dd restore corrupted_reftag failed" + rm -f "$fio_err" + _show_result $TID $err +fi + +dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" +dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args +err=$? +if [ $err != 0 ]; then + echo "dd corrupted_data failed" + rm -f "$fio_err" + _show_result $TID $err +fi +if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + rm -f "$fio_err" + _show_result $TID 255 +fi +expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" +if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + rm -f "$fio_err" + _show_result $TID 255 +fi + +if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + rm -f "$fio_err" + _show_result $TID 255 +fi +expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" +if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + rm -f "$fio_err" + _show_result $TID 255 +fi + +rm -f "$fio_err" + +_cleanup_test +_show_result $TID 0 From 9e386f49fa269298490b303c423c6af4645f184e Mon Sep 17 00:00:00 2001 From: Yoav Cohen Date: Tue, 13 Jan 2026 00:05:00 +0200 Subject: [PATCH 045/162] ublk: make ublk_ctrl_stop_dev return void This function always returns 0, so there is no need to return a value. Signed-off-by: Yoav Cohen Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index ec96d5afad7a..73490890242b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -3459,10 +3459,9 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) header->data[0], header->addr, header->len); } -static int ublk_ctrl_stop_dev(struct ublk_device *ub) +static void ublk_ctrl_stop_dev(struct ublk_device *ub) { ublk_stop_dev(ub); - return 0; } static int ublk_ctrl_get_dev_info(struct ublk_device *ub, @@ -3935,7 +3934,8 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_start_dev(ub, header); break; case UBLK_CMD_STOP_DEV: - ret = ublk_ctrl_stop_dev(ub); + ublk_ctrl_stop_dev(ub); + ret = 0; break; case UBLK_CMD_GET_DEV_INFO: case UBLK_CMD_GET_DEV_INFO2: From 93ada1b3da398b492c45429cef1a1c9651d5c7ba Mon Sep 17 00:00:00 2001 From: Yoav Cohen Date: Tue, 13 Jan 2026 00:05:01 +0200 Subject: [PATCH 046/162] ublk: add UBLK_CMD_TRY_STOP_DEV command Add a best-effort stop command, UBLK_CMD_TRY_STOP_DEV, which only stops a ublk device when it has no active openers. Unlike UBLK_CMD_STOP_DEV, this command does not disrupt existing users. New opens are blocked only after disk_openers has reached zero; if the device is busy, the command returns -EBUSY and leaves it running. The ub->block_open flag is used only to close a race with an in-progress open and does not otherwise change open behavior. Advertise support via the UBLK_F_SAFE_STOP_DEV feature flag. Signed-off-by: Yoav Cohen Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 44 +++++++++++++++++++++++++++++++++-- include/uapi/linux/ublk_cmd.h | 9 ++++++- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 73490890242b..aaf94d2fb789 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -56,6 +56,7 @@ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) +#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV) #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) @@ -76,7 +77,8 @@ | UBLK_F_QUIESCE \ | UBLK_F_PER_IO_DAEMON \ | UBLK_F_BUF_REG_OFF_DAEMON \ - | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0)) + | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ + | UBLK_F_SAFE_STOP_DEV) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -243,6 +245,8 @@ struct ublk_device { struct delayed_work exit_work; struct work_struct partition_scan_work; + bool block_open; /* protected by open_mutex */ + struct ublk_queue *queues[]; }; @@ -984,6 +988,9 @@ static int ublk_open(struct gendisk *disk, blk_mode_t mode) return -EPERM; } + if (ub->block_open) + return -ENXIO; + return 0; } @@ -3343,7 +3350,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK | UBLK_F_PER_IO_DAEMON | - UBLK_F_BUF_REG_OFF_DAEMON; + UBLK_F_BUF_REG_OFF_DAEMON | + UBLK_F_SAFE_STOP_DEV; /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | @@ -3464,6 +3472,34 @@ static void ublk_ctrl_stop_dev(struct ublk_device *ub) ublk_stop_dev(ub); } +static int ublk_ctrl_try_stop_dev(struct ublk_device *ub) +{ + struct gendisk *disk; + int ret = 0; + + disk = ublk_get_disk(ub); + if (!disk) + return -ENODEV; + + mutex_lock(&disk->open_mutex); + if (disk_openers(disk) > 0) { + ret = -EBUSY; + goto unlock; + } + ub->block_open = true; + /* release open_mutex as del_gendisk() will reacquire it */ + mutex_unlock(&disk->open_mutex); + + ublk_ctrl_stop_dev(ub); + goto out; + +unlock: + mutex_unlock(&disk->open_mutex); +out: + ublk_put_disk(disk); + return ret; +} + static int ublk_ctrl_get_dev_info(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header) { @@ -3859,6 +3895,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_END_USER_RECOVERY: case UBLK_CMD_UPDATE_SIZE: case UBLK_CMD_QUIESCE_DEV: + case UBLK_CMD_TRY_STOP_DEV: mask = MAY_READ | MAY_WRITE; break; default: @@ -3972,6 +4009,9 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_QUIESCE_DEV: ret = ublk_ctrl_quiesce_dev(ub, header); break; + case UBLK_CMD_TRY_STOP_DEV: + ret = ublk_ctrl_try_stop_dev(ub); + break; default: ret = -EOPNOTSUPP; break; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 61ac5d8e1078..90f47da4f435 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -55,7 +55,8 @@ _IOWR('u', 0x15, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_QUIESCE_DEV \ _IOWR('u', 0x16, struct ublksrv_ctrl_cmd) - +#define UBLK_U_CMD_TRY_STOP_DEV \ + _IOWR('u', 0x17, struct ublksrv_ctrl_cmd) /* * 64bits are enough now, and it should be easy to extend in case of * running out of feature flags @@ -321,6 +322,12 @@ */ #define UBLK_F_INTEGRITY (1ULL << 16) +/* + * The device supports the UBLK_CMD_TRY_STOP_DEV command, which + * allows stopping the device only if there are no openers. + */ +#define UBLK_F_SAFE_STOP_DEV (1ULL << 17) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 From 65955a0993a0a9536263fea2eaae8aed496dcc9c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 13 Jan 2026 00:05:02 +0200 Subject: [PATCH 047/162] selftests: ublk: add stop command with --safe option Add 'stop' subcommand to kublk utility that uses the new UBLK_CMD_TRY_STOP_DEV command when --safe option is specified. This allows stopping a device only if it has no active openers, returning -EBUSY otherwise. Also add test_generic_16.sh to test the new functionality. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/kublk.c | 53 +++++++++++++++++ tools/testing/selftests/ublk/kublk.h | 1 + .../testing/selftests/ublk/test_generic_16.sh | 57 +++++++++++++++++++ 4 files changed, 112 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_generic_16.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 036a9f01b464..3a2498089b15 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -23,6 +23,7 @@ TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_15.sh +TEST_PROGS += test_generic_16.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index d95937dd6167..3472ce7426ba 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -108,6 +108,15 @@ static int ublk_ctrl_stop_dev(struct ublk_dev *dev) return __ublk_ctrl_cmd(dev, &data); } +static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_TRY_STOP_DEV, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + static int ublk_ctrl_start_dev(struct ublk_dev *dev, int daemon_pid) { @@ -1424,6 +1433,42 @@ static int cmd_dev_del(struct dev_ctx *ctx) return 0; } +static int cmd_dev_stop(struct dev_ctx *ctx) +{ + int number = ctx->dev_id; + struct ublk_dev *dev; + int ret; + + if (number < 0) { + ublk_err("%s: device id is required\n", __func__); + return -EINVAL; + } + + dev = ublk_ctrl_init(); + dev->dev_info.dev_id = number; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) + goto fail; + + if (ctx->safe_stop) { + ret = ublk_ctrl_try_stop_dev(dev); + if (ret < 0) + ublk_err("%s: try_stop dev %d failed ret %d\n", + __func__, number, ret); + } else { + ret = ublk_ctrl_stop_dev(dev); + if (ret < 0) + ublk_err("%s: stop dev %d failed ret %d\n", + __func__, number, ret); + } + +fail: + ublk_ctrl_deinit(dev); + + return ret; +} + static int __cmd_dev_list(struct dev_ctx *ctx) { struct ublk_dev *dev = ublk_ctrl_init(); @@ -1487,6 +1532,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), FEAT_NAME(UBLK_F_INTEGRITY), + FEAT_NAME(UBLK_F_SAFE_STOP_DEV) }; struct ublk_dev *dev; __u64 features = 0; @@ -1616,6 +1662,8 @@ static int cmd_dev_help(char *exe) printf("%s del [-n dev_id] -a \n", exe); printf("\t -a delete all devices -n delete specified device\n\n"); + printf("%s stop -n dev_id [--safe]\n", exe); + printf("\t --safe only stop if device has no active openers\n\n"); printf("%s list [-n dev_id] -a \n", exe); printf("\t -a list all devices, -n list specified device, default -a \n\n"); printf("%s features\n", exe); @@ -1653,6 +1701,7 @@ int main(int argc, char *argv[]) { "pi_offset", 1, NULL, 0 }, { "csum_type", 1, NULL, 0 }, { "tag_size", 1, NULL, 0 }, + { "safe", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1760,6 +1809,8 @@ int main(int argc, char *argv[]) } if (!strcmp(longopts[option_idx].name, "tag_size")) ctx.tag_size = strtoul(optarg, NULL, 0); + if (!strcmp(longopts[option_idx].name, "safe")) + ctx.safe_stop = 1; break; case '?': /* @@ -1842,6 +1893,8 @@ int main(int argc, char *argv[]) } } else if (!strcmp(cmd, "del")) ret = cmd_dev_del(&ctx); + else if (!strcmp(cmd, "stop")) + ret = cmd_dev_stop(&ctx); else if (!strcmp(cmd, "list")) { ctx.all = 1; ret = cmd_dev_list(&ctx); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 96c66b337bc0..cb757fd9bf9d 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -83,6 +83,7 @@ struct dev_ctx { __u8 pi_offset; __u8 csum_type; __u8 tag_size; + unsigned int safe_stop:1; int _evtfd; int _shmid; diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh new file mode 100755 index 000000000000..e08af7b685c9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_16" +ERR_CODE=0 + +_prep_test "null" "stop --safe command" + +# Check if SAFE_STOP_DEV feature is supported +if ! _have_feature "SAFE_STOP_DEV"; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# Test 1: stop --safe on idle device should succeed +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Device is idle (no openers), stop --safe should succeed +if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then + echo "stop --safe on idle device failed unexpectedly!" + ERR_CODE=255 +fi + +# Clean up device +${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 +udevadm settle + +# Test 2: stop --safe on device with active opener should fail +dev_id=$(_add_ublk_dev -t null -q 2 -d 32) +_check_add_dev $TID $? + +# Open device in background (dd reads indefinitely) +dd if=/dev/ublkb${dev_id} of=/dev/null bs=4k iflag=direct > /dev/null 2>&1 & +dd_pid=$! + +# Give dd time to start +sleep 0.2 + +# Device has active opener, stop --safe should fail with -EBUSY +if ${UBLK_PROG} stop -n "${dev_id}" --safe 2>/dev/null; then + echo "stop --safe on busy device succeeded unexpectedly!" + ERR_CODE=255 +fi + +# Kill dd and clean up +kill $dd_pid 2>/dev/null +wait $dd_pid 2>/dev/null + +# Now device should be idle, regular delete should work +${UBLK_PROG} del -n "${dev_id}" +udevadm settle + +_cleanup_test "null" +_show_result $TID $ERR_CODE From 91e1c1bcf0f2376f40ac859cf17d0a64a605e662 Mon Sep 17 00:00:00 2001 From: Nitesh Shetty Date: Mon, 12 Jan 2026 20:08:08 +0530 Subject: [PATCH 048/162] block, nvme: remove unused dma_iova_state function parameter DMA IOVA state is not used inside blk_rq_dma_map_iter_next, get rid of the argument. Signed-off-by: Nitesh Shetty Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-dma.c | 3 +-- drivers/nvme/host/pci.c | 5 ++--- include/linux/blk-mq-dma.h | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c index fb018fffffdc..4afeda45df15 100644 --- a/block/blk-mq-dma.c +++ b/block/blk-mq-dma.c @@ -238,7 +238,6 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); * blk_rq_dma_map_iter_next - map the next DMA segment for a request * @req: request to map * @dma_dev: device to map to - * @state: DMA IOVA state * @iter: block layer DMA iterator * * Iterate to the next mapping after a previous call to @@ -253,7 +252,7 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); * returned in @iter.status. */ bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, struct blk_dma_iter *iter) + struct blk_dma_iter *iter) { struct phys_vec vec; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0e4caeab739c..9fc4a60280a0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -823,7 +823,7 @@ static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev, if (iter->len) return true; - if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter)) + if (!blk_rq_dma_map_iter_next(req, dma_dev, iter)) return false; if (!dma_use_iova(&iod->dma_state) && dma_need_unmap(dma_dev)) { iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr; @@ -1010,8 +1010,7 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req, } nvme_pci_sgl_set_data(&sg_list[mapped++], iter); iod->total_len += iter->len; - } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state, - iter)); + } while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, iter)); nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); if (unlikely(iter->status)) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index cb88fc791fbd..214c181ff2c9 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -28,7 +28,7 @@ struct blk_dma_iter { bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, struct dma_iova_state *state, struct blk_dma_iter *iter); bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, struct blk_dma_iter *iter); + struct blk_dma_iter *iter); /** * blk_rq_dma_map_coalesce - were all segments coalesced? From 41ee77b75308354054f4fe03a05b8016a0d41573 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 6 Jan 2026 16:00:56 +0900 Subject: [PATCH 049/162] block: fix blk_zone_cond_str() comment Fix the comment for blk_zone_cond_str() by replacing the meaningless BLK_ZONE_ZONE_XXX comment with the correct BLK_ZONE_COND_name, thus also replacing the XXX with what that actually means. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 10 +++++----- include/linux/blkdev.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1c54678fae6b..ef3872c53244 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -112,12 +112,12 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) /** - * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. - * @zone_cond: BLK_ZONE_COND_XXX. + * blk_zone_cond_str - Return a zone condition name string + * @zone_cond: a zone condition BLK_ZONE_COND_name * - * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX - * into string format. Useful in the debugging and tracing zone conditions. For - * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". + * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful + * for the debugging and tracing zone conditions. For an invalid zone + * conditions, the string "UNKNOWN" is returned. */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 72e34acd439c..63affe898059 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1044,7 +1044,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) return bdev->bd_queue; /* this is never NULL */ } -/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ +/* Convert a zone condition BLK_ZONE_COND_name into the string "name" */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond); static inline unsigned int bio_zone_no(struct bio *bio) From 5e35a24c96185e1be4c24a713e53a49e92ab925b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 6 Jan 2026 16:00:57 +0900 Subject: [PATCH 050/162] block: improve blk_op_str() comment Replace XXX with what it actually means. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 10 +++++----- include/linux/blkdev.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index a0bf5174e9e9..d6732dc69dd9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -114,12 +114,12 @@ static const char *const blk_op_name[] = { #undef REQ_OP_NAME /** - * blk_op_str - Return string XXX in the REQ_OP_XXX. - * @op: REQ_OP_XXX. + * blk_op_str - Return the string "name" for an operation REQ_OP_name. + * @op: a request operation. * - * Description: Centralize block layer function to convert REQ_OP_XXX into - * string format. Useful in the debugging and tracing bio or request. For - * invalid REQ_OP_XXX it returns string "UNKNOWN". + * Convert a request operation REQ_OP_name into the string "name". Useful for + * debugging and tracing BIOs and requests. For an invalid request operation + * code, the string "UNKNOWN" is returned. */ inline const char *blk_op_str(enum req_op op) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 63affe898059..438c4946b6e5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1026,7 +1026,7 @@ extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); -/* Helper to convert REQ_OP_XXX to its string format XXX */ +/* Convert a request operation REQ_OP_name into the string "name" */ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); From 5e2fde1a9433efc484a5feec36f748aa3ea58c85 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 15:46:37 +0800 Subject: [PATCH 051/162] block: pass io_comp_batch to rq_end_io_fn callback Add a third parameter 'const struct io_comp_batch *' to the rq_end_io_fn callback signature. This allows end_io handlers to access the completion batch context when requests are completed via blk_mq_end_request_batch(). The io_comp_batch is passed from blk_mq_end_request_batch(), while NULL is passed from __blk_mq_end_request() and blk_mq_put_rq_ref() which don't have batch context. This infrastructure change enables drivers to detect whether they're being called from a batched completion path (like iopoll) and access additional context stored in the io_comp_batch. Update all rq_end_io_fn implementations: - block/blk-mq.c: blk_end_sync_rq - block/blk-flush.c: flush_end_io, mq_flush_data_end_io - drivers/nvme/host/ioctl.c: nvme_uring_cmd_end_io - drivers/nvme/host/core.c: nvme_keep_alive_end_io - drivers/nvme/host/pci.c: abort_endio, nvme_del_queue_end, nvme_del_cq_end - drivers/nvme/target/passthru.c: nvmet_passthru_req_done - drivers/scsi/scsi_error.c: eh_lock_door_done - drivers/scsi/sg.c: sg_rq_end_io - drivers/scsi/st.c: st_scsi_execute_end - drivers/target/target_core_pscsi.c: pscsi_req_done - drivers/md/dm-rq.c: end_clone_request Signed-off-by: Ming Lei Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- block/blk-flush.c | 6 ++++-- block/blk-mq.c | 9 +++++---- drivers/md/dm-rq.c | 3 ++- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/ioctl.c | 3 ++- drivers/nvme/host/pci.c | 11 +++++++---- drivers/nvme/target/passthru.c | 3 ++- drivers/scsi/scsi_error.c | 3 ++- drivers/scsi/sg.c | 6 ++++-- drivers/scsi/st.c | 3 ++- drivers/target/target_core_pscsi.c | 6 ++++-- include/linux/blk-mq.h | 4 +++- 12 files changed, 39 insertions(+), 21 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 43d6152897a4..403a46c86411 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -199,7 +199,8 @@ static void blk_flush_complete_seq(struct request *rq, } static enum rq_end_io_ret flush_end_io(struct request *flush_rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -335,7 +336,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; diff --git a/block/blk-mq.c b/block/blk-mq.c index a29d8ac9d3e3..cf1daedbb39f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1156,7 +1156,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) if (rq->end_io) { rq_qos_done(rq->q, rq); - if (rq->end_io(rq, error) == RQ_END_IO_FREE) + if (rq->end_io(rq, error, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); @@ -1211,7 +1211,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) * If end_io handler returns NONE, then it still has * ownership of the request. */ - if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) + if (rq->end_io && rq->end_io(rq, 0, iob) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); @@ -1458,7 +1458,8 @@ struct blk_rq_wait { blk_status_t ret; }; -static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) +static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret, + const struct io_comp_batch *iob) { struct blk_rq_wait *wait = rq->end_io_data; @@ -1688,7 +1689,7 @@ static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expi void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { - if (rq->end_io(rq, 0) == RQ_END_IO_FREE) + if (rq->end_io(rq, 0, NULL) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index a6ca92049c10..e9a7563b4b2f 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -295,7 +295,8 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error) } static enum rq_end_io_ret end_clone_request(struct request *clone, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct dm_rq_target_io *tio = clone->end_io_data; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7bf228df6001..19b67cf5d550 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1333,7 +1333,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) } static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct nvme_ctrl *ctrl = rq->end_io_data; unsigned long rtt = jiffies - (rq->deadline - rq->timeout); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index a9c097dacad6..e45ac0ca174e 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -410,7 +410,8 @@ static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) } static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, - blk_status_t err) + blk_status_t err, + const struct io_comp_batch *iob) { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 065555576d2f..d87c56c62861 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1615,7 +1615,8 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error) +static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; @@ -2858,7 +2859,8 @@ out_unlock: } static enum rq_end_io_ret nvme_del_queue_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; @@ -2868,14 +2870,15 @@ static enum rq_end_io_ret nvme_del_queue_end(struct request *req, } static enum rq_end_io_ret nvme_del_cq_end(struct request *req, - blk_status_t error) + blk_status_t error, + const struct io_comp_batch *iob) { struct nvme_queue *nvmeq = req->end_io_data; if (error) set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); - return nvme_del_queue_end(req, error); + return nvme_del_queue_end(req, error, iob); } static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 96648ec2fadb..0823c87637d3 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -247,7 +247,8 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) } static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq, - blk_status_t blk_status) + blk_status_t blk_status, + const struct io_comp_batch *iob) { struct nvmet_req *req = rq->end_io_data; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f869108fd969..1e93390c5a82 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -2085,7 +2085,8 @@ maybe_retry: } static enum rq_end_io_ret eh_lock_door_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { blk_mq_free_request(req); return RQ_END_IO_NONE; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 57fba34832ad..1a521f9d821a 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -177,7 +177,8 @@ typedef struct sg_device { /* holds the state of each scsi generic device */ } Sg_device; /* tasklet or soft irq callback */ -static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status); +static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob); static int sg_start_req(Sg_request *srp, unsigned char *cmd); static int sg_finish_rem_req(Sg_request * srp); static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size); @@ -1309,7 +1310,8 @@ sg_rq_end_io_usercontext(struct work_struct *work) * level when a command is completed (or has failed). */ static enum rq_end_io_ret -sg_rq_end_io(struct request *rq, blk_status_t status) +sg_rq_end_io(struct request *rq, blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); struct sg_request *srp = rq->end_io_data; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 168f25e4aaa3..8aeaa3b68c25 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -525,7 +525,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req) } static enum rq_end_io_ret st_scsi_execute_end(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); struct st_request *SRpnt = req->end_io_data; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index db4e09042469..823b2665f95b 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -39,7 +39,8 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev) } static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd); -static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t); +static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t, + const struct io_comp_batch *); /* pscsi_attach_hba(): * @@ -1001,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev) } static enum rq_end_io_ret pscsi_req_done(struct request *req, - blk_status_t status) + blk_status_t status, + const struct io_comp_batch *iob) { struct se_cmd *cmd = req->end_io_data; struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cae9e857aea4..18a2388ba581 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -13,6 +13,7 @@ struct blk_mq_tags; struct blk_flush_queue; +struct io_comp_batch; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 @@ -22,7 +23,8 @@ enum rq_end_io_ret { RQ_END_IO_FREE, }; -typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); +typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t, + const struct io_comp_batch *); /* * request flags */ From f7bc22ca0d55bdcb59e3a4a028fb811d23e53959 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 15:46:38 +0800 Subject: [PATCH 052/162] nvme/io_uring: optimize IOPOLL completions for local ring context When multiple io_uring rings poll on the same NVMe queue, one ring can find completions belonging to another ring. The current code always uses task_work to handle this, but this adds overhead for the common single-ring case. This patch passes the polling io_ring_ctx through io_comp_batch's new poll_ctx field. In io_do_iopoll(), the polling ring's context is stored in iob.poll_ctx before calling the iopoll callbacks. In nvme_uring_cmd_end_io(), we now compare iob->poll_ctx with the request's owning io_ring_ctx (via io_uring_cmd_ctx_handle()). If they match (local context), we complete inline with io_uring_cmd_done32(). If they differ (remote context) or iob is NULL (non-iopoll path), we use task_work as before. This optimization eliminates task_work scheduling overhead for the common case where a ring polls and finds its own completions. ~10% IOPS improvement is observed in the following benchmark: fio/t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -O0 -P1 -u1 -n1 /dev/ng0n1 Signed-off-by: Ming Lei Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 20 +++++++++++++------- include/linux/blkdev.h | 1 + io_uring/rw.c | 6 ++++++ 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e45ac0ca174e..fb62633ccbb0 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -426,14 +426,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * IOPOLL could potentially complete this request directly, but - * if multiple rings are polling on the same queue, then it's possible - * for one ring to find completions for another ring. Punting the - * completion via task_work will always direct it to the right - * location, rather than potentially complete requests for ringA - * under iopoll invocations from ringB. + * For IOPOLL, check if this completion is happening in the context + * of the same io_ring that owns the request (local context). If so, + * we can complete inline without task_work overhead. Otherwise, we + * must punt to task_work to ensure completion happens in the correct + * ring's context. */ - io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + if (blk_rq_is_poll(req) && iob && + iob->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) { + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0); + } else { + io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 438c4946b6e5..251e0f538c4c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1822,6 +1822,7 @@ struct io_comp_batch { struct rq_list req_list; bool need_ts; void (*complete)(struct io_comp_batch *); + void *poll_ctx; }; static inline bool blk_atomic_write_start_sect_aligned(sector_t sector, diff --git a/io_uring/rw.c b/io_uring/rw.c index 70ca88cc1f54..ff3192f603f3 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1320,6 +1320,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) DEFINE_IO_COMP_BATCH(iob); int nr_events = 0; + /* + * Store the polling io_ring_ctx so drivers can detect if they're + * completing a request in the same ring context that's polling. + */ + iob.poll_ctx = ctx; + /* * Only spin for completions if we don't have multiple devices hanging * off our complete list. From d7a4693a250ee2f185ce5c878e74252e533ac4b9 Mon Sep 17 00:00:00 2001 From: Ke Sun Date: Tue, 20 Jan 2026 16:38:18 +0800 Subject: [PATCH 053/162] rust: block: mq: use pin_init::zeroed() for queue_limits Replace unsafe core::mem::zeroed() with pin_init::zeroed() for queue_limits initialization. Signed-off-by: Ke Sun Acked-by: Andreas Hindborg Reviewed-by: Gary Guo Link: https://lore.kernel.org/r/20260120083824.477339-3-sunke@kylinos.cn Signed-off-by: Jens Axboe --- rust/kernel/block/mq/gen_disk.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index 1ce815c8cdab..c8b0ecb17082 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -107,8 +107,7 @@ impl GenDiskBuilder { drop(unsafe { T::QueueData::from_foreign(data) }); }); - // SAFETY: `bindings::queue_limits` contain only fields that are valid when zeroed. - let mut lim: bindings::queue_limits = unsafe { core::mem::zeroed() }; + let mut lim: bindings::queue_limits = pin_init::zeroed(); lim.logical_block_size = self.logical_block_size; lim.physical_block_size = self.physical_block_size; From 880528eaa67fc6446a0b5c16757f0d6a2639ccda Mon Sep 17 00:00:00 2001 From: Ke Sun Date: Tue, 20 Jan 2026 16:38:19 +0800 Subject: [PATCH 054/162] rust: block: mq: use pin_init::zeroed() for tag_set Replace unsafe core::mem::zeroed() with pin_init::zeroed() for blk_mq_tag_set initialization. Signed-off-by: Ke Sun Acked-by: Andreas Hindborg Reviewed-by: Gary Guo Link: https://lore.kernel.org/r/20260120083824.477339-4-sunke@kylinos.cn Signed-off-by: Jens Axboe --- rust/kernel/block/mq/tag_set.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs index c3cf56d52bee..dae9df408a86 100644 --- a/rust/kernel/block/mq/tag_set.rs +++ b/rust/kernel/block/mq/tag_set.rs @@ -38,9 +38,7 @@ impl TagSet { num_tags: u32, num_maps: u32, ) -> impl PinInit { - // SAFETY: `blk_mq_tag_set` only contains integers and pointers, which - // all are allowed to be 0. - let tag_set: bindings::blk_mq_tag_set = unsafe { core::mem::zeroed() }; + let tag_set: bindings::blk_mq_tag_set = pin_init::zeroed(); let tag_set: Result<_> = core::mem::size_of::() .try_into() .map(|cmd_size| { From fb027d569422efcf1b367441f782fb425a4b5569 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:34 +0800 Subject: [PATCH 055/162] ublk: define ublk_ch_batch_io_fops for the coming feature F_BATCH_IO Introduces the basic structure for a batched I/O feature in the ublk driver. It adds placeholder functions and a new file operations structure, ublk_ch_batch_io_fops, which will be used for fetching and committing I/O commands in batches. Currently, the feature is disabled. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index aaf94d2fb789..f6a4b222c71a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -263,6 +263,11 @@ static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, u16 q_id, u16 tag, struct ublk_io *io); static inline unsigned int ublk_req_build_flags(struct request *req); +static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) +{ + return false; +} + static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) { @@ -2679,6 +2684,12 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} + static inline bool ublk_check_ubuf_dir(const struct request *req, int ubuf_dir) { @@ -2798,6 +2809,16 @@ static const struct file_operations ublk_ch_fops = { .mmap = ublk_ch_mmap, }; +static const struct file_operations ublk_ch_batch_io_fops = { + .owner = THIS_MODULE, + .open = ublk_ch_open, + .release = ublk_ch_release, + .read_iter = ublk_ch_read_iter, + .write_iter = ublk_ch_write_iter, + .uring_cmd = ublk_ch_batch_io_uring_cmd, + .mmap = ublk_ch_mmap, +}; + static void ublk_deinit_queue(struct ublk_device *ub, int q_id) { struct ublk_queue *ubq = ub->queues[q_id]; @@ -2958,7 +2979,10 @@ static int ublk_add_chdev(struct ublk_device *ub) if (ret) goto fail; - cdev_init(&ub->cdev, &ublk_ch_fops); + if (ublk_dev_support_batch_io(ub)) + cdev_init(&ub->cdev, &ublk_ch_batch_io_fops); + else + cdev_init(&ub->cdev, &ublk_ch_fops); ret = cdev_device_add(&ub->cdev, dev); if (ret) goto fail; From 7ba62f5969defbab7df47c0016ffd4dedf30950a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:35 +0800 Subject: [PATCH 056/162] ublk: prepare for not tracking task context for command batch batch io is designed to be independent of task context, and we will not track task context for batch io feature. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f6a4b222c71a..0f9fcd16258b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2423,7 +2423,10 @@ static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, ublk_fill_io_cmd(io, cmd); - WRITE_ONCE(io->task, get_task_struct(current)); + if (ublk_dev_support_batch_io(ub)) + WRITE_ONCE(io->task, NULL); + else + WRITE_ONCE(io->task, get_task_struct(current)); ublk_mark_io_ready(ub); return 0; From e86f89ab24f5ec595879a01eebb5df84f5ed6d2b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:36 +0800 Subject: [PATCH 057/162] ublk: add new batch command UBLK_U_IO_PREP_IO_CMDS & UBLK_U_IO_COMMIT_IO_CMDS Add new command UBLK_U_IO_PREP_IO_CMDS, which is the batch version of UBLK_IO_FETCH_REQ. Add new command UBLK_U_IO_COMMIT_IO_CMDS, which is for committing io command result only, still the batch version. The new command header type is `struct ublk_batch_io`. This patch doesn't actually implement these commands yet, just validates the SQE fields. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 87 ++++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 49 ++++++++++++++++++++ 2 files changed, 135 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0f9fcd16258b..22c7296d90f3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -91,6 +91,11 @@ UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT | \ UBLK_PARAM_TYPE_INTEGRITY) +#define UBLK_BATCH_F_ALL \ + (UBLK_BATCH_F_HAS_ZONE_LBA | \ + UBLK_BATCH_F_HAS_BUF_ADDR | \ + UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) + struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to @@ -114,6 +119,13 @@ struct ublk_uring_cmd_pdu { u16 tag; }; +struct ublk_batch_io_data { + struct ublk_device *ub; + struct io_uring_cmd *cmd; + struct ublk_batch_io header; + unsigned int issue_flags; +}; + /* * io command is active: sqe cmd is received, and its cqe isn't done * @@ -2687,10 +2699,83 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) +{ + unsigned elem_bytes = sizeof(struct ublk_elem_header); + + if (uc->flags & ~UBLK_BATCH_F_ALL) + return -EINVAL; + + /* UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK requires buffer index */ + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)) + return -EINVAL; + + elem_bytes += (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA ? sizeof(u64) : 0) + + (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR ? sizeof(u64) : 0); + if (uc->elem_bytes != elem_bytes) + return -EINVAL; + return 0; +} + +static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) +{ + + const struct ublk_batch_io *uc = &data->header; + + if (uc->nr_elem > data->ub->dev_info.queue_depth) + return -E2BIG; + + if ((uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) && + !ublk_dev_is_zoned(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) && + !ublk_dev_need_map_io(data->ub)) + return -EINVAL; + + if ((uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) && + !ublk_dev_support_auto_buf_reg(data->ub)) + return -EINVAL; + + return ublk_check_batch_cmd_flags(uc); +} + static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - return -EOPNOTSUPP; + const struct ublk_batch_io *uc = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + struct ublk_batch_io_data data = { + .ub = ub, + .cmd = cmd, + .header = (struct ublk_batch_io) { + .q_id = READ_ONCE(uc->q_id), + .flags = READ_ONCE(uc->flags), + .nr_elem = READ_ONCE(uc->nr_elem), + .elem_bytes = READ_ONCE(uc->elem_bytes), + }, + .issue_flags = issue_flags, + }; + u32 cmd_op = cmd->cmd_op; + int ret = -EINVAL; + + if (data.header.q_id >= ub->dev_info.nr_hw_queues) + goto out; + + switch (cmd_op) { + case UBLK_U_IO_PREP_IO_CMDS: + case UBLK_U_IO_COMMIT_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = -EOPNOTSUPP; + break; + default: + ret = -EOPNOTSUPP; + } +out: + return ret; } static inline bool ublk_check_ubuf_dir(const struct request *req, diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 90f47da4f435..0cc58e19d401 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -103,6 +103,10 @@ _IOWR('u', 0x23, struct ublksrv_io_cmd) #define UBLK_U_IO_UNREGISTER_IO_BUF \ _IOWR('u', 0x24, struct ublksrv_io_cmd) +#define UBLK_U_IO_PREP_IO_CMDS \ + _IOWR('u', 0x25, struct ublk_batch_io) +#define UBLK_U_IO_COMMIT_IO_CMDS \ + _IOWR('u', 0x26, struct ublk_batch_io) /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 @@ -544,6 +548,51 @@ struct ublksrv_io_cmd { }; }; +struct ublk_elem_header { + __u16 tag; /* IO tag */ + + /* + * Buffer index for incoming io command, only valid iff + * UBLK_F_AUTO_BUF_REG is set + */ + __u16 buf_index; + __s32 result; /* I/O completion result (commit only) */ +}; + +/* + * uring_cmd buffer structure for batch commands + * + * buffer includes multiple elements, which number is specified by + * `nr_elem`. Each element buffer is organized in the following order: + * + * struct ublk_elem_buffer { + * // Mandatory fields (8 bytes) + * struct ublk_elem_header header; + * + * // Optional fields (8 bytes each, included based on flags) + * + * // Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data + * // between ublk request and ublk server buffer + * __u64 buf_addr; + * + * // returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA) + * __u64 zone_lba; + * } + * + * Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS` + */ +struct ublk_batch_io { + __u16 q_id; +#define UBLK_BATCH_F_HAS_ZONE_LBA (1 << 0) +#define UBLK_BATCH_F_HAS_BUF_ADDR (1 << 1) +#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK (1 << 2) + __u16 flags; + __u16 nr_elem; + __u8 elem_bytes; + __u8 reserved; + __u64 reserved2; +}; + struct ublk_param_basic { #define UBLK_ATTR_READ_ONLY (1 << 0) #define UBLK_ATTR_ROTATIONAL (1 << 1) From b256795b3606e9a67c725dde8eaae91dd9d21de4 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:37 +0800 Subject: [PATCH 058/162] ublk: handle UBLK_U_IO_PREP_IO_CMDS This commit implements the handling of the UBLK_U_IO_PREP_IO_CMDS command, which allows userspace to prepare a batch of I/O requests. The core of this change is the `ublk_walk_cmd_buf` function, which iterates over the elements in the uring_cmd fixed buffer. For each element, it parses the I/O details, finds the corresponding `ublk_io` structure, and prepares it for future dispatch. Add per-io lock for protecting concurrent delivery and committing. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 191 +++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 5 + 2 files changed, 195 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 22c7296d90f3..a3840b3f1081 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -208,6 +208,7 @@ struct ublk_io { unsigned task_registered_buffers; void *buf_ctx_handle; + spinlock_t lock; } ____cacheline_aligned_in_smp; struct ublk_queue { @@ -280,6 +281,16 @@ static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) return false; } +static inline void ublk_io_lock(struct ublk_io *io) +{ + spin_lock(&io->lock); +} + +static inline void ublk_io_unlock(struct ublk_io *io) +{ + spin_unlock(&io->lock); +} + static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) { @@ -2699,6 +2710,171 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return ublk_ch_uring_cmd_local(cmd, issue_flags); } +static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = elem; + + if (uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR) + return *(const __u64 *)(buf + sizeof(*elem)); + return 0; +} + +static struct ublk_auto_buf_reg +ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + struct ublk_auto_buf_reg reg = { + .index = elem->buf_index, + .flags = (uc->flags & UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) ? + UBLK_AUTO_BUF_REG_FALLBACK : 0, + }; + + return reg; +} + +/* + * 48 can hold any type of buffer element(8, 16 and 24 bytes) because + * it is the least common multiple(LCM) of 8, 16 and 24 + */ +#define UBLK_CMD_BATCH_TMP_BUF_SZ (48 * 10) +struct ublk_batch_io_iter { + void __user *uaddr; + unsigned done, total; + unsigned char elem_bytes; + /* copy to this buffer from user space */ + unsigned char buf[UBLK_CMD_BATCH_TMP_BUF_SZ]; +}; + +static inline int +__ublk_walk_cmd_buf(struct ublk_queue *ubq, + struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + unsigned bytes, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < bytes; i += iter->elem_bytes) { + const struct ublk_elem_header *elem = + (const struct ublk_elem_header *)&iter->buf[i]; + + if (unlikely(elem->tag >= data->ub->dev_info.queue_depth)) { + ret = -EINVAL; + break; + } + + ret = cb(ubq, data, elem); + if (unlikely(ret)) + break; + } + + iter->done += i; + return ret; +} + +static int ublk_walk_cmd_buf(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data, + int (*cb)(struct ublk_queue *q, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem)) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + int ret = 0; + + while (iter->done < iter->total) { + unsigned int len = min(sizeof(iter->buf), iter->total - iter->done); + + if (copy_from_user(iter->buf, iter->uaddr + iter->done, len)) { + pr_warn("ublk%d: read batch cmd buffer failed\n", + data->ub->dev_info.dev_id); + return -EFAULT; + } + + ret = __ublk_walk_cmd_buf(ubq, iter, data, len, cb); + if (ret) + return ret; + } + return 0; +} + +static int ublk_batch_unprep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + + data->ub->nr_io_ready--; + ublk_io_lock(io); + io->flags = 0; + ublk_io_unlock(io); + return 0; +} + +static void ublk_batch_revert_prep_cmd(struct ublk_batch_io_iter *iter, + const struct ublk_batch_io_data *data) +{ + int ret; + + /* Re-process only what we've already processed, starting from beginning */ + iter->total = iter->done; + iter->done = 0; + + ret = ublk_walk_cmd_buf(iter, data, ublk_batch_unprep_io); + WARN_ON_ONCE(ret); +} + +static int ublk_batch_prep_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + union ublk_io_buf buf = { 0 }; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + else if (ublk_dev_need_map_io(data->ub)) { + buf.addr = ublk_batch_buf_addr(uc, elem); + + ret = ublk_check_fetch_buf(data->ub, buf.addr); + if (ret) + return ret; + } + + ublk_io_lock(io); + ret = __ublk_fetch(data->cmd, data->ub, io); + if (!ret) + io->buf = buf; + ublk_io_unlock(io); + + return ret; +} + +static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + int ret; + + mutex_lock(&data->ub->mutex); + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_prep_io); + + if (ret && iter.done) + ublk_batch_revert_prep_cmd(&iter, data); + mutex_unlock(&data->ub->mutex); + return ret; +} + static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) { unsigned elem_bytes = sizeof(struct ublk_elem_header); @@ -2765,6 +2941,11 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, switch (cmd_op) { case UBLK_U_IO_PREP_IO_CMDS: + ret = ublk_check_batch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_prep_cmd(&data); + break; case UBLK_U_IO_COMMIT_IO_CMDS: ret = ublk_check_batch_cmd(&data); if (ret) @@ -2952,7 +3133,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) struct ublk_queue *ubq; struct page *page; int numa_node; - int size; + int size, i; /* Determine NUMA node based on queue's CPU affinity */ numa_node = ublk_get_queue_numa_node(ub, q_id); @@ -2977,6 +3158,9 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) } ubq->io_cmd_buf = page_address(page); + for (i = 0; i < ubq->q_depth; i++) + spin_lock_init(&ubq->ios[i].lock); + ub->queues[q_id] = ubq; ubq->dev = ub; return 0; @@ -3220,6 +3404,11 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, return -EINVAL; mutex_lock(&ub->mutex); + /* device may become not ready in case of F_BATCH */ + if (!ublk_dev_ready(ub)) { + ret = -EINVAL; + goto out_unlock; + } if (ub->dev_info.state == UBLK_S_DEV_LIVE || test_bit(UB_STATE_USED, &ub->state)) { ret = -EEXIST; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 0cc58e19d401..1a3d4d33c1d1 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -103,6 +103,11 @@ _IOWR('u', 0x23, struct ublksrv_io_cmd) #define UBLK_U_IO_UNREGISTER_IO_BUF \ _IOWR('u', 0x24, struct ublksrv_io_cmd) + +/* + * return 0 if the command is run successfully, otherwise failure code + * is returned + */ #define UBLK_U_IO_PREP_IO_CMDS \ _IOWR('u', 0x25, struct ublk_batch_io) #define UBLK_U_IO_COMMIT_IO_CMDS \ From 1e500e106d5a82280db59dba06f0108085beba65 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:38 +0800 Subject: [PATCH 059/162] ublk: handle UBLK_U_IO_COMMIT_IO_CMDS Handle UBLK_U_IO_COMMIT_IO_CMDS by walking the uring_cmd fixed buffer: - read each element into one temp buffer in batch style - parse and apply each element for committing io result Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 103 +++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 8 +++ 2 files changed, 109 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a3840b3f1081..162b46c74f16 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2267,7 +2267,7 @@ static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd return 0; } -static int ublk_handle_auto_buf_reg(struct ublk_io *io, +static void ublk_clear_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd, u16 *buf_idx) { @@ -2287,7 +2287,13 @@ static int ublk_handle_auto_buf_reg(struct ublk_io *io, if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd)) *buf_idx = io->buf.auto_reg.index; } +} +static int ublk_handle_auto_buf_reg(struct ublk_io *io, + struct io_uring_cmd *cmd, + u16 *buf_idx) +{ + ublk_clear_auto_buf_reg(io, cmd, buf_idx); return ublk_set_auto_buf_reg(io, cmd); } @@ -2720,6 +2726,17 @@ static inline __u64 ublk_batch_buf_addr(const struct ublk_batch_io *uc, return 0; } +static inline __u64 ublk_batch_zone_lba(const struct ublk_batch_io *uc, + const struct ublk_elem_header *elem) +{ + const void *buf = elem; + + if (uc->flags & UBLK_BATCH_F_HAS_ZONE_LBA) + return *(const __u64 *)(buf + sizeof(*elem) + + 8 * !!(uc->flags & UBLK_BATCH_F_HAS_BUF_ADDR)); + return -1; +} + static struct ublk_auto_buf_reg ublk_batch_auto_buf_reg(const struct ublk_batch_io *uc, const struct ublk_elem_header *elem) @@ -2875,6 +2892,84 @@ static int ublk_handle_batch_prep_cmd(const struct ublk_batch_io_data *data) return ret; } +static int ublk_batch_commit_io_check(const struct ublk_queue *ubq, + struct ublk_io *io, + union ublk_io_buf *buf) +{ + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + return -EBUSY; + + /* BATCH_IO doesn't support UBLK_F_NEED_GET_DATA */ + if (ublk_need_map_io(ubq) && !buf->addr) + return -EINVAL; + return 0; +} + +static int ublk_batch_commit_io(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + const struct ublk_elem_header *elem) +{ + struct ublk_io *io = &ubq->ios[elem->tag]; + const struct ublk_batch_io *uc = &data->header; + u16 buf_idx = UBLK_INVALID_BUF_IDX; + union ublk_io_buf buf = { 0 }; + struct request *req = NULL; + bool auto_reg = false; + bool compl = false; + int ret; + + if (ublk_dev_support_auto_buf_reg(data->ub)) { + buf.auto_reg = ublk_batch_auto_buf_reg(uc, elem); + auto_reg = true; + } else if (ublk_dev_need_map_io(data->ub)) + buf.addr = ublk_batch_buf_addr(uc, elem); + + ublk_io_lock(io); + ret = ublk_batch_commit_io_check(ubq, io, &buf); + if (!ret) { + io->res = elem->result; + io->buf = buf; + req = ublk_fill_io_cmd(io, data->cmd); + + if (auto_reg) + ublk_clear_auto_buf_reg(io, data->cmd, &buf_idx); + compl = ublk_need_complete_req(data->ub, io); + } + ublk_io_unlock(io); + + if (unlikely(ret)) { + pr_warn_ratelimited("%s: dev %u queue %u io %u: commit failure %d\n", + __func__, data->ub->dev_info.dev_id, ubq->q_id, + elem->tag, ret); + return ret; + } + + /* can't touch 'ublk_io' any more */ + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags); + if (req_op(req) == REQ_OP_ZONE_APPEND) + req->__sector = ublk_batch_zone_lba(uc, elem); + if (compl) + __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub)); + return 0; +} + +static int ublk_handle_batch_commit_cmd(const struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + struct io_uring_cmd *cmd = data->cmd; + struct ublk_batch_io_iter iter = { + .uaddr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)), + .total = uc->nr_elem * uc->elem_bytes, + .elem_bytes = uc->elem_bytes, + }; + int ret; + + ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io); + + return iter.done == 0 ? ret : iter.done; +} + static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) { unsigned elem_bytes = sizeof(struct ublk_elem_header); @@ -2950,7 +3045,7 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_check_batch_cmd(&data); if (ret) goto out; - ret = -EOPNOTSUPP; + ret = ublk_handle_batch_commit_cmd(&data); break; default: ret = -EOPNOTSUPP; @@ -3659,6 +3754,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) UBLK_F_AUTO_BUF_REG)) ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* UBLK_F_BATCH_IO doesn't support GET_DATA */ + if (ublk_dev_support_batch_io(ub)) + ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* * Zoned storage support requires reuse `ublksrv_io_cmd->addr` for * returning write_append_lba, which is only allowed in case of diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 1a3d4d33c1d1..3894d676dd02 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -110,6 +110,14 @@ */ #define UBLK_U_IO_PREP_IO_CMDS \ _IOWR('u', 0x25, struct ublk_batch_io) +/* + * If failure code is returned, nothing in the command buffer is handled. + * Otherwise, the returned value means how many bytes in command buffer + * are handled actually, then number of handled IOs can be calculated with + * `elem_bytes` for each IO. IOs in the remained bytes are not committed, + * userspace has to check return value for dealing with partial committing + * correctly. + */ #define UBLK_U_IO_COMMIT_IO_CMDS \ _IOWR('u', 0x26, struct ublk_batch_io) From f1f99ddf607a4b54dc19d92fb07f2a30c131ee56 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:39 +0800 Subject: [PATCH 060/162] ublk: add io events fifo structure Add ublk io events fifo structure and prepare for supporting command batch, which will use io_uring multishot uring_cmd for fetching one batch of io commands each time. One nice feature of kfifo is to allow multiple producer vs single consumer. We just need lock the producer side, meantime the single consumer can be lockless. The producer is actually from ublk_queue_rq() or ublk_queue_rqs(), so lock contention can be eased by setting proper blk-mq nr_queues. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 69 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 162b46c74f16..c88c4bc15b83 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -223,6 +224,24 @@ struct ublk_queue { bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ spinlock_t cancel_lock; struct ublk_device *dev; + + /* + * For supporting UBLK_F_BATCH_IO only. + * + * Inflight ublk request tag is saved in this fifo + * + * There are multiple writer from ublk_queue_rq() or ublk_queue_rqs(), + * so lock is required for storing request tag to fifo + * + * Make sure just one reader for fetching request from task work + * function to ublk server, so no need to grab the lock in reader + * side. + */ + struct { + DECLARE_KFIFO_PTR(evts_fifo, unsigned short); + spinlock_t evts_lock; + }____cacheline_aligned_in_smp; + struct ublk_io ios[] __counted_by(q_depth); }; @@ -291,6 +310,26 @@ static inline void ublk_io_unlock(struct ublk_io *io) spin_unlock(&io->lock); } +/* Initialize the event queue */ +static inline int ublk_io_evts_init(struct ublk_queue *q, unsigned int size, + int numa_node) +{ + spin_lock_init(&q->evts_lock); + return kfifo_alloc_node(&q->evts_fifo, size, GFP_KERNEL, numa_node); +} + +/* Check if event queue is empty */ +static inline bool ublk_io_evts_empty(const struct ublk_queue *q) +{ + return kfifo_is_empty(&q->evts_fifo); +} + +static inline void ublk_io_evts_deinit(struct ublk_queue *q) +{ + WARN_ON_ONCE(!kfifo_is_empty(&q->evts_fifo)); + kfifo_free(&q->evts_fifo); +} + static inline struct ublksrv_io_desc * ublk_get_iod(const struct ublk_queue *ubq, unsigned tag) { @@ -3183,14 +3222,10 @@ static const struct file_operations ublk_ch_batch_io_fops = { .mmap = ublk_ch_mmap, }; -static void ublk_deinit_queue(struct ublk_device *ub, int q_id) +static void __ublk_deinit_queue(struct ublk_device *ub, struct ublk_queue *ubq) { - struct ublk_queue *ubq = ub->queues[q_id]; int size, i; - if (!ubq) - return; - size = ublk_queue_cmd_buf_size(ub); for (i = 0; i < ubq->q_depth; i++) { @@ -3204,7 +3239,20 @@ static void ublk_deinit_queue(struct ublk_device *ub, int q_id) if (ubq->io_cmd_buf) free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); + if (ublk_dev_support_batch_io(ub)) + ublk_io_evts_deinit(ubq); + kvfree(ubq); +} + +static void ublk_deinit_queue(struct ublk_device *ub, int q_id) +{ + struct ublk_queue *ubq = ub->queues[q_id]; + + if (!ubq) + return; + + __ublk_deinit_queue(ub, ubq); ub->queues[q_id] = NULL; } @@ -3228,7 +3276,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) struct ublk_queue *ubq; struct page *page; int numa_node; - int size, i; + int size, i, ret; /* Determine NUMA node based on queue's CPU affinity */ numa_node = ublk_get_queue_numa_node(ub, q_id); @@ -3256,9 +3304,18 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) for (i = 0; i < ubq->q_depth; i++) spin_lock_init(&ubq->ios[i].lock); + if (ublk_dev_support_batch_io(ub)) { + ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node); + if (ret) + goto fail; + } ub->queues[q_id] = ubq; ubq->dev = ub; + return 0; +fail: + __ublk_deinit_queue(ub, ubq); + return ret; } static void ublk_deinit_queues(struct ublk_device *ub) From 7a1bb41947cee3aa50fa9b276e9aeb6caa87b543 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:40 +0800 Subject: [PATCH 061/162] ublk: add batch I/O dispatch infrastructure Add infrastructure for delivering I/O commands to ublk server in batches, preparing for the upcoming UBLK_U_IO_FETCH_IO_CMDS feature. Key components: - struct ublk_batch_fetch_cmd: Represents a batch fetch uring_cmd that will receive multiple I/O tags in a single operation, using io_uring's multishot command for efficient ublk IO delivery. - ublk_batch_dispatch(): Batch version of ublk_dispatch_req() that: * Pulls multiple request tags from the events FIFO (lock-free reader) * Prepares each I/O for delivery (including auto buffer registration) * Delivers tags to userspace via single uring_cmd notification * Handles partial failures by restoring undelivered tags to FIFO The batch approach significantly reduces notification overhead by aggregating multiple I/O completions into single uring_cmd, while maintaining the same I/O processing semantics as individual operations. Error handling ensures system consistency: if buffer selection or CQE posting fails, undelivered tags are restored to the FIFO for retry, meantime IO state has to be restored. This runs in task work context, scheduled via io_uring_cmd_complete_in_task() or called directly from ->uring_cmd(), enabling efficient batch processing without blocking the I/O submission path. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 195 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c88c4bc15b83..1b5721c7a536 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -97,6 +97,12 @@ UBLK_BATCH_F_HAS_BUF_ADDR | \ UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK) +/* ublk batch fetch uring_cmd */ +struct ublk_batch_fetch_cmd { + struct io_uring_cmd *cmd; + unsigned short buf_group; +}; + struct ublk_uring_cmd_pdu { /* * Store requests in same batch temporarily for queuing them to @@ -174,6 +180,9 @@ struct ublk_batch_io_data { */ #define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2) +/* used for UBLK_F_BATCH_IO only */ +#define UBLK_BATCH_IO_UNUSED_TAG ((unsigned short)-1) + union ublk_io_buf { __u64 addr; struct ublk_auto_buf_reg auto_reg; @@ -656,6 +665,32 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +static void ublk_batch_deinit_fetch_buf(const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd, + int res) +{ + io_uring_cmd_done(fcmd->cmd, res, data->issue_flags); + fcmd->cmd = NULL; +} + +static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd, + struct io_br_sel *sel, + unsigned int issue_flags) +{ + if (io_uring_mshot_cmd_post_cqe(fcmd->cmd, sel, issue_flags)) + return -ENOBUFS; + return 0; +} + +static ssize_t ublk_batch_copy_io_tags(struct ublk_batch_fetch_cmd *fcmd, + void __user *buf, const u16 *tag_buf, + unsigned int len) +{ + if (copy_to_user(buf, tag_buf, len)) + return -EFAULT; + return len; +} + #define UBLK_MAX_UBLKS UBLK_MINORS /* @@ -1522,6 +1557,166 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) } } +static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short tag) +{ + struct ublk_device *ub = data->ub; + struct ublk_io *io = &ubq->ios[tag]; + struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + enum auto_buf_reg_res res = AUTO_BUF_REG_FALLBACK; + struct io_uring_cmd *cmd = data->cmd; + + if (!ublk_start_io(ubq, req, io)) + return false; + + if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { + res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + data->issue_flags); + + if (res == AUTO_BUF_REG_FAIL) + return false; + } + + ublk_io_lock(io); + ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + ublk_io_unlock(io); + + return true; +} + +static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short *tag_buf, + unsigned int len) +{ + bool has_unused = false; + unsigned int i; + + for (i = 0; i < len; i++) { + unsigned short tag = tag_buf[i]; + + if (!__ublk_batch_prep_dispatch(ubq, data, tag)) { + tag_buf[i] = UBLK_BATCH_IO_UNUSED_TAG; + has_unused = true; + } + } + + return has_unused; +} + +/* + * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf. + * Returns the new length after filtering. + */ +static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, + unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; i < len; i++) { + if (tag_buf[i] != UBLK_BATCH_IO_UNUSED_TAG) { + if (i != j) + tag_buf[j] = tag_buf[i]; + j++; + } + } + + return j; +} + +#define MAX_NR_TAG 128 +static int __ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + const unsigned int tag_sz = sizeof(unsigned short); + unsigned short tag_buf[MAX_NR_TAG]; + struct io_br_sel sel; + size_t len = 0; + bool needs_filter; + int ret; + + sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len, + data->issue_flags); + if (sel.val < 0) + return sel.val; + if (!sel.addr) + return -ENOBUFS; + + /* single reader needn't lock and sizeof(kfifo element) is 2 bytes */ + len = min(len, sizeof(tag_buf)) / tag_sz; + len = kfifo_out(&ubq->evts_fifo, tag_buf, len); + + needs_filter = ublk_batch_prep_dispatch(ubq, data, tag_buf, len); + /* Filter out unused tags before posting to userspace */ + if (unlikely(needs_filter)) { + int new_len = ublk_filter_unused_tags(tag_buf, len); + + /* return actual length if all are failed or requeued */ + if (!new_len) { + /* release the selected buffer */ + sel.val = 0; + WARN_ON_ONCE(!io_uring_mshot_cmd_post_cqe(fcmd->cmd, + &sel, data->issue_flags)); + return len; + } + len = new_len; + } + + sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz); + ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags); + if (unlikely(ret < 0)) { + int i, res; + + /* + * Undo prep state for all IOs since userspace never received them. + * This restores IOs to pre-prepared state so they can be cleanly + * re-prepared when tags are pulled from FIFO again. + */ + for (i = 0; i < len; i++) { + struct ublk_io *io = &ubq->ios[tag_buf[i]]; + int index = -1; + + ublk_io_lock(io); + if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) + index = io->buf.auto_reg.index; + io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG); + io->flags |= UBLK_IO_FLAG_ACTIVE; + ublk_io_unlock(io); + + if (index != -1) + io_buffer_unregister_bvec(data->cmd, index, + data->issue_flags); + } + + res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo, + tag_buf, len, &ubq->evts_lock); + + pr_warn_ratelimited("%s: copy tags or post CQE failure, move back " + "tags(%d %zu) ret %d\n", __func__, res, len, + ret); + } + return ret; +} + +static __maybe_unused void +ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + int ret = 0; + + while (!ublk_io_evts_empty(ubq)) { + ret = __ublk_batch_dispatch(ubq, data, fcmd); + if (ret <= 0) + break; + } + + if (ret < 0) + ublk_batch_deinit_fetch_buf(data, fcmd, ret); +} + static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) { struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); From a4d88375539920b7401ead59d2f944ac23c668ea Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:41 +0800 Subject: [PATCH 062/162] ublk: add UBLK_U_IO_FETCH_IO_CMDS for batch I/O processing Add UBLK_U_IO_FETCH_IO_CMDS command to enable efficient batch processing of I/O requests. This multishot uring_cmd allows the ublk server to fetch multiple I/O commands in a single operation, significantly reducing submission overhead compared to individual FETCH_REQ* commands. Key Design Features: 1. Multishot Operation: One UBLK_U_IO_FETCH_IO_CMDS can fetch many I/O commands, with the batch size limited by the provided buffer length. 2. Dynamic Load Balancing: Multiple fetch commands can be submitted simultaneously, but only one is active at any time. This enables efficient load distribution across multiple server task contexts. 3. Implicit State Management: The implementation uses three key variables to track state: - evts_fifo: Queue of request tags awaiting processing - fcmd_head: List of available fetch commands - active_fcmd: Currently active fetch command (NULL = none active) States are derived implicitly: - IDLE: No fetch commands available - READY: Fetch commands available, none active - ACTIVE: One fetch command processing events 4. Lockless Reader Optimization: The active fetch command can read from evts_fifo without locking (single reader guarantee), while writers (ublk_queue_rq/ublk_queue_rqs) use evts_lock protection. The memory barrier pairing plays key role for the single lockless reader optimization. Implementation Details: - ublk_queue_rq() and ublk_queue_rqs() save request tags to evts_fifo - __ublk_acquire_fcmd() selects an available fetch command when events arrive and no command is currently active - ublk_batch_dispatch() moves tags from evts_fifo to the fetch command's buffer and posts completion via io_uring_mshot_cmd_post_cqe() - State transitions are coordinated via evts_lock to maintain consistency Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 394 +++++++++++++++++++++++++++++++++- include/uapi/linux/ublk_cmd.h | 7 + 2 files changed, 393 insertions(+), 8 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1b5721c7a536..0a0210f9d417 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -99,6 +99,7 @@ /* ublk batch fetch uring_cmd */ struct ublk_batch_fetch_cmd { + struct list_head node; struct io_uring_cmd *cmd; unsigned short buf_group; }; @@ -123,7 +124,10 @@ struct ublk_uring_cmd_pdu { */ struct ublk_queue *ubq; - u16 tag; + union { + u16 tag; + struct ublk_batch_fetch_cmd *fcmd; /* batch io only */ + }; }; struct ublk_batch_io_data { @@ -245,10 +249,37 @@ struct ublk_queue { * Make sure just one reader for fetching request from task work * function to ublk server, so no need to grab the lock in reader * side. + * + * Batch I/O State Management: + * + * The batch I/O system uses implicit state management based on the + * combination of three key variables below. + * + * - IDLE: list_empty(&fcmd_head) && !active_fcmd + * No fetch commands available, events queue in evts_fifo + * + * - READY: !list_empty(&fcmd_head) && !active_fcmd + * Fetch commands available but none processing events + * + * - ACTIVE: active_fcmd + * One fetch command actively processing events from evts_fifo + * + * Key Invariants: + * - At most one active_fcmd at any time (single reader) + * - active_fcmd is always from fcmd_head list when non-NULL + * - evts_fifo can be read locklessly by the single active reader + * - All state transitions require evts_lock protection + * - Multiple writers to evts_fifo require lock protection */ struct { DECLARE_KFIFO_PTR(evts_fifo, unsigned short); spinlock_t evts_lock; + + /* List of fetch commands available to process events */ + struct list_head fcmd_head; + + /* Currently active fetch command (NULL = none active) */ + struct ublk_batch_fetch_cmd *active_fcmd; }____cacheline_aligned_in_smp; struct ublk_io ios[] __counted_by(q_depth); @@ -303,12 +334,20 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, u16 q_id, u16 tag, struct ublk_io *io); static inline unsigned int ublk_req_build_flags(struct request *req); +static void ublk_batch_dispatch(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd); static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) { return false; } +static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) +{ + return false; +} + static inline void ublk_io_lock(struct ublk_io *io) { spin_lock(&io->lock); @@ -664,13 +703,45 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +static struct ublk_batch_fetch_cmd * +ublk_batch_alloc_fcmd(struct io_uring_cmd *cmd) +{ + struct ublk_batch_fetch_cmd *fcmd = kzalloc(sizeof(*fcmd), GFP_NOIO); -static void ublk_batch_deinit_fetch_buf(const struct ublk_batch_io_data *data, + if (fcmd) { + fcmd->cmd = cmd; + fcmd->buf_group = READ_ONCE(cmd->sqe->buf_index); + } + return fcmd; +} + +static void ublk_batch_free_fcmd(struct ublk_batch_fetch_cmd *fcmd) +{ + kfree(fcmd); +} + +static void __ublk_release_fcmd(struct ublk_queue *ubq) +{ + WRITE_ONCE(ubq->active_fcmd, NULL); +} + +/* + * Nothing can move on, so clear ->active_fcmd, and the caller should stop + * dispatching + */ +static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, struct ublk_batch_fetch_cmd *fcmd, int res) { + spin_lock(&ubq->evts_lock); + list_del(&fcmd->node); + WARN_ON_ONCE(fcmd != ubq->active_fcmd); + __ublk_release_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + io_uring_cmd_done(fcmd->cmd, res, data->issue_flags); - fcmd->cmd = NULL; + ublk_batch_free_fcmd(fcmd); } static int ublk_batch_fetch_post_cqe(struct ublk_batch_fetch_cmd *fcmd, @@ -1637,6 +1708,8 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq, bool needs_filter; int ret; + WARN_ON_ONCE(data->cmd != fcmd->cmd); + sel = io_uring_cmd_buffer_select(fcmd->cmd, fcmd->buf_group, &len, data->issue_flags); if (sel.val < 0) @@ -1700,21 +1773,93 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq, return ret; } -static __maybe_unused void +static struct ublk_batch_fetch_cmd *__ublk_acquire_fcmd( + struct ublk_queue *ubq) +{ + struct ublk_batch_fetch_cmd *fcmd; + + lockdep_assert_held(&ubq->evts_lock); + + /* + * Ordering updating ubq->evts_fifo and checking ubq->active_fcmd. + * + * The pair is the smp_mb() in ublk_batch_dispatch(). + * + * If ubq->active_fcmd is observed as non-NULL, the new added tags + * can be visisible in ublk_batch_dispatch() with the barrier pairing. + */ + smp_mb(); + if (READ_ONCE(ubq->active_fcmd)) { + fcmd = NULL; + } else { + fcmd = list_first_entry_or_null(&ubq->fcmd_head, + struct ublk_batch_fetch_cmd, node); + WRITE_ONCE(ubq->active_fcmd, fcmd); + } + return fcmd; +} + +static void ublk_batch_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) +{ + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; + struct ublk_batch_io_data data = { + .ub = pdu->ubq->dev, + .cmd = fcmd->cmd, + .issue_flags = issue_flags, + }; + + WARN_ON_ONCE(pdu->ubq->active_fcmd != fcmd); + + ublk_batch_dispatch(pdu->ubq, &data, fcmd); +} + +static void ublk_batch_dispatch(struct ublk_queue *ubq, const struct ublk_batch_io_data *data, struct ublk_batch_fetch_cmd *fcmd) { + struct ublk_batch_fetch_cmd *new_fcmd; + unsigned tried = 0; int ret = 0; +again: while (!ublk_io_evts_empty(ubq)) { ret = __ublk_batch_dispatch(ubq, data, fcmd); if (ret <= 0) break; } - if (ret < 0) - ublk_batch_deinit_fetch_buf(data, fcmd, ret); + if (ret < 0) { + ublk_batch_deinit_fetch_buf(ubq, data, fcmd, ret); + return; + } + + __ublk_release_fcmd(ubq); + /* + * Order clearing ubq->active_fcmd from __ublk_release_fcmd() and + * checking ubq->evts_fifo. + * + * The pair is the smp_mb() in __ublk_acquire_fcmd(). + */ + smp_mb(); + if (likely(ublk_io_evts_empty(ubq))) + return; + + spin_lock(&ubq->evts_lock); + new_fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (!new_fcmd) + return; + + /* Avoid lockup by allowing to handle at most 32 batches */ + if (new_fcmd == fcmd && tried++ < 32) + goto again; + + io_uring_cmd_complete_in_task(new_fcmd->cmd, ublk_batch_tw_cb); } static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) @@ -1726,6 +1871,21 @@ static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) ublk_dispatch_req(ubq, pdu->req); } +static void ublk_batch_queue_cmd(struct ublk_queue *ubq, struct request *rq, bool last) +{ + unsigned short tag = rq->tag; + struct ublk_batch_fetch_cmd *fcmd = NULL; + + spin_lock(&ubq->evts_lock); + kfifo_put(&ubq->evts_fifo, tag); + if (last) + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; @@ -1836,7 +1996,10 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } - ublk_queue_cmd(ubq, rq); + if (ublk_support_batch_io(ubq)) + ublk_batch_queue_cmd(ubq, rq, bd->last); + else + ublk_queue_cmd(ubq, rq); return BLK_STS_OK; } @@ -1848,6 +2011,19 @@ static inline bool ublk_belong_to_same_batch(const struct ublk_io *io, (io->task == io2->task); } +static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct ublk_batch_fetch_cmd *fcmd; + + spin_lock(&ubq->evts_lock); + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + static void ublk_queue_rqs(struct rq_list *rqlist) { struct rq_list requeue_list = { }; @@ -1876,6 +2052,57 @@ static void ublk_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } +static void ublk_batch_queue_cmd_list(struct ublk_queue *ubq, struct rq_list *l) +{ + unsigned short tags[MAX_NR_TAG]; + struct ublk_batch_fetch_cmd *fcmd; + struct request *rq; + unsigned cnt = 0; + + spin_lock(&ubq->evts_lock); + rq_list_for_each(l, rq) { + tags[cnt++] = (unsigned short)rq->tag; + if (cnt >= MAX_NR_TAG) { + kfifo_in(&ubq->evts_fifo, tags, cnt); + cnt = 0; + } + } + if (cnt) + kfifo_in(&ubq->evts_fifo, tags, cnt); + fcmd = __ublk_acquire_fcmd(ubq); + spin_unlock(&ubq->evts_lock); + + rq_list_init(l); + if (fcmd) + io_uring_cmd_complete_in_task(fcmd->cmd, ublk_batch_tw_cb); +} + +static void ublk_batch_queue_rqs(struct rq_list *rqlist) +{ + struct rq_list requeue_list = { }; + struct rq_list submit_list = { }; + struct ublk_queue *ubq = NULL; + struct request *req; + + while ((req = rq_list_pop(rqlist))) { + struct ublk_queue *this_q = req->mq_hctx->driver_data; + + if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { + rq_list_add_tail(&requeue_list, req); + continue; + } + + if (ubq && this_q != ubq && !rq_list_empty(&submit_list)) + ublk_batch_queue_cmd_list(ubq, &submit_list); + ubq = this_q; + rq_list_add_tail(&submit_list, req); + } + + if (!rq_list_empty(&submit_list)) + ublk_batch_queue_cmd_list(ubq, &submit_list); + *rqlist = requeue_list; +} + static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, unsigned int hctx_idx) { @@ -1893,6 +2120,14 @@ static const struct blk_mq_ops ublk_mq_ops = { .timeout = ublk_timeout, }; +static const struct blk_mq_ops ublk_batch_mq_ops = { + .commit_rqs = ublk_commit_rqs, + .queue_rq = ublk_queue_rq, + .queue_rqs = ublk_batch_queue_rqs, + .init_hctx = ublk_init_hctx, + .timeout = ublk_timeout, +}; + static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) { int i; @@ -2290,6 +2525,56 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } +static void ublk_batch_cancel_cmd(struct ublk_queue *ubq, + struct ublk_batch_fetch_cmd *fcmd, + unsigned int issue_flags) +{ + bool done; + + spin_lock(&ubq->evts_lock); + done = (READ_ONCE(ubq->active_fcmd) != fcmd); + if (done) + list_del(&fcmd->node); + spin_unlock(&ubq->evts_lock); + + if (done) { + io_uring_cmd_done(fcmd->cmd, UBLK_IO_RES_ABORT, issue_flags); + ublk_batch_free_fcmd(fcmd); + } +} + +static void ublk_batch_cancel_queue(struct ublk_queue *ubq) +{ + struct ublk_batch_fetch_cmd *fcmd; + LIST_HEAD(fcmd_list); + + spin_lock(&ubq->evts_lock); + ubq->force_abort = true; + list_splice_init(&ubq->fcmd_head, &fcmd_list); + fcmd = READ_ONCE(ubq->active_fcmd); + if (fcmd) + list_move(&fcmd->node, &ubq->fcmd_head); + spin_unlock(&ubq->evts_lock); + + while (!list_empty(&fcmd_list)) { + fcmd = list_first_entry(&fcmd_list, + struct ublk_batch_fetch_cmd, node); + ublk_batch_cancel_cmd(ubq, fcmd, IO_URING_F_UNLOCKED); + } +} + +static void ublk_batch_cancel_fn(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + struct ublk_batch_fetch_cmd *fcmd = pdu->fcmd; + struct ublk_queue *ubq = pdu->ubq; + + ublk_start_cancel(ubq->dev); + + ublk_batch_cancel_cmd(ubq, fcmd, issue_flags); +} + /* * The ublk char device won't be closed when calling cancel fn, so both * ublk device and queue are guaranteed to be live @@ -2341,6 +2626,11 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) { int i; + if (ublk_support_batch_io(ubq)) { + ublk_batch_cancel_queue(ubq); + return; + } + for (i = 0; i < ubq->q_depth; i++) ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); } @@ -3246,6 +3536,79 @@ static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) return ublk_check_batch_cmd_flags(uc); } +static int ublk_batch_attach(struct ublk_queue *ubq, + struct ublk_batch_io_data *data, + struct ublk_batch_fetch_cmd *fcmd) +{ + struct ublk_batch_fetch_cmd *new_fcmd = NULL; + bool free = false; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(data->cmd); + + spin_lock(&ubq->evts_lock); + if (unlikely(ubq->force_abort || ubq->canceling)) { + free = true; + } else { + list_add_tail(&fcmd->node, &ubq->fcmd_head); + new_fcmd = __ublk_acquire_fcmd(ubq); + } + spin_unlock(&ubq->evts_lock); + + if (unlikely(free)) { + ublk_batch_free_fcmd(fcmd); + return -ENODEV; + } + + pdu->ubq = ubq; + pdu->fcmd = fcmd; + io_uring_cmd_mark_cancelable(fcmd->cmd, data->issue_flags); + + if (!new_fcmd) + goto out; + + /* + * If the two fetch commands are originated from same io_ring_ctx, + * run batch dispatch directly. Otherwise, schedule task work for + * doing it. + */ + if (io_uring_cmd_ctx_handle(new_fcmd->cmd) == + io_uring_cmd_ctx_handle(fcmd->cmd)) { + data->cmd = new_fcmd->cmd; + ublk_batch_dispatch(ubq, data, new_fcmd); + } else { + io_uring_cmd_complete_in_task(new_fcmd->cmd, + ublk_batch_tw_cb); + } +out: + return -EIOCBQUEUED; +} + +static int ublk_handle_batch_fetch_cmd(struct ublk_batch_io_data *data) +{ + struct ublk_queue *ubq = ublk_get_queue(data->ub, data->header.q_id); + struct ublk_batch_fetch_cmd *fcmd = ublk_batch_alloc_fcmd(data->cmd); + + if (!fcmd) + return -ENOMEM; + + return ublk_batch_attach(ubq, data, fcmd); +} + +static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) +{ + const struct ublk_batch_io *uc = &data->header; + + if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT)) + return -EINVAL; + + if (uc->elem_bytes != sizeof(__u16)) + return -EINVAL; + + if (uc->flags != 0) + return -EINVAL; + + return 0; +} + static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -3265,6 +3628,11 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; + if (unlikely(issue_flags & IO_URING_F_CANCEL)) { + ublk_batch_cancel_fn(cmd, issue_flags); + return 0; + } + if (data.header.q_id >= ub->dev_info.nr_hw_queues) goto out; @@ -3281,6 +3649,12 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, goto out; ret = ublk_handle_batch_commit_cmd(&data); break; + case UBLK_U_IO_FETCH_IO_CMDS: + ret = ublk_validate_batch_fetch_cmd(&data); + if (ret) + goto out; + ret = ublk_handle_batch_fetch_cmd(&data); + break; default: ret = -EOPNOTSUPP; } @@ -3503,6 +3877,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id) ret = ublk_io_evts_init(ubq, ubq->q_depth, numa_node); if (ret) goto fail; + INIT_LIST_HEAD(&ubq->fcmd_head); } ub->queues[q_id] = ubq; ubq->dev = ub; @@ -3625,7 +4000,10 @@ static void ublk_align_max_io_size(struct ublk_device *ub) static int ublk_add_tag_set(struct ublk_device *ub) { - ub->tag_set.ops = &ublk_mq_ops; + if (ublk_dev_support_batch_io(ub)) + ub->tag_set.ops = &ublk_batch_mq_ops; + else + ub->tag_set.ops = &ublk_mq_ops; ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 3894d676dd02..70d8ebbf4326 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -121,6 +121,13 @@ #define UBLK_U_IO_COMMIT_IO_CMDS \ _IOWR('u', 0x26, struct ublk_batch_io) +/* + * Fetch io commands to provided buffer in multishot style, + * `IORING_URING_CMD_MULTISHOT` is required for this command. + */ +#define UBLK_U_IO_FETCH_IO_CMDS \ + _IOWR('u', 0x27, struct ublk_batch_io) + /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 #define UBLK_IO_RES_NEED_GET_DATA 1 From 3ac4796b888a2574cb982c89076ed717f122289d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:42 +0800 Subject: [PATCH 063/162] ublk: refactor ublk_queue_rq() and add ublk_batch_queue_rq() Extract common request preparation and cancellation logic into __ublk_queue_rq_common() helper function. Add dedicated ublk_batch_queue_rq() for batch mode operations to eliminate runtime check in ublk_queue_rq(). Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 56 +++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0a0210f9d417..90a6d6a12303 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1975,16 +1975,22 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq, return BLK_STS_OK; } -static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +/* + * Common helper for queue_rq that handles request preparation and + * cancellation checks. Returns status and sets should_queue to indicate + * whether the caller should proceed with queuing the request. + */ +static inline blk_status_t __ublk_queue_rq_common(struct ublk_queue *ubq, + struct request *rq, + bool *should_queue) { - struct ublk_queue *ubq = hctx->driver_data; - struct request *rq = bd->rq; blk_status_t res; res = ublk_prep_req(ubq, rq, false); - if (res != BLK_STS_OK) + if (res != BLK_STS_OK) { + *should_queue = false; return res; + } /* * ->canceling has to be handled after ->force_abort and ->fail_io @@ -1992,14 +1998,44 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, * of recovery, and cause hang when deleting disk */ if (unlikely(ubq->canceling)) { + *should_queue = false; __ublk_abort_rq(ubq, rq); return BLK_STS_OK; } - if (ublk_support_batch_io(ubq)) - ublk_batch_queue_cmd(ubq, rq, bd->last); - else - ublk_queue_cmd(ubq, rq); + *should_queue = true; + return BLK_STS_OK; +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + bool should_queue; + blk_status_t res; + + res = __ublk_queue_rq_common(ubq, rq, &should_queue); + if (!should_queue) + return res; + + ublk_queue_cmd(ubq, rq); + return BLK_STS_OK; +} + +static blk_status_t ublk_batch_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + bool should_queue; + blk_status_t res; + + res = __ublk_queue_rq_common(ubq, rq, &should_queue); + if (!should_queue) + return res; + + ublk_batch_queue_cmd(ubq, rq, bd->last); return BLK_STS_OK; } @@ -2122,7 +2158,7 @@ static const struct blk_mq_ops ublk_mq_ops = { static const struct blk_mq_ops ublk_batch_mq_ops = { .commit_rqs = ublk_commit_rqs, - .queue_rq = ublk_queue_rq, + .queue_rq = ublk_batch_queue_rq, .queue_rqs = ublk_batch_queue_rqs, .init_hctx = ublk_init_hctx, .timeout = ublk_timeout, From 29d0a927f9efd5f87ac8a2d291d2782384d1bee2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:43 +0800 Subject: [PATCH 064/162] ublk: abort requests filled in event kfifo In case of BATCH_IO, any request filled in event kfifo, they don't get chance to be dispatched any more when releasing ublk char device, so we have to abort them too. Add ublk_abort_batch_queue() for aborting this kind of requests. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 90a6d6a12303..564cf44c238f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2470,7 +2470,8 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, struct request *req) { - WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); + WARN_ON_ONCE(!ublk_dev_support_batch_io(ub) && + io->flags & UBLK_IO_FLAG_ACTIVE); if (ublk_nosrv_should_reissue_outstanding(ub)) blk_mq_requeue_request(req, false); @@ -2480,6 +2481,24 @@ static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, } } +/* + * Request tag may just be filled to event kfifo, not get chance to + * dispatch, abort these requests too + */ +static void ublk_abort_batch_queue(struct ublk_device *ub, + struct ublk_queue *ubq) +{ + unsigned short tag; + + while (kfifo_out(&ubq->evts_fifo, &tag, 1)) { + struct request *req = blk_mq_tag_to_rq( + ub->tag_set.tags[ubq->q_id], tag); + + if (!WARN_ON_ONCE(!req || !blk_mq_request_started(req))) + __ublk_fail_req(ub, &ubq->ios[tag], req); + } +} + /* * Called from ublk char device release handler, when any uring_cmd is * done, meantime request queue is "quiesced" since all inflight requests @@ -2498,6 +2517,9 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) __ublk_fail_req(ub, io, io->req); } + + if (ublk_support_batch_io(ubq)) + ublk_abort_batch_queue(ub, ubq); } static void ublk_start_cancel(struct ublk_device *ub) From e2723e6ce6025026b6d79d9a00048386a69e00c3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:44 +0800 Subject: [PATCH 065/162] ublk: add new feature UBLK_F_BATCH_IO Add new feature UBLK_F_BATCH_IO which replaces the following two per-io commands: - UBLK_U_IO_FETCH_REQ - UBLK_U_IO_COMMIT_AND_FETCH_REQ with three per-queue batch io uring_cmd: - UBLK_U_IO_PREP_IO_CMDS - UBLK_U_IO_COMMIT_IO_CMDS - UBLK_U_IO_FETCH_IO_CMDS Then ublk can deliver batch io commands to ublk server in single multishort uring_cmd, also allows to prepare & commit multiple commands in batch style via single uring_cmd, communication cost is reduced a lot. This feature also doesn't limit task context any more for all supported commands, so any allowed uring_cmd can be issued in any task context. ublk server implementation becomes much easier. Meantime load balance becomes much easier to support with this feature. The command `UBLK_U_IO_FETCH_IO_CMDS` can be issued from multiple task contexts, so each task can adjust this command's buffer length or number of inflight commands for controlling how much load is handled by current task. Later, priority parameter will be added to command `UBLK_U_IO_FETCH_IO_CMDS` for improving load balance support. UBLK_U_IO_NEED_GET_DATA isn't supported in batch io yet, but it may be enabled in future via its batch pair. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 60 ++++++++++++++++++++++++++++++----- include/uapi/linux/ublk_cmd.h | 15 +++++++++ 2 files changed, 67 insertions(+), 8 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 564cf44c238f..bec34b5ab5ab 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -79,7 +79,8 @@ | UBLK_F_PER_IO_DAEMON \ | UBLK_F_BUF_REG_OFF_DAEMON \ | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ - | UBLK_F_SAFE_STOP_DEV) + | UBLK_F_SAFE_STOP_DEV \ + | UBLK_F_BATCH_IO) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -340,12 +341,12 @@ static void ublk_batch_dispatch(struct ublk_queue *ubq, static inline bool ublk_dev_support_batch_io(const struct ublk_device *ub) { - return false; + return ub->dev_info.flags & UBLK_F_BATCH_IO; } static inline bool ublk_support_batch_io(const struct ublk_queue *ubq) { - return false; + return ubq->flags & UBLK_F_BATCH_IO; } static inline void ublk_io_lock(struct ublk_io *io) @@ -3573,9 +3574,11 @@ static int ublk_check_batch_cmd_flags(const struct ublk_batch_io *uc) static int ublk_check_batch_cmd(const struct ublk_batch_io_data *data) { - const struct ublk_batch_io *uc = &data->header; + if (uc->q_id >= data->ub->dev_info.nr_hw_queues) + return -EINVAL; + if (uc->nr_elem > data->ub->dev_info.queue_depth) return -E2BIG; @@ -3655,6 +3658,9 @@ static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) { const struct ublk_batch_io *uc = &data->header; + if (uc->q_id >= data->ub->dev_info.nr_hw_queues) + return -EINVAL; + if (!(data->cmd->flags & IORING_URING_CMD_MULTISHOT)) return -EINVAL; @@ -3667,6 +3673,35 @@ static int ublk_validate_batch_fetch_cmd(struct ublk_batch_io_data *data) return 0; } +static int ublk_handle_non_batch_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct ublksrv_io_cmd *ub_cmd = io_uring_sqe_cmd(cmd->sqe); + struct ublk_device *ub = cmd->file->private_data; + unsigned tag = READ_ONCE(ub_cmd->tag); + unsigned q_id = READ_ONCE(ub_cmd->q_id); + unsigned index = READ_ONCE(ub_cmd->addr); + struct ublk_queue *ubq; + struct ublk_io *io; + + if (cmd->cmd_op == UBLK_U_IO_UNREGISTER_IO_BUF) + return ublk_unregister_io_buf(cmd, ub, index, issue_flags); + + if (q_id >= ub->dev_info.nr_hw_queues) + return -EINVAL; + + if (tag >= ub->dev_info.queue_depth) + return -EINVAL; + + if (cmd->cmd_op != UBLK_U_IO_REGISTER_IO_BUF) + return -EOPNOTSUPP; + + ubq = ublk_get_queue(ub, q_id); + io = &ubq->ios[tag]; + return ublk_register_io_buf(cmd, ub, q_id, tag, io, index, + issue_flags); +} + static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -3691,9 +3726,6 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, return 0; } - if (data.header.q_id >= ub->dev_info.nr_hw_queues) - goto out; - switch (cmd_op) { case UBLK_U_IO_PREP_IO_CMDS: ret = ublk_check_batch_cmd(&data); @@ -3714,7 +3746,8 @@ static int ublk_ch_batch_io_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_handle_batch_fetch_cmd(&data); break; default: - ret = -EOPNOTSUPP; + ret = ublk_handle_non_batch_cmd(cmd, issue_flags); + break; } out: return ret; @@ -4437,6 +4470,10 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) UBLK_F_BUF_REG_OFF_DAEMON | UBLK_F_SAFE_STOP_DEV; + /* So far, UBLK_F_PER_IO_DAEMON won't be exposed for BATCH_IO */ + if (ublk_dev_support_batch_io(ub)) + ub->dev_info.flags &= ~UBLK_F_PER_IO_DAEMON; + /* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_AUTO_BUF_REG)) @@ -4820,6 +4857,13 @@ static int ublk_wait_for_idle_io(struct ublk_device *ub, unsigned int elapsed = 0; int ret; + /* + * For UBLK_F_BATCH_IO ublk server can get notified with existing + * or new fetch command, so needn't wait any more + */ + if (ublk_dev_support_batch_io(ub)) + return 0; + while (elapsed < timeout_ms && !signal_pending(current)) { unsigned int queues_cancelable = 0; int i; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 70d8ebbf4326..743d31491387 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -340,6 +340,21 @@ */ #define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14) +/* + * Support the following commands for delivering & committing io command + * in batch. + * + * - UBLK_U_IO_PREP_IO_CMDS + * - UBLK_U_IO_COMMIT_IO_CMDS + * - UBLK_U_IO_FETCH_IO_CMDS + * - UBLK_U_IO_REGISTER_IO_BUF + * - UBLK_U_IO_UNREGISTER_IO_BUF + * + * The existing UBLK_U_IO_FETCH_REQ, UBLK_U_IO_COMMIT_AND_FETCH_REQ and + * UBLK_U_IO_NEED_GET_DATA uring_cmd are not supported for this feature. + */ +#define UBLK_F_BATCH_IO (1ULL << 15) + /* * ublk device supports requests with integrity/metadata buffer. * Requires UBLK_F_USER_COPY. From 4d8fd7c5592acc2cc89b1759fb01171478c9ee5f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:45 +0800 Subject: [PATCH 066/162] ublk: document feature UBLK_F_BATCH_IO Document feature UBLK_F_BATCH_IO. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- Documentation/block/ublk.rst | 64 +++++++++++++++++++++++++++++++++--- 1 file changed, 60 insertions(+), 4 deletions(-) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 8c4030bcabb6..6ad28039663d 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -260,9 +260,12 @@ The following IO commands are communicated via io_uring passthrough command, and each command is only for forwarding the IO and committing the result with specified IO tag in the command data: -- ``UBLK_IO_FETCH_REQ`` +Traditional Per-I/O Commands +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Sent from the server IO pthread for fetching future incoming IO requests +- ``UBLK_U_IO_FETCH_REQ`` + + Sent from the server I/O pthread for fetching future incoming I/O requests destined to ``/dev/ublkb*``. This command is sent only once from the server IO pthread for ublk driver to setup IO forward environment. @@ -278,7 +281,7 @@ with specified IO tag in the command data: supported by the driver, daemons must be per-queue instead - i.e. all I/Os associated to a single qid must be handled by the same task. -- ``UBLK_IO_COMMIT_AND_FETCH_REQ`` +- ``UBLK_U_IO_COMMIT_AND_FETCH_REQ`` When an IO request is destined to ``/dev/ublkb*``, the driver stores the IO's ``ublksrv_io_desc`` to the specified mapped area; then the @@ -293,7 +296,7 @@ with specified IO tag in the command data: requests with the same IO tag. That is, ``UBLK_IO_COMMIT_AND_FETCH_REQ`` is reused for both fetching request and committing back IO result. -- ``UBLK_IO_NEED_GET_DATA`` +- ``UBLK_U_IO_NEED_GET_DATA`` With ``UBLK_F_NEED_GET_DATA`` enabled, the WRITE request will be firstly issued to ublk server without data copy. Then, IO backend of ublk server @@ -322,6 +325,59 @@ with specified IO tag in the command data: ``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy the server buffer (pages) read to the IO request pages. +Batch I/O Commands (UBLK_F_BATCH_IO) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``UBLK_F_BATCH_IO`` feature provides an alternative high-performance +I/O handling model that replaces the traditional per-I/O commands with +per-queue batch commands. This significantly reduces communication overhead +and enables better load balancing across multiple server tasks. + +Key differences from traditional mode: + +- **Per-queue vs Per-I/O**: Commands operate on queues rather than individual I/Os +- **Batch processing**: Multiple I/Os are handled in single operations +- **Multishot commands**: Use io_uring multishot for reduced submission overhead +- **Flexible task assignment**: Any task can handle any I/O (no per-I/O daemons) +- **Better load balancing**: Tasks can adjust their workload dynamically + +Batch I/O Commands: + +- ``UBLK_U_IO_PREP_IO_CMDS`` + + Prepares multiple I/O commands in batch. The server provides a buffer + containing multiple I/O descriptors that will be processed together. + This reduces the number of individual command submissions required. + +- ``UBLK_U_IO_COMMIT_IO_CMDS`` + + Commits results for multiple I/O operations in batch, and prepares the + I/O descriptors to accept new requests. The server provides a buffer + containing the results of multiple completed I/Os, allowing efficient + bulk completion of requests. + +- ``UBLK_U_IO_FETCH_IO_CMDS`` + + **Multishot command** for fetching I/O commands in batch. This is the key + command that enables high-performance batch processing: + + * Uses io_uring multishot capability for reduced submission overhead + * Single command can fetch multiple I/O requests over time + * Buffer size determines maximum batch size per operation + * Multiple fetch commands can be submitted for load balancing + * Only one fetch command is active at any time per queue + * Supports dynamic load balancing across multiple server tasks + + It is one typical multishot io_uring request with provided buffer, and it + won't be completed until any failure is triggered. + + Each task can submit ``UBLK_U_IO_FETCH_IO_CMDS`` with different buffer + sizes to control how much work it handles. This enables sophisticated + load balancing strategies in multi-threaded servers. + +Migration: Applications using traditional commands (``UBLK_U_IO_FETCH_REQ``, +``UBLK_U_IO_COMMIT_AND_FETCH_REQ``) cannot use batch mode simultaneously. + Zero copy --------- From 7aa78d4a3c9c30d15c0107fb652b22376aaacce6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:46 +0800 Subject: [PATCH 067/162] ublk: implement batch request completion via blk_mq_end_request_batch() Reduce overhead when completing multiple requests in batch I/O mode by accumulating them in an io_comp_batch structure and completing them together via blk_mq_end_request_batch(). This minimizes per-request completion overhead and improves performance for high IOPS workloads. The implementation adds an io_comp_batch pointer to struct ublk_io and initializes it in __ublk_fetch(). For batch I/O, the pointer is set to the batch structure in ublk_batch_commit_io(). The __ublk_complete_rq() function uses io->iob to call blk_mq_add_to_batch() for batch mode. After processing all batch I/Os, the completion callback is invoked in ublk_handle_batch_commit_cmd() to complete all accumulated requests efficiently. So far just covers direct completion. For deferred completion(zero copy, auto buffer reg), ublk_io_release() is often delayed in freeing buffer consumer io_uring request's code path, so this patch often doesn't work, also it is hard to pass the per-task 'struct io_comp_batch' for deferred completion. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index bec34b5ab5ab..4bbed84232ea 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -136,6 +136,7 @@ struct ublk_batch_io_data { struct io_uring_cmd *cmd; struct ublk_batch_io header; unsigned int issue_flags; + struct io_comp_batch *iob; }; /* @@ -691,7 +692,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq, #endif static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, - bool need_map); + bool need_map, struct io_comp_batch *iob); static dev_t ublk_chr_devt; static const struct class ublk_chr_class = { @@ -1001,7 +1002,7 @@ static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req) return; /* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */ - __ublk_complete_rq(req, io, false); + __ublk_complete_rq(req, io, false, NULL); } static inline bool ublk_sub_req_ref(struct ublk_io *io) @@ -1388,7 +1389,7 @@ static void ublk_end_request(struct request *req, blk_status_t error) /* todo: handle partial completion */ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, - bool need_map) + bool need_map, struct io_comp_batch *iob) { unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; @@ -1442,8 +1443,11 @@ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, local_bh_enable(); if (requeue) blk_mq_requeue_request(req, true); - else if (likely(!blk_should_fake_timeout(req->q))) + else if (likely(!blk_should_fake_timeout(req->q))) { + if (blk_mq_add_to_batch(req, iob, false, blk_mq_end_request_batch)) + return; __blk_mq_end_request(req, BLK_STS_OK); + } return; exit: @@ -2478,7 +2482,7 @@ static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io, blk_mq_requeue_request(req, false); else { io->res = -EIO; - __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL); } } @@ -3214,7 +3218,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = addr; if (compl) - __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub)); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(ub), NULL); if (ret) goto out; @@ -3533,11 +3537,11 @@ static int ublk_batch_commit_io(struct ublk_queue *ubq, if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = ublk_batch_zone_lba(uc, elem); if (compl) - __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub)); + __ublk_complete_rq(req, io, ublk_dev_need_map_io(data->ub), data->iob); return 0; } -static int ublk_handle_batch_commit_cmd(const struct ublk_batch_io_data *data) +static int ublk_handle_batch_commit_cmd(struct ublk_batch_io_data *data) { const struct ublk_batch_io *uc = &data->header; struct io_uring_cmd *cmd = data->cmd; @@ -3546,10 +3550,15 @@ static int ublk_handle_batch_commit_cmd(const struct ublk_batch_io_data *data) .total = uc->nr_elem * uc->elem_bytes, .elem_bytes = uc->elem_bytes, }; + DEFINE_IO_COMP_BATCH(iob); int ret; + data->iob = &iob; ret = ublk_walk_cmd_buf(&iter, data, ublk_batch_commit_io); + if (iob.complete) + iob.complete(&iob); + return iter.done == 0 ? ret : iter.done; } From 3f3850785594e323c5adc6b19ef5907419d3159f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:47 +0800 Subject: [PATCH 068/162] ublk: fix batch I/O recovery -ENODEV error During recovery with batch I/O, UBLK_U_IO_FETCH_IO_CMDS command fails with -ENODEV because ublk_batch_attach() rejects them when ubq->canceling is set. The canceling flag remains set until all queues are ready. Fix this by tracking per-queue readiness and clearing ubq->canceling as soon as each individual queue becomes ready, rather than waiting for all queues. This allows subsequent UBLK_U_IO_FETCH_IO_CMDS commands to succeed during recovery. Changes: - Add ubq->nr_io_ready to track I/Os ready per queue - Add ub->nr_queue_ready to track number of ready queues - Add ublk_queue_ready() helper to check queue readiness - Redefine ublk_dev_ready() based on queue count instead of I/O count - Clear ubq->canceling immediately when queue becomes ready - Add ublk_queue_reset_io_flags() to reset per-queue flags Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 92 ++++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 32 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4bbed84232ea..1e374ecbf0f1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -239,6 +239,7 @@ struct ublk_queue { bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */ spinlock_t cancel_lock; struct ublk_device *dev; + u32 nr_io_ready; /* * For supporting UBLK_F_BATCH_IO only. @@ -311,7 +312,7 @@ struct ublk_device { struct ublk_params params; struct completion completion; - u32 nr_io_ready; + u32 nr_queue_ready; bool unprivileged_daemons; struct mutex cancel_mutex; bool canceling; @@ -2173,6 +2174,8 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) { int i; + ubq->nr_io_ready = 0; + for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -2221,7 +2224,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub) /* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */ ub->mm = NULL; - ub->nr_io_ready = 0; + ub->nr_queue_ready = 0; ub->unprivileged_daemons = false; ub->ublksrv_tgid = -1; } @@ -2678,11 +2681,14 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, ublk_cancel_cmd(ubq, pdu->tag, issue_flags); } +static inline bool ublk_queue_ready(const struct ublk_queue *ubq) +{ + return ubq->nr_io_ready == ubq->q_depth; +} + static inline bool ublk_dev_ready(const struct ublk_device *ub) { - u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth; - - return ub->nr_io_ready == total; + return ub->nr_queue_ready == ub->dev_info.nr_hw_queues; } static void ublk_cancel_queue(struct ublk_queue *ubq) @@ -2791,37 +2797,52 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_cancel_dev(ub); } -/* reset ublk io_uring queue & io flags */ -static void ublk_reset_io_flags(struct ublk_device *ub) +/* reset per-queue io flags */ +static void ublk_queue_reset_io_flags(struct ublk_queue *ubq) { - int i, j; + int j; - for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { - struct ublk_queue *ubq = ublk_get_queue(ub, i); - - /* UBLK_IO_FLAG_CANCELED can be cleared now */ - spin_lock(&ubq->cancel_lock); - for (j = 0; j < ubq->q_depth; j++) - ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; - spin_unlock(&ubq->cancel_lock); - ubq->fail_io = false; - } - mutex_lock(&ub->cancel_mutex); - ublk_set_canceling(ub, false); - mutex_unlock(&ub->cancel_mutex); + /* UBLK_IO_FLAG_CANCELED can be cleared now */ + spin_lock(&ubq->cancel_lock); + for (j = 0; j < ubq->q_depth; j++) + ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); + ubq->fail_io = false; + ubq->canceling = false; } /* device can only be started after all IOs are ready */ -static void ublk_mark_io_ready(struct ublk_device *ub) +static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id) __must_hold(&ub->mutex) { + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN)) ub->unprivileged_daemons = true; - ub->nr_io_ready++; + ubq->nr_io_ready++; + + /* Check if this specific queue is now fully ready */ + if (ublk_queue_ready(ubq)) { + ub->nr_queue_ready++; + + /* + * Reset queue flags as soon as this queue is ready. + * This clears the canceling flag, allowing batch FETCH commands + * to succeed during recovery without waiting for all queues. + */ + ublk_queue_reset_io_flags(ubq); + } + + /* Check if all queues are ready */ if (ublk_dev_ready(ub)) { - /* now we are ready for handling ublk io request */ - ublk_reset_io_flags(ub); + /* + * All queues ready - clear device-level canceling flag + * and complete the recovery/initialization. + */ + mutex_lock(&ub->cancel_mutex); + ub->canceling = false; + mutex_unlock(&ub->cancel_mutex); complete_all(&ub->completion); } } @@ -3025,7 +3046,7 @@ static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr) } static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, - struct ublk_io *io) + struct ublk_io *io, u16 q_id) { /* UBLK_IO_FETCH_REQ is only allowed before dev is setup */ if (ublk_dev_ready(ub)) @@ -3043,13 +3064,13 @@ static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, WRITE_ONCE(io->task, NULL); else WRITE_ONCE(io->task, get_task_struct(current)); - ublk_mark_io_ready(ub); + ublk_mark_io_ready(ub, q_id); return 0; } static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, - struct ublk_io *io, __u64 buf_addr) + struct ublk_io *io, __u64 buf_addr, u16 q_id) { int ret; @@ -3059,7 +3080,7 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, * FETCH, so it is fine even for IO_URING_F_NONBLOCK. */ mutex_lock(&ub->mutex); - ret = __ublk_fetch(cmd, ub, io); + ret = __ublk_fetch(cmd, ub, io, q_id); if (!ret) ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); mutex_unlock(&ub->mutex); @@ -3165,7 +3186,7 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, ret = ublk_check_fetch_buf(ub, addr); if (ret) goto out; - ret = ublk_fetch(cmd, ub, io, addr); + ret = ublk_fetch(cmd, ub, io, addr, q_id); if (ret) goto out; @@ -3411,7 +3432,14 @@ static int ublk_batch_unprep_io(struct ublk_queue *ubq, { struct ublk_io *io = &ubq->ios[elem->tag]; - data->ub->nr_io_ready--; + /* + * If queue was ready before this decrement, it won't be anymore, + * so we need to decrement the queue ready count too. + */ + if (ublk_queue_ready(ubq)) + data->ub->nr_queue_ready--; + ubq->nr_io_ready--; + ublk_io_lock(io); io->flags = 0; ublk_io_unlock(io); @@ -3451,7 +3479,7 @@ static int ublk_batch_prep_io(struct ublk_queue *ubq, } ublk_io_lock(io); - ret = __ublk_fetch(data->cmd, data->ub, io); + ret = __ublk_fetch(data->cmd, data->ub, io, ubq->q_id); if (!ret) io->buf = buf; ublk_io_unlock(io); From caf84294ff98bb7455722285f30f46c193ffccdd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:48 +0800 Subject: [PATCH 069/162] selftests: ublk: fix user_data truncation for tgt_data >= 256 The build_user_data() function packs multiple fields into a __u64 value using bit shifts. Without explicit __u64 casts before shifting, the shift operations are performed on 32-bit unsigned integers before being promoted to 64-bit, causing data loss. Specifically, when tgt_data >= 256, the expression (tgt_data << 24) shifts on a 32-bit value, truncating the upper 8 bits before promotion to __u64. Since tgt_data can be up to 16 bits (assertion allows up to 65535), values >= 256 would have their high byte lost. Add explicit __u64 casts to both op and tgt_data before shifting to ensure the shift operations happen in 64-bit space, preserving all bits of the input values. user_data_to_tgt_data() is only used by stripe.c, in which the max supported member disks are 4, so won't trigger this issue. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index cb757fd9bf9d..69fd5794f300 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -262,7 +262,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); - return tag | (op << 16) | (tgt_data << 24) | + return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; } From 584709ad5ce359f8b5773eb6af40070412652c51 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:49 +0800 Subject: [PATCH 070/162] selftests: ublk: replace assert() with ublk_assert() Replace assert() with ublk_assert() since it is often triggered in daemon, and we may get nothing shown in terminal. Add ublk_assert(), so we can log something to syslog when assert() is triggered. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/common.c | 2 +- tools/testing/selftests/ublk/file_backed.c | 2 +- tools/testing/selftests/ublk/kublk.c | 2 +- tools/testing/selftests/ublk/kublk.h | 2 +- tools/testing/selftests/ublk/stripe.c | 10 +++++----- tools/testing/selftests/ublk/utils.h | 10 ++++++++++ 6 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c index d9873d4d50d0..530f9877c9dd 100644 --- a/tools/testing/selftests/ublk/common.c +++ b/tools/testing/selftests/ublk/common.c @@ -16,7 +16,7 @@ int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct) { int fd, i; - assert(dev->nr_fds == 1); + ublk_assert(dev->nr_fds == 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) { char *file = dev->tgt.backing_file[i]; diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index c3ce5ff72422..889047bd8fa3 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -10,7 +10,7 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; - assert(0); + ublk_assert(0); } static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 3472ce7426ba..e98999bea9b1 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -825,7 +825,7 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, } if (cqe->res == UBLK_IO_RES_OK) { - assert(tag < q->q_depth); + ublk_assert(tag < q->q_depth); if (ublk_queue_use_user_copy(q)) ublk_user_copy(io, UBLK_IO_OP_WRITE); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 69fd5794f300..48634d29c084 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -260,7 +260,7 @@ static inline __u64 build_user_data(unsigned tag, unsigned op, { /* we only have 7 bits to encode q_id */ _Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7); - assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); + ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7)); return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) | (__u64)q_id << 56 | (__u64)is_target_io << 63; diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index 2be1c36438e7..b967447fe591 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -96,12 +96,12 @@ static void calculate_stripe_array(const struct stripe_conf *conf, this->seq = seq; s->nr += 1; } else { - assert(seq == this->seq); - assert(this->start + this->nr_sects == stripe_off); + ublk_assert(seq == this->seq); + ublk_assert(this->start + this->nr_sects == stripe_off); this->nr_sects += nr_sects; } - assert(this->nr_vec < this->cap); + ublk_assert(this->nr_vec < this->cap); this->vec[this->nr_vec].iov_base = (void *)(base + done); this->vec[this->nr_vec++].iov_len = nr_sects << 9; @@ -120,7 +120,7 @@ static inline enum io_uring_op stripe_to_uring_op( return zc ? IORING_OP_READV_FIXED : IORING_OP_READV; else if (ublk_op == UBLK_IO_OP_WRITE) return zc ? IORING_OP_WRITEV_FIXED : IORING_OP_WRITEV; - assert(0); + ublk_assert(0); } static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, @@ -322,7 +322,7 @@ static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) return -EINVAL; - assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + ublk_assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); for (i = 0; i < dev->tgt.nr_backing_files; i++) dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index a852e0b7153e..17eefed73690 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -43,6 +43,7 @@ static inline void ublk_err(const char *fmt, ...) va_start(ap, fmt); vfprintf(stderr, fmt, ap); + va_end(ap); } static inline void ublk_log(const char *fmt, ...) @@ -52,6 +53,7 @@ static inline void ublk_log(const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } @@ -62,7 +64,15 @@ static inline void ublk_dbg(int level, const char *fmt, ...) va_start(ap, fmt); vfprintf(stdout, fmt, ap); + va_end(ap); } } +#define ublk_assert(x) do { \ + if (!(x)) { \ + ublk_err("%s %d: assert!\n", __func__, __LINE__); \ + assert(x); \ + } \ +} while (0) + #endif From f1d621b5a04ea41ee90f177db084d00db57e6839 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:50 +0800 Subject: [PATCH 071/162] selftests: ublk: add ublk_io_buf_idx() for returning io buffer index Since UBLK_F_PER_IO_DAEMON is added, io buffer index may depend on current thread because the common way is to use per-pthread io_ring_ctx for issuing ublk uring_cmd. Add one helper for returning io buffer index, so we can hide the buffer index implementation details for target code. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 9 +++++---- tools/testing/selftests/ublk/kublk.c | 9 +++++---- tools/testing/selftests/ublk/kublk.h | 10 +++++++++- tools/testing/selftests/ublk/null.c | 18 ++++++++++-------- tools/testing/selftests/ublk/stripe.c | 7 ++++--- 5 files changed, 33 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 889047bd8fa3..228af2580ac6 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -39,6 +39,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, __u32 len = iod->nr_sectors << 9; struct io_uring_sqe *sqe[3]; void *addr = io->buf_addr; + unsigned short buf_index = ublk_io_buf_idx(t, q, tag); if (iod->op_flags & UBLK_IO_F_INTEGRITY) { ublk_io_alloc_sqes(t, sqe, 1); @@ -62,7 +63,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, len, offset); if (auto_zc) - sqe[0]->buf_index = tag; + sqe[0]->buf_index = buf_index; io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); /* bit63 marks us as tgt io */ sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); @@ -71,7 +72,7 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_index); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -79,11 +80,11 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0, len, offset); - sqe[1]->buf_index = tag; + sqe[1]->buf_index = buf_index; sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_index); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index e98999bea9b1..9b6f1cd04dc4 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -605,16 +605,17 @@ static void ublk_dev_unprep(struct ublk_dev *dev) close(dev->fds[0]); } -static void ublk_set_auto_buf_reg(const struct ublk_queue *q, +static void ublk_set_auto_buf_reg(const struct ublk_thread *t, + const struct ublk_queue *q, struct io_uring_sqe *sqe, unsigned short tag) { struct ublk_auto_buf_reg buf = {}; if (q->tgt_ops->buf_index) - buf.index = q->tgt_ops->buf_index(q, tag); + buf.index = q->tgt_ops->buf_index(t, q, tag); else - buf.index = q->ios[tag].buf_index; + buf.index = ublk_io_buf_idx(t, q, tag); if (ublk_queue_auto_zc_fallback(q)) buf.flags = UBLK_AUTO_BUF_REG_FALLBACK; @@ -730,7 +731,7 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io) cmd->addr = 0; if (ublk_queue_use_auto_zc(q)) - ublk_set_auto_buf_reg(q, sqe[0], io->tag); + ublk_set_auto_buf_reg(t, q, sqe[0], io->tag); user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0); io_uring_sqe_set_data64(sqe[0], user_data); diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 48634d29c084..311a75da9b21 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -150,7 +150,8 @@ struct ublk_tgt_ops { void (*usage)(const struct ublk_tgt_ops *ops); /* return buffer index for UBLK_F_AUTO_BUF_REG */ - unsigned short (*buf_index)(const struct ublk_queue *, int tag); + unsigned short (*buf_index)(const struct ublk_thread *t, + const struct ublk_queue *, int tag); }; struct ublk_tgt { @@ -393,6 +394,13 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, + const struct ublk_queue *q, + unsigned tag) +{ + return q->ios[tag].buf_index; +} + static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) { return &q->ios[tag]; diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c index 3aa162f08476..7656888f4149 100644 --- a/tools/testing/selftests/ublk/null.c +++ b/tools/testing/selftests/ublk/null.c @@ -44,12 +44,12 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) } static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod, - struct io_uring_sqe *sqe, int q_id) + struct io_uring_sqe *sqe, int q_id, unsigned buf_idx) { unsigned ublk_op = ublksrv_get_op(iod); io_uring_prep_nop(sqe); - sqe->buf_index = tag; + sqe->buf_index = buf_idx; sqe->flags |= IOSQE_FIXED_FILE; sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; sqe->len = iod->nr_sectors << 9; /* injected result */ @@ -61,18 +61,19 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q, { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe[3]; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); ublk_io_alloc_sqes(t, sqe, 3); - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; - __setup_nop_io(tag, iod, sqe[1], q->q_id); + __setup_nop_io(tag, iod, sqe[1], q->q_id, buf_idx); sqe[1]->flags |= IOSQE_IO_HARDLINK; - io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index); + io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx); sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1); // buf register is marked as IOSQE_CQE_SKIP_SUCCESS @@ -86,7 +87,7 @@ static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q, struct io_uring_sqe *sqe[1]; ublk_io_alloc_sqes(t, sqe, 1); - __setup_nop_io(tag, iod, sqe[0], q->q_id); + __setup_nop_io(tag, iod, sqe[0], q->q_id, ublk_io_buf_idx(t, q, tag)); return 1; } @@ -137,11 +138,12 @@ static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q, * return invalid buffer index for triggering auto buffer register failure, * then UBLK_IO_RES_NEED_REG_BUF handling is covered */ -static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag) +static unsigned short ublk_null_buf_index(const struct ublk_thread *t, + const struct ublk_queue *q, int tag) { if (ublk_queue_auto_zc_fallback(q)) return (unsigned short)-1; - return q->ios[tag].buf_index; + return ublk_io_buf_idx(t, q, tag); } const struct ublk_tgt_ops null_tgt_ops = { diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c index b967447fe591..dca819f5366e 100644 --- a/tools/testing/selftests/ublk/stripe.c +++ b/tools/testing/selftests/ublk/stripe.c @@ -135,6 +135,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, struct ublk_io *io = ublk_get_io(q, tag); int i, extra = zc ? 2 : 0; void *base = io->buf_addr; + unsigned short buf_idx = ublk_io_buf_idx(t, q, tag); io->private_data = s; calculate_stripe_array(conf, iod, s, base); @@ -142,7 +143,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, ublk_io_alloc_sqes(t, sqe, s->nr + extra); if (zc) { - io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx); sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; sqe[0]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1); @@ -158,7 +159,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, t->start << 9); io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); if (auto_zc || zc) { - sqe[i]->buf_index = tag; + sqe[i]->buf_index = buf_idx; if (zc) sqe[i]->flags |= IOSQE_IO_HARDLINK; } @@ -168,7 +169,7 @@ static int stripe_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, if (zc) { struct io_uring_sqe *unreg = sqe[s->nr + 1]; - io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, io->buf_index); + io_uring_prep_buf_unregister(unreg, q, tag, q->q_id, buf_idx); unreg->user_data = build_user_data( tag, ublk_cmd_op_nr(unreg->cmd_op), 0, q->q_id, 1); } From dccbfa9d416424fbcbc83a46e84c604bad1db9d0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:51 +0800 Subject: [PATCH 072/162] selftests: ublk: add batch buffer management infrastructure Add the foundational infrastructure for UBLK_F_BATCH_IO buffer management including: - Allocator utility functions for small sized per-thread allocation - Batch buffer allocation and deallocation functions - Buffer index management for commit buffers - Thread state management for batch I/O mode - Buffer size calculation based on device features This prepares the groundwork for handling batch I/O commands by establishing the buffer management layer needed for UBLK_U_IO_PREP_IO_CMDS and UBLK_U_IO_COMMIT_IO_CMDS operations. The allocator uses CPU sets for efficient per-thread buffer tracking, and commit buffers are pre-allocated with 2 buffers per thread to handle overlapping command operations. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 152 +++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.c | 26 ++++- tools/testing/selftests/ublk/kublk.h | 53 ++++++++++ tools/testing/selftests/ublk/utils.h | 54 ++++++++++ 4 files changed, 282 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/ublk/batch.c diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c new file mode 100644 index 000000000000..609e6073c9c0 --- /dev/null +++ b/tools/testing/selftests/ublk/batch.c @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: UBLK_F_BATCH_IO buffer management + */ + +#include "kublk.h" + +static inline void *ublk_get_commit_buf(struct ublk_thread *t, + unsigned short buf_idx) +{ + unsigned idx; + + if (buf_idx < t->commit_buf_start || + buf_idx >= t->commit_buf_start + t->nr_commit_buf) + return NULL; + idx = buf_idx - t->commit_buf_start; + return t->commit_buf + idx * t->commit_buf_size; +} + +/* + * Allocate one buffer for UBLK_U_IO_PREP_IO_CMDS or UBLK_U_IO_COMMIT_IO_CMDS + * + * Buffer index is returned. + */ +static inline unsigned short ublk_alloc_commit_buf(struct ublk_thread *t) +{ + int idx = allocator_get(&t->commit_buf_alloc); + + if (idx >= 0) + return idx + t->commit_buf_start; + return UBLKS_T_COMMIT_BUF_INV_IDX; +} + +/* + * Free one commit buffer which is used by UBLK_U_IO_PREP_IO_CMDS or + * UBLK_U_IO_COMMIT_IO_CMDS + */ +static inline void ublk_free_commit_buf(struct ublk_thread *t, + unsigned short i) +{ + unsigned short idx = i - t->commit_buf_start; + + ublk_assert(idx < t->nr_commit_buf); + ublk_assert(allocator_get_val(&t->commit_buf_alloc, idx) != 0); + + allocator_put(&t->commit_buf_alloc, idx); +} + +static unsigned char ublk_commit_elem_buf_size(struct ublk_dev *dev) +{ + if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY | + UBLK_F_AUTO_BUF_REG)) + return 8; + + /* one extra 8bytes for carrying buffer address */ + return 16; +} + +static unsigned ublk_commit_buf_size(struct ublk_thread *t) +{ + struct ublk_dev *dev = t->dev; + unsigned elem_size = ublk_commit_elem_buf_size(dev); + unsigned int total = elem_size * dev->dev_info.queue_depth; + unsigned int page_sz = getpagesize(); + + return round_up(total, page_sz); +} + +static void free_batch_commit_buf(struct ublk_thread *t) +{ + if (t->commit_buf) { + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + + munlock(t->commit_buf, total); + free(t->commit_buf); + } + allocator_deinit(&t->commit_buf_alloc); +} + +static int alloc_batch_commit_buf(struct ublk_thread *t) +{ + unsigned buf_size = ublk_commit_buf_size(t); + unsigned int total = buf_size * t->nr_commit_buf; + unsigned int page_sz = getpagesize(); + void *buf = NULL; + int ret; + + allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); + + t->commit_buf = NULL; + ret = posix_memalign(&buf, page_sz, total); + if (ret || !buf) + goto fail; + + t->commit_buf = buf; + + /* lock commit buffer pages for fast access */ + if (mlock(t->commit_buf, total)) + ublk_err("%s: can't lock commit buffer %s\n", __func__, + strerror(errno)); + + return 0; + +fail: + free_batch_commit_buf(t); + return ret; +} + +void ublk_batch_prepare(struct ublk_thread *t) +{ + /* + * We only handle single device in this thread context. + * + * All queues have same feature flags, so use queue 0's for + * calculate uring_cmd flags. + * + * This way looks not elegant, but it works so far. + */ + struct ublk_queue *q = &t->dev->q[0]; + + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); + t->commit_buf_size = ublk_commit_buf_size(t); + t->commit_buf_start = t->nr_bufs; + t->nr_commit_buf = 2; + t->nr_bufs += t->nr_commit_buf; + + t->cmd_flags = 0; + if (ublk_queue_use_auto_zc(q)) { + if (ublk_queue_auto_zc_fallback(q)) + t->cmd_flags |= UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK; + } else if (!ublk_queue_no_buf(q)) + t->cmd_flags |= UBLK_BATCH_F_HAS_BUF_ADDR; + + t->state |= UBLKS_T_BATCH_IO; + + ublk_log("%s: thread %d commit(nr_bufs %u, buf_size %u, start %u)\n", + __func__, t->idx, + t->nr_commit_buf, t->commit_buf_size, + t->nr_bufs); +} + +int ublk_batch_alloc_buf(struct ublk_thread *t) +{ + ublk_assert(t->nr_commit_buf < 16); + return alloc_batch_commit_buf(t); +} + +void ublk_batch_free_buf(struct ublk_thread *t) +{ + free_batch_commit_buf(t); +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 9b6f1cd04dc4..3864f42e6c29 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -435,6 +435,8 @@ static void ublk_thread_deinit(struct ublk_thread *t) { io_uring_unregister_buffers(&t->ring); + ublk_batch_free_buf(t); + io_uring_unregister_ring_fd(&t->ring); if (t->ring.ring_fd > 0) { @@ -531,15 +533,33 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues; unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads; max_nr_ios_per_thread += !!(nr_ios % dev->nthreads); - ret = io_uring_register_buffers_sparse( - &t->ring, max_nr_ios_per_thread); + + t->nr_bufs = max_nr_ios_per_thread; + } else { + t->nr_bufs = 0; + } + + if (ublk_dev_batch_io(dev)) + ublk_batch_prepare(t); + + if (t->nr_bufs) { + ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs); if (ret) { - ublk_err("ublk dev %d thread %d register spare buffers failed %d", + ublk_err("ublk dev %d thread %d register spare buffers failed %d\n", dev->dev_info.dev_id, t->idx, ret); goto fail; } } + if (ublk_dev_batch_io(dev)) { + ret = ublk_batch_alloc_buf(t); + if (ret) { + ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n", + dev->dev_info.dev_id, t->idx, ret); + goto fail; + } + } + io_uring_register_ring_fd(&t->ring); if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) { diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 311a75da9b21..424c333596ac 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -182,15 +182,40 @@ struct ublk_queue { struct ublk_io ios[UBLK_QUEUE_DEPTH]; }; +/* align with `ublk_elem_header` */ +struct ublk_batch_elem { + __u16 tag; + __u16 buf_index; + __s32 result; + __u64 buf_addr; +}; + struct ublk_thread { struct ublk_dev *dev; unsigned idx; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) +#define UBLKS_T_BATCH_IO (1U << 31) /* readonly */ unsigned state; unsigned int cmd_inflight; unsigned int io_inflight; + + unsigned short nr_bufs; + + /* followings are for BATCH_IO */ + unsigned short commit_buf_start; + unsigned char commit_buf_elem_size; + /* + * We just support single device, so pre-calculate commit/prep flags + */ + unsigned short cmd_flags; + unsigned int nr_commit_buf; + unsigned int commit_buf_size; + void *commit_buf; +#define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) + struct allocator commit_buf_alloc; + struct io_uring ring; }; @@ -211,6 +236,27 @@ struct ublk_dev { extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io); +static inline int __ublk_use_batch_io(__u64 flags) +{ + return flags & UBLK_F_BATCH_IO; +} + +static inline int ublk_queue_batch_io(const struct ublk_queue *q) +{ + return __ublk_use_batch_io(q->flags); +} + +static inline int ublk_dev_batch_io(const struct ublk_dev *dev) +{ + return __ublk_use_batch_io(dev->dev_info.flags); +} + +/* only work for handle single device in this pthread context */ +static inline int ublk_thread_batch_io(const struct ublk_thread *t) +{ + return t->state & UBLKS_T_BATCH_IO; +} + static inline void ublk_set_integrity_params(const struct dev_ctx *ctx, struct ublk_params *params) { @@ -465,6 +511,13 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +/* Initialize batch I/O state and calculate buffer parameters */ +void ublk_batch_prepare(struct ublk_thread *t); +/* Allocate and register commit buffers for batch operations */ +int ublk_batch_alloc_buf(struct ublk_thread *t); +/* Free commit buffers and cleanup batch allocator */ +void ublk_batch_free_buf(struct ublk_thread *t); + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index 17eefed73690..aab522f26167 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -21,6 +21,60 @@ #define round_up(val, rnd) \ (((val) + ((rnd) - 1)) & ~((rnd) - 1)) +/* small sized & per-thread allocator */ +struct allocator { + unsigned int size; + cpu_set_t *set; +}; + +static inline int allocator_init(struct allocator *a, unsigned size) +{ + a->set = CPU_ALLOC(size); + a->size = size; + + if (a->set) + return 0; + return -ENOMEM; +} + +static inline void allocator_deinit(struct allocator *a) +{ + CPU_FREE(a->set); + a->set = NULL; + a->size = 0; +} + +static inline int allocator_get(struct allocator *a) +{ + int i; + + for (i = 0; i < a->size; i += 1) { + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (!CPU_ISSET_S(i, set_size, a->set)) { + CPU_SET_S(i, set_size, a->set); + return i; + } + } + + return -1; +} + +static inline void allocator_put(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + if (i >= 0 && i < a->size) + CPU_CLR_S(i, set_size, a->set); +} + +static inline int allocator_get_val(struct allocator *a, int i) +{ + size_t set_size = CPU_ALLOC_SIZE(a->size); + + return CPU_ISSET_S(i, set_size, a->set); +} + static inline unsigned int ilog2(unsigned int x) { if (x == 0) From d468930a019df71951a80fde20f6348136a2175d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:52 +0800 Subject: [PATCH 073/162] selftests: ublk: handle UBLK_U_IO_PREP_IO_CMDS Implement support for UBLK_U_IO_PREP_IO_CMDS in the batch I/O framework: - Add batch command initialization and setup functions - Implement prep command queueing with proper buffer management - Add command completion handling for prep and commit commands - Integrate batch I/O setup into thread initialization - Update CQE handling to support batch commands The implementation uses the previously established buffer management infrastructure to queue UBLK_U_IO_PREP_IO_CMDS commands. Commands are prepared in the first thread context and use commit buffers for efficient command batching. Key changes: - ublk_batch_queue_prep_io_cmds() prepares I/O command batches - ublk_batch_compl_cmd() handles batch command completions - Modified thread setup to use batch operations when enabled - Enhanced buffer index calculation for batch mode Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 114 +++++++++++++++++++++++++++ tools/testing/selftests/ublk/kublk.c | 50 +++++++++--- tools/testing/selftests/ublk/kublk.h | 22 ++++++ 3 files changed, 174 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 609e6073c9c0..079cae77add1 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -150,3 +150,117 @@ void ublk_batch_free_buf(struct ublk_thread *t) { free_batch_commit_buf(t); } + +static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, + struct io_uring_sqe *sqe, unsigned op, + unsigned short elem_bytes, + unsigned short nr_elem, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + __u64 user_data; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + ublk_set_sqe_cmd_op(sqe, op); + + sqe->fd = 0; /* dev->fds[0] */ + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags = IOSQE_FIXED_FILE; + + cmd->q_id = q_id; + cmd->flags = 0; + cmd->reserved = 0; + cmd->elem_bytes = elem_bytes; + cmd->nr_elem = nr_elem; + + user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0); + io_uring_sqe_set_data64(sqe, user_data); + + t->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: thread %u qid %d cmd_op %x data %lx " + "nr_elem %u elem_bytes %u buf_size %u buf_idx %d " + "cmd_inflight %u\n", + __func__, t->idx, q_id, op, user_data, + cmd->nr_elem, cmd->elem_bytes, + nr_elem * elem_bytes, buf_idx, t->cmd_inflight); +} + +static void ublk_setup_commit_sqe(struct ublk_thread *t, + struct io_uring_sqe *sqe, + unsigned short buf_idx) +{ + struct ublk_batch_io *cmd; + + cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe); + + /* Use plain user buffer instead of fixed buffer */ + cmd->flags |= t->cmd_flags; +} + +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + unsigned short nr_elem = q->q_depth; + unsigned short buf_idx = ublk_alloc_commit_buf(t); + struct io_uring_sqe *sqe; + void *buf; + int i; + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_assert(nr_elem == q->q_depth); + buf = ublk_get_commit_buf(t, buf_idx); + for (i = 0; i < nr_elem; i++) { + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)( + buf + i * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[i]; + + elem->tag = i; + elem->result = 0; + + if (ublk_queue_use_auto_zc(q)) + elem->buf_index = ublk_batch_io_buf_idx(t, q, i); + else if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64)io->buf_addr; + } + + sqe->addr = (__u64)buf; + sqe->len = t->commit_buf_elem_size * nr_elem; + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_PREP_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); + return 0; +} + +static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe, + unsigned op) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) + ublk_assert(cqe->res == 0); + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) + ;//assert(cqe->res == t->commit_buf_size); + else + ublk_assert(0); + + ublk_free_commit_buf(t, buf_idx); +} + +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + + if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || + op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + t->cmd_inflight--; + ublk_batch_compl_commit_cmd(t, cqe, op); + return; + } +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 3864f42e6c29..dba912a44eb3 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -840,6 +840,8 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t, unsigned tag = user_data_to_tag(cqe->user_data); struct ublk_io *io = &q->ios[tag]; + t->cmd_inflight--; + if (!fetch) { t->state |= UBLKS_T_STOPPING; io->flags &= ~UBLKS_IO_NEED_FETCH_RQ; @@ -874,28 +876,30 @@ static void ublk_handle_cqe(struct ublk_thread *t, { struct ublk_dev *dev = t->dev; unsigned q_id = user_data_to_q_id(cqe->user_data); - struct ublk_queue *q = &dev->q[q_id]; unsigned cmd_op = user_data_to_op(cqe->user_data); if (cqe->res < 0 && cqe->res != -ENODEV) - ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, - cqe->res, cqe->user_data, q->flags); + ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, + cqe->res, cqe->user_data, t->state); - ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", - __func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data), - cmd_op, is_target_io(cqe->user_data), + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x " + "data %lx target %d/%d) stopping %d\n", + __func__, cqe->res, t->idx, q_id, + user_data_to_tag(cqe->user_data), + cmd_op, cqe->user_data, is_target_io(cqe->user_data), user_data_to_tgt_data(cqe->user_data), (t->state & UBLKS_T_STOPPING)); /* Don't retrieve io in case of target io */ if (is_target_io(cqe->user_data)) { - ublksrv_handle_tgt_cqe(t, q, cqe); + ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe); return; } - t->cmd_inflight--; - - ublk_handle_uring_cmd(t, q, cqe); + if (ublk_thread_batch_io(t)) + ublk_batch_compl_cmd(t, cqe); + else + ublk_handle_uring_cmd(t, &dev->q[q_id], cqe); } static int ublk_reap_events_uring(struct ublk_thread *t) @@ -952,6 +956,22 @@ static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) info->dev->dev_info.dev_id, info->idx); } +static void ublk_batch_setup_queues(struct ublk_thread *t) +{ + int i; + + /* setup all queues in the 1st thread */ + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + struct ublk_queue *q = &t->dev->q[i]; + int ret; + + ret = ublk_batch_queue_prep_io_cmds(t, q); + ublk_assert(ret == 0); + ret = ublk_process_io(t); + ublk_assert(ret >= 0); + } +} + static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info) { struct ublk_thread t = { @@ -972,8 +992,14 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n", gettid(), dev_id, t.idx); - /* submit all io commands to ublk driver */ - ublk_submit_fetch_commands(&t); + if (!ublk_thread_batch_io(&t)) { + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(&t); + } else if (!t.idx) { + /* prepare all io commands in the 1st thread context */ + ublk_batch_setup_queues(&t); + } + do { if (ublk_process_io(&t) < 0) break; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 424c333596ac..08320d44c7c2 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -440,10 +440,16 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) addr[1] = 0; } +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag); + static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t, const struct ublk_queue *q, unsigned tag) { + if (ublk_queue_batch_io(q)) + return ublk_batch_io_buf_idx(t, q, tag); return q->ios[tag].buf_index; } @@ -511,6 +517,22 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +/* + * Each IO's buffer index has to be calculated by this helper for + * UBLKS_T_BATCH_IO + */ +static inline unsigned short ublk_batch_io_buf_idx( + const struct ublk_thread *t, const struct ublk_queue *q, + unsigned tag) +{ + return tag; +} + +/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Handle completion of batch I/O commands (prep/commit) */ +void ublk_batch_compl_cmd(struct ublk_thread *t, + const struct io_uring_cqe *cqe); /* Initialize batch I/O state and calculate buffer parameters */ void ublk_batch_prepare(struct ublk_thread *t); /* Allocate and register commit buffers for batch operations */ From dee7024ffecba291891503e425373d9f2a1d01b6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:53 +0800 Subject: [PATCH 074/162] selftests: ublk: handle UBLK_U_IO_COMMIT_IO_CMDS Implement UBLK_U_IO_COMMIT_IO_CMDS to enable efficient batched completion of I/O operations in the batch I/O framework. This completes the batch I/O infrastructure by adding the commit phase that notifies the kernel about completed I/O operations: Key features: - Batch multiple I/O completions into single UBLK_U_IO_COMMIT_IO_CMDS - Dynamic commit buffer allocation and management per thread - Automatic commit buffer preparation before processing events - Commit buffer submission after processing completed I/Os - Integration with existing completion workflows Implementation details: - ublk_batch_prep_commit() allocates and initializes commit buffers - ublk_batch_complete_io() adds completed I/Os to current batch - ublk_batch_commit_io_cmds() submits batched completions to kernel - Modified ublk_process_io() to handle batch commit lifecycle - Enhanced ublk_complete_io() to route to batch or legacy completion The commit buffer stores completion information (tag, result, buffer details) for multiple I/Os, then submits them all at once, significantly reducing syscall overhead compared to individual I/O completions. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 74 ++++++++++++++++++++++++++-- tools/testing/selftests/ublk/kublk.c | 8 ++- tools/testing/selftests/ublk/kublk.h | 69 +++++++++++++++++--------- 3 files changed, 122 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 079cae77add1..9c4db7335d44 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -174,7 +174,7 @@ static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, cmd->elem_bytes = elem_bytes; cmd->nr_elem = nr_elem; - user_data = build_user_data(buf_idx, _IOC_NR(op), 0, q_id, 0); + user_data = build_user_data(buf_idx, _IOC_NR(op), nr_elem, q_id, 0); io_uring_sqe_set_data64(sqe, user_data); t->cmd_inflight += 1; @@ -244,9 +244,11 @@ static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS)) ublk_assert(cqe->res == 0); - else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) - ;//assert(cqe->res == t->commit_buf_size); - else + else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { + int nr_elem = user_data_to_tgt_data(cqe->user_data); + + ublk_assert(cqe->res == t->commit_buf_elem_size * nr_elem); + } else ublk_assert(0); ublk_free_commit_buf(t, buf_idx); @@ -264,3 +266,67 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, return; } } + +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + struct io_uring_sqe *sqe; + unsigned short buf_idx; + unsigned short nr_elem = t->commit.done; + + /* nothing to commit */ + if (!nr_elem) { + ublk_free_commit_buf(t, t->commit.buf_idx); + return; + } + + ublk_io_alloc_sqes(t, &sqe, 1); + buf_idx = t->commit.buf_idx; + sqe->addr = (__u64)t->commit.elem; + sqe->len = nr_elem * t->commit_buf_elem_size; + + /* commit isn't per-queue command */ + ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + t->commit_buf_elem_size, nr_elem, buf_idx); + ublk_setup_commit_sqe(t, sqe, buf_idx); +} + +static void ublk_batch_init_commit(struct ublk_thread *t, + unsigned short buf_idx) +{ + /* so far only support 1:1 queue/thread mapping */ + t->commit.q_id = t->idx; + t->commit.buf_idx = buf_idx; + t->commit.elem = ublk_get_commit_buf(t, buf_idx); + t->commit.done = 0; + t->commit.count = t->commit_buf_size / + t->commit_buf_elem_size; +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + unsigned short buf_idx = ublk_alloc_commit_buf(t); + + ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); + ublk_batch_init_commit(t, buf_idx); +} + +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + struct batch_commit_buf *cb = &t->commit; + struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem + + cb->done * t->commit_buf_elem_size); + struct ublk_io *io = &q->ios[tag]; + + ublk_assert(q->q_id == t->commit.q_id); + + elem->tag = tag; + elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); + elem->result = res; + + if (!ublk_queue_no_buf(q)) + elem->buf_addr = (__u64) (uintptr_t) io->buf_addr; + + cb->done += 1; + ublk_assert(cb->done <= cb->count); +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index dba912a44eb3..bf217d30c15f 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -931,7 +931,13 @@ static int ublk_process_io(struct ublk_thread *t) return -ENODEV; ret = io_uring_submit_and_wait(&t->ring, 1); - reapped = ublk_reap_events_uring(t); + if (ublk_thread_batch_io(t)) { + ublk_batch_prep_commit(t); + reapped = ublk_reap_events_uring(t); + ublk_batch_commit_io_cmds(t); + } else { + reapped = ublk_reap_events_uring(t); + } ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n", ret, reapped, (t->state & UBLKS_T_STOPPING), diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 08320d44c7c2..5b05f6d7d808 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -190,6 +190,14 @@ struct ublk_batch_elem { __u64 buf_addr; }; +struct batch_commit_buf { + unsigned short q_id; + unsigned short buf_idx; + void *elem; + unsigned short done; + unsigned short count; +}; + struct ublk_thread { struct ublk_dev *dev; unsigned idx; @@ -215,6 +223,7 @@ struct ublk_thread { void *commit_buf; #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; + struct batch_commit_buf commit; struct io_uring ring; }; @@ -458,30 +467,6 @@ static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) return &q->ios[tag]; } -static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int res) -{ - struct ublk_io *io = &q->ios[tag]; - - ublk_mark_io_done(io, res); - - return ublk_queue_io_cmd(t, io); -} - -static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, - unsigned tag, int queued) -{ - if (queued < 0) - ublk_complete_io(t, q, tag, queued); - else { - struct ublk_io *io = ublk_get_io(q, tag); - - t->io_inflight += queued; - io->tgt_ios = queued; - io->result = 0; - } -} - static inline int ublk_completed_tgt_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag) { @@ -540,6 +525,42 @@ int ublk_batch_alloc_buf(struct ublk_thread *t); /* Free commit buffers and cleanup batch allocator */ void ublk_batch_free_buf(struct ublk_thread *t); +/* Prepare a new commit buffer for batching completed I/O operations */ +void ublk_batch_prep_commit(struct ublk_thread *t); +/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */ +void ublk_batch_commit_io_cmds(struct ublk_thread *t); +/* Add a completed I/O operation to the current batch commit buffer */ +void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res); + +static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int res) +{ + if (ublk_queue_batch_io(q)) { + ublk_batch_complete_io(t, q, tag, res); + return 0; + } else { + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + return ublk_queue_io_cmd(t, io); + } +} + +static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q, + unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(t, q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + t->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; From cb5a6b308700c65c29baccbb6b9b07f306633ad5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:54 +0800 Subject: [PATCH 075/162] selftests: ublk: handle UBLK_U_IO_FETCH_IO_CMDS Add support for UBLK_U_IO_FETCH_IO_CMDS to enable efficient batch fetching of I/O commands using multishot io_uring operations. Key improvements: - Implement multishot UBLK_U_IO_FETCH_IO_CMDS for continuous command fetching - Add fetch buffer management with page-aligned, mlocked buffers - Process fetched I/O command tags from kernel-provided buffers - Integrate fetch operations with existing batch I/O infrastructure - Significantly reduce uring_cmd issuing overhead through batching The implementation uses two fetch buffers per thread with automatic requeuing to maintain continuous I/O command flow. Each fetch operation retrieves multiple command tags in a single syscall, dramatically improving performance compared to individual command fetching. Technical details: - Fetch buffers are page-aligned and mlocked for optimal performance - Uses IORING_URING_CMD_MULTISHOT for continuous operation - Automatic buffer management and requeuing on completion - Enhanced CQE handling for fetch command completions Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/batch.c | 136 ++++++++++++++++++++++++++- tools/testing/selftests/ublk/kublk.c | 14 ++- tools/testing/selftests/ublk/kublk.h | 13 +++ 3 files changed, 159 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 9c4db7335d44..5f9587210b12 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -140,15 +140,63 @@ void ublk_batch_prepare(struct ublk_thread *t) t->nr_bufs); } +static void free_batch_fetch_buf(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); + munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); + free(t->fetch[i].fetch_buf); + } +} + +static int alloc_batch_fetch_buf(struct ublk_thread *t) +{ + /* page aligned fetch buffer, and it is mlocked for speedup delivery */ + unsigned pg_sz = getpagesize(); + unsigned buf_size = round_up(t->dev->dev_info.queue_depth * 2, pg_sz); + int ret; + int i = 0; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + t->fetch[i].fetch_buf_size = buf_size; + + if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, + t->fetch[i].fetch_buf_size)) + return -ENOMEM; + + /* lock fetch buffer page for fast fetching */ + if (mlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size)) + ublk_err("%s: can't lock fetch buffer %s\n", __func__, + strerror(errno)); + t->fetch[i].br = io_uring_setup_buf_ring(&t->ring, 1, + i, IOU_PBUF_RING_INC, &ret); + if (!t->fetch[i].br) { + ublk_err("Buffer ring register failed %d\n", ret); + return ret; + } + } + + return 0; +} + int ublk_batch_alloc_buf(struct ublk_thread *t) { + int ret; + ublk_assert(t->nr_commit_buf < 16); - return alloc_batch_commit_buf(t); + + ret = alloc_batch_commit_buf(t); + if (ret) + return ret; + return alloc_batch_fetch_buf(t); } void ublk_batch_free_buf(struct ublk_thread *t) { free_batch_commit_buf(t); + free_batch_fetch_buf(t); } static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id, @@ -199,6 +247,76 @@ static void ublk_setup_commit_sqe(struct ublk_thread *t, cmd->flags |= t->cmd_flags; } +static void ublk_batch_queue_fetch(struct ublk_thread *t, + struct ublk_queue *q, + unsigned short buf_idx) +{ + unsigned short nr_elem = t->fetch[buf_idx].fetch_buf_size / 2; + struct io_uring_sqe *sqe; + + io_uring_buf_ring_add(t->fetch[buf_idx].br, t->fetch[buf_idx].fetch_buf, + t->fetch[buf_idx].fetch_buf_size, + 0, 0, 0); + io_uring_buf_ring_advance(t->fetch[buf_idx].br, 1); + + ublk_io_alloc_sqes(t, &sqe, 1); + + ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_FETCH_IO_CMDS, 2, nr_elem, + buf_idx); + + sqe->rw_flags= IORING_URING_CMD_MULTISHOT; + sqe->buf_group = buf_idx; + sqe->flags |= IOSQE_BUFFER_SELECT; + + t->fetch[buf_idx].fetch_buf_off = 0; +} + +void ublk_batch_start_fetch(struct ublk_thread *t, + struct ublk_queue *q) +{ + int i; + + for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) + ublk_batch_queue_fetch(t, q, i); +} + +static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, + struct ublk_queue *q, + const struct io_uring_cqe *cqe) +{ + unsigned short buf_idx = user_data_to_tag(cqe->user_data); + unsigned start = t->fetch[buf_idx].fetch_buf_off; + unsigned end = start + cqe->res; + void *buf = t->fetch[buf_idx].fetch_buf; + int i; + + if (cqe->res < 0) + return buf_idx; + + if ((end - start) / 2 > q->q_depth) { + ublk_err("%s: fetch duplicated ios offset %u count %u\n", __func__, start, cqe->res); + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + ublk_err("%u ", tag); + } + ublk_err("\n"); + } + + for (i = start; i < end; i += 2) { + unsigned short tag = *(unsigned short *)(buf + i); + + if (tag >= q->q_depth) + ublk_err("%s: bad tag %u\n", __func__, tag); + + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(t, q, tag); + } + t->fetch[buf_idx].fetch_buf_off = end; + return buf_idx; +} + int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) { unsigned short nr_elem = q->q_depth; @@ -258,6 +376,9 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe) { unsigned op = user_data_to_op(cqe->user_data); + struct ublk_queue *q; + unsigned buf_idx; + unsigned q_id; if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) || op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) { @@ -265,6 +386,19 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, ublk_batch_compl_commit_cmd(t, cqe, op); return; } + + /* FETCH command is per queue */ + q_id = user_data_to_q_id(cqe->user_data); + q = &t->dev->q[q_id]; + buf_idx = ublk_compl_batch_fetch(t, q, cqe); + + if (cqe->res < 0 && cqe->res != -ENOBUFS) { + t->cmd_inflight--; + t->state |= UBLKS_T_STOPPING; + } else if (!(cqe->flags & IORING_CQE_F_MORE) || cqe->res == -ENOBUFS) { + t->cmd_inflight--; + ublk_batch_queue_fetch(t, q, buf_idx); + } } void ublk_batch_commit_io_cmds(struct ublk_thread *t) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index bf217d30c15f..c77205bac7a9 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -519,6 +519,10 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; int ret; + /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ + if (ublk_dev_batch_io(dev)) + cq_depth += dev->dev_info.queue_depth; + ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | IORING_SETUP_SINGLE_ISSUER | @@ -878,7 +882,7 @@ static void ublk_handle_cqe(struct ublk_thread *t, unsigned q_id = user_data_to_q_id(cqe->user_data); unsigned cmd_op = user_data_to_op(cqe->user_data); - if (cqe->res < 0 && cqe->res != -ENODEV) + if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS) ublk_err("%s: res %d userdata %llx thread state %x\n", __func__, cqe->res, cqe->user_data, t->state); @@ -1001,9 +1005,13 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf if (!ublk_thread_batch_io(&t)) { /* submit all io commands to ublk driver */ ublk_submit_fetch_commands(&t); - } else if (!t.idx) { + } else { + struct ublk_queue *q = &t.dev->q[t.idx]; + /* prepare all io commands in the 1st thread context */ - ublk_batch_setup_queues(&t); + if (!t.idx) + ublk_batch_setup_queues(&t); + ublk_batch_start_fetch(&t, q); } do { diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 5b05f6d7d808..950e99c02e8b 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -198,6 +198,13 @@ struct batch_commit_buf { unsigned short count; }; +struct batch_fetch_buf { + struct io_uring_buf_ring *br; + void *fetch_buf; + unsigned int fetch_buf_size; + unsigned int fetch_buf_off; +}; + struct ublk_thread { struct ublk_dev *dev; unsigned idx; @@ -224,6 +231,9 @@ struct ublk_thread { #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; struct batch_commit_buf commit; + /* FETCH_IO_CMDS buffer */ +#define UBLKS_T_NR_FETCH_BUF 2 + struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF]; struct io_uring ring; }; @@ -515,6 +525,9 @@ static inline unsigned short ublk_batch_io_buf_idx( /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); +/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ +void ublk_batch_start_fetch(struct ublk_thread *t, + struct ublk_queue *q); /* Handle completion of batch I/O commands (prep/commit) */ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe); From 4968fb7cc60676040258c8867f22931c8735126f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:55 +0800 Subject: [PATCH 076/162] selftests: ublk: increase timeout to 150 seconds More tests need to be covered in existing generic tests, and default 45sec isn't enough, and timeout is often triggered, increase timeout by adding setting file. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 ++ tools/testing/selftests/ublk/settings | 1 + 2 files changed, 3 insertions(+) create mode 100644 tools/testing/selftests/ublk/settings diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 3a2498089b15..f2da8b403537 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -52,6 +52,8 @@ TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh +TEST_FILES := settings + TEST_GEN_PROGS_EXTENDED = kublk metadata_size STANDALONE_UTILS := metadata_size.c diff --git a/tools/testing/selftests/ublk/settings b/tools/testing/selftests/ublk/settings new file mode 100644 index 000000000000..682a40f1c8e6 --- /dev/null +++ b/tools/testing/selftests/ublk/settings @@ -0,0 +1 @@ +timeout=150 From 20aeab0b08a175d9ceb4ad327f55ba5c29a79888 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:56 +0800 Subject: [PATCH 077/162] selftests: ublk: add --batch/-b for enabling F_BATCH_IO Add --batch/-b for enabling F_BATCH_IO. Add batch_01 for covering its basic function. Add stress_08 and stress_09 for covering stress test. Add recovery test for F_BATCH_IO in generic_04 and generic_05. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 4 ++ tools/testing/selftests/ublk/kublk.c | 15 ++++++- tools/testing/selftests/ublk/test_batch_01.sh | 32 +++++++++++++ .../testing/selftests/ublk/test_generic_04.sh | 5 +++ .../testing/selftests/ublk/test_generic_05.sh | 5 +++ .../testing/selftests/ublk/test_stress_08.sh | 45 +++++++++++++++++++ .../testing/selftests/ublk/test_stress_09.sh | 44 ++++++++++++++++++ 7 files changed, 148 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_batch_01.sh create mode 100755 tools/testing/selftests/ublk/test_stress_08.sh create mode 100755 tools/testing/selftests/ublk/test_stress_09.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index f2da8b403537..520e18e224f2 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -25,6 +25,8 @@ TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh +TEST_PROGS += test_batch_01.sh + TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh @@ -51,6 +53,8 @@ TEST_PROGS += test_stress_04.sh TEST_PROGS += test_stress_05.sh TEST_PROGS += test_stress_06.sh TEST_PROGS += test_stress_07.sh +TEST_PROGS += test_stress_08.sh +TEST_PROGS += test_stress_09.sh TEST_FILES := settings diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index c77205bac7a9..5d84000872a0 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1593,7 +1593,8 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_PER_IO_DAEMON), FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON), FEAT_NAME(UBLK_F_INTEGRITY), - FEAT_NAME(UBLK_F_SAFE_STOP_DEV) + FEAT_NAME(UBLK_F_SAFE_STOP_DEV), + FEAT_NAME(UBLK_F_BATCH_IO), }; struct ublk_dev *dev; __u64 features = 0; @@ -1691,6 +1692,7 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--nthreads threads] [--per_io_tasks]\n"); printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); + printf("\t[--batch|-b]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1763,6 +1765,7 @@ int main(int argc, char *argv[]) { "csum_type", 1, NULL, 0 }, { "tag_size", 1, NULL, 0 }, { "safe", 0, NULL, 0 }, + { "batch", 0, NULL, 'b'}, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1785,12 +1788,15 @@ int main(int argc, char *argv[]) opterr = 0; optind = 2; - while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu", + while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub", longopts, &option_idx)) != -1) { switch (opt) { case 'a': ctx.all = 1; break; + case 'b': + ctx.flags |= UBLK_F_BATCH_IO; + break; case 'n': ctx.dev_id = strtol(optarg, NULL, 10); break; @@ -1895,6 +1901,11 @@ int main(int argc, char *argv[]) } } + if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) { + ublk_err("per_io_task and F_BATCH_IO conflict\n"); + return -EINVAL; + } + /* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */ if (ctx.auto_zc_fallback && !((ctx.flags & UBLK_F_AUTO_BUF_REG) && diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh new file mode 100755 index 000000000000..9fa9fff5c62f --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_01.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_01" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test basic function of UBLK_F_BATCH_IO" + +_create_backfile 0 256M +_create_backfile 1 256M + +dev_id=$(_add_ublk_dev -t loop -q 2 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +if ! _mkfs_mount_test /dev/ublkb"${dev_id}"; then + _cleanup_test "generic" + _show_result $TID 255 +fi + +dev_id=$(_add_ublk_dev -t stripe -b --auto_zc "${UBLK_BACKFILES[0]}" "${UBLK_BACKFILES[1]}") +_check_add_dev $TID $? +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh index baf5b156193d..be2292822bbe 100755 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ b/tools/testing/selftests/ublk/test_generic_04.sh @@ -26,6 +26,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -b & +ublk_run_recover_test -t loop -q 2 -r 1 -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 & ublk_run_recover_test -t loop -q 2 -r 1 "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh index 7b5083afc02a..9b7f71c16d82 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_generic_05.sh @@ -30,6 +30,11 @@ _create_backfile 0 256M _create_backfile 1 128M _create_backfile 2 128M +ublk_run_recover_test -t null -q 2 -r 1 -z -b & +ublk_run_recover_test -t loop -q 2 -r 1 -z -b "${UBLK_BACKFILES[0]}" & +ublk_run_recover_test -t stripe -q 2 -r 1 -z -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +wait + ublk_run_recover_test -t null -q 2 -r 1 -z & ublk_run_recover_test -t loop -q 2 -r 1 -z "${UBLK_BACKFILES[0]}" & ublk_run_recover_test -t stripe -q 2 -r 1 -z "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh new file mode 100755 index 000000000000..190db0b4f2ad --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_06" +ERR_CODE=0 + +ublk_io_and_remove() +{ + run_io_and_remove "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and remove device(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_remove 8G -t null -q 4 -b & +ublk_io_and_remove 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_remove 256M -t stripe -q 4 --auto_zc -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_remove 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh new file mode 100755 index 000000000000..1b6bdb31da03 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_07" +ERR_CODE=0 + +ublk_io_and_kill_daemon() +{ + run_io_and_kill_daemon "$@" + ERR_CODE=$? + if [ ${ERR_CODE} -ne 0 ]; then + echo "$TID failure: $*" + _show_result $TID $ERR_CODE + fi +} + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "ZERO_COPY"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "AUTO_BUF_REG"; then + exit "$UBLK_SKIP_CODE" +fi +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "stress" "run IO and kill ublk server(zero copy)" + +_create_backfile 0 256M +_create_backfile 1 128M +_create_backfile 2 128M + +ublk_io_and_kill_daemon 8G -t null -q 4 -z -b & +ublk_io_and_kill_daemon 256M -t loop -q 4 --auto_zc -b "${UBLK_BACKFILES[0]}" & +ublk_io_and_kill_daemon 256M -t stripe -q 4 -b "${UBLK_BACKFILES[1]}" "${UBLK_BACKFILES[2]}" & +ublk_io_and_kill_daemon 8G -t null -q 4 -z --auto_zc --auto_zc_fallback -b & +wait + +_cleanup_test "stress" +_show_result $TID $ERR_CODE From e8cd481cc665d5db8e918e84740db22bc213059e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jan 2026 22:18:57 +0800 Subject: [PATCH 078/162] selftests: ublk: support arbitrary threads/queues combination Enable flexible thread-to-queue mapping in batch I/O mode to support arbitrary combinations of threads and queues, improving resource utilization and scalability. Key improvements: - Support N:M thread-to-queue mapping (previously limited to 1:1) - Dynamic buffer allocation based on actual queue assignment per thread - Thread-safe queue preparation with spinlock protection - Intelligent buffer index calculation for multi-queue scenarios - Enhanced validation for thread/queue combination constraints Implementation details: - Add q_thread_map matrix to track queue-to-thread assignments - Dynamic allocation of commit and fetch buffers per thread - Round-robin queue assignment algorithm for load balancing - Per-queue spinlock to prevent race conditions during prep - Updated buffer index calculation using queue position within thread This enables efficient configurations like: - Any other N:M combinations for optimal resource matching Testing: - Added test_batch_02.sh: 4 threads vs 1 queue - Added test_batch_03.sh: 1 thread vs 4 queues - Validates correctness across different mapping scenarios Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 + tools/testing/selftests/ublk/batch.c | 199 +++++++++++++++--- tools/testing/selftests/ublk/kublk.c | 49 ++++- tools/testing/selftests/ublk/kublk.h | 40 +++- tools/testing/selftests/ublk/test_batch_02.sh | 30 +++ tools/testing/selftests/ublk/test_batch_03.sh | 30 +++ 6 files changed, 302 insertions(+), 48 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_batch_02.sh create mode 100755 tools/testing/selftests/ublk/test_batch_03.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 520e18e224f2..e39a6f871fcc 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -26,6 +26,8 @@ TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh TEST_PROGS += test_batch_01.sh +TEST_PROGS += test_batch_02.sh +TEST_PROGS += test_batch_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh diff --git a/tools/testing/selftests/ublk/batch.c b/tools/testing/selftests/ublk/batch.c index 5f9587210b12..a54025b00917 100644 --- a/tools/testing/selftests/ublk/batch.c +++ b/tools/testing/selftests/ublk/batch.c @@ -76,6 +76,7 @@ static void free_batch_commit_buf(struct ublk_thread *t) free(t->commit_buf); } allocator_deinit(&t->commit_buf_alloc); + free(t->commit); } static int alloc_batch_commit_buf(struct ublk_thread *t) @@ -84,7 +85,13 @@ static int alloc_batch_commit_buf(struct ublk_thread *t) unsigned int total = buf_size * t->nr_commit_buf; unsigned int page_sz = getpagesize(); void *buf = NULL; - int ret; + int i, ret, j = 0; + + t->commit = calloc(t->nr_queues, sizeof(*t->commit)); + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) + t->commit[j++].q_id = i; + } allocator_init(&t->commit_buf_alloc, t->nr_commit_buf); @@ -107,6 +114,17 @@ fail: return ret; } +static unsigned int ublk_thread_nr_queues(const struct ublk_thread *t) +{ + int i; + int ret = 0; + + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) + ret += !!t->q_map[i]; + + return ret; +} + void ublk_batch_prepare(struct ublk_thread *t) { /* @@ -119,10 +137,13 @@ void ublk_batch_prepare(struct ublk_thread *t) */ struct ublk_queue *q = &t->dev->q[0]; + /* cache nr_queues because we don't support dynamic load-balance yet */ + t->nr_queues = ublk_thread_nr_queues(t); + t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev); t->commit_buf_size = ublk_commit_buf_size(t); t->commit_buf_start = t->nr_bufs; - t->nr_commit_buf = 2; + t->nr_commit_buf = 2 * t->nr_queues; t->nr_bufs += t->nr_commit_buf; t->cmd_flags = 0; @@ -144,11 +165,12 @@ static void free_batch_fetch_buf(struct ublk_thread *t) { int i; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + for (i = 0; i < t->nr_fetch_bufs; i++) { io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i); munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size); free(t->fetch[i].fetch_buf); } + free(t->fetch); } static int alloc_batch_fetch_buf(struct ublk_thread *t) @@ -159,7 +181,12 @@ static int alloc_batch_fetch_buf(struct ublk_thread *t) int ret; int i = 0; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) { + /* double fetch buffer for each queue */ + t->nr_fetch_bufs = t->nr_queues * 2; + t->fetch = calloc(t->nr_fetch_bufs, sizeof(*t->fetch)); + + /* allocate one buffer for each queue */ + for (i = 0; i < t->nr_fetch_bufs; i++) { t->fetch[i].fetch_buf_size = buf_size; if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz, @@ -185,7 +212,7 @@ int ublk_batch_alloc_buf(struct ublk_thread *t) { int ret; - ublk_assert(t->nr_commit_buf < 16); + ublk_assert(t->nr_commit_buf < 2 * UBLK_MAX_QUEUES); ret = alloc_batch_commit_buf(t); if (ret) @@ -271,13 +298,20 @@ static void ublk_batch_queue_fetch(struct ublk_thread *t, t->fetch[buf_idx].fetch_buf_off = 0; } -void ublk_batch_start_fetch(struct ublk_thread *t, - struct ublk_queue *q) +void ublk_batch_start_fetch(struct ublk_thread *t) { int i; + int j = 0; - for (i = 0; i < UBLKS_T_NR_FETCH_BUF; i++) - ublk_batch_queue_fetch(t, q, i); + for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { + if (t->q_map[i]) { + struct ublk_queue *q = &t->dev->q[i]; + + /* submit two fetch commands for each queue */ + ublk_batch_queue_fetch(t, q, j++); + ublk_batch_queue_fetch(t, q, j++); + } + } } static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, @@ -317,7 +351,7 @@ static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t, return buf_idx; } -int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +static int __ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) { unsigned short nr_elem = q->q_depth; unsigned short buf_idx = ublk_alloc_commit_buf(t); @@ -354,6 +388,22 @@ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) return 0; } +int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q) +{ + int ret = 0; + + pthread_spin_lock(&q->lock); + if (q->flags & UBLKS_Q_PREPARED) + goto unlock; + ret = __ublk_batch_queue_prep_io_cmds(t, q); + if (!ret) + q->flags |= UBLKS_Q_PREPARED; +unlock: + pthread_spin_unlock(&q->lock); + + return ret; +} + static void ublk_batch_compl_commit_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe, unsigned op) @@ -401,59 +451,89 @@ void ublk_batch_compl_cmd(struct ublk_thread *t, } } -void ublk_batch_commit_io_cmds(struct ublk_thread *t) +static void __ublk_batch_commit_io_cmds(struct ublk_thread *t, + struct batch_commit_buf *cb) { struct io_uring_sqe *sqe; unsigned short buf_idx; - unsigned short nr_elem = t->commit.done; + unsigned short nr_elem = cb->done; /* nothing to commit */ if (!nr_elem) { - ublk_free_commit_buf(t, t->commit.buf_idx); + ublk_free_commit_buf(t, cb->buf_idx); return; } ublk_io_alloc_sqes(t, &sqe, 1); - buf_idx = t->commit.buf_idx; - sqe->addr = (__u64)t->commit.elem; + buf_idx = cb->buf_idx; + sqe->addr = (__u64)cb->elem; sqe->len = nr_elem * t->commit_buf_elem_size; /* commit isn't per-queue command */ - ublk_init_batch_cmd(t, t->commit.q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, + ublk_init_batch_cmd(t, cb->q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS, t->commit_buf_elem_size, nr_elem, buf_idx); ublk_setup_commit_sqe(t, sqe, buf_idx); } -static void ublk_batch_init_commit(struct ublk_thread *t, - unsigned short buf_idx) +void ublk_batch_commit_io_cmds(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) { + struct batch_commit_buf *cb = &t->commit[i]; + + if (cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX) + __ublk_batch_commit_io_cmds(t, cb); + } + +} + +static void __ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb, + unsigned short buf_idx) { /* so far only support 1:1 queue/thread mapping */ - t->commit.q_id = t->idx; - t->commit.buf_idx = buf_idx; - t->commit.elem = ublk_get_commit_buf(t, buf_idx); - t->commit.done = 0; - t->commit.count = t->commit_buf_size / + cb->buf_idx = buf_idx; + cb->elem = ublk_get_commit_buf(t, buf_idx); + cb->done = 0; + cb->count = t->commit_buf_size / t->commit_buf_elem_size; } -void ublk_batch_prep_commit(struct ublk_thread *t) +/* COMMIT_IO_CMDS is per-queue command, so use its own commit buffer */ +static void ublk_batch_init_commit(struct ublk_thread *t, + struct batch_commit_buf *cb) { unsigned short buf_idx = ublk_alloc_commit_buf(t); ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX); - ublk_batch_init_commit(t, buf_idx); + ublk_assert(!ublk_batch_commit_prepared(cb)); + + __ublk_batch_init_commit(t, cb, buf_idx); +} + +void ublk_batch_prep_commit(struct ublk_thread *t) +{ + int i; + + for (i = 0; i < t->nr_queues; i++) + t->commit[i].buf_idx = UBLKS_T_COMMIT_BUF_INV_IDX; } void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res) { - struct batch_commit_buf *cb = &t->commit; - struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(cb->elem + - cb->done * t->commit_buf_elem_size); + unsigned q_t_idx = ublk_queue_idx_in_thread(t, q); + struct batch_commit_buf *cb = &t->commit[q_t_idx]; + struct ublk_batch_elem *elem; struct ublk_io *io = &q->ios[tag]; - ublk_assert(q->q_id == t->commit.q_id); + if (!ublk_batch_commit_prepared(cb)) + ublk_batch_init_commit(t, cb); + ublk_assert(q->q_id == cb->q_id); + + elem = (struct ublk_batch_elem *)(cb->elem + cb->done * t->commit_buf_elem_size); elem->tag = tag; elem->buf_index = ublk_batch_io_buf_idx(t, q, tag); elem->result = res; @@ -464,3 +544,64 @@ void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, cb->done += 1; ublk_assert(cb->done <= cb->count); } + +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues) +{ + int i, j; + + /* + * Setup round-robin queue-to-thread mapping for arbitrary N:M combinations. + * + * This algorithm distributes queues across threads (and threads across queues) + * in a balanced round-robin fashion to ensure even load distribution. + * + * Examples: + * - 2 threads, 4 queues: T0=[Q0,Q2], T1=[Q1,Q3] + * - 4 threads, 2 queues: T0=[Q0], T1=[Q1], T2=[Q0], T3=[Q1] + * - 3 threads, 3 queues: T0=[Q0], T1=[Q1], T2=[Q2] (1:1 mapping) + * + * Phase 1: Mark which queues each thread handles (boolean mapping) + */ + for (i = 0, j = 0; i < queues || j < nthreads; i++, j++) { + q_thread_map[j % nthreads][i % queues] = 1; + } + + /* + * Phase 2: Convert boolean mapping to sequential indices within each thread. + * + * Transform from: q_thread_map[thread][queue] = 1 (handles queue) + * To: q_thread_map[thread][queue] = N (queue index within thread) + * + * This allows each thread to know the local index of each queue it handles, + * which is essential for buffer allocation and management. For example: + * - Thread 0 handling queues [0,2] becomes: q_thread_map[0][0]=1, q_thread_map[0][2]=2 + * - Thread 1 handling queues [1,3] becomes: q_thread_map[1][1]=1, q_thread_map[1][3]=2 + */ + for (j = 0; j < nthreads; j++) { + unsigned char seq = 1; + + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + q_thread_map[j][i] = seq++; + } + } + +#if 0 + for (j = 0; j < nthreads; j++) { + printf("thread %0d: ", j); + for (i = 0; i < queues; i++) { + if (q_thread_map[j][i]) + printf("%03u ", i); + } + printf("\n"); + } + printf("\n"); + for (j = 0; j < nthreads; j++) { + for (i = 0; i < queues; i++) { + printf("%03u ", q_thread_map[j][i]); + } + printf("\n"); + } +#endif +} diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 5d84000872a0..2da37557e1a9 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -455,6 +455,7 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags, int cmd_buf_size, io_buf_size, integrity_size; unsigned long off; + pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE); q->tgt_ops = dev->tgt.ops; q->flags = 0; q->q_depth = depth; @@ -521,7 +522,7 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag /* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */ if (ublk_dev_batch_io(dev)) - cq_depth += dev->dev_info.queue_depth; + cq_depth += dev->dev_info.queue_depth * 2; ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth, IORING_SETUP_COOP_TASKRUN | @@ -957,6 +958,7 @@ struct ublk_thread_info { sem_t *ready; cpu_set_t *affinity; unsigned long long extra_flags; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES]; }; static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info) @@ -970,14 +972,18 @@ static void ublk_batch_setup_queues(struct ublk_thread *t) { int i; - /* setup all queues in the 1st thread */ for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) { struct ublk_queue *q = &t->dev->q[i]; int ret; + /* + * Only prepare io commands in the mapped thread context, + * otherwise io command buffer index may not work as expected + */ + if (t->q_map[i] == 0) + continue; + ret = ublk_batch_queue_prep_io_cmds(t, q); - ublk_assert(ret == 0); - ret = ublk_process_io(t); ublk_assert(ret >= 0); } } @@ -991,6 +997,10 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf int dev_id = info->dev->dev_info.dev_id; int ret; + /* Copy per-thread queue mapping into thread-local variable */ + if (info->q_thread_map) + memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map)); + ret = ublk_thread_init(&t, info->extra_flags); if (ret) { ublk_err("ublk dev %d thread %u init failed\n", @@ -1006,12 +1016,8 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf /* submit all io commands to ublk driver */ ublk_submit_fetch_commands(&t); } else { - struct ublk_queue *q = &t.dev->q[t.idx]; - - /* prepare all io commands in the 1st thread context */ - if (!t.idx) - ublk_batch_setup_queues(&t); - ublk_batch_start_fetch(&t, q); + ublk_batch_setup_queues(&t); + ublk_batch_start_fetch(&t); } do { @@ -1085,6 +1091,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) struct ublk_thread_info *tinfo; unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; + unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; void *thread_ret; sem_t ready; int ret, i; @@ -1104,6 +1111,16 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) if (ret) return ret; + if (ublk_dev_batch_io(dev)) { + q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map)); + if (!q_thread_map) { + ret = -ENOMEM; + goto fail; + } + ublk_batch_setup_map(q_thread_map, dev->nthreads, + dinfo->nr_hw_queues); + } + if (ctx->auto_zc_fallback) extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK; if (ctx->no_ublk_fixed_fd) @@ -1127,6 +1144,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) tinfo[i].idx = i; tinfo[i].ready = &ready; tinfo[i].extra_flags = extra_flags; + tinfo[i].q_thread_map = q_thread_map; /* * If threads are not tied 1:1 to queues, setting thread @@ -1146,6 +1164,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) for (i = 0; i < dev->nthreads; i++) sem_wait(&ready); free(affinity_buf); + free(q_thread_map); /* everything is fine now, start us */ if (ctx->recovery) @@ -1314,7 +1333,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx) goto fail; } - if (nthreads != nr_queues && !ctx->per_io_tasks) { + if (nthreads != nr_queues && (!ctx->per_io_tasks && + !(ctx->flags & UBLK_F_BATCH_IO))) { ublk_err("%s: threads %u must be same as queues %u if " "not using per_io_tasks\n", __func__, nthreads, nr_queues); @@ -1940,6 +1960,13 @@ int main(int argc, char *argv[]) return -EINVAL; } + if ((ctx.flags & UBLK_F_AUTO_BUF_REG) && + (ctx.flags & UBLK_F_BATCH_IO) && + (ctx.nthreads > ctx.nr_hw_queues)) { + ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n"); + return -EINVAL; + } + i = optind; while (i < argc && ctx.nr_files < MAX_BACK_FILES) { ctx.files[ctx.nr_files++] = argv[i++]; diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 950e99c02e8b..ca97deb5e208 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -173,13 +173,17 @@ struct ublk_queue { const struct ublk_tgt_ops *tgt_ops; struct ublksrv_io_desc *io_cmd_buf; -/* borrow one bit of ublk uapi flags, which may never be used */ +/* borrow three bit of ublk uapi flags, which may never be used */ #define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63) #define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62) +#define UBLKS_Q_PREPARED (1ULL << 61) __u64 flags; int ublk_fd; /* cached ublk char device fd */ __u8 metadata_size; struct ublk_io ios[UBLK_QUEUE_DEPTH]; + + /* used for prep io commands */ + pthread_spinlock_t lock; }; /* align with `ublk_elem_header` */ @@ -206,8 +210,12 @@ struct batch_fetch_buf { }; struct ublk_thread { + /* Thread-local copy of queue-to-thread mapping for this thread */ + unsigned char q_map[UBLK_MAX_QUEUES]; + struct ublk_dev *dev; - unsigned idx; + unsigned short idx; + unsigned short nr_queues; #define UBLKS_T_STOPPING (1U << 0) #define UBLKS_T_IDLE (1U << 1) @@ -230,10 +238,10 @@ struct ublk_thread { void *commit_buf; #define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1) struct allocator commit_buf_alloc; - struct batch_commit_buf commit; + struct batch_commit_buf *commit; /* FETCH_IO_CMDS buffer */ -#define UBLKS_T_NR_FETCH_BUF 2 - struct batch_fetch_buf fetch[UBLKS_T_NR_FETCH_BUF]; + unsigned short nr_fetch_bufs; + struct batch_fetch_buf *fetch; struct io_uring ring; }; @@ -512,6 +520,21 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q) return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q); } +static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb) +{ + return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX; +} + +static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t, + const struct ublk_queue *q) +{ + unsigned char idx; + + idx = t->q_map[q->q_id]; + ublk_assert(idx != 0); + return idx - 1; +} + /* * Each IO's buffer index has to be calculated by this helper for * UBLKS_T_BATCH_IO @@ -520,14 +543,13 @@ static inline unsigned short ublk_batch_io_buf_idx( const struct ublk_thread *t, const struct ublk_queue *q, unsigned tag) { - return tag; + return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag; } /* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */ int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q); /* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */ -void ublk_batch_start_fetch(struct ublk_thread *t, - struct ublk_queue *q); +void ublk_batch_start_fetch(struct ublk_thread *t); /* Handle completion of batch I/O commands (prep/commit) */ void ublk_batch_compl_cmd(struct ublk_thread *t, const struct io_uring_cqe *cqe); @@ -545,6 +567,8 @@ void ublk_batch_commit_io_cmds(struct ublk_thread *t); /* Add a completed I/O operation to the current batch commit buffer */ void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res); +void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES], + int nthreads, int queues); static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q, unsigned tag, int res) diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh new file mode 100755 index 000000000000..b477f91359e1 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_02.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_02" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 4_threads vs. 1_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 1 --nthreads 4 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh new file mode 100755 index 000000000000..13a2b3d3a1b9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_batch_03.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="batch_03" +ERR_CODE=0 + +if ! _have_feature "BATCH_IO"; then + exit "$UBLK_SKIP_CODE" +fi + +if ! _have_program fio; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_BATCH_IO with 1_threads vs. 4_queues" + +_create_backfile 0 512M + +dev_id=$(_add_ublk_dev -t loop -q 4 --nthreads 1 -b "${UBLK_BACKFILES[0]}") +_check_add_dev $TID $? + +# run fio over the ublk disk +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite \ + --iodepth=32 --size=100M --numjobs=4 > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "generic" +_show_result $TID $ERR_CODE From dbc635c4be7eba1d0e0fe0275a289ee3ccc63d72 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Jan 2026 17:15:42 +0800 Subject: [PATCH 079/162] ublk: move ublk_mark_io_ready() out of __ublk_fetch() ublk_batch_prep_io() calls __ublk_fetch() while holding io->lock spinlock. When the last IO makes the device ready, ublk_mark_io_ready() tries to acquire ub->cancel_mutex which can sleep, causing a sleeping-while-atomic bug. Fix by moving ublk_mark_io_ready() out of __ublk_fetch() and into the callers (ublk_fetch and ublk_batch_prep_io) after the spinlock is released. Reported-by: Jens Axboe Fixes: b256795b3606 ("ublk: handle UBLK_U_IO_PREP_IO_CMDS") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1e374ecbf0f1..31279a8238b8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -3064,7 +3064,6 @@ static int __ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, WRITE_ONCE(io->task, NULL); else WRITE_ONCE(io->task, get_task_struct(current)); - ublk_mark_io_ready(ub, q_id); return 0; } @@ -3083,6 +3082,8 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, ret = __ublk_fetch(cmd, ub, io, q_id); if (!ret) ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); + if (!ret) + ublk_mark_io_ready(ub, q_id); mutex_unlock(&ub->mutex); return ret; } @@ -3484,6 +3485,9 @@ static int ublk_batch_prep_io(struct ublk_queue *ubq, io->buf = buf; ublk_io_unlock(io); + if (!ret) + ublk_mark_io_ready(data->ub, ubq->q_id); + return ret; } From e4c4bfec2bb8db9963d87e8ccdf89cd9e485d7b6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Jan 2026 17:15:43 +0800 Subject: [PATCH 080/162] ublk: fix canceling flag handling in batch I/O recovery Two issues with ubq->canceling flag handling: 1) In ublk_queue_reset_io_flags(), ubq->canceling is set outside cancel_lock, violating the locking requirement. Move it inside the spinlock-protected section. 2) In ublk_batch_unprep_io(), when rolling back after a batch prep failure, if the queue became ready during prep (which cleared canceling), the flag is not restored when the queue becomes not-ready again. This allows new requests to be queued to uninitialized IO slots. Fix by restoring ubq->canceling = true under cancel_lock when the queue transitions from ready to not-ready during rollback. Reported-by: Jens Axboe Fixes: 3f3850785594 ("ublk: fix batch I/O recovery -ENODEV error") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 31279a8238b8..31fda782c47c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2806,9 +2806,9 @@ static void ublk_queue_reset_io_flags(struct ublk_queue *ubq) spin_lock(&ubq->cancel_lock); for (j = 0; j < ubq->q_depth; j++) ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; + ubq->canceling = false; spin_unlock(&ubq->cancel_lock); ubq->fail_io = false; - ubq->canceling = false; } /* device can only be started after all IOs are ready */ @@ -3435,10 +3435,15 @@ static int ublk_batch_unprep_io(struct ublk_queue *ubq, /* * If queue was ready before this decrement, it won't be anymore, - * so we need to decrement the queue ready count too. + * so we need to decrement the queue ready count and restore the + * canceling flag to prevent new requests from being queued. */ - if (ublk_queue_ready(ubq)) + if (ublk_queue_ready(ubq)) { data->ub->nr_queue_ready--; + spin_lock(&ubq->cancel_lock); + ubq->canceling = true; + spin_unlock(&ubq->cancel_lock); + } ubq->nr_io_ready--; ublk_io_lock(io); From e4d3fc6a22f53e5bbe51e28b43cb32bc130d9f87 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Jan 2026 17:15:44 +0800 Subject: [PATCH 081/162] selftests: ublk: fix test name Fix the two added test name. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_stress_08.sh | 2 +- tools/testing/selftests/ublk/test_stress_09.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh index 190db0b4f2ad..9abb50ee3d00 100755 --- a/tools/testing/selftests/ublk/test_stress_08.sh +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_06" +TID="stress_08" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh index 1b6bdb31da03..87b92b0a2410 100755 --- a/tools/testing/selftests/ublk/test_stress_09.sh +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_07" +TID="stress_09" ERR_CODE=0 ublk_io_and_kill_daemon() From f50af896932f5edb1ff7b407753ecfa285c30b7a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Jan 2026 21:51:58 +0800 Subject: [PATCH 082/162] ublk: rename auto buffer registration helpers Rename the auto buffer registration functions for clarity: - __ublk_do_auto_buf_reg() -> ublk_auto_buf_register() - ublk_prep_auto_buf_reg_io() -> ublk_auto_buf_io_setup() - ublk_do_auto_buf_reg() -> ublk_auto_buf_dispatch() Add comments documenting the locking requirements for each function. No functional change. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 31fda782c47c..7981decd1cee 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1509,10 +1509,16 @@ enum auto_buf_reg_res { AUTO_BUF_REG_OK, }; -static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq, - struct request *req, struct ublk_io *io, - struct io_uring_cmd *cmd, - enum auto_buf_reg_res res) +/* + * Setup io state after auto buffer registration. + * + * Must be called after ublk_auto_buf_register() is done. + * Caller must hold io->lock in batch context. + */ +static void ublk_auto_buf_io_setup(const struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, + enum auto_buf_reg_res res) { if (res == AUTO_BUF_REG_OK) { io->task_registered_buffers = 1; @@ -1523,8 +1529,9 @@ static void ublk_prep_auto_buf_reg_io(const struct ublk_queue *ubq, __ublk_prep_compl_io_cmd(io, req); } +/* Register request bvec to io_uring for auto buffer registration. */ static enum auto_buf_reg_res -__ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, +ublk_auto_buf_register(const struct ublk_queue *ubq, struct request *req, struct ublk_io *io, struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -1544,15 +1551,21 @@ __ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, return AUTO_BUF_REG_OK; } -static void ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, - struct ublk_io *io, struct io_uring_cmd *cmd, - unsigned int issue_flags) +/* + * Dispatch IO to userspace with auto buffer registration. + * + * Only called in non-batch context from task work, io->lock not held. + */ +static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq, + struct request *req, struct ublk_io *io, + struct io_uring_cmd *cmd, + unsigned int issue_flags) { - enum auto_buf_reg_res res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + enum auto_buf_reg_res res = ublk_auto_buf_register(ubq, req, io, cmd, issue_flags); if (res != AUTO_BUF_REG_FAIL) { - ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + ublk_auto_buf_io_setup(ubq, req, io, cmd, res); io_uring_cmd_done(cmd, UBLK_IO_RES_OK, issue_flags); } } @@ -1627,7 +1640,7 @@ static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) return; if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { - ublk_do_auto_buf_reg(ubq, req, io, io->cmd, issue_flags); + ublk_auto_buf_dispatch(ubq, req, io, io->cmd, issue_flags); } else { ublk_init_req_ref(ubq, io); ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); @@ -1648,7 +1661,7 @@ static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq, return false; if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req)) { - res = __ublk_do_auto_buf_reg(ubq, req, io, cmd, + res = ublk_auto_buf_register(ubq, req, io, cmd, data->issue_flags); if (res == AUTO_BUF_REG_FAIL) @@ -1656,7 +1669,7 @@ static bool __ublk_batch_prep_dispatch(struct ublk_queue *ubq, } ublk_io_lock(io); - ublk_prep_auto_buf_reg_io(ubq, req, io, cmd, res); + ublk_auto_buf_io_setup(ubq, req, io, cmd, res); ublk_io_unlock(io); return true; From 8e5bcc3a955a2cc4460b391f55d3b49905eb248e Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Sun, 25 Jan 2026 08:57:46 +0000 Subject: [PATCH 083/162] selftests: ublk: add missing gitignore for metadata_size binary A new utility metadata_size was added in commit 261b67f4e347 ("selftests: ublk: add utility to get block device metadata size") but it was not added to .gitignore. Fix that by adding it there. While at it sort all entries alphabetically and add a SPDX license header. Reviewed-by: Caleb Sander Mateos Fixes: 261b67f4e347 ("selftests: ublk: add utility to get block device metadata size") Signed-off-by: Alexander Atanasov Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/.gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ublk/.gitignore b/tools/testing/selftests/ublk/.gitignore index 8b2871ea7751..e17bd28f27e0 100644 --- a/tools/testing/selftests/ublk/.gitignore +++ b/tools/testing/selftests/ublk/.gitignore @@ -1,3 +1,5 @@ -kublk -/tools +# SPDX-License-Identifier: GPL-2.0 *-verify.state +/tools +kublk +metadata_size From 2d9f7150ac197ce79c9c917a004d4cf0b26ad7e0 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 15 Jan 2026 01:12:29 +0800 Subject: [PATCH 084/162] md/raid5: fix raid5_run() to return error when log_init() fails Since commit f63f17350e53 ("md/raid5: use the atomic queue limit update APIs"), the abort path in raid5_run() returns 'ret' instead of -EIO. However, if log_init() fails, 'ret' is still 0 from the previous successful call, causing raid5_run() to return success despite the failure. Fix this by capturing the return value from log_init(). Link: https://lore.kernel.org/linux-raid/20260114171241.3043364-2-yukuai@fnnas.com Fixes: f63f17350e53 ("md/raid5: use the atomic queue limit update APIs") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202601130531.LGfcZsa4-lkp@intel.com/ Signed-off-by: Yu Kuai Reviewed-by: Li Nan Reviewed-by: Xiao Ni Reviewed-by: Christoph Hellwig --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8dc98f545969..a85878b009f9 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8057,7 +8057,8 @@ static int raid5_run(struct mddev *mddev) goto abort; } - if (log_init(conf, journal_dev, raid5_has_ppl(conf))) + ret = log_init(conf, journal_dev, raid5_has_ppl(conf)); + if (ret) goto abort; return 0; From fba4a980403d2f489bc680dbff7d7d2514e669f9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 15 Jan 2026 01:12:30 +0800 Subject: [PATCH 085/162] md: merge mddev has_superblock into mddev_flags There is not need to use a separate field in struct mddev, there are no functional changes. Link: https://lore.kernel.org/linux-raid/20260114171241.3043364-3-yukuai@fnnas.com Signed-off-by: Yu Kuai Reviewed-by: Li Nan --- drivers/md/md.c | 6 +++--- drivers/md/md.h | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 6d73f6e196a9..bf7666e227a1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6502,7 +6502,7 @@ int md_run(struct mddev *mddev) * the only valid external interface is through the md * device. */ - mddev->has_superblocks = false; + clear_bit(MD_HAS_SUPERBLOCK, &mddev->flags); rdev_for_each(rdev, mddev) { if (test_bit(Faulty, &rdev->flags)) continue; @@ -6515,7 +6515,7 @@ int md_run(struct mddev *mddev) } if (rdev->sb_page) - mddev->has_superblocks = true; + set_bit(MD_HAS_SUPERBLOCK, &mddev->flags); /* perform some consistency tests on the device. * We don't want the data to overlap the metadata, @@ -9125,7 +9125,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) rcu_read_unlock(); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); - if (!mddev->has_superblocks) + if (!test_bit(MD_HAS_SUPERBLOCK, &mddev->flags)) return; wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); diff --git a/drivers/md/md.h b/drivers/md/md.h index 6985f2829bbd..b4c9aa600edd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -340,6 +340,7 @@ struct md_cluster_operations; * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. * @MD_DELETED: This device is being deleted + * @MD_HAS_SUPERBLOCK: There is persistence sb in member disks. * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ @@ -356,6 +357,7 @@ enum mddev_flags { MD_BROKEN, MD_DO_DELETE, MD_DELETED, + MD_HAS_SUPERBLOCK, }; enum mddev_sb_flags { @@ -623,7 +625,6 @@ struct mddev { /* The sequence number for sync thread */ atomic_t sync_seq; - bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; }; From 4f6d2e648cbe963b328cb8815290676da3866434 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 15 Jan 2026 01:12:31 +0800 Subject: [PATCH 086/162] md: merge mddev faillast_dev into mddev_flags There is not need to use a separate field in struct mddev, there are no functional changes. Link: https://lore.kernel.org/linux-raid/20260114171241.3043364-4-yukuai@fnnas.com Signed-off-by: Yu Kuai Reviewed-by: Li Nan --- drivers/md/md.c | 10 ++++++---- drivers/md/md.h | 3 ++- drivers/md/raid0.c | 3 ++- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 4 ++-- drivers/md/raid5.c | 5 ++++- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index bf7666e227a1..b955eba7f461 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5864,11 +5864,11 @@ __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show, static ssize_t fail_last_dev_show(struct mddev *mddev, char *page) { - return sprintf(page, "%d\n", mddev->fail_last_dev); + return sprintf(page, "%d\n", test_bit(MD_FAILLAST_DEV, &mddev->flags)); } /* - * Setting fail_last_dev to true to allow last device to be forcibly removed + * Setting MD_FAILLAST_DEV to allow last device to be forcibly removed * from RAID1/RAID10. */ static ssize_t @@ -5881,8 +5881,10 @@ fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len) if (ret) return ret; - if (value != mddev->fail_last_dev) - mddev->fail_last_dev = value; + if (value) + set_bit(MD_FAILLAST_DEV, &mddev->flags); + else + clear_bit(MD_FAILLAST_DEV, &mddev->flags); return len; } diff --git a/drivers/md/md.h b/drivers/md/md.h index b4c9aa600edd..297a104fba88 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -341,6 +341,7 @@ struct md_cluster_operations; * @MD_BROKEN: This is used to stop writes and mark array as failed. * @MD_DELETED: This device is being deleted * @MD_HAS_SUPERBLOCK: There is persistence sb in member disks. + * @MD_FAILLAST_DEV: Allow last rdev to be removed. * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ @@ -358,6 +359,7 @@ enum mddev_flags { MD_DO_DELETE, MD_DELETED, MD_HAS_SUPERBLOCK, + MD_FAILLAST_DEV, }; enum mddev_sb_flags { @@ -625,7 +627,6 @@ struct mddev { /* The sequence number for sync thread */ atomic_t sync_seq; - bool fail_last_dev:1; bool serialize_policy:1; }; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 985c377356eb..4d567fcf6a7c 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -27,7 +27,8 @@ module_param(default_layout, int, 0644); (1L << MD_JOURNAL_CLEAN) | \ (1L << MD_FAILFAST_SUPPORTED) |\ (1L << MD_HAS_PPL) | \ - (1L << MD_HAS_MULTIPLE_PPLS)) + (1L << MD_HAS_MULTIPLE_PPLS) | \ + (1L << MD_FAILLAST_DEV)) /* * inform the user of the raid configuration diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 57d50465eed1..98b5c93810bb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1746,7 +1746,7 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev) * - &mddev->degraded is bumped. * * @rdev is marked as &Faulty excluding case when array is failed and - * &mddev->fail_last_dev is off. + * MD_FAILLAST_DEV is not set. */ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) { @@ -1759,7 +1759,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) (conf->raid_disks - mddev->degraded) == 1) { set_bit(MD_BROKEN, &mddev->flags); - if (!mddev->fail_last_dev) { + if (!test_bit(MD_FAILLAST_DEV, &mddev->flags)) { conf->recovery_disabled = mddev->recovery_disabled; spin_unlock_irqrestore(&conf->device_lock, flags); return; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 84be4cc7e873..09328e032f14 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1990,7 +1990,7 @@ static int enough(struct r10conf *conf, int ignore) * - &mddev->degraded is bumped. * * @rdev is marked as &Faulty excluding case when array is failed and - * &mddev->fail_last_dev is off. + * MD_FAILLAST_DEV is not set. */ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) { @@ -2002,7 +2002,7 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev) if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { set_bit(MD_BROKEN, &mddev->flags); - if (!mddev->fail_last_dev) { + if (!test_bit(MD_FAILLAST_DEV, &mddev->flags)) { spin_unlock_irqrestore(&conf->device_lock, flags); return; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a85878b009f9..055293e56a7e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -56,7 +56,10 @@ #include "md-bitmap.h" #include "raid5-log.h" -#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) +#define UNSUPPORTED_MDDEV_FLAGS \ + ((1L << MD_FAILFAST_SUPPORTED) | \ + (1L << MD_FAILLAST_DEV)) + #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE From 10787568cc1f3f80afc510b2728751989dfa0ae6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 15 Jan 2026 01:12:32 +0800 Subject: [PATCH 087/162] md: merge mddev serialize_policy into mddev_flags There is not need to use a separate field in struct mddev, there are no functional changes. Link: https://lore.kernel.org/linux-raid/20260114171241.3043364-5-yukuai@fnnas.com Signed-off-by: Yu Kuai Reviewed-by: Li Nan --- drivers/md/md-bitmap.c | 4 ++-- drivers/md/md.c | 20 ++++++++++++-------- drivers/md/md.h | 4 ++-- drivers/md/raid0.c | 3 ++- drivers/md/raid1.c | 4 ++-- drivers/md/raid5.c | 3 ++- 6 files changed, 22 insertions(+), 16 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 84b7e2af6dba..dbe4c4b9a1da 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2085,7 +2085,7 @@ static void bitmap_destroy(struct mddev *mddev) return; bitmap_wait_behind_writes(mddev); - if (!mddev->serialize_policy) + if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags)) mddev_destroy_serial_pool(mddev, NULL); mutex_lock(&mddev->bitmap_info.mutex); @@ -2809,7 +2809,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len) mddev->bitmap_info.max_write_behind = backlog; if (!backlog && mddev->serial_info_pool) { /* serial_info_pool is not needed if backlog is zero */ - if (!mddev->serialize_policy) + if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags)) mddev_destroy_serial_pool(mddev, NULL); } else if (backlog && !mddev->serial_info_pool) { /* serial_info_pool is needed since backlog is not zero */ diff --git a/drivers/md/md.c b/drivers/md/md.c index b955eba7f461..43791d72c0c3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -279,7 +279,8 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev) rdev_for_each(temp, mddev) { if (!rdev) { - if (!mddev->serialize_policy || + if (!test_bit(MD_SERIALIZE_POLICY, + &mddev->flags) || !rdev_need_serial(temp)) rdev_uninit_serial(temp); else @@ -5897,11 +5898,12 @@ static ssize_t serialize_policy_show(struct mddev *mddev, char *page) if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) return sprintf(page, "n/a\n"); else - return sprintf(page, "%d\n", mddev->serialize_policy); + return sprintf(page, "%d\n", + test_bit(MD_SERIALIZE_POLICY, &mddev->flags)); } /* - * Setting serialize_policy to true to enforce write IO is not reordered + * Setting MD_SERIALIZE_POLICY enforce write IO is not reordered * for raid1. */ static ssize_t @@ -5914,7 +5916,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; - if (value == mddev->serialize_policy) + if (value == test_bit(MD_SERIALIZE_POLICY, &mddev->flags)) return len; err = mddev_suspend_and_lock(mddev); @@ -5926,11 +5928,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; } - if (value) + if (value) { mddev_create_serial_pool(mddev, NULL); - else + set_bit(MD_SERIALIZE_POLICY, &mddev->flags); + } else { mddev_destroy_serial_pool(mddev, NULL); - mddev->serialize_policy = value; + clear_bit(MD_SERIALIZE_POLICY, &mddev->flags); + } unlock: mddev_unlock_and_resume(mddev); return err ?: len; @@ -6867,7 +6871,7 @@ static void __md_stop_writes(struct mddev *mddev) md_update_sb(mddev, 1); } /* disable policy to guarantee rdevs free resources for serialization */ - mddev->serialize_policy = 0; + clear_bit(MD_SERIALIZE_POLICY, &mddev->flags); mddev_destroy_serial_pool(mddev, NULL); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 297a104fba88..6ee18045f41c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -342,6 +342,7 @@ struct md_cluster_operations; * @MD_DELETED: This device is being deleted * @MD_HAS_SUPERBLOCK: There is persistence sb in member disks. * @MD_FAILLAST_DEV: Allow last rdev to be removed. + * @MD_SERIALIZE_POLICY: Enforce write IO is not reordered, just used by raid1. * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ @@ -360,6 +361,7 @@ enum mddev_flags { MD_DELETED, MD_HAS_SUPERBLOCK, MD_FAILLAST_DEV, + MD_SERIALIZE_POLICY, }; enum mddev_sb_flags { @@ -626,8 +628,6 @@ struct mddev { /* The sequence number for sync thread */ atomic_t sync_seq; - - bool serialize_policy:1; }; enum recovery_flags { diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 4d567fcf6a7c..d83b2b1c0049 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -28,7 +28,8 @@ module_param(default_layout, int, 0644); (1L << MD_FAILFAST_SUPPORTED) |\ (1L << MD_HAS_PPL) | \ (1L << MD_HAS_MULTIPLE_PPLS) | \ - (1L << MD_FAILLAST_DEV)) + (1L << MD_FAILLAST_DEV) | \ + (1L << MD_SERIALIZE_POLICY)) /* * inform the user of the raid configuration diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 98b5c93810bb..f4c7004888af 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -542,7 +542,7 @@ static void raid1_end_write_request(struct bio *bio) call_bio_endio(r1_bio); } } - } else if (rdev->mddev->serialize_policy) + } else if (test_bit(MD_SERIALIZE_POLICY, &rdev->mddev->flags)) remove_serial(rdev, lo, hi); if (r1_bio->bios[mirror] == NULL) rdev_dec_pending(rdev, conf->mddev); @@ -1644,7 +1644,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); - if (mddev->serialize_policy) + if (test_bit(MD_SERIALIZE_POLICY, &mddev->flags)) wait_for_serialization(rdev, r1_bio); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 055293e56a7e..6d44609f62f3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -58,7 +58,8 @@ #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_FAILFAST_SUPPORTED) | \ - (1L << MD_FAILLAST_DEV)) + (1L << MD_FAILLAST_DEV) | \ + (1L << MD_SERIALIZE_POLICY)) #define cpu_to_group(cpu) cpu_to_node(cpu) From 9340a95d489ab5ff3c2d18c78283211b03a4265a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 15 Jan 2026 01:12:33 +0800 Subject: [PATCH 088/162] md/raid5: use mempool to allocate stripe_request_ctx On the one hand, stripe_request_ctx is 72 bytes, and it's a bit huge for a stack variable. On the other hand, the bitmap sectors_to_do is a fixed size, result in max_hw_sector_kb of raid5 array is at most 256 * 4k = 1Mb, and this will make full stripe IO impossible for the array that chunk_size * data_disks is bigger. Allocate ctx during runtime will make it possible to get rid of this limit. Link: https://lore.kernel.org/linux-raid/20260114171241.3043364-6-yukuai@fnnas.com Signed-off-by: Yu Kuai Reviewed-by: Li Nan --- drivers/md/md.h | 4 +++ drivers/md/raid1-10.c | 5 ---- drivers/md/raid5.c | 61 +++++++++++++++++++++++++++---------------- drivers/md/raid5.h | 2 ++ 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/drivers/md/md.h b/drivers/md/md.h index 6ee18045f41c..b8c5dec12b62 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -22,6 +22,10 @@ #include #define MaxSector (~(sector_t)0) +/* + * Number of guaranteed raid bios in case of extreme VM load: + */ +#define NR_RAID_BIOS 256 enum md_submodule_type { MD_PERSONALITY = 0, diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 521625756128..c33099925f23 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -3,11 +3,6 @@ #define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -/* - * Number of guaranteed raid bios in case of extreme VM load: - */ -#define NR_RAID_BIOS 256 - /* when we get a read error on a read-only array, we redirect to another * device without failing the first device, or trying to over-write to * correct the read error. To keep track of bad blocks on a per-bio diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6d44609f62f3..2fa63bd2431a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6084,13 +6084,13 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { DEFINE_WAIT_FUNC(wait, woken_wake_function); - bool on_wq; struct r5conf *conf = mddev->private; - sector_t logical_sector; - struct stripe_request_ctx ctx = {}; const int rw = bio_data_dir(bi); + struct stripe_request_ctx *ctx; + sector_t logical_sector; enum stripe_result res; int s, stripe_cnt; + bool on_wq; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -6102,11 +6102,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) return true; } /* ret == -EAGAIN, fallback */ - /* - * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, - * we need to flush journal device - */ - ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } md_write_start(mddev, bi); @@ -6129,16 +6124,25 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); - ctx.first_sector = logical_sector; - ctx.last_sector = bio_end_sector(bi); bi->bi_next = NULL; - stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + ctx = mempool_alloc(conf->ctx_pool, GFP_NOIO); + memset(ctx, 0, sizeof(*ctx)); + ctx->first_sector = logical_sector; + ctx->last_sector = bio_end_sector(bi); + /* + * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, + * we need to flush journal device + */ + if (unlikely(bi->bi_opf & REQ_PREFLUSH)) + ctx->do_flush = true; + + stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx->last_sector - logical_sector, RAID5_STRIPE_SECTORS(conf)); - bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); + bitmap_set(ctx->sectors_to_do, 0, stripe_cnt); pr_debug("raid456: %s, logical %llu to %llu\n", __func__, - bi->bi_iter.bi_sector, ctx.last_sector); + bi->bi_iter.bi_sector, ctx->last_sector); /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && @@ -6146,6 +6150,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); + mempool_free(ctx, conf->ctx_pool); return true; } md_account_bio(mddev, &bi); @@ -6164,10 +6169,10 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) add_wait_queue(&conf->wait_for_reshape, &wait); on_wq = true; } - s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); + s = (logical_sector - ctx->first_sector) >> RAID5_STRIPE_SHIFT(conf); while (1) { - res = make_stripe_request(mddev, conf, &ctx, logical_sector, + res = make_stripe_request(mddev, conf, ctx, logical_sector, bi); if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE) break; @@ -6184,9 +6189,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * raid5_activate_delayed() from making progress * and thus deadlocking. */ - if (ctx.batch_last) { - raid5_release_stripe(ctx.batch_last); - ctx.batch_last = NULL; + if (ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last = NULL; } wait_woken(&wait, TASK_UNINTERRUPTIBLE, @@ -6194,21 +6199,23 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; } - s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s); + s = find_next_bit_wrap(ctx->sectors_to_do, stripe_cnt, s); if (s == stripe_cnt) break; - logical_sector = ctx.first_sector + + logical_sector = ctx->first_sector + (s << RAID5_STRIPE_SHIFT(conf)); } if (unlikely(on_wq)) remove_wait_queue(&conf->wait_for_reshape, &wait); - if (ctx.batch_last) - raid5_release_stripe(ctx.batch_last); + if (ctx->batch_last) + raid5_release_stripe(ctx->batch_last); if (rw == WRITE) md_write_end(mddev); + + mempool_free(ctx, conf->ctx_pool); if (res == STRIPE_WAIT_RESHAPE) { md_free_cloned_bio(bi); return false; @@ -7378,6 +7385,9 @@ static void free_conf(struct r5conf *conf) bioset_exit(&conf->bio_split); kfree(conf->stripe_hashtbl); kfree(conf->pending_data); + + mempool_destroy(conf->ctx_pool); + kfree(conf); } @@ -8061,6 +8071,13 @@ static int raid5_run(struct mddev *mddev) goto abort; } + conf->ctx_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, + sizeof(struct stripe_request_ctx)); + if (!conf->ctx_pool) { + ret = -ENOMEM; + goto abort; + } + ret = log_init(conf, journal_dev, raid5_has_ppl(conf)); if (ret) goto abort; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index eafc6e9ed6ee..6e3f07119fa4 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -690,6 +690,8 @@ struct r5conf { struct list_head pending_list; int pending_data_cnt; struct r5pending_data *next_pending_data; + + mempool_t *ctx_pool; }; #if PAGE_SIZE == DEFAULT_STRIPE_SIZE From 4ffe28ed0d7ce7f2f72372cb13152ad37a43ff21 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 15 Jan 2026 01:12:34 +0800 Subject: [PATCH 089/162] md/raid5: make sure max_sectors is not less than io_opt Otherwise, even if user issue IO by io_opt, such IO will be split by max_sectors before they are submitted to raid5. For consequence, full stripe IO is impossible. BTW, dm-raid5 is not affected and still have such problem. Link: https://lore.kernel.org/linux-raid/20260114171241.3043364-7-yukuai@fnnas.com Signed-off-by: Yu Kuai --- drivers/md/raid5.c | 38 ++++++++++++++++++++++++++++---------- drivers/md/raid5.h | 1 + 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2fa63bd2431a..84626ad71ffb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -777,14 +777,14 @@ struct stripe_request_ctx { /* last sector in the request */ sector_t last_sector; + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ + bool do_flush; + /* * bitmap to track stripe sectors that have been added to stripes * add one to account for unaligned requests */ - DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1); - - /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ - bool do_flush; + unsigned long sectors_to_do[]; }; /* @@ -6127,7 +6127,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) bi->bi_next = NULL; ctx = mempool_alloc(conf->ctx_pool, GFP_NOIO); - memset(ctx, 0, sizeof(*ctx)); + memset(ctx, 0, conf->ctx_size); ctx->first_sector = logical_sector; ctx->last_sector = bio_end_sector(bi); /* @@ -7743,6 +7743,25 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded return 0; } +static int raid5_create_ctx_pool(struct r5conf *conf) +{ + struct stripe_request_ctx *ctx; + int size; + + if (mddev_is_dm(conf->mddev)) + size = BITS_TO_LONGS(RAID5_MAX_REQ_STRIPES); + else + size = BITS_TO_LONGS( + queue_max_hw_sectors(conf->mddev->gendisk->queue) >> + RAID5_STRIPE_SHIFT(conf)); + + conf->ctx_size = struct_size(ctx, sectors_to_do, size); + conf->ctx_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, + conf->ctx_size); + + return conf->ctx_pool ? 0 : -ENOMEM; +} + static int raid5_set_limits(struct mddev *mddev) { struct r5conf *conf = mddev->private; @@ -7799,6 +7818,8 @@ static int raid5_set_limits(struct mddev *mddev) * Limit the max sectors based on this. */ lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf); + if ((lim.max_hw_sectors << 9) < lim.io_opt) + lim.max_hw_sectors = lim.io_opt >> 9; /* No restrictions on the number of segments in the request */ lim.max_segments = USHRT_MAX; @@ -8071,12 +8092,9 @@ static int raid5_run(struct mddev *mddev) goto abort; } - conf->ctx_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS, - sizeof(struct stripe_request_ctx)); - if (!conf->ctx_pool) { - ret = -ENOMEM; + ret = raid5_create_ctx_pool(conf); + if (ret) goto abort; - } ret = log_init(conf, journal_dev, raid5_has_ppl(conf)); if (ret) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 6e3f07119fa4..ddfe65237888 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -692,6 +692,7 @@ struct r5conf { struct r5pending_data *next_pending_data; mempool_t *ctx_pool; + int ctx_size; }; #if PAGE_SIZE == DEFAULT_STRIPE_SIZE From 090856dd8599edfc7699d1ad9bf8069b9745b313 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:49 +0800 Subject: [PATCH 090/162] md/raid1: simplify uptodate handling in end_sync_write In end_sync_write, r1bio state is always set to either R1BIO_WriteError or R1BIO_MadeGood. Consequently, put_sync_write_buf() never takes the 'else' branch that calls md_done_sync(), making the uptodate parameter have no practical effect. Pass 1 to put_sync_write_buf(). A more complete cleanup will be done in a follow-up patch. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-2-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/raid1.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f4c7004888af..3670abf46841 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2080,13 +2080,12 @@ static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) static void end_sync_write(struct bio *bio) { - int uptodate = !bio->bi_status; struct r1bio *r1_bio = get_resync_r1bio(bio); struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; - if (!uptodate) { + if (bio->bi_status) { abort_sync_write(mddev, r1_bio); set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -2099,7 +2098,7 @@ static void end_sync_write(struct bio *bio) set_bit(R1BIO_MadeGood, &r1_bio->state); } - put_sync_write_buf(r1_bio, uptodate); + put_sync_write_buf(r1_bio, 1); } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, From 2a5d4549a28da76fa426aaeab0a8561bfc6194c3 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:50 +0800 Subject: [PATCH 091/162] md: factor error handling out of md_done_sync into helper The 'ok' parameter in md_done_sync() is redundant for most callers that always pass 'true'. Factor error handling logic into a separate helper function md_sync_error() to eliminate unnecessary parameter passing and improve code clarity. No functional changes introduced. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-3-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/md.c | 17 ++++++++++------- drivers/md/md.h | 3 ++- drivers/md/raid1.c | 14 +++++++------- drivers/md/raid10.c | 11 ++++++----- drivers/md/raid5.c | 14 ++++++++------ 5 files changed, 33 insertions(+), 26 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 43791d72c0c3..97b023536afc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9074,20 +9074,23 @@ static bool is_mddev_idle(struct mddev *mddev, int init) return idle; } -void md_done_sync(struct mddev *mddev, int blocks, int ok) +void md_done_sync(struct mddev *mddev, int blocks) { /* another "blocks" (512byte) blocks have been synced */ atomic_sub(blocks, &mddev->recovery_active); wake_up(&mddev->recovery_wait); - if (!ok) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_RECOVERY_ERROR, &mddev->recovery); - md_wakeup_thread(mddev->thread); - // stop recovery, signal do_sync .... - } } EXPORT_SYMBOL(md_done_sync); +void md_sync_error(struct mddev *mddev) +{ + // stop recovery, signal do_sync .... + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + set_bit(MD_RECOVERY_ERROR, &mddev->recovery); + md_wakeup_thread(mddev->thread); +} +EXPORT_SYMBOL(md_sync_error); + /* md_write_start(mddev, bi) * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update diff --git a/drivers/md/md.h b/drivers/md/md.h index b8c5dec12b62..4c3dc7a6e399 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -918,7 +918,8 @@ extern const char *md_sync_action_name(enum sync_action action); extern void md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); -extern void md_done_sync(struct mddev *mddev, int blocks, int ok); +extern void md_done_sync(struct mddev *mddev, int blocks); +extern void md_sync_error(struct mddev *mddev); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3670abf46841..0c9332d557a5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2062,7 +2062,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) } while (sectors_to_go > 0); } -static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) +static void put_sync_write_buf(struct r1bio *r1_bio) { if (atomic_dec_and_test(&r1_bio->remaining)) { struct mddev *mddev = r1_bio->mddev; @@ -2073,7 +2073,7 @@ static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) reschedule_retry(r1_bio); else { put_buf(r1_bio); - md_done_sync(mddev, s, uptodate); + md_done_sync(mddev, s); } } } @@ -2098,7 +2098,7 @@ static void end_sync_write(struct bio *bio) set_bit(R1BIO_MadeGood, &r1_bio->state); } - put_sync_write_buf(r1_bio, 1); + put_sync_write_buf(r1_bio); } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, @@ -2348,8 +2348,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) || !fix_sync_read_error(r1_bio)) { conf->recovery_disabled = mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_done_sync(mddev, r1_bio->sectors, 0); + md_done_sync(mddev, r1_bio->sectors); + md_sync_error(mddev); put_buf(r1_bio); return; } @@ -2384,7 +2384,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) submit_bio_noacct(wbio); } - put_sync_write_buf(r1_bio, 1); + put_sync_write_buf(r1_bio); } /* @@ -2575,7 +2575,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio } } put_buf(r1_bio); - md_done_sync(conf->mddev, s, 1); + md_done_sync(conf->mddev, s); } static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 09328e032f14..5450dda8aa34 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2276,7 +2276,7 @@ static void end_sync_request(struct r10bio *r10_bio) reschedule_retry(r10_bio); else put_buf(r10_bio); - md_done_sync(mddev, s, 1); + md_done_sync(mddev, s); break; } else { struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; @@ -2452,7 +2452,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) done: if (atomic_dec_and_test(&r10_bio->remaining)) { - md_done_sync(mddev, r10_bio->sectors, 1); + md_done_sync(mddev, r10_bio->sectors); put_buf(r10_bio); } } @@ -3757,7 +3757,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* pretend they weren't skipped, it makes * no important difference in this case */ - md_done_sync(mddev, sectors_skipped, 1); + md_done_sync(mddev, sectors_skipped); return sectors_skipped + nr_sectors; giveup: @@ -4913,7 +4913,8 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) if (handle_reshape_read_error(mddev, r10_bio) < 0) { /* Reshape has been aborted */ - md_done_sync(mddev, r10_bio->sectors, 0); + md_done_sync(mddev, r10_bio->sectors); + md_sync_error(mddev); return; } @@ -5071,7 +5072,7 @@ static void end_reshape_request(struct r10bio *r10_bio) { if (!atomic_dec_and_test(&r10_bio->remaining)) return; - md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); + md_done_sync(r10_bio->mddev, r10_bio->sectors); bio_put(r10_bio->master_bio); put_buf(r10_bio); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 84626ad71ffb..bba3cab2cc16 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3727,11 +3727,13 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; } - if (abort) - conf->recovery_disabled = - conf->mddev->recovery_disabled; } - md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); + + if (abort) { + conf->recovery_disabled = conf->mddev->recovery_disabled; + md_sync_error(conf->mddev); + } } static int want_replace(struct stripe_head *sh, int disk_idx) @@ -5161,7 +5163,7 @@ static void handle_stripe(struct stripe_head *sh) if ((s.syncing || s.replacing) && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); @@ -5228,7 +5230,7 @@ static void handle_stripe(struct stripe_head *sh) clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_reshape); - md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); } if (s.expanding && s.locked == 0 && From 4870b0f59c1ad3aae05734a833fe7c3ae90bec09 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:51 +0800 Subject: [PATCH 092/162] md/raid1,raid10: support narrow_write_error when badblocks is disabled When badblocks.shift < 0 (badblocks disabled), narrow_write_error() return false, preventing write error handling. Since narrow_write_error() only splits IO into smaller sizes and re-submits, it can work with badblocks disabled. Adjust to use the logical block size for block_sectors when badblocks is disabled, allowing narrow_write_error() to function in this case. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-4-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/raid1.c | 8 ++++---- drivers/md/raid10.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0c9332d557a5..c65f1bb97aa1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2503,17 +2503,17 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) * We currently own a reference on the rdev. */ - int block_sectors; + int block_sectors, lbs = bdev_logical_block_size(rdev->bdev) >> 9; sector_t sector; int sectors; int sect_to_write = r1_bio->sectors; bool ok = true; if (rdev->badblocks.shift < 0) - return false; + block_sectors = lbs; + else + block_sectors = roundup(1 << rdev->badblocks.shift, lbs); - block_sectors = roundup(1 << rdev->badblocks.shift, - bdev_logical_block_size(rdev->bdev) >> 9); sector = r1_bio->sector; sectors = ((sector + block_sectors) & ~(sector_t)(block_sectors - 1)) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5450dda8aa34..deac7741c490 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2790,17 +2790,17 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i) * We currently own a reference to the rdev. */ - int block_sectors; + int block_sectors, lbs = bdev_logical_block_size(rdev->bdev) >> 9; sector_t sector; int sectors; int sect_to_write = r10_bio->sectors; bool ok = true; if (rdev->badblocks.shift < 0) - return false; + block_sectors = lbs; + else + block_sectors = roundup(1 << rdev->badblocks.shift, lbs); - block_sectors = roundup(1 << rdev->badblocks.shift, - bdev_logical_block_size(rdev->bdev) >> 9); sector = r10_bio->sector; sectors = ((r10_bio->sector + block_sectors) & ~(sector_t)(block_sectors - 1)) From aa9d12cfa1a514de427c2641911755c11350ee09 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:52 +0800 Subject: [PATCH 093/162] md: break remaining operations on badblocks set failure in narrow_write_error Mark device faulty and exit at once when setting badblocks fails in narrow_write_error(). No need to continue processing remaining sections. With this change, narrow_write_error() no longer needs to return a value, so adjust its return type to void. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-5-linan666@huaweicloud.com Signed-off-by: Li Nan Signed-off-by: Yu Kuai --- drivers/md/raid1.c | 24 ++++++++++++------------ drivers/md/raid10.c | 22 ++++++++++++---------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c65f1bb97aa1..ac8eff3dfb85 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2486,7 +2486,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) } } -static bool narrow_write_error(struct r1bio *r1_bio, int i) +static void narrow_write_error(struct r1bio *r1_bio, int i) { struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -2507,7 +2507,6 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) sector_t sector; int sectors; int sect_to_write = r1_bio->sectors; - bool ok = true; if (rdev->badblocks.shift < 0) block_sectors = lbs; @@ -2541,18 +2540,22 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i) bio_trim(wbio, sector - r1_bio->sector, sectors); wbio->bi_iter.bi_sector += rdev->data_offset; - if (submit_bio_wait(wbio) < 0) - /* failure! */ - ok = rdev_set_badblocks(rdev, sector, - sectors, 0) - && ok; + if (submit_bio_wait(wbio) && + !rdev_set_badblocks(rdev, sector, sectors, 0)) { + /* + * Badblocks set failed, disk marked Faulty. + * No further operations needed. + */ + md_error(mddev, rdev); + bio_put(wbio); + break; + } bio_put(wbio); sect_to_write -= sectors; sector += sectors; sectors = block_sectors; } - return ok; } static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio) @@ -2596,10 +2599,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * errors. */ fail = true; - if (!narrow_write_error(r1_bio, m)) - md_error(conf->mddev, - conf->mirrors[m].rdev); - /* an I/O failed, we can't clear the bitmap */ + narrow_write_error(r1_bio, m); rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index deac7741c490..233981dac83c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2773,7 +2773,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 } } -static bool narrow_write_error(struct r10bio *r10_bio, int i) +static void narrow_write_error(struct r10bio *r10_bio, int i) { struct bio *bio = r10_bio->master_bio; struct mddev *mddev = r10_bio->mddev; @@ -2794,7 +2794,6 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i) sector_t sector; int sectors; int sect_to_write = r10_bio->sectors; - bool ok = true; if (rdev->badblocks.shift < 0) block_sectors = lbs; @@ -2820,18 +2819,22 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i) choose_data_offset(r10_bio, rdev); wbio->bi_opf = REQ_OP_WRITE; - if (submit_bio_wait(wbio) < 0) - /* Failure! */ - ok = rdev_set_badblocks(rdev, wsector, - sectors, 0) - && ok; + if (submit_bio_wait(wbio) && + !rdev_set_badblocks(rdev, wsector, sectors, 0)) { + /* + * Badblocks set failed, disk marked Faulty. + * No further operations needed. + */ + md_error(mddev, rdev); + bio_put(wbio); + break; + } bio_put(wbio); sect_to_write -= sectors; sector += sectors; sectors = block_sectors; } - return ok; } static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) @@ -2936,8 +2939,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && bio->bi_status) { fail = true; - if (!narrow_write_error(r10_bio, m)) - md_error(conf->mddev, rdev); + narrow_write_error(r10_bio, m); rdev_dec_pending(rdev, conf->mddev); } bio = r10_bio->devs[m].repl_bio; From fd4d44c14ff6a0e815eefd5d87bbba2b2668b18f Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:53 +0800 Subject: [PATCH 094/162] md: mark rdev Faulty when badblocks setting fails Currently when sync read fails and badblocks set fails (exceeding 512 limit), rdev isn't immediately marked Faulty. Instead 'recovery_disabled' is set and non-In_sync rdevs are removed later. This preserves array availability if bad regions aren't read, but bad sectors might be read by users before rdev removal. This occurs due to incorrect resync/recovery_offset updates that include these bad sectors. When badblocks exceed 512, keeping the disk provides little benefit while adding complexity. Prompt disk replacement is more important. Therefore when badblocks set fails, directly call md_error to mark rdev Faulty immediately, preventing potential data access issues. After this change, cleanup of offset update logic and 'recovery_disabled' handling will follow. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-6-linan666@huaweicloud.com Fixes: 5e5702898e93 ("md/raid10: Handle read errors during recovery better.") Fixes: 3a9f28a5117e ("md/raid1: improve handling of read failure during recovery.") Signed-off-by: Li Nan Signed-off-by: Yu Kuai --- drivers/md/md.c | 8 +++++++- drivers/md/raid1.c | 16 +++++----------- drivers/md/raid10.c | 31 +++++++++++-------------------- drivers/md/raid5.c | 22 +++++++++------------- 4 files changed, 32 insertions(+), 45 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 97b023536afc..f1debba1b026 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -10422,8 +10422,14 @@ bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, else s += rdev->data_offset; - if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) + if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) { + /* + * Mark the disk as Faulty when setting badblocks fails, + * otherwise, bad sectors may be read. + */ + md_error(mddev, rdev); return false; + } /* Make sure they get written out promptly */ if (test_bit(ExternalBbl, &rdev->flags)) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ac8eff3dfb85..a841c5784e24 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2115,8 +2115,7 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, rdev->mddev->recovery); } /* need to record an error - either for the block or the device */ - if (!rdev_set_badblocks(rdev, sector, sectors, 0)) - md_error(rdev->mddev, rdev); + rdev_set_badblocks(rdev, sector, sectors, 0); return 0; } @@ -2441,8 +2440,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) if (!success) { /* Cannot read from anywhere - mark it bad */ struct md_rdev *rdev = conf->mirrors[read_disk].rdev; - if (!rdev_set_badblocks(rdev, sect, s, 0)) - md_error(mddev, rdev); + rdev_set_badblocks(rdev, sect, s, 0); break; } /* write it back and re-read */ @@ -2546,7 +2544,6 @@ static void narrow_write_error(struct r1bio *r1_bio, int i) * Badblocks set failed, disk marked Faulty. * No further operations needed. */ - md_error(mddev, rdev); bio_put(wbio); break; } @@ -2568,14 +2565,11 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio if (bio->bi_end_io == NULL) continue; if (!bio->bi_status && - test_bit(R1BIO_MadeGood, &r1_bio->state)) { + test_bit(R1BIO_MadeGood, &r1_bio->state)) rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); - } if (bio->bi_status && - test_bit(R1BIO_WriteError, &r1_bio->state)) { - if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) - md_error(conf->mddev, rdev); - } + test_bit(R1BIO_WriteError, &r1_bio->state)) + rdev_set_badblocks(rdev, r1_bio->sector, s, 0); } put_buf(r1_bio); md_done_sync(conf->mddev, s); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 233981dac83c..7fe363729a5a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2604,8 +2604,7 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, &rdev->mddev->recovery); } /* need to record an error - either for the block or the device */ - if (!rdev_set_badblocks(rdev, sector, sectors, 0)) - md_error(rdev->mddev, rdev); + rdev_set_badblocks(rdev, sector, sectors, 0); return 0; } @@ -2686,7 +2685,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 r10_bio->devs[slot].addr + sect, s, 0)) { - md_error(mddev, rdev); r10_bio->devs[slot].bio = IO_BLOCKED; } @@ -2825,7 +2823,6 @@ static void narrow_write_error(struct r10bio *r10_bio, int i) * Badblocks set failed, disk marked Faulty. * No further operations needed. */ - md_error(mddev, rdev); bio_put(wbio); break; } @@ -2894,35 +2891,29 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) if (r10_bio->devs[m].bio == NULL || r10_bio->devs[m].bio->bi_end_io == NULL) continue; - if (!r10_bio->devs[m].bio->bi_status) { + if (!r10_bio->devs[m].bio->bi_status) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, r10_bio->sectors, 0); - } else { - if (!rdev_set_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors, 0)) - md_error(conf->mddev, rdev); - } + else + rdev_set_badblocks(rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); rdev = conf->mirrors[dev].replacement; if (r10_bio->devs[m].repl_bio == NULL || r10_bio->devs[m].repl_bio->bi_end_io == NULL) continue; - if (!r10_bio->devs[m].repl_bio->bi_status) { + if (!r10_bio->devs[m].repl_bio->bi_status) rdev_clear_badblocks( rdev, r10_bio->devs[m].addr, r10_bio->sectors, 0); - } else { - if (!rdev_set_badblocks( - rdev, - r10_bio->devs[m].addr, - r10_bio->sectors, 0)) - md_error(conf->mddev, rdev); - } + else + rdev_set_badblocks(rdev, + r10_bio->devs[m].addr, + r10_bio->sectors, 0); } put_buf(r10_bio); } else { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index bba3cab2cc16..e72e808cbdd6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2821,11 +2821,9 @@ static void raid5_end_read_request(struct bio * bi) else { clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); - if (!(set_bad - && test_bit(In_sync, &rdev->flags) - && rdev_set_badblocks( - rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0))) - md_error(conf->mddev, rdev); + if (!(set_bad && test_bit(In_sync, &rdev->flags))) + rdev_set_badblocks(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0); } } rdev_dec_pending(rdev, conf->mddev); @@ -3603,11 +3601,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, else rdev = NULL; if (rdev) { - if (!rdev_set_badblocks( - rdev, - sh->sector, - RAID5_STRIPE_SECTORS(conf), 0)) - md_error(conf->mddev, rdev); + rdev_set_badblocks(rdev, + sh->sector, + RAID5_STRIPE_SECTORS(conf), + 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -5259,9 +5256,8 @@ finish: if (test_and_clear_bit(R5_WriteError, &dev->flags)) { /* We own a safe reference to the rdev */ rdev = conf->disks[i].rdev; - if (!rdev_set_badblocks(rdev, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0)) - md_error(conf->mddev, rdev); + rdev_set_badblocks(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { From cc0dab317acb871e11cd83225c90888397043fe8 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:54 +0800 Subject: [PATCH 095/162] md: update curr_resync_completed even when MD_RECOVERY_INTR is set An error sync IO may be done and sub 'recovery_active' while its error handling work is pending. This work sets 'recovery_disabled' and MD_RECOVERY_INTR, then later removes the bad disk without Faulty flag. If 'curr_resync_completed' is updated before the disk is removed, it could lead to reading from sync-failed regions. With the previous patch, error IO will set badblocks or mark rdev as Faulty, sync-failed regions are no longer readable. After waiting for 'recovery_active' to reach 0 (in the previous line), all sync IO has *completed*, regardless of whether MD_RECOVERY_INTR is set. Thus, the MD_RECOVERY_INTR check can be removed. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-7-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f1debba1b026..96f7eadbf959 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9740,8 +9740,8 @@ update: wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && mddev->curr_resync >= MD_RESYNC_ACTIVE) { + /* All sync IO completes after recovery_active becomes 0 */ mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } From af9c40ff5aed8149572ae920f520df88356b7950 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:55 +0800 Subject: [PATCH 096/162] md: remove MD_RECOVERY_ERROR handling and simplify resync_offset update Following previous patch "md: update curr_resync_completed even when MD_RECOVERY_INTR is set", 'curr_resync_completed' always equals 'curr_resync' for resync, so MD_RECOVERY_ERROR can be removed. Also, simplify resync_offset update logic. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-8-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/md.c | 21 ++++----------------- drivers/md/md.h | 2 -- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 96f7eadbf959..3ef845af552b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9086,7 +9086,6 @@ void md_sync_error(struct mddev *mddev) { // stop recovery, signal do_sync .... set_bit(MD_RECOVERY_INTR, &mddev->recovery); - set_bit(MD_RECOVERY_ERROR, &mddev->recovery); md_wakeup_thread(mddev->thread); } EXPORT_SYMBOL(md_sync_error); @@ -9749,24 +9748,12 @@ update: if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > MD_RESYNC_ACTIVE) { + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - if (mddev->curr_resync >= mddev->resync_offset) { - pr_debug("md: checkpointing %s of %s.\n", - desc, mdname(mddev)); - if (test_bit(MD_RECOVERY_ERROR, - &mddev->recovery)) - mddev->resync_offset = - mddev->curr_resync_completed; - else - mddev->resync_offset = - mddev->curr_resync; - } - } else - mddev->resync_offset = MaxSector; + mddev->resync_offset = mddev->curr_resync; } else { - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - mddev->curr_resync = MaxSector; if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { rcu_read_lock(); diff --git a/drivers/md/md.h b/drivers/md/md.h index 4c3dc7a6e399..cda003f24e1e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -652,8 +652,6 @@ enum recovery_flags { MD_RECOVERY_FROZEN, /* waiting for pers->start() to finish */ MD_RECOVERY_WAIT, - /* interrupted because io-error */ - MD_RECOVERY_ERROR, /* flags determines sync action, see details in enum sync_action */ From 6dd3aa08e83beeadf19f9d0f4110e5cd802a65d4 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:56 +0800 Subject: [PATCH 097/162] md: factor out sync completion update into helper Repeatedly reading 'mddev->recovery' flags in md_do_sync() may introduce potential risk if this flag is modified during sync, leading to incorrect offset updates. Therefore, replace direct 'mddev->recovery' checks with 'action'. Move sync completion update logic into helper md_finish_sync(), which improves readability and maintainability. The reshape completion update remains safe as it only updated after successful reshape when MD_RECOVERY_INTR is not set and 'curr_resync' equals 'max_sectors'. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-9-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/md.c | 82 ++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3ef845af552b..7b4fe7379bb6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9438,6 +9438,51 @@ static bool sync_io_within_limit(struct mddev *mddev) (raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev); } +/* + * Update sync offset and mddev status when sync completes + */ +static void md_finish_sync(struct mddev *mddev, enum sync_action action) +{ + struct md_rdev *rdev; + + switch (action) { + case ACTION_RESYNC: + case ACTION_REPAIR: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + mddev->resync_offset = mddev->curr_resync; + break; + case ACTION_RECOVER: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) + mddev->curr_resync = MaxSector; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (mddev->delta_disks >= 0 && + rdev_needs_recovery(rdev, mddev->curr_resync)) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + break; + case ACTION_RESHAPE: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev->delta_disks > 0 && + mddev->pers->finish_reshape && + mddev->pers->size && + !mddev_is_dm(mddev)) { + mddev_lock_nointr(mddev); + md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); + mddev_unlock(mddev); + if (!mddev_is_clustered(mddev)) + set_capacity_and_notify(mddev->gendisk, + mddev->array_sectors); + } + break; + /* */ + case ACTION_CHECK: + default: + break; + } +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -9453,7 +9498,6 @@ void md_do_sync(struct md_thread *thread) int last_mark,m; sector_t last_check; int skipped = 0; - struct md_rdev *rdev; enum sync_action action; const char *desc; struct blk_plug plug; @@ -9746,46 +9790,14 @@ update: } mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped); - if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > MD_RESYNC_ACTIVE) { - if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - mddev->curr_resync = MaxSector; - - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - mddev->resync_offset = mddev->curr_resync; - } else { - if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (mddev->delta_disks >= 0 && - rdev_needs_recovery(rdev, mddev->curr_resync)) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); - } - } - } + if (mddev->curr_resync > MD_RESYNC_ACTIVE) + md_finish_sync(mddev, action); skip: /* set CHANGE_PENDING here since maybe another update is needed, * so other nodes are informed. It should be harmless for normal * raid */ set_mask_bits(&mddev->sb_flags, 0, BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); - - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - mddev->delta_disks > 0 && - mddev->pers->finish_reshape && - mddev->pers->size && - !mddev_is_dm(mddev)) { - mddev_lock_nointr(mddev); - md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); - mddev_unlock(mddev); - if (!mddev_is_clustered(mddev)) - set_capacity_and_notify(mddev->gendisk, - mddev->array_sectors); - } - spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ From 8ff59a72478d6e9f9668e153dbdbdfad7928b123 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:57 +0800 Subject: [PATCH 098/162] md: move finish_reshape to md_finish_sync() finish_reshape implementations of raid10 and raid5 only update mddev and rdev configurations. Move these operations to md_finish_sync() as it is more appropriate. No functional changes. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-10-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/md.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7b4fe7379bb6..d7f94b50694d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9475,6 +9475,8 @@ static void md_finish_sync(struct mddev *mddev, enum sync_action action) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); } + if (mddev->pers->finish_reshape) + mddev->pers->finish_reshape(mddev); break; /* */ case ACTION_CHECK: @@ -10312,7 +10314,7 @@ void md_reap_sync_thread(struct mddev *mddev) { struct md_rdev *rdev; sector_t old_dev_sectors = mddev->dev_sectors; - bool is_reshaped = false; + bool is_reshaped = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); /* resync has finished, collect result */ md_unregister_thread(mddev, &mddev->sync_thread); @@ -10328,12 +10330,6 @@ void md_reap_sync_thread(struct mddev *mddev) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); } } - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - mddev->pers->finish_reshape) { - mddev->pers->finish_reshape(mddev); - if (mddev_is_clustered(mddev)) - is_reshaped = true; - } /* If array is no-longer degraded, then any saved_raid_disk * information must be scrapped. @@ -10360,8 +10356,9 @@ void md_reap_sync_thread(struct mddev *mddev) * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared, * so it is time to update size across cluster. */ - if (mddev_is_clustered(mddev) && is_reshaped - && !test_bit(MD_CLOSING, &mddev->flags)) + if (mddev_is_clustered(mddev) && is_reshaped && + mddev->pers->finish_reshape && + !test_bit(MD_CLOSING, &mddev->flags)) mddev->cluster_ops->update_size(mddev, old_dev_sectors); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); From 99582edb3f62e8ee6c34512021368f53f9b091f2 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:58 +0800 Subject: [PATCH 099/162] md/raid10: fix any_working flag handling in raid10_sync_request In raid10_sync_request(), 'any_working' indicates if any IO will be submitted. When there's only one In_sync disk with badblocks, 'any_working' might be set to 1 but no IO is submitted. Fix it by setting 'any_working' after badblock checks. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-11-linan666@huaweicloud.com Fixes: e875ecea266a ("md/raid10 record bad blocks as needed during recovery.") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7fe363729a5a..b258ed8b4e3a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3395,7 +3395,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, !test_bit(In_sync, &rdev->flags)) continue; /* This is where we read from */ - any_working = 1; sector = r10_bio->devs[j].addr; if (is_badblock(rdev, sector, max_sync, @@ -3410,6 +3409,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, continue; } } + any_working = 1; bio = r10_bio->devs[0].bio; bio->bi_next = biolist; biolist = bio; From 7435b73f05fbb40c07b087fefd3d40bfd759519c Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:02:59 +0800 Subject: [PATCH 100/162] md/raid10: cleanup skip handling in raid10_sync_request Skip a sector in raid10_sync_request() when it needs no syncing or no readable device exists. Current skip handling is unnecessary: - Use 'skip' label to reissue the next sector instead of return directly - Complete sync and return 'max_sectors' when multiple sectors are skipped due to badblocks The first is error-prone. For example, commit bc49694a9e8f ("md: pass in max_sectors for pers->sync_request()") removed redundant max_sector assignments. Since skip modifies max_sectors, `goto skip` leaves max_sectors equal to sector_nr after the jump, which is incorrect. The second causes sync to complete erroneously when no actual sync occurs. For recovery, recording badblocks and continue syncing subsequent sectors is more suitable. For resync, just skip bad sectors and syncing subsequent sectors. Clean up complex and unnecessary skip code. Return immediately when a sector should be skipped. Reduce code paths and lower regression risk. Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-12-linan666@huaweicloud.com Fixes: bc49694a9e8f ("md: pass in max_sectors for pers->sync_request()") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Yu Kuai --- drivers/md/raid10.c | 96 +++++++++++---------------------------------- 1 file changed, 22 insertions(+), 74 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b258ed8b4e3a..6f5a4aefb4e5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3161,11 +3161,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int i; int max_sync; sector_t sync_blocks; - sector_t sectors_skipped = 0; - int chunks_skipped = 0; sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; - int error_disk = -1; /* * Allow skipping a full rebuild for incremental assembly @@ -3186,7 +3183,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (init_resync(conf)) return 0; - skipped: if (sector_nr >= max_sector) { conf->cluster_sync_low = 0; conf->cluster_sync_high = 0; @@ -3238,33 +3234,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mddev->bitmap_ops->close_sync(mddev); close_sync(conf); *skipped = 1; - return sectors_skipped; + return 0; } if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); - if (chunks_skipped >= conf->geo.raid_disks) { - pr_err("md/raid10:%s: %s fails\n", mdname(mddev), - test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"); - if (error_disk >= 0 && - !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - /* - * recovery fails, set mirrors.recovery_disabled, - * device shouldn't be added to there. - */ - conf->mirrors[error_disk].recovery_disabled = - mddev->recovery_disabled; - return 0; - } - /* - * if there has been nothing to do on any drive, - * then there is nothing to do at all. - */ - *skipped = 1; - return (max_sector - sector_nr) + sectors_skipped; - } - if (max_sector > mddev->resync_max) max_sector = mddev->resync_max; /* Don't do IO beyond here */ @@ -3347,7 +3322,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* yep, skip the sync_blocks here, but don't assume * that there will never be anything to do here */ - chunks_skipped = -1; continue; } if (mrdev) @@ -3478,29 +3452,19 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (k = 0; k < conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; - if (mrdev && !test_bit(In_sync, - &mrdev->flags) - && !rdev_set_badblocks( - mrdev, - r10_bio->devs[k].addr, - max_sync, 0)) - any_working = 0; - if (mreplace && - !rdev_set_badblocks( - mreplace, - r10_bio->devs[k].addr, - max_sync, 0)) - any_working = 0; - } - if (!any_working) { - if (!test_and_set_bit(MD_RECOVERY_INTR, - &mddev->recovery)) - pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", - mdname(mddev)); - mirror->recovery_disabled - = mddev->recovery_disabled; - } else { - error_disk = i; + if (mrdev && + !test_bit(In_sync, &mrdev->flags)) + rdev_set_badblocks( + mrdev, + r10_bio->devs[k].addr, + max_sync, 0); + if (mreplace) + rdev_set_badblocks( + mreplace, + r10_bio->devs[k].addr, + max_sync, 0); + pr_warn("md/raid10:%s: cannot recovery sector %llu + %d.\n", + mdname(mddev), r10_bio->devs[k].addr, max_sync); } put_buf(r10_bio); if (rb2) @@ -3541,7 +3505,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rb2->master_bio = NULL; put_buf(rb2); } - goto giveup; + *skipped = 1; + return max_sync; } } else { /* resync. Schedule a read for every block at this virt offset */ @@ -3565,7 +3530,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, &mddev->recovery)) { /* We can skip this block */ *skipped = 1; - return sync_blocks + sectors_skipped; + return sync_blocks; } if (sync_blocks < max_sync) max_sync = sync_blocks; @@ -3657,8 +3622,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mddev); } put_buf(r10_bio); - biolist = NULL; - goto giveup; + *skipped = 1; + return max_sync; } } @@ -3678,7 +3643,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (WARN_ON(!bio_add_page(bio, page, len, 0))) { bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); - goto giveup; + *skipped = 1; + return max_sync; } } nr_sectors += len>>9; @@ -3746,25 +3712,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } } - if (sectors_skipped) - /* pretend they weren't skipped, it makes - * no important difference in this case - */ - md_done_sync(mddev, sectors_skipped); - - return sectors_skipped + nr_sectors; - giveup: - /* There is nowhere to write, so all non-sync - * drives must be failed or in resync, all drives - * have a bad block, so try the next chunk... - */ - if (sector_nr + max_sync < max_sector) - max_sector = sector_nr + max_sync; - - sectors_skipped += (max_sector - sector_nr); - chunks_skipped ++; - sector_nr = max_sector; - goto skipped; + return nr_sectors; } static sector_t From 5d1dd57929be2158fb5a8bc74817cc08b10b0118 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 5 Jan 2026 19:03:00 +0800 Subject: [PATCH 101/162] md: remove recovery_disabled 'recovery_disabled' logic is complex and confusing, originally intended to preserve raid in extreme scenarios. It was used in following cases: - When sync fails and setting badblocks also fails, kick out non-In_sync rdev and block spare rdev from joining to preserve raid [1] - When last backup is unavailable, prevent repeated add-remove of spares triggering recovery [2] The original issues are now resolved: - Error handlers in all raid types prevent last rdev from being kicked out - Disks with failed recovery are marked Faulty and can't re-join Therefore, remove 'recovery_disabled' as it's no longer needed. [1] 5389042ffa36 ("md: change managed of recovery_disabled.") [2] 4044ba58dd15 ("md: don't retry recovery of raid1 that fails due to error on source drive.") Link: https://lore.kernel.org/linux-raid/20260105110300.1442509-13-linan666@huaweicloud.com Signed-off-by: Li Nan Signed-off-by: Yu Kuai --- drivers/md/md.c | 3 --- drivers/md/md.h | 6 ------ drivers/md/raid1.c | 17 +++-------------- drivers/md/raid1.h | 5 ----- drivers/md/raid10.c | 8 -------- drivers/md/raid10.h | 5 ----- drivers/md/raid5.c | 10 +--------- drivers/md/raid5.h | 1 - 8 files changed, 4 insertions(+), 51 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index d7f94b50694d..606f616190d7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2618,9 +2618,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) list_add_rcu(&rdev->same_set, &mddev->disks); bd_link_disk_holder(rdev->bdev, mddev->gendisk); - /* May as well allow recovery to be retried once */ - mddev->recovery_disabled++; - return 0; fail: diff --git a/drivers/md/md.h b/drivers/md/md.h index cda003f24e1e..e6d3d88698ed 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -505,12 +505,6 @@ struct mddev { int ok_start_degraded; unsigned long recovery; - /* If a RAID personality determines that recovery (of a particular - * device) will fail due to a read error on the source device, it - * takes a copy of this number and does not attempt recovery again - * until this number changes. - */ - int recovery_disabled; int in_sync; /* know to not need resync */ /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a841c5784e24..79faec11b79e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1760,7 +1760,6 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) set_bit(MD_BROKEN, &mddev->flags); if (!test_bit(MD_FAILLAST_DEV, &mddev->flags)) { - conf->recovery_disabled = mddev->recovery_disabled; spin_unlock_irqrestore(&conf->device_lock, flags); return; } @@ -1904,7 +1903,6 @@ static bool raid1_remove_conf(struct r1conf *conf, int disk) /* Only remove non-faulty devices if recovery is not possible. */ if (!test_bit(Faulty, &rdev->flags) && - rdev->mddev->recovery_disabled != conf->recovery_disabled && rdev->mddev->degraded < conf->raid_disks) return false; @@ -1924,9 +1922,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; - if (mddev->recovery_disabled == conf->recovery_disabled) - return -EBUSY; - if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; @@ -2346,7 +2341,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) || !fix_sync_read_error(r1_bio)) { - conf->recovery_disabled = mddev->recovery_disabled; md_done_sync(mddev, r1_bio->sectors); md_sync_error(mddev); put_buf(r1_bio); @@ -2948,16 +2942,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, *skipped = 1; put_buf(r1_bio); - if (!ok) { - /* Cannot record the badblocks, so need to + if (!ok) + /* Cannot record the badblocks, md_error has set INTR, * abort the resync. - * If there are multiple read targets, could just - * fail the really bad ones ??? */ - conf->recovery_disabled = mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &mddev->recovery); return 0; - } else + else return min_bad; } @@ -3144,7 +3134,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) init_waitqueue_head(&conf->wait_barrier); bio_list_init(&conf->pending_bio_list); - conf->recovery_disabled = mddev->recovery_disabled - 1; err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 2ebe35aaa534..c98d43a7ae99 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -93,11 +93,6 @@ struct r1conf { */ int fullsync; - /* When the same as mddev->recovery_disabled we don't allow - * recovery to be attempted as we expect a read error. - */ - int recovery_disabled; - mempool_t *r1bio_pool; mempool_t r1buf_pool; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6f5a4aefb4e5..9debb20cf129 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2130,8 +2130,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) mirror = first; for ( ; mirror <= last ; mirror++) { p = &conf->mirrors[mirror]; - if (p->recovery_disabled == mddev->recovery_disabled) - continue; if (p->rdev) { if (test_bit(WantReplacement, &p->rdev->flags) && p->replacement == NULL && repl_slot < 0) @@ -2143,7 +2141,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (err) return err; p->head_position = 0; - p->recovery_disabled = mddev->recovery_disabled - 1; rdev->raid_disk = mirror; err = 0; if (rdev->saved_raid_disk != mirror) @@ -2196,7 +2193,6 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != p->recovery_disabled && (!p->replacement || p->replacement == rdev) && number < conf->geo.raid_disks && enough(conf, -1)) { @@ -2535,8 +2531,6 @@ static void fix_recovery_read_error(struct r10bio *r10_bio) pr_notice("md/raid10:%s: recovery aborted due to read error\n", mdname(mddev)); - conf->mirrors[dw].recovery_disabled - = mddev->recovery_disabled; set_bit(MD_RECOVERY_INTR, &mddev->recovery); break; @@ -4075,8 +4069,6 @@ static int raid10_run(struct mddev *mddev) disk->replacement->saved_raid_disk < 0) { conf->fullsync = 1; } - - disk->recovery_disabled = mddev->recovery_disabled - 1; } if (mddev->resync_offset != MaxSector) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index da00a55f7a55..ec79d87fb92f 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -18,11 +18,6 @@ struct raid10_info { struct md_rdev *rdev, *replacement; sector_t head_position; - int recovery_disabled; /* matches - * mddev->recovery_disabled - * when we shouldn't try - * recovering this device. - */ }; struct r10conf { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e72e808cbdd6..6d408aaaacf3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2922,7 +2922,6 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) if (has_failed(conf)) { set_bit(MD_BROKEN, &conf->mddev->flags); - conf->recovery_disabled = mddev->recovery_disabled; pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n", mdname(mddev), mddev->degraded, conf->raid_disks); @@ -3727,10 +3726,8 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, } md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf)); - if (abort) { - conf->recovery_disabled = conf->mddev->recovery_disabled; + if (abort) md_sync_error(conf->mddev); - } } static int want_replace(struct stripe_head *sh, int disk_idx) @@ -7548,8 +7545,6 @@ static struct r5conf *setup_conf(struct mddev *mddev) } conf->bypass_threshold = BYPASS_THRESHOLD; - conf->recovery_disabled = mddev->recovery_disabled - 1; - conf->raid_disks = mddev->raid_disks; if (mddev->reshape_position == MaxSector) conf->previous_raid_disks = mddev->raid_disks; @@ -8249,7 +8244,6 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * isn't possible. */ if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && !has_failed(conf) && (!p->replacement || p->replacement == rdev) && number < conf->raid_disks) { @@ -8310,8 +8304,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) return 0; } - if (mddev->recovery_disabled == conf->recovery_disabled) - return -EBUSY; if (rdev->saved_raid_disk < 0 && has_failed(conf)) /* no point adding a device */ diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ddfe65237888..110b1c2d0a86 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -640,7 +640,6 @@ struct r5conf { * (fresh device added). * Cleared when a sync completes. */ - int recovery_disabled; /* per cpu variables */ struct raid5_percpu __percpu *percpu; int scribble_disks; From cd1635d844d26471c56c0a432abdee12fc9ad735 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 24 Jan 2026 02:26:22 +0800 Subject: [PATCH 102/162] md/raid5: fix IO hang with degraded array with llbitmap When llbitmap bit state is still unwritten, any new write should force rcw, as bitmap_ops->blocks_synced() is checked in handle_stripe_dirtying(). However, later the same check is missing in need_this_block(), causing stripe to deadloop during handling because handle_stripe() will decide to go to handle_stripe_fill(), meanwhile need_this_block() always return 0 and nothing is handled. Link: https://lore.kernel.org/linux-raid/20260123182623.3718551-2-yukuai@fnnas.com Fixes: 5ab829f1971d ("md/md-llbitmap: introduce new lockless bitmap") Signed-off-by: Yu Kuai Reviewed-by: Li Nan --- drivers/md/raid5.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6d408aaaacf3..8854e024f311 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3751,9 +3751,14 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; + struct mddev *mddev = sh->raid_conf->mddev; + bool force_rcw = false; int i; - bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW); + if (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW || + (mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced && + !mddev->bitmap_ops->blocks_synced(mddev, sh->sector))) + force_rcw = true; if (test_bit(R5_LOCKED, &dev->flags) || test_bit(R5_UPTODATE, &dev->flags)) From d119bd2e1643cc023210ff3c6f0657e4f914e71d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 24 Jan 2026 02:26:23 +0800 Subject: [PATCH 103/162] md/md-llbitmap: fix percpu_ref not resurrected on suspend timeout When llbitmap_suspend_timeout() times out waiting for percpu_ref to become zero, it returns -ETIMEDOUT without resurrecting the percpu_ref. The caller (md_llbitmap_daemon_fn) then continues to the next page without calling llbitmap_resume(), leaving the percpu_ref in a killed state permanently. Fix this by resurrecting the percpu_ref before returning the error, ensuring the page control structure remains usable for subsequent operations. Link: https://lore.kernel.org/linux-raid/20260123182623.3718551-3-yukuai@fnnas.com Fixes: 5ab829f1971d ("md/md-llbitmap: introduce new lockless bitmap") Signed-off-by: Yu Kuai Reviewed-by: Li Nan --- drivers/md/md-llbitmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index 9c1ade19b774..cd713a7dc270 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -712,8 +712,10 @@ static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx) percpu_ref_kill(&pctl->active); if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active), - llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) + llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) { + percpu_ref_resurrect(&pctl->active); return -ETIMEDOUT; + } return 0; } From 46ef85f854dfa9d5226b3c1c46493d79556c9589 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Tue, 20 Jan 2026 11:24:56 +0100 Subject: [PATCH 104/162] md/bitmap: fix GPF in write_page caused by resize race A General Protection Fault occurs in write_page() during array resize: RIP: 0010:write_page+0x22b/0x3c0 [md_mod] This is a use-after-free race between bitmap_daemon_work() and __bitmap_resize(). The daemon iterates over `bitmap->storage.filemap` without locking, while the resize path frees that storage via md_bitmap_file_unmap(). `quiesce()` does not stop the md thread, allowing concurrent access to freed pages. Fix by holding `mddev->bitmap_info.mutex` during the bitmap update. Link: https://lore.kernel.org/linux-raid/20260120102456.25169-1-jinpu.wang@ionos.com Closes: https://lore.kernel.org/linux-raid/CAMGffE=Mbfp=7xD_hYxXk1PAaCZNSEAVeQGKGy7YF9f2S4=NEA@mail.gmail.com/T/#u Cc: stable@vger.kernel.org Fixes: d60b479d177a ("md/bitmap: add bitmap_resize function to allow bitmap resizing.") Signed-off-by: Jack Wang Signed-off-by: Yu Kuai --- drivers/md/md-bitmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index dbe4c4b9a1da..1d4a050dab3a 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2453,6 +2453,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), sizeof(bitmap_super_t)); + mutex_lock(&bitmap->mddev->bitmap_info.mutex); spin_lock_irq(&bitmap->counts.lock); md_bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; @@ -2560,7 +2561,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); } spin_unlock_irq(&bitmap->counts.lock); - + mutex_unlock(&bitmap->mddev->bitmap_info.mutex); if (!init) { __bitmap_unplug(bitmap); bitmap->mddev->pers->quiesce(bitmap->mddev, 0); From f150e753cb8dd756085f46e86f2c35ce472e0a3c Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Sat, 17 Jan 2026 14:59:03 +0000 Subject: [PATCH 105/162] md-cluster: fix NULL pointer dereference in process_metadata_update The function process_metadata_update() blindly dereferences the 'thread' pointer (acquired via rcu_dereference_protected) within the wait_event() macro. While the code comment states "daemon thread must exist", there is a valid race condition window during the MD array startup sequence (md_run): 1. bitmap_load() is called, which invokes md_cluster_ops->join(). 2. join() starts the "cluster_recv" thread (recv_daemon). 3. At this point, recv_daemon is active and processing messages. 4. However, mddev->thread (the main MD thread) is not initialized until later in md_run(). If a METADATA_UPDATED message is received from a remote node during this specific window, process_metadata_update() will be called while mddev->thread is still NULL, leading to a kernel panic. To fix this, we must validate the 'thread' pointer. If it is NULL, we release the held lock (no_new_dev_lockres) and return early, safely ignoring the update request as the array is not yet fully ready to process it. Link: https://lore.kernel.org/linux-raid/20260117145903.28921-1-jiashengjiangcool@gmail.com Signed-off-by: Jiasheng Jiang Signed-off-by: Yu Kuai --- drivers/md/md-cluster.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 11f1e91d387d..896279988dfd 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -549,8 +549,13 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); - /* daemaon thread must exist */ thread = rcu_dereference_protected(mddev->thread, true); + if (!thread) { + pr_warn("md-cluster: Received metadata update but MD thread is not ready\n"); + dlm_unlock_sync(cinfo->no_new_dev_lockres); + return; + } + wait_event(thread->wqueue, (got_lock = mddev_trylock(mddev)) || test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)); From cefcb9297fbdb6d94b61787b4f8d84f55b741470 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 14 Jan 2026 18:52:21 +0100 Subject: [PATCH 106/162] md raid: fix hang when stopping arrays with metadata through dm-raid When using device-mapper's dm-raid target, stopping a RAID array can cause the system to hang under specific conditions. This occurs when: - A dm-raid managed device tree is suspended from top to bottom (the top-level RAID device is suspended first, followed by its underlying metadata and data devices) - The top-level RAID device is then removed Removing the top-level device triggers a hang in the following sequence: the dm-raid destructor calls md_stop(), which tries to flush the write-intent bitmap by writing to the metadata sub-devices. However, these devices are already suspended, making them unable to complete the write-intent operations and causing an indefinite block. Fix: - Prevent bitmap flushing when md_stop() is called from dm-raid destructor context and avoid a quiescing/unquescing cycle which could also cause I/O - Still allow write-intent bitmap flushing when called from dm-raid suspend context This ensures that RAID array teardown can complete successfully even when the underlying devices are in a suspended state. This second patch uses md_is_rdwr() to distinguish between suspend and destructor paths as elaborated on above. Link: https://lore.kernel.org/linux-raid/CAM23VxqYrwkhKEBeQrZeZwQudbiNey2_8B_SEOLqug=pXxaFrA@mail.gmail.com Signed-off-by: Heinz Mauelshagen Signed-off-by: Yu Kuai --- drivers/md/md.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 606f616190d7..59cd303548de 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6851,13 +6851,15 @@ static void __md_stop_writes(struct mddev *mddev) { timer_delete_sync(&mddev->safemode_timer); - if (mddev->pers && mddev->pers->quiesce) { - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); - } + if (md_is_rdwr(mddev) || !mddev_is_dm(mddev)) { + if (mddev->pers && mddev->pers->quiesce) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } - if (md_bitmap_enabled(mddev, true)) - mddev->bitmap_ops->flush(mddev); + if (md_bitmap_enabled(mddev, true)) + mddev->bitmap_ops->flush(mddev); + } if (md_is_rdwr(mddev) && ((!mddev->in_sync && !mddev_is_clustered(mddev)) || From 72a41750f1a35b46caa5bbd70df7b5d3ce4f4b0a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 26 Jan 2026 08:27:24 -0800 Subject: [PATCH 107/162] block: remove bio_last_bvec_all There are no more callers of this function after commit f6b2d8b134b2413 ("btrfs: track the next file offset in struct btrfs_bio_ctrl"), so remove the function. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- Documentation/block/biovecs.rst | 1 - include/linux/bio.h | 6 ------ 2 files changed, 7 deletions(-) diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index b9dc0c9dbee4..11126ed6f40f 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -135,7 +135,6 @@ Usage of helpers: bio_first_bvec_all() bio_first_page_all() bio_first_folio_all() - bio_last_bvec_all() * The following helpers iterate over single-page segment. The passed 'struct bio_vec' will contain a single-page IO vector during the iteration:: diff --git a/include/linux/bio.h b/include/linux/bio.h index c75a9b3672aa..d32aee2857a9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -256,12 +256,6 @@ static inline struct folio *bio_first_folio_all(struct bio *bio) return page_folio(bio_first_page_all(bio)); } -static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) -{ - WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); - return &bio->bi_io_vec[bio->bi_vcnt - 1]; -} - /** * struct folio_iter - State for iterating all folios in a bio. * @folio: The current folio we're iterating. NULL after the last folio. From b2b2ce870651db659247f34636e6243b4547e8af Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Fri, 23 Jan 2026 17:19:41 +0000 Subject: [PATCH 108/162] block: rnull: remove imports available via prelude These imports are already in scope by importing `kernel::prelude::*` and does not need to be imported separately. Signed-off-by: Gary Guo Acked-by: Andreas Hindborg Signed-off-by: Jens Axboe --- drivers/block/rnull/configfs.rs | 1 - drivers/block/rnull/rnull.rs | 3 --- 2 files changed, 4 deletions(-) diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs index 2f5a7da03af5..158f38bbbb8b 100644 --- a/drivers/block/rnull/configfs.rs +++ b/drivers/block/rnull/configfs.rs @@ -13,7 +13,6 @@ use kernel::{ str::{kstrtobool_bytes, CString}, sync::Mutex, }; -use pin_init::PinInit; pub(crate) fn subsystem() -> impl PinInit, Error> { let item_type = configfs_attrs! { diff --git a/drivers/block/rnull/rnull.rs b/drivers/block/rnull/rnull.rs index a9d5e575a2c4..0ca8715febe8 100644 --- a/drivers/block/rnull/rnull.rs +++ b/drivers/block/rnull/rnull.rs @@ -14,12 +14,9 @@ use kernel::{ Operations, TagSet, }, }, - error::Result, - pr_info, prelude::*, sync::{aref::ARef, Arc}, }; -use pin_init::PinInit; module! { type: NullBlkModule, From 7c746eb71fc3737340c32f44c31b111f74f5632c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 12 Jan 2026 15:19:28 -0800 Subject: [PATCH 109/162] rnbd-clt: fix refcount underflow in device unmap path During device unmapping (triggered by module unload or explicit unmap), a refcount underflow occurs causing a use-after-free warning: [14747.574913] ------------[ cut here ]------------ [14747.574916] refcount_t: underflow; use-after-free. [14747.574917] WARNING: lib/refcount.c:28 at refcount_warn_saturate+0x55/0x90, CPU#9: kworker/9:1/378 [14747.574924] Modules linked in: rnbd_client(-) rtrs_client rnbd_server rtrs_server rtrs_core ... [14747.574998] CPU: 9 UID: 0 PID: 378 Comm: kworker/9:1 Tainted: G O N 6.19.0-rc3lblk-fnext+ #42 PREEMPT(voluntary) [14747.575005] Workqueue: rnbd_clt_wq unmap_device_work [rnbd_client] [14747.575010] RIP: 0010:refcount_warn_saturate+0x55/0x90 [14747.575037] Call Trace: [14747.575038] [14747.575038] rnbd_clt_unmap_device+0x170/0x1d0 [rnbd_client] [14747.575044] process_one_work+0x211/0x600 [14747.575052] worker_thread+0x184/0x330 [14747.575055] ? __pfx_worker_thread+0x10/0x10 [14747.575058] kthread+0x10d/0x250 [14747.575062] ? __pfx_kthread+0x10/0x10 [14747.575066] ret_from_fork+0x319/0x390 [14747.575069] ? __pfx_kthread+0x10/0x10 [14747.575072] ret_from_fork_asm+0x1a/0x30 [14747.575083] [14747.575096] ---[ end trace 0000000000000000 ]--- Befor this patch :- The bug is a double kobject_put() on dev->kobj during device cleanup. Kobject Lifecycle: kobject_init_and_add() sets kobj.kref = 1 (initialization) kobject_put() sets kobj.kref = 0 (should be called once) * Before this patch: rnbd_clt_unmap_device() rnbd_destroy_sysfs() kobject_del(&dev->kobj) [remove from sysfs] kobject_put(&dev->kobj) PUT #1 (WRONG!) kref: 1 to 0 rnbd_dev_release() kfree(dev) [DEVICE FREED!] rnbd_destroy_gen_disk() [use-after-free!] rnbd_clt_put_dev() refcount_dec_and_test(&dev->refcount) kobject_put(&dev->kobj) PUT #2 (UNDERFLOW!) kref: 0 to -1 [WARNING!] The first kobject_put() in rnbd_destroy_sysfs() prematurely frees the device via rnbd_dev_release(), then the second kobject_put() in rnbd_clt_put_dev() causes refcount underflow. * After this patch :- Remove kobject_put() from rnbd_destroy_sysfs(). This function should only remove sysfs visibility (kobject_del), not manage object lifetime. Call Graph (FIXED): rnbd_clt_unmap_device() rnbd_destroy_sysfs() kobject_del(&dev->kobj) [remove from sysfs only] [kref unchanged: 1] rnbd_destroy_gen_disk() [device still valid] rnbd_clt_put_dev() refcount_dec_and_test(&dev->refcount) kobject_put(&dev->kobj) ONLY PUT (CORRECT!) kref: 1 to 0 [BALANCED] rnbd_dev_release() kfree(dev) [CLEAN DESTRUCTION] This follows the kernel pattern where sysfs removal (kobject_del) is separate from object destruction (kobject_put). Fixes: 581cf833cac4 ("block: rnbd: add .release to rnbd_dev_ktype") Signed-off-by: Chaitanya Kulkarni Acked-by: Jack Wang Reviewed-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 094ecc174f41..757df2896aeb 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1664,7 +1664,6 @@ static void rnbd_destroy_sysfs(struct rnbd_clt_dev *dev, /* To avoid deadlock firstly remove itself */ sysfs_remove_file_self(&dev->kobj, sysfs_self); kobject_del(&dev->kobj); - kobject_put(&dev->kobj); } } From f46ebb910989a1db244f95bd1f937907591aa2ee Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 27 Jan 2026 23:47:01 +0100 Subject: [PATCH 110/162] block: Replace snprintf with strscpy in check_partition Replace snprintf("%s", ...) with the faster and more direct strscpy(). Signed-off-by: Thorsten Blum Signed-off-by: Jens Axboe --- block/partitions/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 815ed33caa1b..079057ab535a 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -130,7 +131,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) state->pp_buf[0] = '\0'; state->disk = hd; - snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); + strscpy(state->name, hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); From 0921abdcbd1cbd6605ea425e85758bd4a19b9b32 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 24 Jan 2026 12:27:16 +0800 Subject: [PATCH 111/162] ublk: document IO reference counting design Add comprehensive documentation for ublk's split reference counting model (io->ref + io->task_registered_buffers) above ublk_init_req_ref() given this model isn't very straightforward. Signed-off-by: Ming Lei Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 89 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 7981decd1cee..8a5a6ba29a1d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -985,6 +985,95 @@ static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub) ublk_dev_support_auto_buf_reg(ub); } +/* + * ublk IO Reference Counting Design + * ================================== + * + * For user-copy and zero-copy modes, ublk uses a split reference model with + * two counters that together track IO lifetime: + * + * - io->ref: refcount for off-task buffer registrations and user-copy ops + * - io->task_registered_buffers: count of buffers registered on the IO task + * + * Key Invariant: + * -------------- + * When IO is dispatched to the ublk server (UBLK_IO_FLAG_OWNED_BY_SRV set), + * the sum (io->ref + io->task_registered_buffers) must equal UBLK_REFCOUNT_INIT + * when no active references exist. After IO completion, both counters become + * zero. For I/Os not currently dispatched to the ublk server, both ref and + * task_registered_buffers are 0. + * + * This invariant is checked by ublk_check_and_reset_active_ref() during daemon + * exit to determine if all references have been released. + * + * Why Split Counters: + * ------------------- + * Buffers registered on the IO daemon task can use the lightweight + * task_registered_buffers counter (simple increment/decrement) instead of + * atomic refcount operations. The ublk_io_release() callback checks if + * current == io->task to decide which counter to update. + * + * This optimization only applies before IO completion. At completion, + * ublk_sub_req_ref() collapses task_registered_buffers into the atomic ref. + * After that, all subsequent buffer unregistrations must use the atomic ref + * since they may be releasing the last reference. + * + * Reference Lifecycle: + * -------------------- + * 1. ublk_init_req_ref(): Sets io->ref = UBLK_REFCOUNT_INIT at IO dispatch + * + * 2. During IO processing: + * - On-task buffer reg: task_registered_buffers++ (no ref change) + * - Off-task buffer reg: ref++ via ublk_get_req_ref() + * - Buffer unregister callback (ublk_io_release): + * * If on-task: task_registered_buffers-- + * * If off-task: ref-- via ublk_put_req_ref() + * + * 3. ublk_sub_req_ref() at IO completion: + * - Computes: sub_refs = UBLK_REFCOUNT_INIT - task_registered_buffers + * - Subtracts sub_refs from ref and zeroes task_registered_buffers + * - This effectively collapses task_registered_buffers into the atomic ref, + * accounting for the initial UBLK_REFCOUNT_INIT minus any on-task + * buffers that were already counted + * + * Example (zero-copy, register on-task, unregister off-task): + * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0 + * - Register buffer on-task: task_registered_buffers = 1 + * - Unregister off-task: ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1 + * - Completion via ublk_sub_req_ref(): + * sub_refs = UBLK_REFCOUNT_INIT - 1, + * ref = (UBLK_REFCOUNT_INIT - 1) - (UBLK_REFCOUNT_INIT - 1) = 0 + * + * Example (auto buffer registration): + * Auto buffer registration sets task_registered_buffers = 1 at dispatch. + * + * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 1 + * - Buffer unregister: task_registered_buffers-- (becomes 0) + * - Completion via ublk_sub_req_ref(): + * sub_refs = UBLK_REFCOUNT_INIT - 0, ref becomes 0 + * + * Example (zero-copy, ublk server killed): + * When daemon is killed, io_uring cleanup unregisters buffers off-task. + * ublk_check_and_reset_active_ref() waits for the invariant to hold. + * + * - Dispatch: ref = UBLK_REFCOUNT_INIT, task_registered_buffers = 0 + * - Register buffer on-task: task_registered_buffers = 1 + * - Daemon killed, io_uring cleanup unregisters buffer (off-task): + * ref-- (UBLK_REFCOUNT_INIT - 1), task_registered_buffers stays 1 + * - Daemon exit check: sum = (UBLK_REFCOUNT_INIT - 1) + 1 = UBLK_REFCOUNT_INIT + * - Sum equals UBLK_REFCOUNT_INIT, then both two counters are zeroed by + * ublk_check_and_reset_active_ref(), so ublk_abort_queue() can proceed + * and abort pending requests + * + * Batch IO Special Case: + * ---------------------- + * In batch IO mode, io->task is NULL. This means ublk_io_release() always + * takes the off-task path (ublk_put_req_ref), decrementing io->ref. The + * task_registered_buffers counter still tracks registered buffers for the + * invariant check, even though the callback doesn't decrement it. + * + * Note: updating task_registered_buffers is protected by io->lock. + */ static inline void ublk_init_req_ref(const struct ublk_queue *ubq, struct ublk_io *io) { From 068f5b5ef5bf97e25568950f06ba32325bdc660b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 29 Jan 2026 16:27:14 +0900 Subject: [PATCH 112/162] block: cleanup queue limit features definition Unwrap the definition of BLK_FEAT_ATOMIC_WRITES and renumber this feature to be sequential with BLK_FEAT_SKIP_TAGSET_QUIESCE. Signed-off-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 251e0f538c4c..4536211ff33c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -340,14 +340,13 @@ typedef unsigned int __bitwise blk_features_t; /* skip this queue in blk_mq_(un)quiesce_tagset */ #define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13)) +/* atomic writes enabled */ +#define BLK_FEAT_ATOMIC_WRITES ((__force blk_features_t)(1u << 14)) + /* undocumented magic for bcache */ #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) -/* atomic writes enabled */ -#define BLK_FEAT_ATOMIC_WRITES \ - ((__force blk_features_t)(1u << 16)) - /* * Flags automatically inherited when stacking limits. */ From 2719bd1ee1a1cd0535bc62e89b52822f2bbd14eb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 29 Jan 2026 16:27:15 +0900 Subject: [PATCH 113/162] block: introduce blk_queue_rot() To check if a request queue is for a rotational device, a double negation is needed with the pattern "!blk_queue_nonrot(q)". Simplify this with the introduction of the helper blk_queue_rot() which tests if a requests queue limit has the BLK_FEAT_ROTATIONAL feature set. All call sites of blk_queue_nonrot() are modified to use blk_queue_rot() and blk_queue_nonrot() definition removed. No functional changes. Signed-off-by: Damien Le Moal Reviewed-by: Nitesh Shetty Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 20 ++++++++++---------- block/blk-iocost.c | 2 +- block/blk-iolatency.c | 5 +---- block/blk-wbt.c | 5 ++--- include/linux/blkdev.h | 4 ++-- 5 files changed, 16 insertions(+), 20 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 6e54b1d3d8bc..3ebdec40e758 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -231,7 +231,7 @@ static struct kmem_cache *bfq_pool; #define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ (get_sdist(last_pos, rq) > \ BFQQ_SEEK_THR && \ - (!blk_queue_nonrot(bfqd->queue) || \ + (blk_queue_rot(bfqd->queue) || \ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) #define BFQQ_CLOSE_THR (sector_t)(8 * 1024) #define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) @@ -4165,7 +4165,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* don't use too short time intervals */ if (delta_usecs < 1000) { - if (blk_queue_nonrot(bfqd->queue)) + if (!blk_queue_rot(bfqd->queue)) /* * give same worst-case guarantees as idling * for seeky @@ -4487,7 +4487,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq) { bool rot_without_queueing = - !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, + blk_queue_rot(bfqd->queue) && !bfqd->hw_tag, bfqq_sequential_and_IO_bound, idling_boosts_thr; @@ -4521,7 +4521,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, * flash-based device. */ idling_boosts_thr = rot_without_queueing || - ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && + ((blk_queue_rot(bfqd->queue) || !bfqd->hw_tag) && bfqq_sequential_and_IO_bound); /* @@ -4722,7 +4722,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) * there is only one in-flight large request * at a time. */ - if (blk_queue_nonrot(bfqd->queue) && + if (!blk_queue_rot(bfqd->queue) && blk_rq_sectors(bfqq->next_rq) >= BFQQ_SECT_THR_NONROT && bfqd->tot_rq_in_driver >= 1) @@ -6340,7 +6340,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) bfqd->hw_tag_samples = 0; bfqd->nonrot_with_queueing = - blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag; + !blk_queue_rot(bfqd->queue) && bfqd->hw_tag; } static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) @@ -7293,7 +7293,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) INIT_HLIST_HEAD(&bfqd->burst_list); bfqd->hw_tag = -1; - bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue); + bfqd->nonrot_with_queueing = !blk_queue_rot(bfqd->queue); bfqd->bfq_max_budget = bfq_default_max_budget; @@ -7328,9 +7328,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) * Begin by assuming, optimistically, that the device peak * rate is equal to 2/3 of the highest reference rate. */ - bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * - ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; - bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + bfqd->rate_dur_prod = ref_rate[!blk_queue_rot(bfqd->queue)] * + ref_wr_duration[!blk_queue_rot(bfqd->queue)]; + bfqd->peak_rate = ref_rate[!blk_queue_rot(bfqd->queue)] * 2 / 3; /* see comments on the definition of next field inside bfq_data */ bfqd->actuator_load_threshold = 4; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index a0416927d33d..ef543d163d46 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -812,7 +812,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) u64 now_ns; /* rotational? */ - if (!blk_queue_nonrot(disk->queue)) + if (blk_queue_rot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 45bd18f68541..f7434278cd29 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -988,10 +988,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) u64 now = blk_time_get_ns(); int cpu; - if (blk_queue_nonrot(blkg->q)) - iolat->ssd = true; - else - iolat->ssd = false; + iolat->ssd = !blk_queue_rot(blkg->q); for_each_possible_cpu(cpu) { struct latency_stat *stat; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0974875f77bd..8e025834f2fb 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -747,10 +747,9 @@ u64 wbt_default_latency_nsec(struct request_queue *q) * We default to 2msec for non-rotational storage, and 75msec * for rotational storage. */ - if (blk_queue_nonrot(q)) - return 2000000ULL; - else + if (blk_queue_rot(q)) return 75000000ULL; + return 2000000ULL; } static int wbt_data_dir(const struct request *rq) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4536211ff33c..1e5b5547929f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -680,7 +680,7 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) -#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL)) +#define blk_queue_rot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL) #define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT) #define blk_queue_passthrough_stat(q) \ ((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH) @@ -1463,7 +1463,7 @@ bdev_write_zeroes_unmap_sectors(struct block_device *bdev) static inline bool bdev_nonrot(struct block_device *bdev) { - return blk_queue_nonrot(bdev_get_queue(bdev)); + return !blk_queue_rot(bdev_get_queue(bdev)); } static inline bool bdev_synchronous(struct block_device *bdev) From ad5f2e2908c9b79a86529281a48e94d644d43dc7 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Wed, 28 Jan 2026 13:56:34 -0700 Subject: [PATCH 114/162] ublk: restore auto buf unregister refcount optimization Commit 1ceeedb59749 ("ublk: optimize UBLK_IO_UNREGISTER_IO_BUF on daemon task") optimized ublk request buffer unregistration to use a non-atomic reference count decrement when performed on the ublk_io's daemon task. The optimization applied to auto buffer unregistration, which happens as part of handling UBLK_IO_COMMIT_AND_FETCH_REQ on the daemon task. However, commit b749965edda8 ("ublk: remove ublk_commit_and_fetch()") reordered the ublk_sub_req_ref() for the completed request before the io_buffer_unregister_bvec() call. As a result, task_registered_buffers is already 0 when io_buffer_unregister_bvec() calls ublk_io_release() and the non-atomic refcount optimization doesn't apply. Move the io_buffer_unregister_bvec() call back to before ublk_need_complete_req() to restore the reference counting optimization. Signed-off-by: Caleb Sander Mateos Fixes: b749965edda8 ("ublk: remove ublk_commit_and_fetch()") Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8a5a6ba29a1d..5efaf53261ce 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -3334,11 +3334,11 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, io->res = result; req = ublk_fill_io_cmd(io, cmd); ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx); + if (buf_idx != UBLK_INVALID_BUF_IDX) + io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); compl = ublk_need_complete_req(ub, io); /* can't touch 'ublk_io' any more */ - if (buf_idx != UBLK_INVALID_BUF_IDX) - io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = addr; if (compl) From da562d92e6755c00cd67845a8dbfb908dac51a9c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 30 Jan 2026 15:28:45 +0900 Subject: [PATCH 115/162] block: introduce bdev_rot() Introduce the helper function bdev_rot() to test if a block device is a rotational one. The existing function bdev_nonrot() which tests for the opposite condition is redefined using this new helper. This avoids the double negation (operator and name) that appears when testing if a block device is a rotational device, thus making the code a little easier to read. Call sites of bdev_nonrot() in the block layer are updated to use this new helper. Remaining users in other subsystems are left unchanged for now. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- drivers/block/loop.c | 2 +- drivers/nvme/target/admin-cmd.c | 4 ++-- include/linux/blkdev.h | 7 ++++++- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 344478348a54..fd48f82f9f03 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -692,7 +692,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: - return put_ushort(argp, !bdev_nonrot(bdev)); + return put_ushort(argp, bdev_rot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bd59c0e9508b..ae3039584045 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -969,7 +969,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) lim->features |= BLK_FEAT_WRITE_CACHE; - if (backing_bdev && !bdev_nonrot(backing_bdev)) + if (backing_bdev && bdev_rot(backing_bdev)) lim->features |= BLK_FEAT_ROTATIONAL; lim->max_hw_discard_sectors = max_discard_sectors; lim->max_write_zeroes_sectors = max_discard_sectors; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 3da31bb1183e..5e366502fb75 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -298,7 +298,7 @@ static void nvmet_execute_get_log_page_rmi(struct nvmet_req *req) if (status) goto out; - if (!req->ns->bdev || bdev_nonrot(req->ns->bdev)) { + if (!req->ns->bdev || !bdev_rot(req->ns->bdev)) { status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; goto out; } @@ -1084,7 +1084,7 @@ static void nvmet_execute_id_cs_indep(struct nvmet_req *req) id->nmic = NVME_NS_NMIC_SHARED; if (req->ns->readonly) id->nsattr |= NVME_NS_ATTR_RO; - if (req->ns->bdev && !bdev_nonrot(req->ns->bdev)) + if (req->ns->bdev && bdev_rot(req->ns->bdev)) id->nsfeat |= NVME_NS_ROTATIONAL; /* * We need flush command to flush the file's metadata, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1e5b5547929f..2ae4c45e4959 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1461,9 +1461,14 @@ bdev_write_zeroes_unmap_sectors(struct block_device *bdev) return bdev_limits(bdev)->max_wzeroes_unmap_sectors; } +static inline bool bdev_rot(struct block_device *bdev) +{ + return blk_queue_rot(bdev_get_queue(bdev)); +} + static inline bool bdev_nonrot(struct block_device *bdev) { - return !blk_queue_rot(bdev_get_queue(bdev)); + return !bdev_rot(bdev); } static inline bool bdev_synchronous(struct block_device *bdev) From da7e4b75e50c087d2031a92f6646eb90f7045a67 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan Date: Fri, 30 Jan 2026 10:14:12 -0700 Subject: [PATCH 116/162] ublk: Validate SQE128 flag before accessing the cmd ublk_ctrl_cmd_dump() accesses (header *)sqe->cmd before IO_URING_F_SQE128 flag check. This could cause out of boundary memory access. Move the SQE128 flag check earlier in ublk_ctrl_uring_cmd() to return -EINVAL immediately if the flag is not set. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Signed-off-by: Govindarajulu Varadarajan Reviewed-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 5efaf53261ce..01088194c8d3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -5221,10 +5221,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ublk_ctrl_cmd_dump(cmd); - if (!(issue_flags & IO_URING_F_SQE128)) - goto out; + return -EINVAL; + + ublk_ctrl_cmd_dump(cmd); ret = ublk_check_cmd_op(cmd_op); if (ret) From ed9f54cc1e335096733aed03c2a46de3d58922ed Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 30 Jan 2026 10:14:13 -0700 Subject: [PATCH 117/162] ublk: use READ_ONCE() to read struct ublksrv_ctrl_cmd struct ublksrv_ctrl_cmd is part of the io_uring_sqe, which may lie in userspace-mapped memory. It's racy to access its fields with normal loads, as userspace may write to them concurrently. Use READ_ONCE() to copy the ublksrv_ctrl_cmd from the io_uring_sqe to the stack. Use the local copy in place of the one in the io_uring_sqe. Fixes: 87213b0d847c ("ublk: allow non-blocking ctrl cmds in IO_URING_F_NONBLOCK issue") Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 56 ++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 01088194c8d3..8122b012a7ae 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -4731,12 +4731,11 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) return 0; } -static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) +static inline void ublk_ctrl_cmd_dump(u32 cmd_op, + const struct ublksrv_ctrl_cmd *header) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); - pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", - __func__, cmd->cmd_op, header->dev_id, header->queue_id, + __func__, cmd_op, header->dev_id, header->queue_id, header->data[0], header->addr, header->len); } @@ -5119,9 +5118,8 @@ exit: } static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, - struct io_uring_cmd *cmd) + u32 cmd_op, struct ublksrv_ctrl_cmd *header) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe); bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; void __user *argp = (void __user *)(unsigned long)header->addr; char *dev_path = NULL; @@ -5137,7 +5135,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, * know if the specified device is created as unprivileged * mode. */ - if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2) + if (_IOC_NR(cmd_op) != UBLK_CMD_GET_DEV_INFO2) return 0; } @@ -5158,7 +5156,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, return PTR_ERR(dev_path); ret = -EINVAL; - switch (_IOC_NR(cmd->cmd_op)) { + switch (_IOC_NR(cmd_op)) { case UBLK_CMD_GET_DEV_INFO: case UBLK_CMD_GET_DEV_INFO2: case UBLK_CMD_GET_QUEUE_AFFINITY: @@ -5188,7 +5186,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, header->addr += header->dev_path_len; } pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", - __func__, ub->ub_number, cmd->cmd_op, + __func__, ub->ub_number, cmd_op, ub->dev_info.owner_uid, ub->dev_info.owner_gid, dev_path, ret); exit: @@ -5212,7 +5210,9 @@ static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op) static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); + /* May point to userspace-mapped memory */ + const struct ublksrv_ctrl_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); + struct ublksrv_ctrl_cmd header; struct ublk_device *ub = NULL; u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; @@ -5224,31 +5224,37 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (!(issue_flags & IO_URING_F_SQE128)) return -EINVAL; - ublk_ctrl_cmd_dump(cmd); + header.dev_id = READ_ONCE(ub_src->dev_id); + header.queue_id = READ_ONCE(ub_src->queue_id); + header.len = READ_ONCE(ub_src->len); + header.addr = READ_ONCE(ub_src->addr); + header.data[0] = READ_ONCE(ub_src->data[0]); + header.dev_path_len = READ_ONCE(ub_src->dev_path_len); + ublk_ctrl_cmd_dump(cmd_op, &header); ret = ublk_check_cmd_op(cmd_op); if (ret) goto out; if (cmd_op == UBLK_U_CMD_GET_FEATURES) { - ret = ublk_ctrl_get_features(header); + ret = ublk_ctrl_get_features(&header); goto out; } if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { ret = -ENODEV; - ub = ublk_get_device_from_id(header->dev_id); + ub = ublk_get_device_from_id(header.dev_id); if (!ub) goto out; - ret = ublk_ctrl_uring_cmd_permission(ub, cmd); + ret = ublk_ctrl_uring_cmd_permission(ub, cmd_op, &header); if (ret) goto put_dev; } switch (_IOC_NR(cmd_op)) { case UBLK_CMD_START_DEV: - ret = ublk_ctrl_start_dev(ub, header); + ret = ublk_ctrl_start_dev(ub, &header); break; case UBLK_CMD_STOP_DEV: ublk_ctrl_stop_dev(ub); @@ -5256,10 +5262,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, break; case UBLK_CMD_GET_DEV_INFO: case UBLK_CMD_GET_DEV_INFO2: - ret = ublk_ctrl_get_dev_info(ub, header); + ret = ublk_ctrl_get_dev_info(ub, &header); break; case UBLK_CMD_ADD_DEV: - ret = ublk_ctrl_add_dev(header); + ret = ublk_ctrl_add_dev(&header); break; case UBLK_CMD_DEL_DEV: ret = ublk_ctrl_del_dev(&ub, true); @@ -5268,26 +5274,26 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: - ret = ublk_ctrl_get_queue_affinity(ub, header); + ret = ublk_ctrl_get_queue_affinity(ub, &header); break; case UBLK_CMD_GET_PARAMS: - ret = ublk_ctrl_get_params(ub, header); + ret = ublk_ctrl_get_params(ub, &header); break; case UBLK_CMD_SET_PARAMS: - ret = ublk_ctrl_set_params(ub, header); + ret = ublk_ctrl_set_params(ub, &header); break; case UBLK_CMD_START_USER_RECOVERY: - ret = ublk_ctrl_start_recovery(ub, header); + ret = ublk_ctrl_start_recovery(ub, &header); break; case UBLK_CMD_END_USER_RECOVERY: - ret = ublk_ctrl_end_recovery(ub, header); + ret = ublk_ctrl_end_recovery(ub, &header); break; case UBLK_CMD_UPDATE_SIZE: - ublk_ctrl_set_size(ub, header); + ublk_ctrl_set_size(ub, &header); ret = 0; break; case UBLK_CMD_QUIESCE_DEV: - ret = ublk_ctrl_quiesce_dev(ub, header); + ret = ublk_ctrl_quiesce_dev(ub, &header); break; case UBLK_CMD_TRY_STOP_DEV: ret = ublk_ctrl_try_stop_dev(ub); @@ -5302,7 +5308,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ublk_put_device(ub); out: pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", - __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); + __func__, ret, cmd_op, header.dev_id, header.queue_id); return ret; } From 373df2c0255da77f0842368708afce771e1330ca Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 30 Jan 2026 10:14:14 -0700 Subject: [PATCH 118/162] ublk: drop ublk_ctrl_start_recovery() header argument ublk_ctrl_start_recovery() only uses its const struct ublksrv_ctrl_cmd * header argument to log the dev_id. But this value is already available in struct ublk_device's ub_number field. So log ub_number instead and drop the unused header argument. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8122b012a7ae..60d07480a24c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -4872,8 +4872,7 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, return ret; } -static int ublk_ctrl_start_recovery(struct ublk_device *ub, - const struct ublksrv_ctrl_cmd *header) +static int ublk_ctrl_start_recovery(struct ublk_device *ub) { int ret = -EINVAL; @@ -4902,7 +4901,7 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, ret = -EBUSY; goto out_unlock; } - pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id); + pr_devel("%s: start recovery for dev id %d\n", __func__, ub->ub_number); init_completion(&ub->completion); ret = 0; out_unlock: @@ -5283,7 +5282,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_set_params(ub, &header); break; case UBLK_CMD_START_USER_RECOVERY: - ret = ublk_ctrl_start_recovery(ub, &header); + ret = ublk_ctrl_start_recovery(ub); break; case UBLK_CMD_END_USER_RECOVERY: ret = ublk_ctrl_end_recovery(ub, &header); From 66d3af8d5d678d221776a1886baec8d78293592c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:50 +0800 Subject: [PATCH 119/162] ublk: check list membership before cancelling batch fetch command Add !list_empty(&fcmd->node) check in ublk_batch_cancel_cmd() to ensure the fcmd hasn't already been removed from the list. Once an fcmd is removed from the list, it's considered claimed by whoever removed it and will be freed by that path. Meantime switch to list_del_init() for deleting it from list. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 60d07480a24c..92bd2351e3ad 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -738,7 +738,7 @@ static void ublk_batch_deinit_fetch_buf(struct ublk_queue *ubq, int res) { spin_lock(&ubq->evts_lock); - list_del(&fcmd->node); + list_del_init(&fcmd->node); WARN_ON_ONCE(fcmd != ubq->active_fcmd); __ublk_release_fcmd(ubq); spin_unlock(&ubq->evts_lock); @@ -2693,6 +2693,16 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); } +/* + * Cancel a batch fetch command if it hasn't been claimed by another path. + * + * An fcmd can only be cancelled if: + * 1. It's not the active_fcmd (which is currently being processed) + * 2. It's still on the list (!list_empty check) - once removed from the list, + * the fcmd is considered claimed and will be freed by whoever removed it + * + * Use list_del_init() so subsequent list_empty() checks work correctly. + */ static void ublk_batch_cancel_cmd(struct ublk_queue *ubq, struct ublk_batch_fetch_cmd *fcmd, unsigned int issue_flags) @@ -2700,9 +2710,9 @@ static void ublk_batch_cancel_cmd(struct ublk_queue *ubq, bool done; spin_lock(&ubq->evts_lock); - done = (READ_ONCE(ubq->active_fcmd) != fcmd); + done = (READ_ONCE(ubq->active_fcmd) != fcmd) && !list_empty(&fcmd->node); if (done) - list_del(&fcmd->node); + list_del_init(&fcmd->node); spin_unlock(&ubq->evts_lock); if (done) { From 8443e2087e7002fa25984faad6bbf5f63b280645 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:51 +0800 Subject: [PATCH 120/162] ublk: add UBLK_F_NO_AUTO_PART_SCAN feature flag Add a new feature flag UBLK_F_NO_AUTO_PART_SCAN to allow users to suppress automatic partition scanning when starting a ublk device. This is useful for some cases in which user don't want to scan partitions. Users still can manually trigger partition scanning later when appropriate using standard tools (e.g., partprobe, blockdev --rereadpt). Reported-by: Yoav Cohen Link: https://lore.kernel.org/linux-block/DM4PR12MB63280C5637917C071C2F0D65A9A8A@DM4PR12MB6328.namprd12.prod.outlook.com/ Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 14 ++++++++++---- include/uapi/linux/ublk_cmd.h | 3 +++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 92bd2351e3ad..4fe754e7d1e8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -80,7 +80,8 @@ | UBLK_F_BUF_REG_OFF_DAEMON \ | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ | UBLK_F_SAFE_STOP_DEV \ - | UBLK_F_BATCH_IO) + | UBLK_F_BATCH_IO \ + | UBLK_F_NO_AUTO_PART_SCAN) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -4430,9 +4431,14 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, set_bit(UB_STATE_USED, &ub->state); - /* Schedule async partition scan for trusted daemons */ - if (!ub->unprivileged_daemons) - schedule_work(&ub->partition_scan_work); + /* Skip partition scan if disabled by user */ + if (ub->dev_info.flags & UBLK_F_NO_AUTO_PART_SCAN) { + clear_bit(GD_SUPPRESS_PART_SCAN, &disk->state); + } else { + /* Schedule async partition scan for trusted daemons */ + if (!ub->unprivileged_daemons) + schedule_work(&ub->partition_scan_work); + } out_put_cdev: if (ret) { diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 743d31491387..a88876756805 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -367,6 +367,9 @@ */ #define UBLK_F_SAFE_STOP_DEV (1ULL << 17) +/* Disable automatic partition scanning when device is started */ +#define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 From 3a4d8bed0b47543b2dfce0b1d714b40d68ff2f7e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:52 +0800 Subject: [PATCH 121/162] selftests: ublk: derive TID automatically from script name Add automatic TID derivation in test_common.sh based on the script filename. The TID is extracted by stripping the "test_" prefix and ".sh" suffix from the script name (e.g., test_loop_01.sh -> loop_01). This removes the need for each test script to manually define TID, reducing boilerplate and preventing potential mismatches between the script name and TID. Scripts can still override TID after sourcing test_common.sh if needed. Reviewed-by: Caleb Sander Mateos Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_batch_01.sh | 1 - tools/testing/selftests/ublk/test_batch_02.sh | 1 - tools/testing/selftests/ublk/test_batch_03.sh | 1 - tools/testing/selftests/ublk/test_common.sh | 5 +++++ tools/testing/selftests/ublk/test_generic_01.sh | 1 - tools/testing/selftests/ublk/test_generic_02.sh | 1 - tools/testing/selftests/ublk/test_generic_03.sh | 1 - tools/testing/selftests/ublk/test_generic_04.sh | 1 - tools/testing/selftests/ublk/test_generic_05.sh | 1 - tools/testing/selftests/ublk/test_generic_06.sh | 1 - tools/testing/selftests/ublk/test_generic_07.sh | 1 - tools/testing/selftests/ublk/test_generic_08.sh | 1 - tools/testing/selftests/ublk/test_generic_09.sh | 1 - tools/testing/selftests/ublk/test_generic_10.sh | 1 - tools/testing/selftests/ublk/test_generic_11.sh | 1 - tools/testing/selftests/ublk/test_generic_12.sh | 1 - tools/testing/selftests/ublk/test_generic_13.sh | 1 - tools/testing/selftests/ublk/test_generic_14.sh | 1 - tools/testing/selftests/ublk/test_generic_15.sh | 1 - tools/testing/selftests/ublk/test_generic_16.sh | 1 - tools/testing/selftests/ublk/test_loop_01.sh | 1 - tools/testing/selftests/ublk/test_loop_02.sh | 1 - tools/testing/selftests/ublk/test_loop_03.sh | 1 - tools/testing/selftests/ublk/test_loop_04.sh | 1 - tools/testing/selftests/ublk/test_loop_05.sh | 1 - tools/testing/selftests/ublk/test_loop_06.sh | 1 - tools/testing/selftests/ublk/test_loop_07.sh | 1 - tools/testing/selftests/ublk/test_loop_08.sh | 1 - tools/testing/selftests/ublk/test_null_01.sh | 1 - tools/testing/selftests/ublk/test_null_02.sh | 1 - tools/testing/selftests/ublk/test_null_03.sh | 1 - tools/testing/selftests/ublk/test_null_04.sh | 1 - tools/testing/selftests/ublk/test_stress_01.sh | 1 - tools/testing/selftests/ublk/test_stress_02.sh | 1 - tools/testing/selftests/ublk/test_stress_03.sh | 1 - tools/testing/selftests/ublk/test_stress_04.sh | 1 - tools/testing/selftests/ublk/test_stress_05.sh | 1 - tools/testing/selftests/ublk/test_stress_06.sh | 1 - tools/testing/selftests/ublk/test_stress_07.sh | 1 - tools/testing/selftests/ublk/test_stress_08.sh | 1 - tools/testing/selftests/ublk/test_stress_09.sh | 1 - tools/testing/selftests/ublk/test_stripe_01.sh | 1 - tools/testing/selftests/ublk/test_stripe_02.sh | 1 - tools/testing/selftests/ublk/test_stripe_03.sh | 1 - tools/testing/selftests/ublk/test_stripe_04.sh | 1 - tools/testing/selftests/ublk/test_stripe_05.sh | 1 - tools/testing/selftests/ublk/test_stripe_06.sh | 1 - 47 files changed, 5 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/ublk/test_batch_01.sh b/tools/testing/selftests/ublk/test_batch_01.sh index 9fa9fff5c62f..a18fb39af8be 100755 --- a/tools/testing/selftests/ublk/test_batch_01.sh +++ b/tools/testing/selftests/ublk/test_batch_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="batch_01" ERR_CODE=0 if ! _have_feature "BATCH_IO"; then diff --git a/tools/testing/selftests/ublk/test_batch_02.sh b/tools/testing/selftests/ublk/test_batch_02.sh index b477f91359e1..7ca384d11987 100755 --- a/tools/testing/selftests/ublk/test_batch_02.sh +++ b/tools/testing/selftests/ublk/test_batch_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="batch_02" ERR_CODE=0 if ! _have_feature "BATCH_IO"; then diff --git a/tools/testing/selftests/ublk/test_batch_03.sh b/tools/testing/selftests/ublk/test_batch_03.sh index 13a2b3d3a1b9..aca9cf144b55 100755 --- a/tools/testing/selftests/ublk/test_batch_03.sh +++ b/tools/testing/selftests/ublk/test_batch_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="batch_03" ERR_CODE=0 if ! _have_feature "BATCH_IO"; then diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 7ff6ce79d62c..bbe031c94a29 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -1,6 +1,11 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +# Derive TID from script name: test__.sh -> _ +# Can be overridden in test script after sourcing this file +TID=$(basename "$0" .sh) +TID=${TID#test_} + UBLK_SKIP_CODE=4 _have_program() { diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh index 21a31cd5491a..26cf3c7ceeb5 100755 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ b/tools/testing/selftests/ublk/test_generic_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_01" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 12920768b1a0..1d4b1d6e059c 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_02" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_03.sh b/tools/testing/selftests/ublk/test_generic_03.sh index b551aa76cb0d..8934ea926762 100755 --- a/tools/testing/selftests/ublk/test_generic_03.sh +++ b/tools/testing/selftests/ublk/test_generic_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_03" ERR_CODE=0 _prep_test "null" "check dma & segment limits for zero copy" diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_generic_04.sh index be2292822bbe..2672f9c40fa8 100755 --- a/tools/testing/selftests/ublk/test_generic_04.sh +++ b/tools/testing/selftests/ublk/test_generic_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_04" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh index 9b7f71c16d82..bda5064bc31f 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_generic_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_05" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_generic_06.sh b/tools/testing/selftests/ublk/test_generic_06.sh index fd42062b7b76..14a05054fcd8 100755 --- a/tools/testing/selftests/ublk/test_generic_06.sh +++ b/tools/testing/selftests/ublk/test_generic_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_06" ERR_CODE=0 _prep_test "fault_inject" "fast cleanup when all I/Os of one hctx are in server" diff --git a/tools/testing/selftests/ublk/test_generic_07.sh b/tools/testing/selftests/ublk/test_generic_07.sh index cba86451fa5e..8dcfd8978f50 100755 --- a/tools/testing/selftests/ublk/test_generic_07.sh +++ b/tools/testing/selftests/ublk/test_generic_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_07" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_generic_08.sh b/tools/testing/selftests/ublk/test_generic_08.sh index b222f3a77e12..ce88c31d6b9c 100755 --- a/tools/testing/selftests/ublk/test_generic_08.sh +++ b/tools/testing/selftests/ublk/test_generic_08.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_08" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_09.sh b/tools/testing/selftests/ublk/test_generic_09.sh index bb6f77ca5522..744d0cdaa242 100755 --- a/tools/testing/selftests/ublk/test_generic_09.sh +++ b/tools/testing/selftests/ublk/test_generic_09.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_09" ERR_CODE=0 if ! _have_feature "AUTO_BUF_REG"; then diff --git a/tools/testing/selftests/ublk/test_generic_10.sh b/tools/testing/selftests/ublk/test_generic_10.sh index abc11c3d416b..4b4293b9081f 100755 --- a/tools/testing/selftests/ublk/test_generic_10.sh +++ b/tools/testing/selftests/ublk/test_generic_10.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_10" ERR_CODE=0 if ! _have_feature "UPDATE_SIZE"; then diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_generic_11.sh index d1f973c8c645..e0dc0b8fe5d6 100755 --- a/tools/testing/selftests/ublk/test_generic_11.sh +++ b/tools/testing/selftests/ublk/test_generic_11.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_11" ERR_CODE=0 ublk_run_quiesce_recover() diff --git a/tools/testing/selftests/ublk/test_generic_12.sh b/tools/testing/selftests/ublk/test_generic_12.sh index b4046201b4d9..54b81ddfe9f9 100755 --- a/tools/testing/selftests/ublk/test_generic_12.sh +++ b/tools/testing/selftests/ublk/test_generic_12.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_12" ERR_CODE=0 if ! _have_program bpftrace; then diff --git a/tools/testing/selftests/ublk/test_generic_13.sh b/tools/testing/selftests/ublk/test_generic_13.sh index b7aa90b1cb74..922115aa14f4 100755 --- a/tools/testing/selftests/ublk/test_generic_13.sh +++ b/tools/testing/selftests/ublk/test_generic_13.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_13" ERR_CODE=0 _prep_test "null" "check that feature list is complete" diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_generic_14.sh index cd9b44b97c24..178443394ca5 100755 --- a/tools/testing/selftests/ublk/test_generic_14.sh +++ b/tools/testing/selftests/ublk/test_generic_14.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_14" ERR_CODE=0 ublk_run_recover_test() diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_generic_15.sh index 76379362e0a2..727d0f4610d6 100755 --- a/tools/testing/selftests/ublk/test_generic_15.sh +++ b/tools/testing/selftests/ublk/test_generic_15.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_15" ERR_CODE=0 _test_partition_scan_no_hang() diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh index e08af7b685c9..42e8d2e16ec9 100755 --- a/tools/testing/selftests/ublk/test_generic_16.sh +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_16" ERR_CODE=0 _prep_test "null" "stop --safe command" diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh index 833fa0dbc700..338a235fd82a 100755 --- a/tools/testing/selftests/ublk/test_loop_01.sh +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh index 874568b3646b..04c52454e2ec 100755 --- a/tools/testing/selftests/ublk/test_loop_02.sh +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_02" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh index c30f797c6429..6e8f649fe93d 100755 --- a/tools/testing/selftests/ublk/test_loop_03.sh +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh index b01d75b3214d..9f6774ec0de6 100755 --- a/tools/testing/selftests/ublk/test_loop_04.sh +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_04" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with zero copy" diff --git a/tools/testing/selftests/ublk/test_loop_05.sh b/tools/testing/selftests/ublk/test_loop_05.sh index de2141533074..2b8d99e007be 100755 --- a/tools/testing/selftests/ublk/test_loop_05.sh +++ b/tools/testing/selftests/ublk/test_loop_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_06.sh b/tools/testing/selftests/ublk/test_loop_06.sh index 1d1a8a725502..e73f6f4844db 100755 --- a/tools/testing/selftests/ublk/test_loop_06.sh +++ b/tools/testing/selftests/ublk/test_loop_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_06" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_loop_07.sh b/tools/testing/selftests/ublk/test_loop_07.sh index 493f3fb611a5..264d20e7c530 100755 --- a/tools/testing/selftests/ublk/test_loop_07.sh +++ b/tools/testing/selftests/ublk/test_loop_07.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="loop_07" ERR_CODE=0 _prep_test "loop" "mkfs & mount & umount with user copy" diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh index ca289cfb2ad4..2caa7ba748fb 100755 --- a/tools/testing/selftests/ublk/test_loop_08.sh +++ b/tools/testing/selftests/ublk/test_loop_08.sh @@ -13,7 +13,6 @@ if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then exit $UBLK_SKIP_CODE fi -TID=loop_08 _prep_test "loop" "end-to-end integrity" diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh index c2cb8f7a09fe..eebce8076530 100755 --- a/tools/testing/selftests/ublk/test_null_01.sh +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh index 8accd35beb55..654bdff39664 100755 --- a/tools/testing/selftests/ublk/test_null_02.sh +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_03.sh b/tools/testing/selftests/ublk/test_null_03.sh index 0051067b4686..29cd09f06672 100755 --- a/tools/testing/selftests/ublk/test_null_03.sh +++ b/tools/testing/selftests/ublk/test_null_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="null_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index 0b0719ea33a3..7491b8c17f00 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID=null_04 _prep_test "null" "integrity params" diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh index 7d3150f057d4..a9322ce496e9 100755 --- a/tools/testing/selftests/ublk/test_stress_01.sh +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_01" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh index 4bdd921081e5..6c114194f9c9 100755 --- a/tools/testing/selftests/ublk/test_stress_02.sh +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_02" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_03.sh b/tools/testing/selftests/ublk/test_stress_03.sh index 3ed4c9b2d8c0..4e81ca0db758 100755 --- a/tools/testing/selftests/ublk/test_stress_03.sh +++ b/tools/testing/selftests/ublk/test_stress_03.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_03" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_04.sh b/tools/testing/selftests/ublk/test_stress_04.sh index efa8dc33234b..6c6f44b172bc 100755 --- a/tools/testing/selftests/ublk/test_stress_04.sh +++ b/tools/testing/selftests/ublk/test_stress_04.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_04" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_05.sh b/tools/testing/selftests/ublk/test_stress_05.sh index 68a194144302..7e9324de2030 100755 --- a/tools/testing/selftests/ublk/test_stress_05.sh +++ b/tools/testing/selftests/ublk/test_stress_05.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stress_06.sh b/tools/testing/selftests/ublk/test_stress_06.sh index 37188ec2e1f7..c72e5d0b14be 100755 --- a/tools/testing/selftests/ublk/test_stress_06.sh +++ b/tools/testing/selftests/ublk/test_stress_06.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_06" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_07.sh b/tools/testing/selftests/ublk/test_stress_07.sh index fb061fc26d36..04c2764d5238 100755 --- a/tools/testing/selftests/ublk/test_stress_07.sh +++ b/tools/testing/selftests/ublk/test_stress_07.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_07" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stress_08.sh b/tools/testing/selftests/ublk/test_stress_08.sh index 9abb50ee3d00..37f7d204879a 100755 --- a/tools/testing/selftests/ublk/test_stress_08.sh +++ b/tools/testing/selftests/ublk/test_stress_08.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_08" ERR_CODE=0 ublk_io_and_remove() diff --git a/tools/testing/selftests/ublk/test_stress_09.sh b/tools/testing/selftests/ublk/test_stress_09.sh index 87b92b0a2410..53c1e3b2ab30 100755 --- a/tools/testing/selftests/ublk/test_stress_09.sh +++ b/tools/testing/selftests/ublk/test_stress_09.sh @@ -2,7 +2,6 @@ # SPDX-License-Identifier: GPL-2.0 . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stress_09" ERR_CODE=0 ublk_io_and_kill_daemon() diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh index 4e4f0fdf3c9b..3bc821aadad8 100755 --- a/tools/testing/selftests/ublk/test_stripe_01.sh +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_01" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh index 5820ab2efba4..4a7d2b21a6bf 100755 --- a/tools/testing/selftests/ublk/test_stripe_02.sh +++ b/tools/testing/selftests/ublk/test_stripe_02.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_02" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount" diff --git a/tools/testing/selftests/ublk/test_stripe_03.sh b/tools/testing/selftests/ublk/test_stripe_03.sh index 20b977e27814..a1c159d54e53 100755 --- a/tools/testing/selftests/ublk/test_stripe_03.sh +++ b/tools/testing/selftests/ublk/test_stripe_03.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_03" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_04.sh b/tools/testing/selftests/ublk/test_stripe_04.sh index 1b51ed2f1d84..0c30bd6c2b3b 100755 --- a/tools/testing/selftests/ublk/test_stripe_04.sh +++ b/tools/testing/selftests/ublk/test_stripe_04.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_04" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on zero copy" diff --git a/tools/testing/selftests/ublk/test_stripe_05.sh b/tools/testing/selftests/ublk/test_stripe_05.sh index 05d71951d710..6ddfa88ad226 100755 --- a/tools/testing/selftests/ublk/test_stripe_05.sh +++ b/tools/testing/selftests/ublk/test_stripe_05.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_05" ERR_CODE=0 if ! _have_program fio; then diff --git a/tools/testing/selftests/ublk/test_stripe_06.sh b/tools/testing/selftests/ublk/test_stripe_06.sh index d06cac7626e2..a2c7bf4cc613 100755 --- a/tools/testing/selftests/ublk/test_stripe_06.sh +++ b/tools/testing/selftests/ublk/test_stripe_06.sh @@ -3,7 +3,6 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="stripe_06" ERR_CODE=0 _prep_test "stripe" "mkfs & mount & umount on user copy" From e07a2039b6d4ae3acf8ae39b86be449b7fa18d4a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:53 +0800 Subject: [PATCH 122/162] selftests: ublk: add selftest for UBLK_F_NO_AUTO_PART_SCAN Add test_part_01.sh to test the UBLK_F_NO_AUTO_PART_SCAN feature flag which allows suppressing automatic partition scanning during device startup while still allowing manual partition probing. The test verifies: - Normal behavior: partitions are auto-detected without the flag - With flag: partitions are not auto-detected during START_DEV - Manual scan: blockdev --rereadpt works with the flag Also update kublk tool to support --no_auto_part_scan option and recognize the feature flag. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 + tools/testing/selftests/ublk/kublk.c | 6 +- tools/testing/selftests/ublk/kublk.h | 3 +- tools/testing/selftests/ublk/test_part_01.sh | 104 +++++++++++++++++++ 4 files changed, 113 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_part_01.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index e39a6f871fcc..bc5bd7d1381d 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -48,6 +48,8 @@ TEST_PROGS += test_stripe_04.sh TEST_PROGS += test_stripe_05.sh TEST_PROGS += test_stripe_06.sh +TEST_PROGS += test_part_01.sh + TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh TEST_PROGS += test_stress_03.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 2da37557e1a9..e8279c4acc40 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1615,6 +1615,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_INTEGRITY), FEAT_NAME(UBLK_F_SAFE_STOP_DEV), FEAT_NAME(UBLK_F_BATCH_IO), + FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), }; struct ublk_dev *dev; __u64 features = 0; @@ -1712,7 +1713,7 @@ static void __cmd_create_help(char *exe, bool recovery) printf("\t[--nthreads threads] [--per_io_tasks]\n"); printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] " "[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n"); - printf("\t[--batch|-b]\n"); + printf("\t[--batch|-b] [--no_auto_part_scan]\n"); printf("\t[target options] [backfile1] [backfile2] ...\n"); printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n"); printf("\tdefault: nthreads=nr_queues"); @@ -1786,6 +1787,7 @@ int main(int argc, char *argv[]) { "tag_size", 1, NULL, 0 }, { "safe", 0, NULL, 0 }, { "batch", 0, NULL, 'b'}, + { "no_auto_part_scan", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1898,6 +1900,8 @@ int main(int argc, char *argv[]) ctx.tag_size = strtoul(optarg, NULL, 0); if (!strcmp(longopts[option_idx].name, "safe")) ctx.safe_stop = 1; + if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) + ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; break; case '?': /* diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index ca97deb5e208..1faeccaaecae 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -78,12 +78,13 @@ struct dev_ctx { unsigned int auto_zc_fallback:1; unsigned int per_io_tasks:1; unsigned int no_ublk_fixed_fd:1; + unsigned int safe_stop:1; + unsigned int no_auto_part_scan:1; __u32 integrity_flags; __u8 metadata_size; __u8 pi_offset; __u8 csum_type; __u8 tag_size; - unsigned int safe_stop:1; int _evtfd; int _shmid; diff --git a/tools/testing/selftests/ublk/test_part_01.sh b/tools/testing/selftests/ublk/test_part_01.sh new file mode 100755 index 000000000000..8028f6e4b3a5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_part_01.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +format_backing_file() +{ + local backing_file=$1 + + # Create ublk device to write partition table + local tmp_dev=$(_add_ublk_dev -t loop "${backing_file}") + [ $? -ne 0 ] && return 1 + + # Write partition table with sfdisk + sfdisk /dev/ublkb"${tmp_dev}" > /dev/null 2>&1 < /dev/null 2>&1 + udevadm settle + + if [ ! -e /dev/ublkb"${dev_id}"p1 ] || [ ! -e /dev/ublkb"${dev_id}"p2 ]; then + "${UBLK_PROG}" del -n "${dev_id}" + return 1 + fi + + "${UBLK_PROG}" del -n "${dev_id}" + return 0 +} + +if ! _have_program sfdisk || ! _have_program blockdev; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "generic" "test UBLK_F_NO_AUTO_PART_SCAN" + +if ! _have_feature "UBLK_F_NO_AUTO_PART_SCAN"; then + _cleanup_test "generic" + exit "$UBLK_SKIP_CODE" +fi + + +# Create and format backing file with partition table +_create_backfile 0 256M +format_backing_file "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test normal auto partition scan +[ "$ERR_CODE" -eq 0 ] && test_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +# Test no auto partition scan with manual scan +[ "$ERR_CODE" -eq 0 ] && test_no_auto_part_scan "${UBLK_BACKFILES[0]}" +[ $? -ne 0 ] && ERR_CODE=255 + +_cleanup_test "generic" +_show_result $TID $ERR_CODE From 7a30d3dfea4a455d1109d5258fe332f2157071ba Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:54 +0800 Subject: [PATCH 123/162] selftests: ublk: rename test_generic_15 to test_part_02 This test exercises partition scanning behavior, so move it to the test_part_* group for consistency. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 +- .../selftests/ublk/{test_generic_15.sh => test_part_02.sh} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tools/testing/selftests/ublk/{test_generic_15.sh => test_part_02.sh} (100%) diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index bc5bd7d1381d..ca8588ed962c 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -22,7 +22,6 @@ TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_14.sh -TEST_PROGS += test_generic_15.sh TEST_PROGS += test_generic_16.sh TEST_PROGS += test_batch_01.sh @@ -49,6 +48,7 @@ TEST_PROGS += test_stripe_05.sh TEST_PROGS += test_stripe_06.sh TEST_PROGS += test_part_01.sh +TEST_PROGS += test_part_02.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh diff --git a/tools/testing/selftests/ublk/test_generic_15.sh b/tools/testing/selftests/ublk/test_part_02.sh similarity index 100% rename from tools/testing/selftests/ublk/test_generic_15.sh rename to tools/testing/selftests/ublk/test_part_02.sh From 130975353b1548d76aa9790a4ac7e74bd2a37221 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:55 +0800 Subject: [PATCH 124/162] selftests: ublk: refactor test_null_04 into separate functions Encapsulate each test case in its own function that creates the device, runs checks, and deletes only that device. This avoids calling _cleanup_test multiple times. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_null_04.sh | 250 +++++++------------ 1 file changed, 95 insertions(+), 155 deletions(-) diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index 7491b8c17f00..22328e0f3925 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -3,163 +3,103 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +ERR_CODE=0 + +_check_value() { + local name=$1 + local actual=$2 + local expected=$3 + + if [ "$actual" != "$expected" ]; then + echo "$name $actual != $expected" + ERR_CODE=255 + return 1 + fi + return 0 +} + +_test_metadata_only() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 0 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" nop && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_test_integrity_capable_ip() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 56 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 1 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE3-IP && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_test_integrity_reftag_t10dif() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 8 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" T10-DIF-TYPE1-CRC && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 + + ${UBLK_PROG} del -n "${dev_id}" +} + +_test_nvme_csum() { + local dev_id + + dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) + _check_add_dev "$TID" $? + + _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && + _check_value "pi_offset" "$(_get_metadata_size "$dev_id" pi_offset)" 0 && + _check_value "pi_tuple_size" "$(_get_metadata_size "$dev_id" pi_tuple_size)" 16 && + _check_value "device_is_integrity_capable" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable")" 0 && + _check_value "format" "$(cat "/sys/block/ublkb$dev_id/integrity/format")" EXT-DIF-TYPE3-CRC64 && + _check_value "protection_interval_bytes" \ + "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && + _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 + + ${UBLK_PROG} del -n "${dev_id}" +} _prep_test "null" "integrity params" -dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 8 ]; then - echo "metadata_size $metadata_size != 8" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 0 ]; then - echo "pi_offset $pi_offset != 0" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 0 ]; then - echo "pi_tuple_size $pi_tuple_size != 0" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 0 ]; then - echo "device_is_integrity_capable $capable != 0" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != nop ]; then - echo "format $format != nop" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 0 ]; then - echo "tag_size $tag_size != 0" - _show_result $TID 255 -fi -_cleanup_test +_test_metadata_only +_test_integrity_capable_ip +_test_integrity_reftag_t10dif +_test_nvme_csum -dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 64 ]; then - echo "metadata_size $metadata_size != 64" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 56 ]; then - echo "pi_offset $pi_offset != 56" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 8 ]; then - echo "pi_tuple_size $pi_tuple_size != 8" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 1 ]; then - echo "device_is_integrity_capable $capable != 1" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != T10-DIF-TYPE3-IP ]; then - echo "format $format != T10-DIF-TYPE3-IP" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 0 ]; then - echo "tag_size $tag_size != 0" - _show_result $TID 255 -fi _cleanup_test - -dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 8 ]; then - echo "metadata_size $metadata_size != 8" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 0 ]; then - echo "pi_offset $pi_offset != 0" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 8 ]; then - echo "pi_tuple_size $pi_tuple_size != 8" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 0 ]; then - echo "device_is_integrity_capable $capable != 0" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != T10-DIF-TYPE1-CRC ]; then - echo "format $format != T10-DIF-TYPE1-CRC" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 0 ]; then - echo "tag_size $tag_size != 0" - _show_result $TID 255 -fi -_cleanup_test - -dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) -_check_add_dev $TID $? -metadata_size=$(_get_metadata_size "$dev_id" metadata_size) -if [ "$metadata_size" != 16 ]; then - echo "metadata_size $metadata_size != 16" - _show_result $TID 255 -fi -pi_offset=$(_get_metadata_size "$dev_id" pi_offset) -if [ "$pi_offset" != 0 ]; then - echo "pi_offset $pi_offset != 0" - _show_result $TID 255 -fi -pi_tuple_size=$(_get_metadata_size "$dev_id" pi_tuple_size) -if [ "$pi_tuple_size" != 16 ]; then - echo "pi_tuple_size $pi_tuple_size != 16" - _show_result $TID 255 -fi -capable=$(cat "/sys/block/ublkb$dev_id/integrity/device_is_integrity_capable") -if [ "$capable" != 0 ]; then - echo "device_is_integrity_capable $capable != 0" - _show_result $TID 255 -fi -format=$(cat "/sys/block/ublkb$dev_id/integrity/format") -if [ "$format" != EXT-DIF-TYPE3-CRC64 ]; then - echo "format $format != EXT-DIF-TYPE3-CRC64" - _show_result $TID 255 -fi -protection_interval_bytes=$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes") -if [ "$protection_interval_bytes" != 512 ]; then - echo "protection_interval_bytes $protection_interval_bytes != 512" - _show_result $TID 255 -fi -tag_size=$(cat "/sys/block/ublkb$dev_id/integrity/tag_size") -if [ "$tag_size" != 8 ]; then - echo "tag_size $tag_size != 8" - _show_result $TID 255 -fi -_cleanup_test - -_show_result $TID 0 +_show_result "$TID" $ERR_CODE From 76334de7da404c385e18efb3640ed60ca77a899f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Jan 2026 00:19:56 +0800 Subject: [PATCH 125/162] selftests: ublk: disable partition scan for integrity tests The null target doesn't handle IO, so disable partition scan to avoid IO failures caused by integrity verification during the kernel's partition table read. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_null_04.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index 22328e0f3925..a5599d38583a 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -21,7 +21,7 @@ _check_value() { _test_metadata_only() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --metadata_size 8) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 8) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && @@ -40,7 +40,7 @@ _test_metadata_only() { _test_integrity_capable_ip() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_capable --metadata_size 64 --pi_offset 56 --csum_type ip) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 64 && @@ -59,7 +59,7 @@ _test_integrity_capable_ip() { _test_integrity_reftag_t10dif() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --integrity_reftag --metadata_size 8 --csum_type t10dif) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --integrity_reftag --metadata_size 8 --csum_type t10dif) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 8 && @@ -78,7 +78,7 @@ _test_integrity_reftag_t10dif() { _test_nvme_csum() { local dev_id - dev_id=$(_add_ublk_dev -t null -u --metadata_size 16 --csum_type nvme --tag_size 8) + dev_id=$(_add_ublk_dev -t null -u --no_auto_part_scan --metadata_size 16 --csum_type nvme --tag_size 8) _check_add_dev "$TID" $? _check_value "metadata_size" "$(_get_metadata_size "$dev_id" metadata_size)" 16 && From 4e0d293af9e37c735aec574c1e69ed71f81f94b2 Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Fri, 30 Jan 2026 00:19:57 +0800 Subject: [PATCH 126/162] selftests: ublk: mark each test start and end time in dmesg Log test start and end time in dmesg, so generated log messages during the test run can be linked to specific test from the test suite. (switch to `date +%F %T`) Signed-off-by: Alexander Atanasov Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index bbe031c94a29..dd4eff97610a 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -126,6 +126,7 @@ _prep_test() { modprobe ublk_drv > /dev/null 2>&1 UBLK_TMP=$(mktemp ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" + echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg } _remove_test_files() @@ -170,6 +171,7 @@ _cleanup_test() { "${UBLK_PROG}" del -a _remove_files + echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg } _have_feature() From 2feca79ef8df5505b87c00812b9ba263b92c64ed Mon Sep 17 00:00:00 2001 From: Alexander Atanasov Date: Fri, 30 Jan 2026 00:19:58 +0800 Subject: [PATCH 127/162] selftests: ublk: move test temp files into a sub directory Create and use a temporary directory for the files created during test runs. If TMPDIR environment variable is set use it as a base for the temporary directory path. TMPDIR=/mnt/scratch make run_tests and TMPDIR=/mnt/scratch ./test_generic_01.sh will place test directory under /mnt/scratch Signed-off-by: Alexander Atanasov Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index dd4eff97610a..21ba51fcc7d7 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -48,7 +48,7 @@ _create_backfile() { old_file="${UBLK_BACKFILES[$index]}" [ -f "$old_file" ] && rm -f "$old_file" - new_file=$(mktemp ublk_file_"${new_size}"_XXXXX) + new_file=$(mktemp ${UBLK_TEST_DIR}/ublk_file_"${new_size}"_XXXXX) truncate -s "${new_size}" "${new_file}" UBLK_BACKFILES["$index"]="$new_file" } @@ -65,7 +65,7 @@ _remove_files() { _create_tmp_dir() { local my_file; - my_file=$(mktemp -d ublk_dir_XXXXX) + my_file=$(mktemp -d ${UBLK_TEST_DIR}/ublk_dir_XXXXX) echo "$my_file" } @@ -124,7 +124,9 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv > /dev/null 2>&1 - UBLK_TMP=$(mktemp ublk_test_XXXXX) + TDIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) + export UBLK_TEST_DIR=${TDIR} + UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg } @@ -171,6 +173,7 @@ _cleanup_test() { "${UBLK_PROG}" del -a _remove_files + rmdir ${UBLK_TEST_DIR} echo "ublk selftest: $TID done at $(date '+%F %T')" | tee /dev/kmsg } @@ -405,6 +408,8 @@ UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 UBLK_BACKFILES=() +UBLK_TEST_DIR=${TMPDIR:-.} export UBLK_PROG export UBLK_TEST_QUIET export UBLK_TEST_SHOW_RESULT +export UBLK_TEST_DIR From 491af20b3c6d5baedb96357d4b12232ae490cbe7 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 30 Jan 2026 08:25:30 -0700 Subject: [PATCH 128/162] ublk: remove "can't touch 'ublk_io' any more" comments The struct ublk_io is in fact accessed in __ublk_complete_rq() after the comment. But it's not racy to access the ublk_io between clearing its UBLK_IO_FLAG_OWNED_BY_SRV flag and completing the request, as no other thread can use the ublk_io in the meantime. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4fe754e7d1e8..3c918db4905c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -3349,7 +3349,6 @@ static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd, io_buffer_unregister_bvec(cmd, buf_idx, issue_flags); compl = ublk_need_complete_req(ub, io); - /* can't touch 'ublk_io' any more */ if (req_op(req) == REQ_OP_ZONE_APPEND) req->__sector = addr; if (compl) @@ -3681,7 +3680,6 @@ static int ublk_batch_commit_io(struct ublk_queue *ubq, return ret; } - /* can't touch 'ublk_io' any more */ if (buf_idx != UBLK_INVALID_BUF_IDX) io_buffer_unregister_bvec(data->cmd, buf_idx, data->issue_flags); if (req_op(req) == REQ_OP_ZONE_APPEND) From 5af302a15a1d628a025a78892001fe8afea90c60 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:32 +0800 Subject: [PATCH 129/162] selftests: ublk: simplify UBLK_TEST_DIR handling Remove intermediate TDIR variable and set UBLK_TEST_DIR directly in _prep_test(). Remove default initialization since the directory is created dynamically when tests run. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 21ba51fcc7d7..8d298a7ee7b1 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -124,8 +124,7 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv > /dev/null 2>&1 - TDIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) - export UBLK_TEST_DIR=${TDIR} + UBLK_TEST_DIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg @@ -408,8 +407,6 @@ UBLK_PROG=$(_ublk_test_top_dir)/kublk UBLK_TEST_QUIET=1 UBLK_TEST_SHOW_RESULT=1 UBLK_BACKFILES=() -UBLK_TEST_DIR=${TMPDIR:-.} export UBLK_PROG export UBLK_TEST_QUIET export UBLK_TEST_SHOW_RESULT -export UBLK_TEST_DIR From 842b6520e579b8bd7d6ea09937e1fb7729cce1c5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:33 +0800 Subject: [PATCH 130/162] selftests: ublk: refactor test_loop_08 into separate functions Encapsulate each test case in its own function for better organization and maintainability: - _setup_device(): device and backfile initialization - _test_fill_and_verify(): initial data population - _test_corrupted_reftag(): reftag corruption detection test - _test_corrupted_data(): data corruption detection test - _test_bad_apptag(): apptag mismatch detection test Also fix temp file creation to use ${UBLK_TEST_DIR}/fio_err_XXXXX instead of creating in current directory. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_loop_08.sh | 205 +++++++++++-------- 1 file changed, 118 insertions(+), 87 deletions(-) diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_loop_08.sh index 2caa7ba748fb..aaf1f52da559 100755 --- a/tools/testing/selftests/ublk/test_loop_08.sh +++ b/tools/testing/selftests/ublk/test_loop_08.sh @@ -13,98 +13,129 @@ if [[ "$fio_version" =~ fio-[0-9]+\.[0-9]+$ ]]; then exit $UBLK_SKIP_CODE fi +ERR_CODE=0 + +# Global variables set during device setup +dev_id="" +fio_args="" +fio_err="" + +_setup_device() { + _create_backfile 0 256M + _create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) + + local integrity_params="--integrity_capable --integrity_reftag + --metadata_size 64 --pi_offset 56 --csum_type t10dif" + dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") + _check_add_dev "$TID" $? + + # 1M * (64 integrity bytes / 512 data bytes) = 128K + fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 + --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG + --filename /dev/ublkb$dev_id" + + fio_err=$(mktemp "${UBLK_TEST_DIR}"/fio_err_XXXXX) +} + +_test_fill_and_verify() { + fio --name fill --rw randwrite $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio fill failed" + ERR_CODE=255 + return 1 + fi + + fio --name verify --rw randread $fio_args > /dev/null + if [ $? != 0 ]; then + echo "fio verify failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_reftag() { + local dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" + local expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" + + # Overwrite 4-byte reftag at offset 56 + 4 = 60 + dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_reftag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_reftag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi + + # Reset to 0 + dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args + if [ $? != 0 ]; then + echo "dd restore corrupted_reftag failed" + ERR_CODE=255 + return 1 + fi +} + +_test_corrupted_data() { + local dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" + local expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" + + dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args + if [ $? != 0 ]; then + echo "dd corrupted_data failed" + ERR_CODE=255 + return 1 + fi + + if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then + echo "fio corrupted_data unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio corrupted_data message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} + +_test_bad_apptag() { + local expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" + + if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then + echo "fio bad_apptag unexpectedly succeeded" + ERR_CODE=255 + return 1 + fi + + if ! grep -q "$expected_err" "$fio_err"; then + echo "fio bad_apptag message not found: $expected_err" + ERR_CODE=255 + return 1 + fi +} _prep_test "loop" "end-to-end integrity" -_create_backfile 0 256M -_create_backfile 1 32M # 256M * (64 integrity bytes / 512 data bytes) -integrity_params="--integrity_capable --integrity_reftag - --metadata_size 64 --pi_offset 56 --csum_type t10dif" -dev_id=$(_add_ublk_dev -t loop -u $integrity_params "${UBLK_BACKFILES[@]}") -_check_add_dev $TID $? +_setup_device -# 1M * (64 integrity bytes / 512 data bytes) = 128K -fio_args="--ioengine io_uring --direct 1 --bsrange 512-1M --iodepth 32 - --md_per_io_size 128K --pi_act 0 --pi_chk GUARD,REFTAG,APPTAG - --filename /dev/ublkb$dev_id" -fio --name fill --rw randwrite $fio_args > /dev/null -err=$? -if [ $err != 0 ]; then - echo "fio fill failed" - _show_result $TID $err -fi - -fio --name verify --rw randread $fio_args > /dev/null -err=$? -if [ $err != 0 ]; then - echo "fio verify failed" - _show_result $TID $err -fi - -fio_err=$(mktemp fio_err_XXXXX) - -# Overwrite 4-byte reftag at offset 56 + 4 = 60 -dd_reftag_args="bs=1 seek=60 count=4 oflag=dsync conv=notrunc status=none" -dd if=/dev/urandom "of=${UBLK_BACKFILES[1]}" $dd_reftag_args -err=$? -if [ $err != 0 ]; then - echo "dd corrupted_reftag failed" - rm -f "$fio_err" - _show_result $TID $err -fi -if fio --name corrupted_reftag --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_reftag unexpectedly succeeded" - rm -f "$fio_err" - _show_result $TID 255 -fi -expected_err="REFTAG compare error: LBA: 0 Expected=0, Actual=" -if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_reftag message not found: $expected_err" - rm -f "$fio_err" - _show_result $TID 255 -fi -# Reset to 0 -dd if=/dev/zero "of=${UBLK_BACKFILES[1]}" $dd_reftag_args -err=$? -if [ $err != 0 ]; then - echo "dd restore corrupted_reftag failed" - rm -f "$fio_err" - _show_result $TID $err -fi - -dd_data_args="bs=512 count=1 oflag=direct,dsync conv=notrunc status=none" -dd if=/dev/zero "of=${UBLK_BACKFILES[0]}" $dd_data_args -err=$? -if [ $err != 0 ]; then - echo "dd corrupted_data failed" - rm -f "$fio_err" - _show_result $TID $err -fi -if fio --name corrupted_data --rw randread $fio_args > /dev/null 2> "$fio_err"; then - echo "fio corrupted_data unexpectedly succeeded" - rm -f "$fio_err" - _show_result $TID 255 -fi -expected_err="Guard compare error: LBA: 0 Expected=0, Actual=" -if ! grep -q "$expected_err" "$fio_err"; then - echo "fio corrupted_data message not found: $expected_err" - rm -f "$fio_err" - _show_result $TID 255 -fi - -if fio --name bad_apptag --rw randread $fio_args --apptag 0x4321 > /dev/null 2> "$fio_err"; then - echo "fio bad_apptag unexpectedly succeeded" - rm -f "$fio_err" - _show_result $TID 255 -fi -expected_err="APPTAG compare error: LBA: [0-9]* Expected=4321, Actual=1234" -if ! grep -q "$expected_err" "$fio_err"; then - echo "fio bad_apptag message not found: $expected_err" - rm -f "$fio_err" - _show_result $TID 255 -fi +_test_fill_and_verify && \ +_test_corrupted_reftag && \ +_test_corrupted_data && \ +_test_bad_apptag rm -f "$fio_err" _cleanup_test -_show_result $TID 0 +_show_result "$TID" $ERR_CODE From 92734a4f3a7a5449b0c7d0160ba658a2b665c31b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:34 +0800 Subject: [PATCH 131/162] selftests: ublk: add _ublk_del_dev helper function Add _ublk_del_dev() to delete a specific ublk device by ID and use it in all test scripts instead of calling UBLK_PROG directly. Also remove unused _remove_ublk_devices() function. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 13 +++++++------ tools/testing/selftests/ublk/test_generic_16.sh | 4 ++-- tools/testing/selftests/ublk/test_null_04.sh | 8 ++++---- tools/testing/selftests/ublk/test_part_02.sh | 4 ++-- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 8d298a7ee7b1..0f1fdb0892b4 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -106,11 +106,6 @@ _check_root() { fi } -_remove_ublk_devices() { - ${UBLK_PROG} del -a - modprobe -r ublk_drv > /dev/null 2>&1 -} - _get_ublk_dev_state() { ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' } @@ -277,10 +272,16 @@ __ublk_kill_daemon() echo "$state" } -__remove_ublk_dev_return() { +_ublk_del_dev() { local dev_id=$1 ${UBLK_PROG} del -n "${dev_id}" +} + +__remove_ublk_dev_return() { + local dev_id=$1 + + _ublk_del_dev "${dev_id}" local res=$? udevadm settle return ${res} diff --git a/tools/testing/selftests/ublk/test_generic_16.sh b/tools/testing/selftests/ublk/test_generic_16.sh index 42e8d2e16ec9..3ef367836ac5 100755 --- a/tools/testing/selftests/ublk/test_generic_16.sh +++ b/tools/testing/selftests/ublk/test_generic_16.sh @@ -24,7 +24,7 @@ if ! ${UBLK_PROG} stop -n "${dev_id}" --safe; then fi # Clean up device -${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 +_ublk_del_dev "${dev_id}" > /dev/null 2>&1 udevadm settle # Test 2: stop --safe on device with active opener should fail @@ -49,7 +49,7 @@ kill $dd_pid 2>/dev/null wait $dd_pid 2>/dev/null # Now device should be idle, regular delete should work -${UBLK_PROG} del -n "${dev_id}" +_ublk_del_dev "${dev_id}" udevadm settle _cleanup_test "null" diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_null_04.sh index a5599d38583a..6713b280a6ff 100755 --- a/tools/testing/selftests/ublk/test_null_04.sh +++ b/tools/testing/selftests/ublk/test_null_04.sh @@ -34,7 +34,7 @@ _test_metadata_only() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _test_integrity_capable_ip() { @@ -53,7 +53,7 @@ _test_integrity_capable_ip() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _test_integrity_reftag_t10dif() { @@ -72,7 +72,7 @@ _test_integrity_reftag_t10dif() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 0 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _test_nvme_csum() { @@ -91,7 +91,7 @@ _test_nvme_csum() { "$(cat "/sys/block/ublkb$dev_id/integrity/protection_interval_bytes")" 512 && _check_value "tag_size" "$(cat "/sys/block/ublkb$dev_id/integrity/tag_size")" 8 - ${UBLK_PROG} del -n "${dev_id}" + _ublk_del_dev "${dev_id}" } _prep_test "null" "integrity params" diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh index 727d0f4610d6..acd098deda3a 100755 --- a/tools/testing/selftests/ublk/test_part_02.sh +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -46,13 +46,13 @@ _test_partition_scan_no_hang() if [ "$state" != "${expected_state}" ]; then echo "FAIL: Device state is $state, expected ${expected_state}" ERR_CODE=255 - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 return fi echo "PASS: Device transitioned to ${expected_state} in ${elapsed}s without hanging" # Clean up the device - ${UBLK_PROG} del -n "${dev_id}" > /dev/null 2>&1 + _ublk_del_dev "${dev_id}" > /dev/null 2>&1 } _prep_test "partition_scan" "verify async partition scan prevents IO hang" From 2021e6109de3e97adfce262c40a657ff206ef495 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:35 +0800 Subject: [PATCH 132/162] selftests: ublk: track created devices for per-test cleanup Track device IDs in UBLK_DEVS array when created. Update _cleanup_test() to only delete devices created by this test instead of using 'del -a' which removes all devices. This prepares for running tests concurrently where each test should only clean up its own devices. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 0f1fdb0892b4..422882c32490 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -164,7 +164,12 @@ _check_add_dev() } _cleanup_test() { - "${UBLK_PROG}" del -a + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + while read -r dev_id; do + ${UBLK_PROG} del -n "${dev_id}" + done < "${UBLK_TEST_DIR}/.ublk_devs" + rm -f "${UBLK_TEST_DIR}/.ublk_devs" + fi _remove_files rmdir ${UBLK_TEST_DIR} @@ -205,6 +210,7 @@ _create_ublk_dev() { fi if [[ "$dev_id" =~ ^[0-9]+$ ]]; then + echo "$dev_id" >> "${UBLK_TEST_DIR}/.ublk_devs" echo "${dev_id}" else return 255 @@ -276,6 +282,11 @@ _ublk_del_dev() { local dev_id=$1 ${UBLK_PROG} del -n "${dev_id}" + + # Remove from tracking file + if [ -f "${UBLK_TEST_DIR}/.ublk_devs" ]; then + sed -i "/^${dev_id}$/d" "${UBLK_TEST_DIR}/.ublk_devs" + fi } __remove_ublk_dev_return() { From b6bbc3bec19efd557f888d78865b627b80b37a32 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:36 +0800 Subject: [PATCH 133/162] selftests: ublk: add group-based test targets Add convenient Makefile targets for running specific test groups: - run_generic, run_batch, run_null, run_loop, run_stripe, run_stress, etc. - run_all for running all tests Test groups are auto-detected from TEST_PROGS using pattern matching (test__.sh -> group), and targets are generated dynamically using define/eval templates. Supports parallel execution via JOBS variable: - JOBS=1 (default): sequential with kselftest TAP output - JOBS>1: parallel execution with xargs -P Usage examples: make run_null # Sequential execution make run_stress JOBS=4 # Parallel with 4 jobs make run_all JOBS=8 # Run all tests with 8 parallel jobs With JOBS=8, running time of `make run_all` is reduced to 2m2s from 6m5s in my test VM. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index ca8588ed962c..37e012d3a8a7 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -72,3 +72,39 @@ $(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c)) check: shellcheck -x -f gcc *.sh + +# Test groups for running subsets of tests +# JOBS=1 (default): sequential with kselftest TAP output +# JOBS>1: parallel execution with xargs -P +# Usage: make run_null JOBS=4 +JOBS ?= 1 + +# Auto-detect test groups from TEST_PROGS (test__.sh -> group) +TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \ + sed 's/test_\([^_]*\)_.*/\1/' | sort -u) + +# Template for group test targets +# $(1) = group name (e.g., null, generic, stress) +define RUN_GROUP +run_$(1): all + @if [ $$(JOBS) -gt 1 ]; then \ + echo $$(filter test_$(1)_%.sh,$$(TEST_PROGS)) | tr ' ' '\n' | \ + xargs -P $$(JOBS) -n1 sh -c './"$$$$0"' || true; \ + else \ + $$(call RUN_TESTS, $$(filter test_$(1)_%.sh,$$(TEST_PROGS))); \ + fi +.PHONY: run_$(1) +endef + +# Generate targets for each discovered test group +$(foreach group,$(TEST_GROUPS),$(eval $(call RUN_GROUP,$(group)))) + +# Run all tests (parallel when JOBS>1) +run_all: all + @if [ $(JOBS) -gt 1 ]; then \ + echo $(TEST_PROGS) | tr ' ' '\n' | \ + xargs -P $(JOBS) -n1 sh -c './"$$0"' || true; \ + else \ + $(call RUN_TESTS, $(TEST_PROGS)); \ + fi +.PHONY: run_all From 64406dd2f69fe27921c7bf06088871c002cf6186 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:37 +0800 Subject: [PATCH 134/162] selftests: ublk: add _ublk_sleep helper for parallel execution Add _ublk_sleep() helper function that uses different sleep times depending on whether tests run in parallel or sequential mode. Usage: _ublk_sleep Export JOBS variable from Makefile so test scripts can detect parallel execution, and use _ublk_sleep in test_part_02.sh to handle the partition scan delay (1s normal, 5s parallel). Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_common.sh | 10 ++++++++++ tools/testing/selftests/ublk/test_part_02.sh | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 37e012d3a8a7..1ceae611acb7 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -78,6 +78,7 @@ check: # JOBS>1: parallel execution with xargs -P # Usage: make run_null JOBS=4 JOBS ?= 1 +export JOBS # Auto-detect test groups from TEST_PROGS (test__.sh -> group) TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \ diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 422882c32490..bd27a6875c1a 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -15,6 +15,16 @@ _have_program() { return 1 } +# Sleep with awareness of parallel execution. +# Usage: _ublk_sleep +_ublk_sleep() { + if [ "${JOBS:-1}" -gt 1 ]; then + sleep "$2" + else + sleep "$1" + fi +} + _get_disk_dev_t() { local dev_id=$1 local dev diff --git a/tools/testing/selftests/ublk/test_part_02.sh b/tools/testing/selftests/ublk/test_part_02.sh index acd098deda3a..7d42ab4d6e83 100755 --- a/tools/testing/selftests/ublk/test_part_02.sh +++ b/tools/testing/selftests/ublk/test_part_02.sh @@ -33,7 +33,7 @@ _test_partition_scan_no_hang() # The add command should return quickly because partition scan is async. # Now sleep briefly to let the async partition scan work start and hit # the delay in the fault_inject handler. - sleep 1 + _ublk_sleep 1 5 # Kill the ublk daemon while partition scan is potentially blocked # And check state transitions properly From 56a08b87f9f2a763cb5546f83b78ebe1e96260af Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:38 +0800 Subject: [PATCH 135/162] selftests: ublk: increase timeouts for parallel test execution When running tests in parallel with high JOBS count (e.g., JOBS=64), the existing timeouts can be insufficient due to system load: - Increase state wait loops from 20/50 to 100 iterations in _recover_ublk_dev(), __ublk_quiesce_dev(), and __ublk_kill_daemon() to handle slower state transitions under heavy load - Add --timeout=20 to udevadm settle calls to prevent indefinite hangs when udev event queue is overwhelmed by rapid device creation/deletion Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index bd27a6875c1a..c3afd00783a2 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -216,7 +216,7 @@ _create_ublk_dev() { fi if [ "$settle" = "yes" ]; then - udevadm settle + udevadm settle --timeout=20 fi if [[ "$dev_id" =~ ^[0-9]+$ ]]; then @@ -240,7 +240,7 @@ _recover_ublk_dev() { local state dev_id=$(_create_ublk_dev "recover" "yes" "$@") - for ((j=0;j<20;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "LIVE" ] && break sleep 1 @@ -260,7 +260,7 @@ __ublk_quiesce_dev() return "$state" fi - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do state=$(_get_ublk_dev_state "${dev_id}") [ "$state" == "$exp_state" ] && break sleep 1 @@ -279,7 +279,7 @@ __ublk_kill_daemon() daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") state=$(_get_ublk_dev_state "${dev_id}") - for ((j=0;j<50;j++)); do + for ((j=0;j<100;j++)); do [ "$state" == "$exp_state" ] && break kill -9 "$daemon_pid" > /dev/null 2>&1 sleep 1 @@ -304,7 +304,7 @@ __remove_ublk_dev_return() { _ublk_del_dev "${dev_id}" local res=$? - udevadm settle + udevadm settle --timeout=20 return ${res} } From d9a36ab302b1c90d8f03a3b13538b8676eb6ed3b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:39 +0800 Subject: [PATCH 136/162] selftests: ublk: reorganize tests into integrity and recover groups Move integrity-focused tests into new 'integrity' group: - test_null_04.sh -> test_integrity_01.sh - test_loop_08.sh -> test_integrity_02.sh Move recovery-focused tests into new 'recover' group: - test_generic_04.sh -> test_recover_01.sh - test_generic_05.sh -> test_recover_02.sh - test_generic_11.sh -> test_recover_03.sh - test_generic_14.sh -> test_recover_04.sh Update Makefile to reflect the reorganization. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 14 ++++++++------ .../ublk/{test_null_04.sh => test_integrity_01.sh} | 0 .../ublk/{test_loop_08.sh => test_integrity_02.sh} | 0 .../{test_generic_04.sh => test_recover_01.sh} | 0 .../{test_generic_05.sh => test_recover_02.sh} | 0 .../{test_generic_11.sh => test_recover_03.sh} | 0 .../{test_generic_14.sh => test_recover_04.sh} | 0 7 files changed, 8 insertions(+), 6 deletions(-) rename tools/testing/selftests/ublk/{test_null_04.sh => test_integrity_01.sh} (100%) rename tools/testing/selftests/ublk/{test_loop_08.sh => test_integrity_02.sh} (100%) rename tools/testing/selftests/ublk/{test_generic_04.sh => test_recover_01.sh} (100%) rename tools/testing/selftests/ublk/{test_generic_05.sh => test_recover_02.sh} (100%) rename tools/testing/selftests/ublk/{test_generic_11.sh => test_recover_03.sh} (100%) rename tools/testing/selftests/ublk/{test_generic_14.sh => test_recover_04.sh} (100%) diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 1ceae611acb7..a62a06e13006 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -10,18 +10,14 @@ LDLIBS += -lpthread -lm -luring TEST_PROGS := test_generic_01.sh TEST_PROGS += test_generic_02.sh TEST_PROGS += test_generic_03.sh -TEST_PROGS += test_generic_04.sh -TEST_PROGS += test_generic_05.sh TEST_PROGS += test_generic_06.sh TEST_PROGS += test_generic_07.sh TEST_PROGS += test_generic_08.sh TEST_PROGS += test_generic_09.sh TEST_PROGS += test_generic_10.sh -TEST_PROGS += test_generic_11.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh -TEST_PROGS += test_generic_14.sh TEST_PROGS += test_generic_16.sh TEST_PROGS += test_batch_01.sh @@ -31,7 +27,6 @@ TEST_PROGS += test_batch_03.sh TEST_PROGS += test_null_01.sh TEST_PROGS += test_null_02.sh TEST_PROGS += test_null_03.sh -TEST_PROGS += test_null_04.sh TEST_PROGS += test_loop_01.sh TEST_PROGS += test_loop_02.sh TEST_PROGS += test_loop_03.sh @@ -39,7 +34,14 @@ TEST_PROGS += test_loop_04.sh TEST_PROGS += test_loop_05.sh TEST_PROGS += test_loop_06.sh TEST_PROGS += test_loop_07.sh -TEST_PROGS += test_loop_08.sh + +TEST_PROGS += test_integrity_01.sh +TEST_PROGS += test_integrity_02.sh + +TEST_PROGS += test_recover_01.sh +TEST_PROGS += test_recover_02.sh +TEST_PROGS += test_recover_03.sh +TEST_PROGS += test_recover_04.sh TEST_PROGS += test_stripe_01.sh TEST_PROGS += test_stripe_02.sh TEST_PROGS += test_stripe_03.sh diff --git a/tools/testing/selftests/ublk/test_null_04.sh b/tools/testing/selftests/ublk/test_integrity_01.sh similarity index 100% rename from tools/testing/selftests/ublk/test_null_04.sh rename to tools/testing/selftests/ublk/test_integrity_01.sh diff --git a/tools/testing/selftests/ublk/test_loop_08.sh b/tools/testing/selftests/ublk/test_integrity_02.sh similarity index 100% rename from tools/testing/selftests/ublk/test_loop_08.sh rename to tools/testing/selftests/ublk/test_integrity_02.sh diff --git a/tools/testing/selftests/ublk/test_generic_04.sh b/tools/testing/selftests/ublk/test_recover_01.sh similarity index 100% rename from tools/testing/selftests/ublk/test_generic_04.sh rename to tools/testing/selftests/ublk/test_recover_01.sh diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_recover_02.sh similarity index 100% rename from tools/testing/selftests/ublk/test_generic_05.sh rename to tools/testing/selftests/ublk/test_recover_02.sh diff --git a/tools/testing/selftests/ublk/test_generic_11.sh b/tools/testing/selftests/ublk/test_recover_03.sh similarity index 100% rename from tools/testing/selftests/ublk/test_generic_11.sh rename to tools/testing/selftests/ublk/test_recover_03.sh diff --git a/tools/testing/selftests/ublk/test_generic_14.sh b/tools/testing/selftests/ublk/test_recover_04.sh similarity index 100% rename from tools/testing/selftests/ublk/test_generic_14.sh rename to tools/testing/selftests/ublk/test_recover_04.sh From 5314d25afbc44d0449fa2519d2c9d7f3c319f74c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 00:23:40 +0800 Subject: [PATCH 137/162] selftests: ublk: improve I/O ordering test with bpftrace Remove test_generic_01.sh since block layer may reorder I/O, making the test prone to false positives. Apply the improvements to test_generic_02.sh instead, which supposes for covering ublk dispatch io order. Rework test_generic_02 to verify that ublk dispatch doesn't reorder I/O by comparing request start order with completion order using bpftrace. The bpftrace script now: - Tracks each request's start sequence number in a map keyed by sector - On completion, verifies the request's start order matches expected completion order - Reports any out-of-order completions detected The test script: - Wait bpftrace BEGIN code block is run - Pins fio to CPU 0 for deterministic behavior - Uses block_io_start and block_rq_complete tracepoints - Checks bpftrace output for reordering errors Reported-and-tested-by: Alexander Atanasov Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 3 +- .../testing/selftests/ublk/test_generic_01.sh | 47 ------------------ .../testing/selftests/ublk/test_generic_02.sh | 22 ++++++--- tools/testing/selftests/ublk/trace/seq_io.bt | 49 +++++++++++++++---- 4 files changed, 55 insertions(+), 66 deletions(-) delete mode 100755 tools/testing/selftests/ublk/test_generic_01.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index a62a06e13006..8ac2d4a682a1 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -7,8 +7,7 @@ endif LDLIBS += -lpthread -lm -luring -TEST_PROGS := test_generic_01.sh -TEST_PROGS += test_generic_02.sh +TEST_PROGS := test_generic_02.sh TEST_PROGS += test_generic_03.sh TEST_PROGS += test_generic_06.sh TEST_PROGS += test_generic_07.sh diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh deleted file mode 100755 index 26cf3c7ceeb5..000000000000 --- a/tools/testing/selftests/ublk/test_generic_01.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh - -ERR_CODE=0 - -if ! _have_program bpftrace; then - exit "$UBLK_SKIP_CODE" -fi - -if ! _have_program fio; then - exit "$UBLK_SKIP_CODE" -fi - -_prep_test "null" "sequential io order" - -dev_id=$(_add_ublk_dev -t null) -_check_add_dev $TID $? - -dev_t=$(_get_disk_dev_t "$dev_id") -bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & -btrace_pid=$! -sleep 2 - -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then - _cleanup_test "null" - exit "$UBLK_SKIP_CODE" -fi - -# run fio over this ublk disk -fio --name=write_seq \ - --filename=/dev/ublkb"${dev_id}" \ - --ioengine=libaio --iodepth=16 \ - --rw=write \ - --size=512M \ - --direct=1 \ - --bs=4k > /dev/null 2>&1 -ERR_CODE=$? -kill "$btrace_pid" -wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" - ERR_CODE=255 -fi -_cleanup_test "null" -_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_generic_02.sh b/tools/testing/selftests/ublk/test_generic_02.sh index 1d4b1d6e059c..46b657143fd6 100755 --- a/tools/testing/selftests/ublk/test_generic_02.sh +++ b/tools/testing/selftests/ublk/test_generic_02.sh @@ -13,7 +13,7 @@ if ! _have_program fio; then exit "$UBLK_SKIP_CODE" fi -_prep_test "null" "sequential io order for MQ" +_prep_test "null" "ublk dispatch won't reorder IO for MQ" dev_id=$(_add_ublk_dev -t null -q 2) _check_add_dev $TID $? @@ -21,15 +21,20 @@ _check_add_dev $TID $? dev_t=$(_get_disk_dev_t "$dev_id") bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & btrace_pid=$! -sleep 2 -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then +# Wait for bpftrace probes to be attached (BEGIN block prints BPFTRACE_READY) +for _ in $(seq 100); do + grep -q "BPFTRACE_READY" "$UBLK_TMP" 2>/dev/null && break + sleep 0.1 +done + +if ! kill -0 "$btrace_pid" 2>/dev/null; then _cleanup_test "null" exit "$UBLK_SKIP_CODE" fi -# run fio over this ublk disk -fio --name=write_seq \ +# run fio over this ublk disk (pinned to CPU 0) +taskset -c 0 fio --name=write_seq \ --filename=/dev/ublkb"${dev_id}" \ --ioengine=libaio --iodepth=16 \ --rw=write \ @@ -39,8 +44,11 @@ fio --name=write_seq \ ERR_CODE=$? kill "$btrace_pid" wait -if grep -q "io_out_of_order" "$UBLK_TMP"; then - cat "$UBLK_TMP" + +# Check for out-of-order completions detected by bpftrace +if grep -q "^out_of_order:" "$UBLK_TMP"; then + echo "I/O reordering detected:" + grep "^out_of_order:" "$UBLK_TMP" ERR_CODE=255 fi _cleanup_test "null" diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt b/tools/testing/selftests/ublk/trace/seq_io.bt index b2f60a92b118..9d36ba35468f 100644 --- a/tools/testing/selftests/ublk/trace/seq_io.bt +++ b/tools/testing/selftests/ublk/trace/seq_io.bt @@ -2,23 +2,52 @@ $1: dev_t $2: RWBS $3: strlen($2) + + Track request order between block_io_start and block_rq_complete. + Sequence starts at 1 so 0 means "never seen". On first valid + completion, sync complete_seq to handle probe attachment races. + block_rq_complete listed first to reduce missed completion window. */ + BEGIN { - @last_rw[$1, str($2)] = (uint64)0; + @start_seq = (uint64)1; + @complete_seq = (uint64)0; + @out_of_order = (uint64)0; + @start_order[0] = (uint64)0; + delete(@start_order[0]); + printf("BPFTRACE_READY\n"); } + tracepoint:block:block_rq_complete +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ { - $dev = $1; - if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { - $last = @last_rw[$dev, str($2)]; - if ((uint64)args.sector != $last) { - printf("io_out_of_order: exp %llu actual %llu\n", - args.sector, $last); + $expected = @start_order[args.sector]; + if ($expected > 0) { + if (@complete_seq == 0) { + @complete_seq = $expected; } - @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); + if ($expected != @complete_seq) { + printf("out_of_order: sector %llu started at seq %llu but completed at seq %llu\n", + args.sector, $expected, @complete_seq); + @out_of_order = @out_of_order + 1; + } + delete(@start_order[args.sector]); + @complete_seq = @complete_seq + 1; } } -END { - clear(@last_rw); +tracepoint:block:block_io_start +/(int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)/ +{ + @start_order[args.sector] = @start_seq; + @start_seq = @start_seq + 1; +} + +END { + printf("total_start: %llu total_complete: %llu out_of_order: %llu\n", + @start_seq - 1, @complete_seq, @out_of_order); + clear(@start_order); + clear(@start_seq); + clear(@complete_seq); + clear(@out_of_order); } From 6abc7d5dcf0ee0f85e16e41c87fbd06231f28753 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Mon, 26 Jan 2026 07:15:33 +0000 Subject: [PATCH 138/162] md/raid1: fix memory leak in raid1_run() raid1_run() calls setup_conf() which registers a thread via md_register_thread(). If raid1_set_limits() fails, the previously registered thread is not unregistered, resulting in a memory leak of the md_thread structure and the thread resource itself. Add md_unregister_thread() to the error path to properly cleanup the thread, which aligns with the error handling logic of other paths in this function. Compile tested only. Issue found using a prototype static analysis tool and code review. Link: https://lore.kernel.org/linux-raid/20260126071533.606263-1-zilin@seu.edu.cn Fixes: 97894f7d3c29 ("md/raid1: use the atomic queue limit update APIs") Signed-off-by: Zilin Guan Reviewed-by: Li Nan Signed-off-by: Yu Kuai --- drivers/md/raid1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 79faec11b79e..867db18bc3ba 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3236,6 +3236,7 @@ static int raid1_run(struct mddev *mddev) if (!mddev_is_dm(mddev)) { ret = raid1_set_limits(mddev); if (ret) { + md_unregister_thread(mddev, &conf->thread); if (!mddev->private) raid1_free(mddev, conf); return ret; From 05c8de4f09b08e97c6ecb190dcec0e68b167cb03 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Tue, 27 Jan 2026 15:39:27 +0800 Subject: [PATCH 139/162] md: fix return value of mddev_trylock A return value of 0 is treaded as successful lock acquisition. In fact, a return value of 1 means getting the lock successfully. Link: https://lore.kernel.org/linux-raid/20260127073951.17248-1-xni@redhat.com Fixes: 9e59d609763f ("md: call del_gendisk in control path") Reported-by: Bart Van Assche Closes: https://lore.kernel.org/linux-raid/20250611073108.25463-1-xni@redhat.com/T/#mfa369ef5faa4aa58e13e6d9fdb88aecd862b8f2f Signed-off-by: Xiao Ni Reviewed-by: Bart Van Assche Reviewed-by: Li Nan Signed-off-by: Yu Kuai --- drivers/md/md.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.h b/drivers/md/md.h index e6d3d88698ed..ac84289664cd 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -735,8 +735,8 @@ static inline int mddev_trylock(struct mddev *mddev) int ret; ret = mutex_trylock(&mddev->reconfig_mutex); - if (!ret && test_bit(MD_DELETED, &mddev->flags)) { - ret = -ENODEV; + if (ret && test_bit(MD_DELETED, &mddev->flags)) { + ret = 0; mutex_unlock(&mddev->reconfig_mutex); } return ret; From b36844f7d11e1f322d1e48a828c5bfb4a0ecabd1 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 2 Feb 2026 16:32:03 +0800 Subject: [PATCH 140/162] MAINTAINERS: Add Li Nan as md/raid reviewer I've long contributed to and reviewed the md/raid subsystem. I've fixed many bugs and done code refactors, with dozens of patches merged. I now volunteer to work as a reviewer for this subsystem. Link: https://lore.kernel.org/linux-raid/20260202083203.3017096-1-linan666@huaweicloud.com Signed-off-by: Li Nan Signed-off-by: Yu Kuai --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 12f49de7fe03..d10045d16b64 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24239,6 +24239,7 @@ F: include/linux/property.h SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu M: Yu Kuai +R: Li Nan L: linux-raid@vger.kernel.org S: Supported Q: https://patchwork.kernel.org/project/linux-raid/list/ From 06564bae93d024e346c49304dfb4e2aaa68cf620 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 30 Jan 2026 17:33:52 +0100 Subject: [PATCH 141/162] sed-opal: ignore locking ranges array when not enabling SUM. The locking ranges count and the array items are always ignored unless Single User Mode (SUM) is requested in the activate method. It is useless to enforce limits of unused array in the non-SUM case. Signed-off-by: Ondrej Kozina Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/sed-opal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 5a28f23f7f22..23a19c92d791 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -2940,7 +2940,8 @@ static int opal_activate_lsp(struct opal_dev *dev, }; int ret; - if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) + if (opal_lr_act->sum && + (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS)) return -EINVAL; ret = opal_get_key(dev, &opal_lr_act->key); From 2751b90051a0211ed7c78f26eb2a9b7038804b9b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Feb 2026 16:05:16 +0800 Subject: [PATCH 142/162] blk-wbt: factor out a helper wbt_set_lat() To move implementation details inside blk-wbt.c, prepare to fix possible deadlock to call wbt_init() while queue is frozen in the next patch. Reviewed-by: Ming Lei Reviewed-by: Nilay Shroff Signed-off-by: Yu Kuai Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 39 ++---------------------------------- block/blk-wbt.c | 50 ++++++++++++++++++++++++++++++++++++++++++++--- block/blk-wbt.h | 7 ++----- 3 files changed, 51 insertions(+), 45 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e0a70d26972b..a580688c3ad5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -636,11 +636,8 @@ out: static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, size_t count) { - struct request_queue *q = disk->queue; - struct rq_qos *rqos; ssize_t ret; s64 val; - unsigned int memflags; ret = queue_var_store64(&val, page); if (ret < 0) @@ -648,40 +645,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page, if (val < -1) return -EINVAL; - /* - * Ensure that the queue is idled, in case the latency update - * ends up either enabling or disabling wbt completely. We can't - * have IO inflight if that happens. - */ - memflags = blk_mq_freeze_queue(q); - - rqos = wbt_rq_qos(q); - if (!rqos) { - ret = wbt_init(disk); - if (ret) - goto out; - } - - ret = count; - if (val == -1) - val = wbt_default_latency_nsec(q); - else if (val >= 0) - val *= 1000ULL; - - if (wbt_get_min_lat(q) == val) - goto out; - - blk_mq_quiesce_queue(q); - - mutex_lock(&disk->rqos_state_mutex); - wbt_set_min_lat(q, val); - mutex_unlock(&disk->rqos_state_mutex); - - blk_mq_unquiesce_queue(q); -out: - blk_mq_unfreeze_queue(q, memflags); - - return ret; + ret = wbt_set_lat(disk, val); + return ret ? ret : count; } QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 8e025834f2fb..0a37d97bda75 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -93,6 +93,8 @@ struct rq_wb { struct rq_depth rq_depth; }; +static int wbt_init(struct gendisk *disk); + static inline struct rq_wb *RQWB(struct rq_qos *rqos) { return container_of(rqos, struct rq_wb, rqos); @@ -506,7 +508,7 @@ u64 wbt_get_min_lat(struct request_queue *q) return RQWB(rqos)->min_lat_nsec; } -void wbt_set_min_lat(struct request_queue *q, u64 val) +static void wbt_set_min_lat(struct request_queue *q, u64 val) { struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) @@ -741,7 +743,7 @@ void wbt_init_enable_default(struct gendisk *disk) WARN_ON_ONCE(wbt_init(disk)); } -u64 wbt_default_latency_nsec(struct request_queue *q) +static u64 wbt_default_latency_nsec(struct request_queue *q) { /* * We default to 2msec for non-rotational storage, and 75msec @@ -901,7 +903,7 @@ static const struct rq_qos_ops wbt_rqos_ops = { #endif }; -int wbt_init(struct gendisk *disk) +static int wbt_init(struct gendisk *disk) { struct request_queue *q = disk->queue; struct rq_wb *rwb; @@ -948,3 +950,45 @@ err_free: return ret; } + +int wbt_set_lat(struct gendisk *disk, s64 val) +{ + struct request_queue *q = disk->queue; + unsigned int memflags; + struct rq_qos *rqos; + int ret = 0; + + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ + memflags = blk_mq_freeze_queue(q); + + rqos = wbt_rq_qos(q); + if (!rqos) { + ret = wbt_init(disk); + if (ret) + goto out; + } + + if (val == -1) + val = wbt_default_latency_nsec(q); + else if (val >= 0) + val *= 1000ULL; + + if (wbt_get_min_lat(q) == val) + goto out; + + blk_mq_quiesce_queue(q); + + mutex_lock(&disk->rqos_state_mutex); + wbt_set_min_lat(q, val); + mutex_unlock(&disk->rqos_state_mutex); + + blk_mq_unquiesce_queue(q); +out: + blk_mq_unfreeze_queue(q, memflags); + + return ret; +} diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 925f22475738..6e39da17218b 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -4,16 +4,13 @@ #ifdef CONFIG_BLK_WBT -int wbt_init(struct gendisk *disk); void wbt_init_enable_default(struct gendisk *disk); void wbt_disable_default(struct gendisk *disk); void wbt_enable_default(struct gendisk *disk); u64 wbt_get_min_lat(struct request_queue *q); -void wbt_set_min_lat(struct request_queue *q, u64 val); -bool wbt_disabled(struct request_queue *); - -u64 wbt_default_latency_nsec(struct request_queue *); +bool wbt_disabled(struct request_queue *q); +int wbt_set_lat(struct gendisk *disk, s64 val); #else From 41afaeeda5099d9cd07eaa7dc6c3d20c6f1dd9e9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Feb 2026 16:05:17 +0800 Subject: [PATCH 143/162] blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter If wbt is disabled by default and user configures wbt by sysfs, queue will be frozen first and then pcpu_alloc_mutex will be held in blk_stat_alloc_callback(). Fix this problem by allocating memory first before queue frozen. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-wbt.c | 108 ++++++++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 45 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0a37d97bda75..665760274e60 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -93,7 +93,7 @@ struct rq_wb { struct rq_depth rq_depth; }; -static int wbt_init(struct gendisk *disk); +static int wbt_init(struct gendisk *disk, struct rq_wb *rwb); static inline struct rq_wb *RQWB(struct rq_qos *rqos) { @@ -698,6 +698,41 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } +static int wbt_data_dir(const struct request *rq) +{ + const enum req_op op = req_op(rq); + + if (op == REQ_OP_READ) + return READ; + else if (op_is_write(op)) + return WRITE; + + /* don't account */ + return -1; +} + +static struct rq_wb *wbt_alloc(void) +{ + struct rq_wb *rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + + if (!rwb) + return NULL; + + rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); + if (!rwb->cb) { + kfree(rwb); + return NULL; + } + + return rwb; +} + +static void wbt_free(struct rq_wb *rwb) +{ + blk_stat_free_callback(rwb->cb); + kfree(rwb); +} + /* * Enable wbt if defaults are configured that way */ @@ -739,8 +774,17 @@ EXPORT_SYMBOL_GPL(wbt_enable_default); void wbt_init_enable_default(struct gendisk *disk) { - if (__wbt_enable_default(disk)) - WARN_ON_ONCE(wbt_init(disk)); + struct rq_wb *rwb; + + if (!__wbt_enable_default(disk)) + return; + + rwb = wbt_alloc(); + if (WARN_ON_ONCE(!rwb)) + return; + + if (WARN_ON_ONCE(wbt_init(disk, rwb))) + wbt_free(rwb); } static u64 wbt_default_latency_nsec(struct request_queue *q) @@ -754,19 +798,6 @@ static u64 wbt_default_latency_nsec(struct request_queue *q) return 2000000ULL; } -static int wbt_data_dir(const struct request *rq) -{ - const enum req_op op = req_op(rq); - - if (op == REQ_OP_READ) - return READ; - else if (op_is_write(op)) - return WRITE; - - /* don't account */ - return -1; -} - static void wbt_queue_depth_changed(struct rq_qos *rqos) { RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); @@ -778,8 +809,7 @@ static void wbt_exit(struct rq_qos *rqos) struct rq_wb *rwb = RQWB(rqos); blk_stat_remove_callback(rqos->disk->queue, rwb->cb); - blk_stat_free_callback(rwb->cb); - kfree(rwb); + wbt_free(rwb); } /* @@ -903,22 +933,11 @@ static const struct rq_qos_ops wbt_rqos_ops = { #endif }; -static int wbt_init(struct gendisk *disk) +static int wbt_init(struct gendisk *disk, struct rq_wb *rwb) { struct request_queue *q = disk->queue; - struct rq_wb *rwb; - int i; int ret; - - rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); - if (!rwb) - return -ENOMEM; - - rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); - if (!rwb->cb) { - kfree(rwb); - return -ENOMEM; - } + int i; for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); @@ -938,38 +957,38 @@ static int wbt_init(struct gendisk *disk) ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); mutex_unlock(&q->rq_qos_mutex); if (ret) - goto err_free; + return ret; blk_stat_add_callback(q, rwb->cb); - return 0; - -err_free: - blk_stat_free_callback(rwb->cb); - kfree(rwb); - return ret; - } int wbt_set_lat(struct gendisk *disk, s64 val) { struct request_queue *q = disk->queue; + struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_wb *rwb = NULL; unsigned int memflags; - struct rq_qos *rqos; int ret = 0; + if (!rqos) { + rwb = wbt_alloc(); + if (!rwb) + return -ENOMEM; + } + /* * Ensure that the queue is idled, in case the latency update * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ memflags = blk_mq_freeze_queue(q); - - rqos = wbt_rq_qos(q); if (!rqos) { - ret = wbt_init(disk); - if (ret) + ret = wbt_init(disk, rwb); + if (ret) { + wbt_free(rwb); goto out; + } } if (val == -1) @@ -989,6 +1008,5 @@ int wbt_set_lat(struct gendisk *disk, s64 val) blk_mq_unquiesce_queue(q); out: blk_mq_unfreeze_queue(q, memflags); - return ret; } From 3f0bea9f3b9e7d9bdc3794103575168ef007d309 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Feb 2026 16:05:18 +0800 Subject: [PATCH 144/162] blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos There is already a helper blk_mq_debugfs_register_rqos() to register one rqos, however this helper is called synchronously when the rqos is created with queue frozen. Prepare to fix possible deadlock to create blk-mq debugfs entries while queue is still frozen. Reviewed-by: Ming Lei Reviewed-by: Nilay Shroff Signed-off-by: Yu Kuai Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 23 +++++++++++++++-------- block/blk-mq-debugfs.h | 5 +++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4896525b1c05..4fe164b6d648 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -631,14 +631,7 @@ void blk_mq_debugfs_register(struct request_queue *q) blk_mq_debugfs_register_hctx(q, hctx); } - if (q->rq_qos) { - struct rq_qos *rqos = q->rq_qos; - - while (rqos) { - blk_mq_debugfs_register_rqos(rqos); - rqos = rqos->next; - } - } + blk_mq_debugfs_register_rq_qos(q); } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, @@ -769,6 +762,20 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); } +void blk_mq_debugfs_register_rq_qos(struct request_queue *q) +{ + lockdep_assert_held(&q->debugfs_mutex); + + if (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + + while (rqos) { + blk_mq_debugfs_register_rqos(rqos); + rqos = rqos->next; + } + } +} + void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index c80e453e3014..54948a266889 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -33,6 +33,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); +void blk_mq_debugfs_register_rq_qos(struct request_queue *q); void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); #else @@ -78,6 +79,10 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { } +static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q) +{ +} + static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { } From 3c17a346ffc613615f48c6f1ed30cdf328bab805 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Feb 2026 16:05:19 +0800 Subject: [PATCH 145/162] blk-rq-qos: fix possible debugfs_mutex deadlock Currently rq-qos debugfs entries are created from rq_qos_add(), while rq_qos_add() can be called while queue is still frozen. This can deadlock because creating new entries can trigger fs reclaim. Fix this problem by delaying creating rq-qos debugfs entries after queue is unfrozen. - For wbt, 1) it can be initialized by default, fix it by calling new helper after wbt_init() from wbt_init_enable_default(); 2) it can be initialized by sysfs, fix it by calling new helper after queue is unfrozen from wbt_set_lat(). - For iocost and iolatency, they can only be initialized by blkcg configuration, however, they don't have debugfs entries for now, hence they are not handled yet. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 7 ------- block/blk-wbt.c | 13 ++++++++++++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 654478dfbc20..d7ce99ce2e80 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -347,13 +347,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); - - if (rqos->ops->debugfs_attrs) { - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_rqos(rqos); - mutex_unlock(&q->debugfs_mutex); - } - return 0; ebusy: blk_mq_unfreeze_queue(q, memflags); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 665760274e60..1415f2bf8611 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -774,6 +774,7 @@ EXPORT_SYMBOL_GPL(wbt_enable_default); void wbt_init_enable_default(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct rq_wb *rwb; if (!__wbt_enable_default(disk)) @@ -783,8 +784,14 @@ void wbt_init_enable_default(struct gendisk *disk) if (WARN_ON_ONCE(!rwb)) return; - if (WARN_ON_ONCE(wbt_init(disk, rwb))) + if (WARN_ON_ONCE(wbt_init(disk, rwb))) { wbt_free(rwb); + return; + } + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rq_qos(q); + mutex_unlock(&q->debugfs_mutex); } static u64 wbt_default_latency_nsec(struct request_queue *q) @@ -1008,5 +1015,9 @@ int wbt_set_lat(struct gendisk *disk, s64 val) blk_mq_unquiesce_queue(q); out: blk_mq_unfreeze_queue(q, memflags); + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rq_qos(q); + mutex_unlock(&q->debugfs_mutex); + return ret; } From 70bafa5e31ff979c4c38ac9838cc960a32c04f49 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Feb 2026 16:05:20 +0800 Subject: [PATCH 146/162] blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static Because it's only used inside blk-mq-debugfs.c now. Reviewed-by: Nilay Shroff Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- block/blk-mq-debugfs.h | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4fe164b6d648..11f00a868541 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -744,7 +744,7 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) rqos->debugfs_dir = NULL; } -void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; const char *dir_name = rq_qos_id_to_name(rqos->id); diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 54948a266889..d94daa66556b 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -34,7 +34,6 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_register_rq_qos(struct request_queue *q); -void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); #else static inline void blk_mq_debugfs_register(struct request_queue *q) @@ -75,10 +74,6 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc { } -static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) -{ -} - static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q) { } From 5ae4b12ee6422a816efca4ede8411e4d5503b5ac Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Feb 2026 16:05:21 +0800 Subject: [PATCH 147/162] blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos() Because this helper is only used by iocost and iolatency, while they don't have debugfs entries. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 10 ---------- block/blk-mq-debugfs.h | 4 ---- block/blk-rq-qos.c | 4 ---- 3 files changed, 18 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 11f00a868541..22c182b40bc3 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -734,16 +734,6 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) return "unknown"; } -void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) -{ - lockdep_assert_held(&rqos->disk->queue->debugfs_mutex); - - if (!rqos->disk->queue->debugfs_dir) - return; - debugfs_remove_recursive(rqos->debugfs_dir); - rqos->debugfs_dir = NULL; -} - static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { struct request_queue *q = rqos->disk->queue; diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index d94daa66556b..49bb1aaa83dc 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -34,7 +34,6 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_register_rq_qos(struct request_queue *q); -void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); #else static inline void blk_mq_debugfs_register(struct request_queue *q) { @@ -78,9 +77,6 @@ static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q) { } -static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) -{ -} #endif #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d7ce99ce2e80..85cf74402a09 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -371,8 +371,4 @@ void rq_qos_del(struct rq_qos *rqos) if (!q->rq_qos) blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q); blk_mq_unfreeze_queue(q, memflags); - - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_rqos(rqos); - mutex_unlock(&q->debugfs_mutex); } From 9d20fd6ce1ba9733cd5ac96fcab32faa9fc404dd Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Feb 2026 16:05:22 +0800 Subject: [PATCH 148/162] blk-mq-debugfs: add missing debugfs_mutex in blk_mq_debugfs_register_hctxs() In blk_mq_update_nr_hw_queues(), debugfs_mutex is not held while creating debugfs entries for hctxs. Hence add debugfs_mutex there, it's safe because queue is not frozen. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 22c182b40bc3..5c7cadf51a88 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -679,8 +679,10 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; + mutex_lock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) From 65d466b6298470ce21ab21ebfdb51309d515737d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 2 Feb 2026 16:05:23 +0800 Subject: [PATCH 149/162] blk-mq-debugfs: warn about possible deadlock Creating new debugfs entries can trigger fs reclaim, hence we can't do this with queue frozen, meanwhile, other locks that can be held while queue is frozen should not be held as well. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5c7cadf51a88..faeaa1fc86a7 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -608,9 +608,23 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {}, }; -static void debugfs_create_files(struct dentry *parent, void *data, +static void debugfs_create_files(struct request_queue *q, struct dentry *parent, + void *data, const struct blk_mq_debugfs_attr *attr) { + lockdep_assert_held(&q->debugfs_mutex); + /* + * Creating new debugfs entries with queue freezed has the risk of + * deadlock. + */ + WARN_ON_ONCE(q->mq_freeze_depth != 0); + /* + * debugfs_mutex should not be nested under other locks that can be + * grabbed while queue is frozen. + */ + lockdep_assert_not_held(&q->elevator_lock); + lockdep_assert_not_held(&q->rq_qos_mutex); + if (IS_ERR_OR_NULL(parent)) return; @@ -624,7 +638,7 @@ void blk_mq_debugfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned long i; - debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); + debugfs_create_files(q, q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir) @@ -643,7 +657,8 @@ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, snprintf(name, sizeof(name), "cpu%u", ctx->cpu); ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir); - debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs); + debugfs_create_files(hctx->queue, ctx_dir, ctx, + blk_mq_debugfs_ctx_attrs); } void blk_mq_debugfs_register_hctx(struct request_queue *q, @@ -659,7 +674,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); - debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs); + debugfs_create_files(q, hctx->debugfs_dir, hctx, + blk_mq_debugfs_hctx_attrs); hctx_for_each_ctx(hctx, ctx, i) blk_mq_debugfs_register_ctx(hctx, ctx); @@ -712,7 +728,7 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir); - debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs); + debugfs_create_files(q, q->sched_debugfs_dir, q, e->queue_debugfs_attrs); } void blk_mq_debugfs_unregister_sched(struct request_queue *q) @@ -751,7 +767,8 @@ static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) q->debugfs_dir); rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir); - debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); + debugfs_create_files(q, rqos->debugfs_dir, rqos, + rqos->ops->debugfs_attrs); } void blk_mq_debugfs_register_rq_qos(struct request_queue *q) @@ -788,7 +805,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, hctx->sched_debugfs_dir = debugfs_create_dir("sched", hctx->debugfs_dir); - debugfs_create_files(hctx->sched_debugfs_dir, hctx, + debugfs_create_files(q, hctx->sched_debugfs_dir, hctx, e->hctx_debugfs_attrs); } From ee4784a83fb21a2d16ebfdf8877fa6f6a1129150 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 3 Feb 2026 12:35:28 +0100 Subject: [PATCH 150/162] block: don't use strcpy to copy blockdev name 0-day bot flagged the use of strcpy() in blk_trace_setup(), because the source buffer can theoretically be bigger than the destination buffer. While none of the current callers pass a string bigger than BLKTRACE_BDEV_SIZE, use strscpy() to prevent eventual future misuse and silence the checker warnings. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202602020718.GUEIRyG9-lkp@intel.com/ Fixes: 113cbd62824a ("blktrace: pass blk_user_trace2 to setup functions") Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d031c8d80be4..c4db5c2e7103 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -793,7 +793,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, return PTR_ERR(bt); } blk_trace_setup_finalize(q, name, 1, bt, &buts2); - strcpy(buts.name, buts2.name); + strscpy(buts.name, buts2.name, BLKTRACE_BDEV_SIZE); mutex_unlock(&q->debugfs_mutex); if (copy_to_user(arg, &buts, sizeof(buts))) { From 9fc7900b14727d39457bd3724f26e6e3faca3efd Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:42 +0800 Subject: [PATCH 151/162] block: convert nr_requests to unsigned int This value represents the number of requests for elevator tags, or drivers tags if elevator is none. The max value for elevator tags is 2048, and in drivers at most 16 bits is used for tag. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2ae4c45e4959..67d8d9e03abc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -550,7 +550,7 @@ struct request_queue { /* * queue settings */ - unsigned long nr_requests; /* Max # of requests */ + unsigned int nr_requests; /* Max # of requests */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; From 1db61b0afdd7e8aa9289c423fdff002603b520b5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:43 +0800 Subject: [PATCH 152/162] blk-mq-sched: unify elevators checking for async requests bfq and mq-deadline consider sync writes as async requests and only reserve tags for sync reads by async_depth, however, kyber doesn't consider sync writes as async requests for now. Consider the case there are lots of dirty pages, and user use fsync to flush dirty pages. In this case sched_tags can be exhausted by sync writes and sync reads can stuck waiting for tag. Hence let kyber follow what mq-deadline and bfq did, and unify async requests checking for all elevators. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- block/blk-mq-sched.h | 5 +++++ block/kyber-iosched.c | 2 +- block/mq-deadline.c | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3ebdec40e758..44746f4c0b89 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -697,7 +697,7 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) unsigned int limit, act_idx; /* Sync reads have full depth available */ - if (op_is_sync(opf) && !op_is_write(opf)) + if (blk_mq_is_sync_read(opf)) limit = data->q->nr_requests; else limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 02c40a72e959..5678e15bd33c 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -137,4 +137,9 @@ static inline void blk_mq_set_min_shallow_depth(struct request_queue *q, depth); } +static inline bool blk_mq_is_sync_read(blk_opf_t opf) +{ + return op_is_sync(opf) && !op_is_write(opf); +} + #endif diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index c1b36ffd19ce..2b3f5b8959af 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -556,7 +556,7 @@ static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * We use the scheduler tags as per-hardware queue queueing tokens. * Async requests can be limited at this stage. */ - if (!op_is_sync(opf)) { + if (!blk_mq_is_sync_read(opf)) { struct kyber_queue_data *kqd = data->q->elevator->elevator_data; data->shallow_depth = kqd->async_depth; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 3e3719093aec..29d00221fbea 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -495,7 +495,7 @@ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) struct deadline_data *dd = data->q->elevator->elevator_data; /* Do not throttle synchronous reads. */ - if (op_is_sync(opf) && !op_is_write(opf)) + if (blk_mq_is_sync_read(opf)) return; /* From cf02d7d41b064af3e2c3a3a1ea9042a5b565b0d8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:44 +0800 Subject: [PATCH 153/162] blk-mq: factor out a helper blk_mq_limit_depth() There are no functional changes, just make code cleaner. Signed-off-by: Yu Kuai Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-mq.c | 62 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index cf1daedbb39f..b7b272e856b8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -498,6 +498,42 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) return rq_list_pop(data->cached_rqs); } +static void blk_mq_limit_depth(struct blk_mq_alloc_data *data) +{ + struct elevator_mq_ops *ops; + + /* If no I/O scheduler has been configured, don't limit requests */ + if (!data->q->elevator) { + blk_mq_tag_busy(data->hctx); + return; + } + + /* + * All requests use scheduler tags when an I/O scheduler is + * enabled for the queue. + */ + data->rq_flags |= RQF_SCHED_TAGS; + + /* + * Flush/passthrough requests are special and go directly to the + * dispatch list, they are not subject to the async_depth limit. + */ + if ((data->cmd_flags & REQ_OP_MASK) == REQ_OP_FLUSH || + blk_op_is_passthrough(data->cmd_flags)) + return; + + WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); + data->rq_flags |= RQF_USE_SCHED; + + /* + * By default, sync requests have no limit, and async requests are + * limited to async_depth. + */ + ops = &data->q->elevator->type->ops; + if (ops->limit_depth) + ops->limit_depth(data->cmd_flags, data); +} + static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; @@ -516,31 +552,7 @@ retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); - if (q->elevator) { - /* - * All requests use scheduler tags when an I/O scheduler is - * enabled for the queue. - */ - data->rq_flags |= RQF_SCHED_TAGS; - - /* - * Flush/passthrough requests are special and go directly to the - * dispatch list. - */ - if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && - !blk_op_is_passthrough(data->cmd_flags)) { - struct elevator_mq_ops *ops = &q->elevator->type->ops; - - WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); - - data->rq_flags |= RQF_USE_SCHED; - if (ops->limit_depth) - ops->limit_depth(data->cmd_flags, data); - } - } else { - blk_mq_tag_busy(data->hctx); - } - + blk_mq_limit_depth(data); if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; From f98afe4f31bb8b07fea318606c08030c2049587e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:45 +0800 Subject: [PATCH 154/162] blk-mq: add a new queue sysfs attribute async_depth Add a new field async_depth to request_queue and related APIs, this is currently not used, following patches will convert elevators to use this instead of internal async_depth. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-mq.c | 6 ++++++ block/blk-sysfs.c | 42 ++++++++++++++++++++++++++++++++++++++++++ block/elevator.c | 1 + include/linux/blkdev.h | 1 + 5 files changed, 51 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index d6732dc69dd9..474700ffaa1c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -463,6 +463,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) fs_reclaim_release(GFP_KERNEL); q->nr_requests = BLKDEV_DEFAULT_RQ; + q->async_depth = BLKDEV_DEFAULT_RQ; return q; diff --git a/block/blk-mq.c b/block/blk-mq.c index b7b272e856b8..0ad3dd3329db 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4662,6 +4662,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; + q->async_depth = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_map_swqueue(q); @@ -5028,6 +5029,11 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q, q->elevator->et = et; } + /* + * Preserve relative value, both nr and async_depth are at most 16 bit + * value, no need to worry about overflow. + */ + q->async_depth = max(q->async_depth * nr / q->nr_requests, 1); q->nr_requests = nr; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a580688c3ad5..003aa684e854 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -127,6 +127,46 @@ unlock: return ret; } +static ssize_t queue_async_depth_show(struct gendisk *disk, char *page) +{ + guard(mutex)(&disk->queue->elevator_lock); + + return queue_var_show(disk->queue->async_depth, page); +} + +static ssize_t +queue_async_depth_store(struct gendisk *disk, const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned int memflags; + unsigned long nr; + int ret; + + if (!queue_is_mq(q)) + return -EINVAL; + + ret = queue_var_store(&nr, page, count); + if (ret < 0) + return ret; + + if (nr == 0) + return -EINVAL; + + memflags = blk_mq_freeze_queue(q); + scoped_guard(mutex, &q->elevator_lock) { + if (q->elevator) { + q->async_depth = min(q->nr_requests, nr); + if (q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(q); + } else { + ret = -EINVAL; + } + } + blk_mq_unfreeze_queue(q, memflags); + + return ret; +} + static ssize_t queue_ra_show(struct gendisk *disk, char *page) { ssize_t ret; @@ -532,6 +572,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \ } QUEUE_RW_ENTRY(queue_requests, "nr_requests"); +QUEUE_RW_ENTRY(queue_async_depth, "async_depth"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); @@ -719,6 +760,7 @@ static struct attribute *blk_mq_queue_attrs[] = { */ &elv_iosched_entry.attr, &queue_requests_entry.attr, + &queue_async_depth_entry.attr, #ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, #endif diff --git a/block/elevator.c b/block/elevator.c index a2f8b2251dc6..ebe2a1fcf011 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -589,6 +589,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx) blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = NULL; q->nr_requests = q->tag_set->queue_depth; + q->async_depth = q->tag_set->queue_depth; } blk_add_trace_msg(q, "elv switch: %s", ctx->name); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 67d8d9e03abc..99ef8cd7673c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -551,6 +551,7 @@ struct request_queue { * queue settings */ unsigned int nr_requests; /* Max # of requests */ + unsigned int async_depth; /* Max # of async requests */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; From 8cbe62f4d8c37b74947569c7b874848f39f09a22 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:46 +0800 Subject: [PATCH 155/162] kyber: covert to use request_queue->async_depth Instead of the internal async_depth, remove kqd->async_depth and related helpers. Noted elevator attribute async_depth is now removed, queue attribute with the same name is used instead. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 2b3f5b8959af..b84163d1f851 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -47,9 +47,8 @@ enum { * asynchronous requests, we reserve 25% of requests for synchronous * operations. */ - KYBER_ASYNC_PERCENT = 75, + KYBER_DEFAULT_ASYNC_PERCENT = 75, }; - /* * Maximum device-wide depth for each scheduling domain. * @@ -157,9 +156,6 @@ struct kyber_queue_data { */ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; - /* Number of allowed async requests. */ - unsigned int async_depth; - struct kyber_cpu_latency __percpu *cpu_latency; /* Timer for stats aggregation and adjusting domain tokens. */ @@ -401,10 +397,7 @@ err: static void kyber_depth_updated(struct request_queue *q) { - struct kyber_queue_data *kqd = q->elevator->elevator_data; - - kqd->async_depth = q->nr_requests * KYBER_ASYNC_PERCENT / 100U; - blk_mq_set_min_shallow_depth(q, kqd->async_depth); + blk_mq_set_min_shallow_depth(q, q->async_depth); } static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) @@ -414,6 +407,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq) blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; + q->async_depth = q->nr_requests * KYBER_DEFAULT_ASYNC_PERCENT / 100; kyber_depth_updated(q); return 0; @@ -552,15 +546,8 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd, static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { - /* - * We use the scheduler tags as per-hardware queue queueing tokens. - * Async requests can be limited at this stage. - */ - if (!blk_mq_is_sync_read(opf)) { - struct kyber_queue_data *kqd = data->q->elevator->elevator_data; - - data->shallow_depth = kqd->async_depth; - } + if (!blk_mq_is_sync_read(opf)) + data->shallow_depth = data->q->async_depth; } static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, @@ -956,15 +943,6 @@ KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard) KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) #undef KYBER_DEBUGFS_DOMAIN_ATTRS -static int kyber_async_depth_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct kyber_queue_data *kqd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", kqd->async_depth); - return 0; -} - static int kyber_cur_domain_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -990,7 +968,6 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { KYBER_QUEUE_DOMAIN_ATTRS(write), KYBER_QUEUE_DOMAIN_ATTRS(discard), KYBER_QUEUE_DOMAIN_ATTRS(other), - {"async_depth", 0400, kyber_async_depth_show}, {}, }; #undef KYBER_QUEUE_DOMAIN_ATTRS From 988bb1b9ededab9aed83df8c1f5be0232b71ded3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:47 +0800 Subject: [PATCH 156/162] mq-deadline: covert to use request_queue->async_depth In downstream kernel, we test with mq-deadline with many fio workloads, and we found a performance regression after commit 39823b47bbd4 ("block/mq-deadline: Fix the tag reservation code") with following test: [global] rw=randread direct=1 ramp_time=1 ioengine=libaio iodepth=1024 numjobs=24 bs=1024k group_reporting=1 runtime=60 [job1] filename=/dev/sda Root cause is that mq-deadline now support configuring async_depth, although the default value is nr_request, however the minimal value is 1, hence min_shallow_depth is set to 1, causing wake_batch to be 1. For consequence, sbitmap_queue will be waken up after each IO instead of 8 IO. In this test case, sda is HDD and max_sectors is 128k, hence each submitted 1M io will be splited into 8 sequential 128k requests, however due to there are 24 jobs and total tags are exhausted, the 8 requests are unlikely to be dispatched sequentially, and changing wake_batch to 1 will make this much worse, accounting blktrace D stage, the percentage of sequential io is decreased from 8% to 0.8%. Fix this problem by converting to request_queue->async_depth, where min_shallow_depth is set each time async_depth is updated. Noted elevator attribute async_depth is now removed, queue attribute with the same name is used instead. Fixes: 39823b47bbd4 ("block/mq-deadline: Fix the tag reservation code") Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/mq-deadline.c | 39 +++++---------------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 29d00221fbea..95917a88976f 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -98,7 +98,6 @@ struct deadline_data { int fifo_batch; int writes_starved; int front_merges; - u32 async_depth; int prio_aging_expire; spinlock_t lock; @@ -486,32 +485,16 @@ unlock: return rq; } -/* - * Called by __blk_mq_alloc_request(). The shallow_depth value set by this - * function is used by __blk_mq_get_tag(). - */ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { - struct deadline_data *dd = data->q->elevator->elevator_data; - - /* Do not throttle synchronous reads. */ - if (blk_mq_is_sync_read(opf)) - return; - - /* - * Throttle asynchronous requests and writes such that these requests - * do not block the allocation of synchronous requests. - */ - data->shallow_depth = dd->async_depth; + if (!blk_mq_is_sync_read(opf)) + data->shallow_depth = data->q->async_depth; } -/* Called by blk_mq_update_nr_requests(). */ +/* Called by blk_mq_init_sched() and blk_mq_update_nr_requests(). */ static void dd_depth_updated(struct request_queue *q) { - struct deadline_data *dd = q->elevator->elevator_data; - - dd->async_depth = q->nr_requests; - blk_mq_set_min_shallow_depth(q, 1); + blk_mq_set_min_shallow_depth(q, q->async_depth); } static void dd_exit_sched(struct elevator_queue *e) @@ -576,6 +559,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq) blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; + q->async_depth = q->nr_requests; dd_depth_updated(q); return 0; } @@ -763,7 +747,6 @@ SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); -SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES @@ -793,7 +776,6 @@ STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MA STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); -STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT @@ -807,7 +789,6 @@ static const struct elv_fs_entry deadline_attrs[] = { DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), - DD_ATTR(async_depth), DD_ATTR(fifo_batch), DD_ATTR(prio_aging_expire), __ATTR_NULL @@ -894,15 +875,6 @@ static int deadline_starved_show(void *data, struct seq_file *m) return 0; } -static int dd_async_depth_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - struct deadline_data *dd = q->elevator->elevator_data; - - seq_printf(m, "%u\n", dd->async_depth); - return 0; -} - static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; @@ -1002,7 +974,6 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, - {"async_depth", 0400, dd_async_depth_show}, {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, From 2110858c5178176d0d535b7762b20cb9c0d03146 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:48 +0800 Subject: [PATCH 157/162] block, bfq: convert to use request_queue->async_depth The default limits is unchanged, and user can configure async_depth now. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 44746f4c0b89..b180ce583951 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7112,39 +7112,29 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) static void bfq_depth_updated(struct request_queue *q) { struct bfq_data *bfqd = q->elevator->elevator_data; - unsigned int nr_requests = q->nr_requests; + unsigned int async_depth = q->async_depth; /* - * In-word depths if no bfq_queue is being weight-raised: - * leaving 25% of tags only for sync reads. + * By default: + * - sync reads are not limited + * If bfqq is not being weight-raised: + * - sync writes are limited to 75%(async depth default value) + * - async IO are limited to 50% + * If bfqq is being weight-raised: + * - sync writes are limited to ~37% + * - async IO are limited to ~18 * - * In next formulas, right-shift the value - * (1U<sb.shift), instead of computing directly - * (1U<<(bt->sb.shift - something)), to be robust against - * any possible value of bt->sb.shift, without having to - * limit 'something'. + * If request_queue->async_depth is updated by user, all limit are + * updated relatively. */ - /* no more than 50% of tags for async I/O */ - bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U); - /* - * no more than 75% of tags for sync writes (25% extra tags - * w.r.t. async I/O, to prevent async I/O from starving sync - * writes) - */ - bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U); + bfqd->async_depths[0][1] = async_depth; + bfqd->async_depths[0][0] = max(async_depth * 2 / 3, 1U); + bfqd->async_depths[1][1] = max(async_depth >> 1, 1U); + bfqd->async_depths[1][0] = max(async_depth >> 2, 1U); /* - * In-word depths in case some bfq_queue is being weight- - * raised: leaving ~63% of tags for sync reads. This is the - * highest percentage for which, in our tests, application - * start-up times didn't suffer from any regression due to tag - * shortage. + * Due to cgroup qos, the allowed request for bfqq might be 1 */ - /* no more than ~18% of tags for async I/O */ - bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U); - /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U); - blk_mq_set_min_shallow_depth(q, 1); } @@ -7365,6 +7355,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q); wbt_disable_default(q->disk); blk_stat_enable_accounting(q); + q->async_depth = (q->nr_requests * 3) >> 2; return 0; From 2c04718edcd5e1ac8fed9a0f8d0620e8bc94014d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 3 Feb 2026 16:19:49 +0800 Subject: [PATCH 158/162] blk-mq: add documentation for new queue attribute async_dpeth Explain the attribute and the default value in different case. Signed-off-by: Yu Kuai Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 34 ++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 0ed10aeff86b..aa1e94169666 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -609,6 +609,40 @@ Description: enabled, and whether tags are shared. +What: /sys/block//queue/async_depth +Date: August 2025 +Contact: linux-block@vger.kernel.org +Description: + [RW] Controls how many asynchronous requests may be allocated in the + block layer. The value is always capped at nr_requests. + + When no elevator is active (none): + - async_depth is always equal to nr_requests. + + For bfq scheduler: + - By default, async_depth is set to 75% of nr_requests. + Internal limits are then derived from this value: + * Sync writes: limited to async_depth (≈75% of nr_requests). + * Async I/O: limited to ~2/3 of async_depth (≈50% of nr_requests). + + If a bfq_queue is weight-raised: + * Sync writes: limited to ~1/2 of async_depth (≈37% of nr_requests). + * Async I/O: limited to ~1/4 of async_depth (≈18% of nr_requests). + + - If the user writes a custom value to async_depth, BFQ will recompute + these limits proportionally based on the new value. + + For Kyber: + - By default async_depth is set to 75% of nr_requests. + - If the user writes a custom value to async_depth, then it override the + default and directly control the limit for writes and async I/O. + + For mq-deadline: + - By default async_depth is set to nr_requests. + - If the user writes a custom value to async_depth, then it override the + default and directly control the limit for writes and async I/O. + + What: /sys/block//queue/nr_zones Date: November 2018 Contact: Damien Le Moal From d4d78dd43cecaeaadb24e78927a4d6fda7d1f1d9 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Tue, 6 Jan 2026 10:42:57 +0800 Subject: [PATCH 159/162] block: remove redundant kill_bdev() call in set_blocksize() The second kill_bdev() call in set_blocksize() is redundant as the first call already clears all buffers and pagecache, and locks prevent new pagecache creation between the calls. Signed-off-by: Yang Xiuwei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bdev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/bdev.c b/block/bdev.c index b8fbb9576110..ed022f8c48c7 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -208,7 +208,6 @@ int set_blocksize(struct file *file, int size) inode->i_blkbits = blksize_bits(size); mapping_set_folio_min_order(inode->i_mapping, get_order(size)); - kill_bdev(bdev); filemap_invalidate_unlock(inode->i_mapping); inode_unlock(inode); } From ee81212f74a57c5d2b56cf504f40d528dac6faaf Mon Sep 17 00:00:00 2001 From: Luke Wang Date: Wed, 4 Feb 2026 11:40:02 +0800 Subject: [PATCH 160/162] block: decouple secure erase size limit from discard size limit Secure erase should use max_secure_erase_sectors instead of being limited by max_discard_sectors. Separate the handling of REQ_OP_SECURE_ERASE from REQ_OP_DISCARD to allow each operation to use its own size limit. Signed-off-by: Luke Wang Reviewed-by: Ulf Hansson Signed-off-by: Jens Axboe --- block/blk-merge.c | 21 +++++++++++++++++---- block/blk.h | 6 +++++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index b82c6d304658..c18bc440d647 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -158,8 +158,9 @@ static struct bio *bio_submit_split(struct bio *bio, int split_sectors) return bio; } -struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, - unsigned *nsegs) +static struct bio *__bio_split_discard(struct bio *bio, + const struct queue_limits *lim, unsigned *nsegs, + unsigned int max_sectors) { unsigned int max_discard_sectors, granularity; sector_t tmp; @@ -169,8 +170,7 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, granularity = max(lim->discard_granularity >> 9, 1U); - max_discard_sectors = - min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); + max_discard_sectors = min(max_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) return bio; @@ -194,6 +194,19 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, return bio_submit_split(bio, split_sectors); } +struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, + unsigned *nsegs) +{ + unsigned int max_sectors; + + if (bio_op(bio) == REQ_OP_SECURE_ERASE) + max_sectors = lim->max_secure_erase_sectors; + else + max_sectors = lim->max_discard_sectors; + + return __bio_split_discard(bio, lim, nsegs, max_sectors); +} + static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, bool is_atomic) { diff --git a/block/blk.h b/block/blk.h index 980eef1f5690..401d19ed08a6 100644 --- a/block/blk.h +++ b/block/blk.h @@ -208,10 +208,14 @@ static inline unsigned int blk_queue_get_max_sectors(struct request *rq) struct request_queue *q = rq->q; enum req_op op = req_op(rq); - if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) + if (unlikely(op == REQ_OP_DISCARD)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); + if (unlikely(op == REQ_OP_SECURE_ERASE)) + return min(q->limits.max_secure_erase_sectors, + UINT_MAX >> SECTOR_SHIFT); + if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; From 5d3ae80b4dc43d1c49f5ab6e9835ae5fc9ac5d37 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 1 Feb 2026 11:10:44 +0800 Subject: [PATCH 161/162] selftests: ublk: organize test directories by test ID Set UBLK_TEST_DIR to ${TMPDIR:-./ublktest-dir}/${TID}.XXXXXX to create per-test subdirectories organized by test ID. This makes it easier to identify and debug specific test runs. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index c3afd00783a2..163a40007910 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -129,7 +129,9 @@ _prep_test() { local type=$1 shift 1 modprobe ublk_drv > /dev/null 2>&1 - UBLK_TEST_DIR=$(mktemp -d ${TMPDIR:-.}/ublktest-dir.XXXXXX) + local base_dir=${TMPDIR:-./ublktest-dir} + mkdir -p "$base_dir" + UBLK_TEST_DIR=$(mktemp -d ${base_dir}/${TID}.XXXXXX) UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg From 72f4d6fca699a1e35b39c5e5dacac2926d254135 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 4 Feb 2026 23:29:03 -0800 Subject: [PATCH 162/162] blk-mq: ABI/sysfs-block: fix docs build warnings Clean up the async_depth documentation: - insert blank lines before and after lists where needed - convert verb tense in a few places - make lines fit within 80 characters Documentation/ABI/stable/sysfs-block:612: ERROR: Unexpected indentation. [docutils] Documentation/ABI/stable/sysfs-block:612: ERROR: Unexpected indentation. [docutils] Documentation/ABI/stable/sysfs-block:612: ERROR: Unexpected indentation. [docutils] Fixes: 2c04718edcd5 ("blk-mq: add documentation for new queue attribute async_dpeth") Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 33 ++++++++++++++++++---------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index aa1e94169666..09a9d4aca0fd 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -613,34 +613,45 @@ What: /sys/block//queue/async_depth Date: August 2025 Contact: linux-block@vger.kernel.org Description: - [RW] Controls how many asynchronous requests may be allocated in the - block layer. The value is always capped at nr_requests. + [RW] Controls how many asynchronous requests may be allocated + in the block layer. The value is always capped at nr_requests. When no elevator is active (none): + - async_depth is always equal to nr_requests. For bfq scheduler: + - By default, async_depth is set to 75% of nr_requests. Internal limits are then derived from this value: + * Sync writes: limited to async_depth (≈75% of nr_requests). - * Async I/O: limited to ~2/3 of async_depth (≈50% of nr_requests). + * Async I/O: limited to ~2/3 of async_depth (≈50% of + nr_requests). If a bfq_queue is weight-raised: - * Sync writes: limited to ~1/2 of async_depth (≈37% of nr_requests). - * Async I/O: limited to ~1/4 of async_depth (≈18% of nr_requests). - - If the user writes a custom value to async_depth, BFQ will recompute - these limits proportionally based on the new value. + * Sync writes: limited to ~1/2 of async_depth (≈37% of + nr_requests). + * Async I/O: limited to ~1/4 of async_depth (≈18% of + nr_requests). + + - If the user writes a custom value to async_depth, BFQ will + recompute these limits proportionally based on the new value. For Kyber: + - By default async_depth is set to 75% of nr_requests. - - If the user writes a custom value to async_depth, then it override the - default and directly control the limit for writes and async I/O. + - If the user writes a custom value to async_depth, then it + overrides the default and directly controls the limit for + writes and async I/O. For mq-deadline: + - By default async_depth is set to nr_requests. - - If the user writes a custom value to async_depth, then it override the - default and directly control the limit for writes and async I/O. + - If the user writes a custom value to async_depth, then it + overrides the default and directly controls the limit for + writes and async I/O. What: /sys/block//queue/nr_zones