mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 01:24:47 +01:00
for-7.0/block-20260206
-----BEGIN PGP SIGNATURE-----
iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmmGLwcQHGF4Ym9lQGtl
cm5lbC5kawAKCRD301j7KXHgpv+TD/48S2HTnMhmW6AtFYWErQ+sEKXpHrxbYe7S
+qR8/g/T+QSfhfqPwZEuagndFKtIP3LJfaXGSP1Lk1RfP9NLQy91v33Ibe4DjHkp
etWSfnMHA9MUAoWKmg8EvncB2G+ZQFiYCpjazj5tKHD9S2+psGMuL8kq6qzMJE83
uhpb8WutUl4aSIXbMSfyGlwBhI1MjjRbbWlIBmg4yC8BWt1sH8Qn2L2GNVylEIcX
U8At3KLgPGn0axSg4yGMAwTqtGhL/jwdDyeczbmRlXuAr4iVL9UX/yADCYkazt6U
ttQ2/H+cxCwfES84COx9EteAatlbZxo6wjGvZ3xOMiMJVTjYe1x6Gkcckq+LrZX6
tjofi2KK78qkrMXk1mZMkZjpyUWgRtCswhDllbQyqFs0SwzQtno2//Rk8HU9dhbt
pkpryDbGFki9X3upcNyEYp5TYflpW6YhAzShYgmE6KXim2fV8SeFLviy0erKOAl+
fwjTE6KQ5QoQv0s3WxkWa4lREm34O6IHrCUmbiPm5CruJnQDhqAN2QZIDgYC4WAf
0gu9cR/O4Vxu7TQXrumPs5q+gCyDU0u0B8C3mG2s+rIo+PI5cVZKs2OIZ8HiPo0F
x73kR/pX3DMe35ZQkQX22ymMuowV+aQouDLY9DTwakP5acdcg7h7GZKABk6VLB06
gUIsnxURiQ==
=jNzW
-----END PGP SIGNATURE-----
Merge tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull block updates from Jens Axboe:
- Support for batch request processing for ublk, improving the
efficiency of the kernel/ublk server communication. This can yield
nice 7-12% performance improvements
- Support for integrity data for ublk
- Various other ublk improvements and additions, including a ton of
selftests additions and updated
- Move the handling of blk-crypto software fallback from below the
block layer to above it. This reduces the complexity of dealing with
bio splitting
- Series fixing a number of potential deadlocks in blk-mq related to
the queue usage counter and writeback throttling and rq-qos debugfs
handling
- Add an async_depth queue attribute, to resolve a performance
regression that's been around for a qhilw related to the scheduler
depth handling
- Only use task_work for IOPOLL completions on NVMe, if it is necessary
to do so. An earlier fix for an issue resulted in all these
completions being punted to task_work, to guarantee that completions
were only run for a given io_uring ring when it was local to that
ring. With the new changes, we can detect if it's necessary to use
task_work or not, and avoid it if possible.
- rnbd fixes:
- Fix refcount underflow in device unmap path
- Handle PREFLUSH and NOUNMAP flags properly in protocol
- Fix server-side bi_size for special IOs
- Zero response buffer before use
- Fix trace format for flags
- Add .release to rnbd_dev_ktype
- MD pull requests via Yu Kuai
- Fix raid5_run() to return error when log_init() fails
- Fix IO hang with degraded array with llbitmap
- Fix percpu_ref not resurrected on suspend timeout in llbitmap
- Fix GPF in write_page caused by resize race
- Fix NULL pointer dereference in process_metadata_update
- Fix hang when stopping arrays with metadata through dm-raid
- Fix any_working flag handling in raid10_sync_request
- Refactor sync/recovery code path, improve error handling for
badblocks, and remove unused recovery_disabled field
- Consolidate mddev boolean fields into mddev_flags
- Use mempool to allocate stripe_request_ctx and make sure
max_sectors is not less than io_opt in raid5
- Fix return value of mddev_trylock
- Fix memory leak in raid1_run()
- Add Li Nan as mdraid reviewer
- Move phys_vec definitions to the kernel types, mostly in preparation
for some VFIO and RDMA changes
- Improve the speed for secure erase for some devices
- Various little rust updates
- Various other minor fixes, improvements, and cleanups
* tag 'for-7.0/block-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (162 commits)
blk-mq: ABI/sysfs-block: fix docs build warnings
selftests: ublk: organize test directories by test ID
block: decouple secure erase size limit from discard size limit
block: remove redundant kill_bdev() call in set_blocksize()
blk-mq: add documentation for new queue attribute async_dpeth
block, bfq: convert to use request_queue->async_depth
mq-deadline: covert to use request_queue->async_depth
kyber: covert to use request_queue->async_depth
blk-mq: add a new queue sysfs attribute async_depth
blk-mq: factor out a helper blk_mq_limit_depth()
blk-mq-sched: unify elevators checking for async requests
block: convert nr_requests to unsigned int
block: don't use strcpy to copy blockdev name
blk-mq-debugfs: warn about possible deadlock
blk-mq-debugfs: add missing debugfs_mutex in blk_mq_debugfs_register_hctxs()
blk-mq-debugfs: remove blk_mq_debugfs_unregister_rqos()
blk-mq-debugfs: make blk_mq_debugfs_register_rqos() static
blk-rq-qos: fix possible debugfs_mutex deadlock
blk-mq-debugfs: factor out a helper to register debugfs for all rq_qos
blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter
...
This commit is contained in:
commit
0c00ed308d
151 changed files with 5217 additions and 1583 deletions
|
|
@ -609,6 +609,51 @@ Description:
|
|||
enabled, and whether tags are shared.
|
||||
|
||||
|
||||
What: /sys/block/<disk>/queue/async_depth
|
||||
Date: August 2025
|
||||
Contact: linux-block@vger.kernel.org
|
||||
Description:
|
||||
[RW] Controls how many asynchronous requests may be allocated
|
||||
in the block layer. The value is always capped at nr_requests.
|
||||
|
||||
When no elevator is active (none):
|
||||
|
||||
- async_depth is always equal to nr_requests.
|
||||
|
||||
For bfq scheduler:
|
||||
|
||||
- By default, async_depth is set to 75% of nr_requests.
|
||||
Internal limits are then derived from this value:
|
||||
|
||||
* Sync writes: limited to async_depth (≈75% of nr_requests).
|
||||
* Async I/O: limited to ~2/3 of async_depth (≈50% of
|
||||
nr_requests).
|
||||
|
||||
If a bfq_queue is weight-raised:
|
||||
|
||||
* Sync writes: limited to ~1/2 of async_depth (≈37% of
|
||||
nr_requests).
|
||||
* Async I/O: limited to ~1/4 of async_depth (≈18% of
|
||||
nr_requests).
|
||||
|
||||
- If the user writes a custom value to async_depth, BFQ will
|
||||
recompute these limits proportionally based on the new value.
|
||||
|
||||
For Kyber:
|
||||
|
||||
- By default async_depth is set to 75% of nr_requests.
|
||||
- If the user writes a custom value to async_depth, then it
|
||||
overrides the default and directly controls the limit for
|
||||
writes and async I/O.
|
||||
|
||||
For mq-deadline:
|
||||
|
||||
- By default async_depth is set to nr_requests.
|
||||
- If the user writes a custom value to async_depth, then it
|
||||
overrides the default and directly controls the limit for
|
||||
writes and async I/O.
|
||||
|
||||
|
||||
What: /sys/block/<disk>/queue/nr_zones
|
||||
Date: November 2018
|
||||
Contact: Damien Le Moal <damien.lemoal@wdc.com>
|
||||
|
|
|
|||
|
|
@ -135,7 +135,6 @@ Usage of helpers:
|
|||
bio_first_bvec_all()
|
||||
bio_first_page_all()
|
||||
bio_first_folio_all()
|
||||
bio_last_bvec_all()
|
||||
|
||||
* The following helpers iterate over single-page segment. The passed 'struct
|
||||
bio_vec' will contain a single-page IO vector during the iteration::
|
||||
|
|
|
|||
|
|
@ -206,6 +206,12 @@ it to a bio, given the blk_crypto_key and the data unit number that will be used
|
|||
for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx
|
||||
later, as that happens automatically when the bio is freed or reset.
|
||||
|
||||
To submit a bio that uses inline encryption, users must call
|
||||
``blk_crypto_submit_bio()`` instead of the usual ``submit_bio()``. This will
|
||||
submit the bio to the underlying driver if it supports inline crypto, or else
|
||||
call the blk-crypto fallback routines before submitting normal bios to the
|
||||
underlying drivers.
|
||||
|
||||
Finally, when done using inline encryption with a blk_crypto_key on a
|
||||
block_device, users must call ``blk_crypto_evict_key()``. This ensures that
|
||||
the key is evicted from all keyslots it may be programmed into and unlinked from
|
||||
|
|
|
|||
|
|
@ -260,9 +260,12 @@ The following IO commands are communicated via io_uring passthrough command,
|
|||
and each command is only for forwarding the IO and committing the result
|
||||
with specified IO tag in the command data:
|
||||
|
||||
- ``UBLK_IO_FETCH_REQ``
|
||||
Traditional Per-I/O Commands
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Sent from the server IO pthread for fetching future incoming IO requests
|
||||
- ``UBLK_U_IO_FETCH_REQ``
|
||||
|
||||
Sent from the server I/O pthread for fetching future incoming I/O requests
|
||||
destined to ``/dev/ublkb*``. This command is sent only once from the server
|
||||
IO pthread for ublk driver to setup IO forward environment.
|
||||
|
||||
|
|
@ -278,7 +281,7 @@ with specified IO tag in the command data:
|
|||
supported by the driver, daemons must be per-queue instead - i.e. all I/Os
|
||||
associated to a single qid must be handled by the same task.
|
||||
|
||||
- ``UBLK_IO_COMMIT_AND_FETCH_REQ``
|
||||
- ``UBLK_U_IO_COMMIT_AND_FETCH_REQ``
|
||||
|
||||
When an IO request is destined to ``/dev/ublkb*``, the driver stores
|
||||
the IO's ``ublksrv_io_desc`` to the specified mapped area; then the
|
||||
|
|
@ -293,7 +296,7 @@ with specified IO tag in the command data:
|
|||
requests with the same IO tag. That is, ``UBLK_IO_COMMIT_AND_FETCH_REQ``
|
||||
is reused for both fetching request and committing back IO result.
|
||||
|
||||
- ``UBLK_IO_NEED_GET_DATA``
|
||||
- ``UBLK_U_IO_NEED_GET_DATA``
|
||||
|
||||
With ``UBLK_F_NEED_GET_DATA`` enabled, the WRITE request will be firstly
|
||||
issued to ublk server without data copy. Then, IO backend of ublk server
|
||||
|
|
@ -322,6 +325,59 @@ with specified IO tag in the command data:
|
|||
``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy
|
||||
the server buffer (pages) read to the IO request pages.
|
||||
|
||||
Batch I/O Commands (UBLK_F_BATCH_IO)
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``UBLK_F_BATCH_IO`` feature provides an alternative high-performance
|
||||
I/O handling model that replaces the traditional per-I/O commands with
|
||||
per-queue batch commands. This significantly reduces communication overhead
|
||||
and enables better load balancing across multiple server tasks.
|
||||
|
||||
Key differences from traditional mode:
|
||||
|
||||
- **Per-queue vs Per-I/O**: Commands operate on queues rather than individual I/Os
|
||||
- **Batch processing**: Multiple I/Os are handled in single operations
|
||||
- **Multishot commands**: Use io_uring multishot for reduced submission overhead
|
||||
- **Flexible task assignment**: Any task can handle any I/O (no per-I/O daemons)
|
||||
- **Better load balancing**: Tasks can adjust their workload dynamically
|
||||
|
||||
Batch I/O Commands:
|
||||
|
||||
- ``UBLK_U_IO_PREP_IO_CMDS``
|
||||
|
||||
Prepares multiple I/O commands in batch. The server provides a buffer
|
||||
containing multiple I/O descriptors that will be processed together.
|
||||
This reduces the number of individual command submissions required.
|
||||
|
||||
- ``UBLK_U_IO_COMMIT_IO_CMDS``
|
||||
|
||||
Commits results for multiple I/O operations in batch, and prepares the
|
||||
I/O descriptors to accept new requests. The server provides a buffer
|
||||
containing the results of multiple completed I/Os, allowing efficient
|
||||
bulk completion of requests.
|
||||
|
||||
- ``UBLK_U_IO_FETCH_IO_CMDS``
|
||||
|
||||
**Multishot command** for fetching I/O commands in batch. This is the key
|
||||
command that enables high-performance batch processing:
|
||||
|
||||
* Uses io_uring multishot capability for reduced submission overhead
|
||||
* Single command can fetch multiple I/O requests over time
|
||||
* Buffer size determines maximum batch size per operation
|
||||
* Multiple fetch commands can be submitted for load balancing
|
||||
* Only one fetch command is active at any time per queue
|
||||
* Supports dynamic load balancing across multiple server tasks
|
||||
|
||||
It is one typical multishot io_uring request with provided buffer, and it
|
||||
won't be completed until any failure is triggered.
|
||||
|
||||
Each task can submit ``UBLK_U_IO_FETCH_IO_CMDS`` with different buffer
|
||||
sizes to control how much work it handles. This enables sophisticated
|
||||
load balancing strategies in multi-threaded servers.
|
||||
|
||||
Migration: Applications using traditional commands (``UBLK_U_IO_FETCH_REQ``,
|
||||
``UBLK_U_IO_COMMIT_AND_FETCH_REQ``) cannot use batch mode simultaneously.
|
||||
|
||||
Zero copy
|
||||
---------
|
||||
|
||||
|
|
|
|||
|
|
@ -24276,6 +24276,7 @@ F: include/linux/property.h
|
|||
SOFTWARE RAID (Multiple Disks) SUPPORT
|
||||
M: Song Liu <song@kernel.org>
|
||||
M: Yu Kuai <yukuai@fnnas.com>
|
||||
R: Li Nan <linan122@huawei.com>
|
||||
L: linux-raid@vger.kernel.org
|
||||
S: Supported
|
||||
Q: https://patchwork.kernel.org/project/linux-raid/list/
|
||||
|
|
|
|||
|
|
@ -208,7 +208,6 @@ int set_blocksize(struct file *file, int size)
|
|||
|
||||
inode->i_blkbits = blksize_bits(size);
|
||||
mapping_set_folio_min_order(inode->i_mapping, get_order(size));
|
||||
kill_bdev(bdev);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
inode_unlock(inode);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -231,7 +231,7 @@ static struct kmem_cache *bfq_pool;
|
|||
#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \
|
||||
(get_sdist(last_pos, rq) > \
|
||||
BFQQ_SEEK_THR && \
|
||||
(!blk_queue_nonrot(bfqd->queue) || \
|
||||
(blk_queue_rot(bfqd->queue) || \
|
||||
blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT))
|
||||
#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
|
||||
#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19)
|
||||
|
|
@ -697,7 +697,7 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
|
|||
unsigned int limit, act_idx;
|
||||
|
||||
/* Sync reads have full depth available */
|
||||
if (op_is_sync(opf) && !op_is_write(opf))
|
||||
if (blk_mq_is_sync_read(opf))
|
||||
limit = data->q->nr_requests;
|
||||
else
|
||||
limit = bfqd->async_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
|
||||
|
|
@ -4165,7 +4165,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
|||
|
||||
/* don't use too short time intervals */
|
||||
if (delta_usecs < 1000) {
|
||||
if (blk_queue_nonrot(bfqd->queue))
|
||||
if (!blk_queue_rot(bfqd->queue))
|
||||
/*
|
||||
* give same worst-case guarantees as idling
|
||||
* for seeky
|
||||
|
|
@ -4487,7 +4487,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
|
|||
struct bfq_queue *bfqq)
|
||||
{
|
||||
bool rot_without_queueing =
|
||||
!blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag,
|
||||
blk_queue_rot(bfqd->queue) && !bfqd->hw_tag,
|
||||
bfqq_sequential_and_IO_bound,
|
||||
idling_boosts_thr;
|
||||
|
||||
|
|
@ -4521,7 +4521,7 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
|
|||
* flash-based device.
|
||||
*/
|
||||
idling_boosts_thr = rot_without_queueing ||
|
||||
((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) &&
|
||||
((blk_queue_rot(bfqd->queue) || !bfqd->hw_tag) &&
|
||||
bfqq_sequential_and_IO_bound);
|
||||
|
||||
/*
|
||||
|
|
@ -4722,7 +4722,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
|
|||
* there is only one in-flight large request
|
||||
* at a time.
|
||||
*/
|
||||
if (blk_queue_nonrot(bfqd->queue) &&
|
||||
if (!blk_queue_rot(bfqd->queue) &&
|
||||
blk_rq_sectors(bfqq->next_rq) >=
|
||||
BFQQ_SECT_THR_NONROT &&
|
||||
bfqd->tot_rq_in_driver >= 1)
|
||||
|
|
@ -6340,7 +6340,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
|
|||
bfqd->hw_tag_samples = 0;
|
||||
|
||||
bfqd->nonrot_with_queueing =
|
||||
blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag;
|
||||
!blk_queue_rot(bfqd->queue) && bfqd->hw_tag;
|
||||
}
|
||||
|
||||
static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
|
||||
|
|
@ -7112,39 +7112,29 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
|
|||
static void bfq_depth_updated(struct request_queue *q)
|
||||
{
|
||||
struct bfq_data *bfqd = q->elevator->elevator_data;
|
||||
unsigned int nr_requests = q->nr_requests;
|
||||
unsigned int async_depth = q->async_depth;
|
||||
|
||||
/*
|
||||
* In-word depths if no bfq_queue is being weight-raised:
|
||||
* leaving 25% of tags only for sync reads.
|
||||
* By default:
|
||||
* - sync reads are not limited
|
||||
* If bfqq is not being weight-raised:
|
||||
* - sync writes are limited to 75%(async depth default value)
|
||||
* - async IO are limited to 50%
|
||||
* If bfqq is being weight-raised:
|
||||
* - sync writes are limited to ~37%
|
||||
* - async IO are limited to ~18
|
||||
*
|
||||
* In next formulas, right-shift the value
|
||||
* (1U<<bt->sb.shift), instead of computing directly
|
||||
* (1U<<(bt->sb.shift - something)), to be robust against
|
||||
* any possible value of bt->sb.shift, without having to
|
||||
* limit 'something'.
|
||||
* If request_queue->async_depth is updated by user, all limit are
|
||||
* updated relatively.
|
||||
*/
|
||||
/* no more than 50% of tags for async I/O */
|
||||
bfqd->async_depths[0][0] = max(nr_requests >> 1, 1U);
|
||||
/*
|
||||
* no more than 75% of tags for sync writes (25% extra tags
|
||||
* w.r.t. async I/O, to prevent async I/O from starving sync
|
||||
* writes)
|
||||
*/
|
||||
bfqd->async_depths[0][1] = max((nr_requests * 3) >> 2, 1U);
|
||||
bfqd->async_depths[0][1] = async_depth;
|
||||
bfqd->async_depths[0][0] = max(async_depth * 2 / 3, 1U);
|
||||
bfqd->async_depths[1][1] = max(async_depth >> 1, 1U);
|
||||
bfqd->async_depths[1][0] = max(async_depth >> 2, 1U);
|
||||
|
||||
/*
|
||||
* In-word depths in case some bfq_queue is being weight-
|
||||
* raised: leaving ~63% of tags for sync reads. This is the
|
||||
* highest percentage for which, in our tests, application
|
||||
* start-up times didn't suffer from any regression due to tag
|
||||
* shortage.
|
||||
* Due to cgroup qos, the allowed request for bfqq might be 1
|
||||
*/
|
||||
/* no more than ~18% of tags for async I/O */
|
||||
bfqd->async_depths[1][0] = max((nr_requests * 3) >> 4, 1U);
|
||||
/* no more than ~37% of tags for sync writes (~20% extra tags) */
|
||||
bfqd->async_depths[1][1] = max((nr_requests * 6) >> 4, 1U);
|
||||
|
||||
blk_mq_set_min_shallow_depth(q, 1);
|
||||
}
|
||||
|
||||
|
|
@ -7293,7 +7283,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
|
|||
INIT_HLIST_HEAD(&bfqd->burst_list);
|
||||
|
||||
bfqd->hw_tag = -1;
|
||||
bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue);
|
||||
bfqd->nonrot_with_queueing = !blk_queue_rot(bfqd->queue);
|
||||
|
||||
bfqd->bfq_max_budget = bfq_default_max_budget;
|
||||
|
||||
|
|
@ -7328,9 +7318,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
|
|||
* Begin by assuming, optimistically, that the device peak
|
||||
* rate is equal to 2/3 of the highest reference rate.
|
||||
*/
|
||||
bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] *
|
||||
ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
|
||||
bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
|
||||
bfqd->rate_dur_prod = ref_rate[!blk_queue_rot(bfqd->queue)] *
|
||||
ref_wr_duration[!blk_queue_rot(bfqd->queue)];
|
||||
bfqd->peak_rate = ref_rate[!blk_queue_rot(bfqd->queue)] * 2 / 3;
|
||||
|
||||
/* see comments on the definition of next field inside bfq_data */
|
||||
bfqd->actuator_load_threshold = 4;
|
||||
|
|
@ -7365,6 +7355,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq)
|
|||
blk_queue_flag_set(QUEUE_FLAG_DISABLE_WBT_DEF, q);
|
||||
wbt_disable_default(q->disk);
|
||||
blk_stat_enable_accounting(q);
|
||||
q->async_depth = (q->nr_requests * 3) >> 2;
|
||||
|
||||
return 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -52,19 +52,7 @@ static bool bip_should_check(struct bio_integrity_payload *bip)
|
|||
|
||||
static bool bi_offload_capable(struct blk_integrity *bi)
|
||||
{
|
||||
switch (bi->csum_type) {
|
||||
case BLK_INTEGRITY_CSUM_CRC64:
|
||||
return bi->metadata_size == sizeof(struct crc64_pi_tuple);
|
||||
case BLK_INTEGRITY_CSUM_CRC:
|
||||
case BLK_INTEGRITY_CSUM_IP:
|
||||
return bi->metadata_size == sizeof(struct t10_pi_tuple);
|
||||
default:
|
||||
pr_warn_once("%s: unknown integrity checksum type:%d\n",
|
||||
__func__, bi->csum_type);
|
||||
fallthrough;
|
||||
case BLK_INTEGRITY_CSUM_NONE:
|
||||
return false;
|
||||
}
|
||||
return bi->metadata_size == bi->pi_tuple_size;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -301,9 +301,12 @@ EXPORT_SYMBOL(bio_init);
|
|||
*/
|
||||
void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
|
||||
{
|
||||
struct bio_vec *bv = bio->bi_io_vec;
|
||||
|
||||
bio_uninit(bio);
|
||||
memset(bio, 0, BIO_RESET_BYTES);
|
||||
atomic_set(&bio->__bi_remaining, 1);
|
||||
bio->bi_io_vec = bv;
|
||||
bio->bi_bdev = bdev;
|
||||
if (bio->bi_bdev)
|
||||
bio_associate_blkg(bio);
|
||||
|
|
@ -1196,8 +1199,8 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
|
|||
{
|
||||
WARN_ON_ONCE(bio->bi_max_vecs);
|
||||
|
||||
bio->bi_vcnt = iter->nr_segs;
|
||||
bio->bi_io_vec = (struct bio_vec *)iter->bvec;
|
||||
bio->bi_iter.bi_idx = 0;
|
||||
bio->bi_iter.bi_bvec_done = iter->iov_offset;
|
||||
bio->bi_iter.bi_size = iov_iter_count(iter);
|
||||
bio_set_flag(bio, BIO_CLONED);
|
||||
|
|
|
|||
|
|
@ -114,12 +114,12 @@ static const char *const blk_op_name[] = {
|
|||
#undef REQ_OP_NAME
|
||||
|
||||
/**
|
||||
* blk_op_str - Return string XXX in the REQ_OP_XXX.
|
||||
* @op: REQ_OP_XXX.
|
||||
* blk_op_str - Return the string "name" for an operation REQ_OP_name.
|
||||
* @op: a request operation.
|
||||
*
|
||||
* Description: Centralize block layer function to convert REQ_OP_XXX into
|
||||
* string format. Useful in the debugging and tracing bio or request. For
|
||||
* invalid REQ_OP_XXX it returns string "UNKNOWN".
|
||||
* Convert a request operation REQ_OP_name into the string "name". Useful for
|
||||
* debugging and tracing BIOs and requests. For an invalid request operation
|
||||
* code, the string "UNKNOWN" is returned.
|
||||
*/
|
||||
inline const char *blk_op_str(enum req_op op)
|
||||
{
|
||||
|
|
@ -463,6 +463,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
|
|||
fs_reclaim_release(GFP_KERNEL);
|
||||
|
||||
q->nr_requests = BLKDEV_DEFAULT_RQ;
|
||||
q->async_depth = BLKDEV_DEFAULT_RQ;
|
||||
|
||||
return q;
|
||||
|
||||
|
|
@ -628,9 +629,6 @@ static void __submit_bio(struct bio *bio)
|
|||
/* If plug is not used, add new plug here to cache nsecs time. */
|
||||
struct blk_plug plug;
|
||||
|
||||
if (unlikely(!blk_crypto_bio_prep(&bio)))
|
||||
return;
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
|
||||
|
|
@ -794,6 +792,13 @@ void submit_bio_noacct(struct bio *bio)
|
|||
if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
|
||||
goto not_supported;
|
||||
|
||||
if (bio_has_crypt_ctx(bio)) {
|
||||
if (WARN_ON_ONCE(!bio_has_data(bio)))
|
||||
goto end_io;
|
||||
if (!blk_crypto_supported(bio))
|
||||
goto not_supported;
|
||||
}
|
||||
|
||||
if (should_fail_bio(bio))
|
||||
goto end_io;
|
||||
bio_check_ro(bio);
|
||||
|
|
|
|||
|
|
@ -22,7 +22,7 @@
|
|||
#include "blk-cgroup.h"
|
||||
#include "blk-crypto-internal.h"
|
||||
|
||||
static unsigned int num_prealloc_bounce_pg = 32;
|
||||
static unsigned int num_prealloc_bounce_pg = BIO_MAX_VECS;
|
||||
module_param(num_prealloc_bounce_pg, uint, 0);
|
||||
MODULE_PARM_DESC(num_prealloc_bounce_pg,
|
||||
"Number of preallocated bounce pages for the blk-crypto crypto API fallback");
|
||||
|
|
@ -75,13 +75,13 @@ static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
|
|||
|
||||
static struct blk_crypto_fallback_keyslot {
|
||||
enum blk_crypto_mode_num crypto_mode;
|
||||
struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
|
||||
struct crypto_sync_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
|
||||
} *blk_crypto_keyslots;
|
||||
|
||||
static struct blk_crypto_profile *blk_crypto_fallback_profile;
|
||||
static struct workqueue_struct *blk_crypto_wq;
|
||||
static mempool_t *blk_crypto_bounce_page_pool;
|
||||
static struct bio_set crypto_bio_split;
|
||||
static struct bio_set enc_bio_set;
|
||||
|
||||
/*
|
||||
* This is the key we set when evicting a keyslot. This *should* be the all 0's
|
||||
|
|
@ -98,7 +98,7 @@ static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
|
|||
WARN_ON(slotp->crypto_mode == BLK_ENCRYPTION_MODE_INVALID);
|
||||
|
||||
/* Clear the key in the skcipher */
|
||||
err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], blank_key,
|
||||
err = crypto_sync_skcipher_setkey(slotp->tfms[crypto_mode], blank_key,
|
||||
blk_crypto_modes[crypto_mode].keysize);
|
||||
WARN_ON(err);
|
||||
slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
|
||||
|
|
@ -119,7 +119,7 @@ blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
|
|||
blk_crypto_fallback_evict_keyslot(slot);
|
||||
|
||||
slotp->crypto_mode = crypto_mode;
|
||||
err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes,
|
||||
err = crypto_sync_skcipher_setkey(slotp->tfms[crypto_mode], key->bytes,
|
||||
key->size);
|
||||
if (err) {
|
||||
blk_crypto_fallback_evict_keyslot(slot);
|
||||
|
|
@ -144,94 +144,84 @@ static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = {
|
|||
static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
|
||||
{
|
||||
struct bio *src_bio = enc_bio->bi_private;
|
||||
int i;
|
||||
struct page **pages = (struct page **)enc_bio->bi_io_vec;
|
||||
struct bio_vec *bv;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < enc_bio->bi_vcnt; i++)
|
||||
mempool_free(enc_bio->bi_io_vec[i].bv_page,
|
||||
blk_crypto_bounce_page_pool);
|
||||
/*
|
||||
* Use the same trick as the alloc side to avoid the need for an extra
|
||||
* pages array.
|
||||
*/
|
||||
bio_for_each_bvec_all(bv, enc_bio, i)
|
||||
pages[i] = bv->bv_page;
|
||||
|
||||
src_bio->bi_status = enc_bio->bi_status;
|
||||
i = mempool_free_bulk(blk_crypto_bounce_page_pool, (void **)pages,
|
||||
enc_bio->bi_vcnt);
|
||||
if (i < enc_bio->bi_vcnt)
|
||||
release_pages(pages + i, enc_bio->bi_vcnt - i);
|
||||
|
||||
bio_uninit(enc_bio);
|
||||
kfree(enc_bio);
|
||||
if (enc_bio->bi_status)
|
||||
cmpxchg(&src_bio->bi_status, 0, enc_bio->bi_status);
|
||||
|
||||
bio_put(enc_bio);
|
||||
bio_endio(src_bio);
|
||||
}
|
||||
|
||||
static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
|
||||
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
|
||||
|
||||
static struct bio *blk_crypto_alloc_enc_bio(struct bio *bio_src,
|
||||
unsigned int nr_segs, struct page ***pages_ret)
|
||||
{
|
||||
unsigned int nr_segs = bio_segments(bio_src);
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
unsigned int memflags = memalloc_noio_save();
|
||||
unsigned int nr_allocated;
|
||||
struct page **pages;
|
||||
struct bio *bio;
|
||||
|
||||
bio = bio_kmalloc(nr_segs, GFP_NOIO);
|
||||
if (!bio)
|
||||
return NULL;
|
||||
bio_init_inline(bio, bio_src->bi_bdev, nr_segs, bio_src->bi_opf);
|
||||
bio = bio_alloc_bioset(bio_src->bi_bdev, nr_segs, bio_src->bi_opf,
|
||||
GFP_NOIO, &enc_bio_set);
|
||||
if (bio_flagged(bio_src, BIO_REMAPPED))
|
||||
bio_set_flag(bio, BIO_REMAPPED);
|
||||
bio->bi_private = bio_src;
|
||||
bio->bi_end_io = blk_crypto_fallback_encrypt_endio;
|
||||
bio->bi_ioprio = bio_src->bi_ioprio;
|
||||
bio->bi_write_hint = bio_src->bi_write_hint;
|
||||
bio->bi_write_stream = bio_src->bi_write_stream;
|
||||
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
|
||||
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
|
||||
|
||||
bio_for_each_segment(bv, bio_src, iter)
|
||||
bio->bi_io_vec[bio->bi_vcnt++] = bv;
|
||||
|
||||
bio_clone_blkg_association(bio, bio_src);
|
||||
|
||||
/*
|
||||
* Move page array up in the allocated memory for the bio vecs as far as
|
||||
* possible so that we can start filling biovecs from the beginning
|
||||
* without overwriting the temporary page array.
|
||||
*/
|
||||
static_assert(PAGE_PTRS_PER_BVEC > 1);
|
||||
pages = (struct page **)bio->bi_io_vec;
|
||||
pages += nr_segs * (PAGE_PTRS_PER_BVEC - 1);
|
||||
|
||||
/*
|
||||
* Try a bulk allocation first. This could leave random pages in the
|
||||
* array unallocated, but we'll fix that up later in mempool_alloc_bulk.
|
||||
*
|
||||
* Note: alloc_pages_bulk needs the array to be zeroed, as it assumes
|
||||
* any non-zero slot already contains a valid allocation.
|
||||
*/
|
||||
memset(pages, 0, sizeof(struct page *) * nr_segs);
|
||||
nr_allocated = alloc_pages_bulk(GFP_KERNEL, nr_segs, pages);
|
||||
if (nr_allocated < nr_segs)
|
||||
mempool_alloc_bulk(blk_crypto_bounce_page_pool, (void **)pages,
|
||||
nr_segs, nr_allocated);
|
||||
memalloc_noio_restore(memflags);
|
||||
*pages_ret = pages;
|
||||
return bio;
|
||||
}
|
||||
|
||||
static bool
|
||||
blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot,
|
||||
struct skcipher_request **ciph_req_ret,
|
||||
struct crypto_wait *wait)
|
||||
static struct crypto_sync_skcipher *
|
||||
blk_crypto_fallback_tfm(struct blk_crypto_keyslot *slot)
|
||||
{
|
||||
struct skcipher_request *ciph_req;
|
||||
const struct blk_crypto_fallback_keyslot *slotp;
|
||||
int keyslot_idx = blk_crypto_keyslot_index(slot);
|
||||
const struct blk_crypto_fallback_keyslot *slotp =
|
||||
&blk_crypto_keyslots[blk_crypto_keyslot_index(slot)];
|
||||
|
||||
slotp = &blk_crypto_keyslots[keyslot_idx];
|
||||
ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
|
||||
GFP_NOIO);
|
||||
if (!ciph_req)
|
||||
return false;
|
||||
|
||||
skcipher_request_set_callback(ciph_req,
|
||||
CRYPTO_TFM_REQ_MAY_BACKLOG |
|
||||
CRYPTO_TFM_REQ_MAY_SLEEP,
|
||||
crypto_req_done, wait);
|
||||
*ciph_req_ret = ciph_req;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
|
||||
{
|
||||
struct bio *bio = *bio_ptr;
|
||||
unsigned int i = 0;
|
||||
unsigned int num_sectors = 0;
|
||||
struct bio_vec bv;
|
||||
struct bvec_iter iter;
|
||||
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
num_sectors += bv.bv_len >> SECTOR_SHIFT;
|
||||
if (++i == BIO_MAX_VECS)
|
||||
break;
|
||||
}
|
||||
|
||||
if (num_sectors < bio_sectors(bio)) {
|
||||
bio = bio_submit_split_bioset(bio, num_sectors,
|
||||
&crypto_bio_split);
|
||||
if (!bio)
|
||||
return false;
|
||||
|
||||
*bio_ptr = bio;
|
||||
}
|
||||
|
||||
return true;
|
||||
return slotp->tfms[slotp->crypto_mode];
|
||||
}
|
||||
|
||||
union blk_crypto_iv {
|
||||
|
|
@ -248,59 +238,23 @@ static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
|
|||
iv->dun[i] = cpu_to_le64(dun[i]);
|
||||
}
|
||||
|
||||
/*
|
||||
* The crypto API fallback's encryption routine.
|
||||
* Allocate a bounce bio for encryption, encrypt the input bio using crypto API,
|
||||
* and replace *bio_ptr with the bounce bio. May split input bio if it's too
|
||||
* large. Returns true on success. Returns false and sets bio->bi_status on
|
||||
* error.
|
||||
*/
|
||||
static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
|
||||
static void __blk_crypto_fallback_encrypt_bio(struct bio *src_bio,
|
||||
struct crypto_sync_skcipher *tfm)
|
||||
{
|
||||
struct bio *src_bio, *enc_bio;
|
||||
struct bio_crypt_ctx *bc;
|
||||
struct blk_crypto_keyslot *slot;
|
||||
int data_unit_size;
|
||||
struct skcipher_request *ciph_req = NULL;
|
||||
DECLARE_CRYPTO_WAIT(wait);
|
||||
struct bio_crypt_ctx *bc = src_bio->bi_crypt_context;
|
||||
int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
|
||||
SYNC_SKCIPHER_REQUEST_ON_STACK(ciph_req, tfm);
|
||||
u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
|
||||
struct scatterlist src, dst;
|
||||
union blk_crypto_iv iv;
|
||||
unsigned int i, j;
|
||||
bool ret = false;
|
||||
blk_status_t blk_st;
|
||||
unsigned int nr_enc_pages, enc_idx;
|
||||
struct page **enc_pages;
|
||||
struct bio *enc_bio;
|
||||
unsigned int i;
|
||||
|
||||
/* Split the bio if it's too big for single page bvec */
|
||||
if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr))
|
||||
return false;
|
||||
|
||||
src_bio = *bio_ptr;
|
||||
bc = src_bio->bi_crypt_context;
|
||||
data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
|
||||
|
||||
/* Allocate bounce bio for encryption */
|
||||
enc_bio = blk_crypto_fallback_clone_bio(src_bio);
|
||||
if (!enc_bio) {
|
||||
src_bio->bi_status = BLK_STS_RESOURCE;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
|
||||
* this bio's algorithm and key.
|
||||
*/
|
||||
blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
|
||||
bc->bc_key, &slot);
|
||||
if (blk_st != BLK_STS_OK) {
|
||||
src_bio->bi_status = blk_st;
|
||||
goto out_put_enc_bio;
|
||||
}
|
||||
|
||||
/* and then allocate an skcipher_request for it */
|
||||
if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
|
||||
src_bio->bi_status = BLK_STS_RESOURCE;
|
||||
goto out_release_keyslot;
|
||||
}
|
||||
skcipher_request_set_callback(ciph_req,
|
||||
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
|
||||
NULL, NULL);
|
||||
|
||||
memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun));
|
||||
sg_init_table(&src, 1);
|
||||
|
|
@ -309,65 +263,159 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
|
|||
skcipher_request_set_crypt(ciph_req, &src, &dst, data_unit_size,
|
||||
iv.bytes);
|
||||
|
||||
/* Encrypt each page in the bounce bio */
|
||||
for (i = 0; i < enc_bio->bi_vcnt; i++) {
|
||||
struct bio_vec *enc_bvec = &enc_bio->bi_io_vec[i];
|
||||
struct page *plaintext_page = enc_bvec->bv_page;
|
||||
struct page *ciphertext_page =
|
||||
mempool_alloc(blk_crypto_bounce_page_pool, GFP_NOIO);
|
||||
/*
|
||||
* Encrypt each page in the source bio. Because the source bio could
|
||||
* have bio_vecs that span more than a single page, but the encrypted
|
||||
* bios are limited to a single page per bio_vec, this can generate
|
||||
* more than a single encrypted bio per source bio.
|
||||
*/
|
||||
new_bio:
|
||||
nr_enc_pages = min(bio_segments(src_bio), BIO_MAX_VECS);
|
||||
enc_bio = blk_crypto_alloc_enc_bio(src_bio, nr_enc_pages, &enc_pages);
|
||||
enc_idx = 0;
|
||||
for (;;) {
|
||||
struct bio_vec src_bv =
|
||||
bio_iter_iovec(src_bio, src_bio->bi_iter);
|
||||
struct page *enc_page = enc_pages[enc_idx];
|
||||
|
||||
enc_bvec->bv_page = ciphertext_page;
|
||||
|
||||
if (!ciphertext_page) {
|
||||
src_bio->bi_status = BLK_STS_RESOURCE;
|
||||
goto out_free_bounce_pages;
|
||||
if (!IS_ALIGNED(src_bv.bv_len | src_bv.bv_offset,
|
||||
data_unit_size)) {
|
||||
enc_bio->bi_status = BLK_STS_INVAL;
|
||||
goto out_free_enc_bio;
|
||||
}
|
||||
|
||||
sg_set_page(&src, plaintext_page, data_unit_size,
|
||||
enc_bvec->bv_offset);
|
||||
sg_set_page(&dst, ciphertext_page, data_unit_size,
|
||||
enc_bvec->bv_offset);
|
||||
__bio_add_page(enc_bio, enc_page, src_bv.bv_len,
|
||||
src_bv.bv_offset);
|
||||
|
||||
/* Encrypt each data unit in this page */
|
||||
for (j = 0; j < enc_bvec->bv_len; j += data_unit_size) {
|
||||
sg_set_page(&src, src_bv.bv_page, data_unit_size,
|
||||
src_bv.bv_offset);
|
||||
sg_set_page(&dst, enc_page, data_unit_size, src_bv.bv_offset);
|
||||
|
||||
/*
|
||||
* Increment the index now that the encrypted page is added to
|
||||
* the bio. This is important for the error unwind path.
|
||||
*/
|
||||
enc_idx++;
|
||||
|
||||
/*
|
||||
* Encrypt each data unit in this page.
|
||||
*/
|
||||
for (i = 0; i < src_bv.bv_len; i += data_unit_size) {
|
||||
blk_crypto_dun_to_iv(curr_dun, &iv);
|
||||
if (crypto_wait_req(crypto_skcipher_encrypt(ciph_req),
|
||||
&wait)) {
|
||||
i++;
|
||||
src_bio->bi_status = BLK_STS_IOERR;
|
||||
goto out_free_bounce_pages;
|
||||
if (crypto_skcipher_encrypt(ciph_req)) {
|
||||
enc_bio->bi_status = BLK_STS_IOERR;
|
||||
goto out_free_enc_bio;
|
||||
}
|
||||
bio_crypt_dun_increment(curr_dun, 1);
|
||||
src.offset += data_unit_size;
|
||||
dst.offset += data_unit_size;
|
||||
}
|
||||
|
||||
bio_advance_iter_single(src_bio, &src_bio->bi_iter,
|
||||
src_bv.bv_len);
|
||||
if (!src_bio->bi_iter.bi_size)
|
||||
break;
|
||||
|
||||
if (enc_idx == nr_enc_pages) {
|
||||
/*
|
||||
* For each additional encrypted bio submitted,
|
||||
* increment the source bio's remaining count. Each
|
||||
* encrypted bio's completion handler calls bio_endio on
|
||||
* the source bio, so this keeps the source bio from
|
||||
* completing until the last encrypted bio does.
|
||||
*/
|
||||
bio_inc_remaining(src_bio);
|
||||
submit_bio(enc_bio);
|
||||
goto new_bio;
|
||||
}
|
||||
}
|
||||
|
||||
enc_bio->bi_private = src_bio;
|
||||
enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio;
|
||||
*bio_ptr = enc_bio;
|
||||
ret = true;
|
||||
submit_bio(enc_bio);
|
||||
return;
|
||||
|
||||
enc_bio = NULL;
|
||||
goto out_free_ciph_req;
|
||||
out_free_enc_bio:
|
||||
/*
|
||||
* Add the remaining pages to the bio so that the normal completion path
|
||||
* in blk_crypto_fallback_encrypt_endio frees them. The exact data
|
||||
* layout does not matter for that, so don't bother iterating the source
|
||||
* bio.
|
||||
*/
|
||||
for (; enc_idx < nr_enc_pages; enc_idx++)
|
||||
__bio_add_page(enc_bio, enc_pages[enc_idx], PAGE_SIZE, 0);
|
||||
bio_endio(enc_bio);
|
||||
}
|
||||
|
||||
out_free_bounce_pages:
|
||||
while (i > 0)
|
||||
mempool_free(enc_bio->bi_io_vec[--i].bv_page,
|
||||
blk_crypto_bounce_page_pool);
|
||||
out_free_ciph_req:
|
||||
skcipher_request_free(ciph_req);
|
||||
out_release_keyslot:
|
||||
/*
|
||||
* The crypto API fallback's encryption routine.
|
||||
*
|
||||
* Allocate one or more bios for encryption, encrypt the input bio using the
|
||||
* crypto API, and submit the encrypted bios. Sets bio->bi_status and
|
||||
* completes the source bio on error
|
||||
*/
|
||||
static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio)
|
||||
{
|
||||
struct bio_crypt_ctx *bc = src_bio->bi_crypt_context;
|
||||
struct blk_crypto_keyslot *slot;
|
||||
blk_status_t status;
|
||||
|
||||
status = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
|
||||
bc->bc_key, &slot);
|
||||
if (status != BLK_STS_OK) {
|
||||
src_bio->bi_status = status;
|
||||
bio_endio(src_bio);
|
||||
return;
|
||||
}
|
||||
__blk_crypto_fallback_encrypt_bio(src_bio,
|
||||
blk_crypto_fallback_tfm(slot));
|
||||
blk_crypto_put_keyslot(slot);
|
||||
out_put_enc_bio:
|
||||
if (enc_bio)
|
||||
bio_uninit(enc_bio);
|
||||
kfree(enc_bio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static blk_status_t __blk_crypto_fallback_decrypt_bio(struct bio *bio,
|
||||
struct bio_crypt_ctx *bc, struct bvec_iter iter,
|
||||
struct crypto_sync_skcipher *tfm)
|
||||
{
|
||||
SYNC_SKCIPHER_REQUEST_ON_STACK(ciph_req, tfm);
|
||||
u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
|
||||
union blk_crypto_iv iv;
|
||||
struct scatterlist sg;
|
||||
struct bio_vec bv;
|
||||
const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
|
||||
unsigned int i;
|
||||
|
||||
skcipher_request_set_callback(ciph_req,
|
||||
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
|
||||
NULL, NULL);
|
||||
|
||||
memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun));
|
||||
sg_init_table(&sg, 1);
|
||||
skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size,
|
||||
iv.bytes);
|
||||
|
||||
/* Decrypt each segment in the bio */
|
||||
__bio_for_each_segment(bv, bio, iter, iter) {
|
||||
struct page *page = bv.bv_page;
|
||||
|
||||
if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size))
|
||||
return BLK_STS_INVAL;
|
||||
|
||||
sg_set_page(&sg, page, data_unit_size, bv.bv_offset);
|
||||
|
||||
/* Decrypt each data unit in the segment */
|
||||
for (i = 0; i < bv.bv_len; i += data_unit_size) {
|
||||
blk_crypto_dun_to_iv(curr_dun, &iv);
|
||||
if (crypto_skcipher_decrypt(ciph_req))
|
||||
return BLK_STS_IOERR;
|
||||
bio_crypt_dun_increment(curr_dun, 1);
|
||||
sg.offset += data_unit_size;
|
||||
}
|
||||
}
|
||||
|
||||
return BLK_STS_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
* The crypto API fallback's main decryption routine.
|
||||
*
|
||||
* Decrypts input bio in place, and calls bio_endio on the bio.
|
||||
*/
|
||||
static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
|
||||
|
|
@ -377,63 +425,19 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
|
|||
struct bio *bio = f_ctx->bio;
|
||||
struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
|
||||
struct blk_crypto_keyslot *slot;
|
||||
struct skcipher_request *ciph_req = NULL;
|
||||
DECLARE_CRYPTO_WAIT(wait);
|
||||
u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
|
||||
union blk_crypto_iv iv;
|
||||
struct scatterlist sg;
|
||||
struct bio_vec bv;
|
||||
struct bvec_iter iter;
|
||||
const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
|
||||
unsigned int i;
|
||||
blk_status_t blk_st;
|
||||
blk_status_t status;
|
||||
|
||||
/*
|
||||
* Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
|
||||
* this bio's algorithm and key.
|
||||
*/
|
||||
blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
|
||||
status = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
|
||||
bc->bc_key, &slot);
|
||||
if (blk_st != BLK_STS_OK) {
|
||||
bio->bi_status = blk_st;
|
||||
goto out_no_keyslot;
|
||||
if (status == BLK_STS_OK) {
|
||||
status = __blk_crypto_fallback_decrypt_bio(bio, bc,
|
||||
f_ctx->crypt_iter,
|
||||
blk_crypto_fallback_tfm(slot));
|
||||
blk_crypto_put_keyslot(slot);
|
||||
}
|
||||
|
||||
/* and then allocate an skcipher_request for it */
|
||||
if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
|
||||
bio->bi_status = BLK_STS_RESOURCE;
|
||||
goto out;
|
||||
}
|
||||
|
||||
memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun));
|
||||
sg_init_table(&sg, 1);
|
||||
skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size,
|
||||
iv.bytes);
|
||||
|
||||
/* Decrypt each segment in the bio */
|
||||
__bio_for_each_segment(bv, bio, iter, f_ctx->crypt_iter) {
|
||||
struct page *page = bv.bv_page;
|
||||
|
||||
sg_set_page(&sg, page, data_unit_size, bv.bv_offset);
|
||||
|
||||
/* Decrypt each data unit in the segment */
|
||||
for (i = 0; i < bv.bv_len; i += data_unit_size) {
|
||||
blk_crypto_dun_to_iv(curr_dun, &iv);
|
||||
if (crypto_wait_req(crypto_skcipher_decrypt(ciph_req),
|
||||
&wait)) {
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
goto out;
|
||||
}
|
||||
bio_crypt_dun_increment(curr_dun, 1);
|
||||
sg.offset += data_unit_size;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
skcipher_request_free(ciph_req);
|
||||
blk_crypto_put_keyslot(slot);
|
||||
out_no_keyslot:
|
||||
mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
|
||||
|
||||
bio->bi_status = status;
|
||||
bio_endio(bio);
|
||||
}
|
||||
|
||||
|
|
@ -466,44 +470,44 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
|
|||
|
||||
/**
|
||||
* blk_crypto_fallback_bio_prep - Prepare a bio to use fallback en/decryption
|
||||
* @bio: bio to prepare
|
||||
*
|
||||
* @bio_ptr: pointer to the bio to prepare
|
||||
* If bio is doing a WRITE operation, allocate one or more bios to contain the
|
||||
* encrypted payload and submit them.
|
||||
*
|
||||
* If bio is doing a WRITE operation, this splits the bio into two parts if it's
|
||||
* too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a
|
||||
* bounce bio for the first part, encrypts it, and updates bio_ptr to point to
|
||||
* the bounce bio.
|
||||
*
|
||||
* For a READ operation, we mark the bio for decryption by using bi_private and
|
||||
* For a READ operation, mark the bio for decryption by using bi_private and
|
||||
* bi_end_io.
|
||||
*
|
||||
* In either case, this function will make the bio look like a regular bio (i.e.
|
||||
* as if no encryption context was ever specified) for the purposes of the rest
|
||||
* of the stack except for blk-integrity (blk-integrity and blk-crypto are not
|
||||
* currently supported together).
|
||||
* In either case, this function will make the submitted bio(s) look like
|
||||
* regular bios (i.e. as if no encryption context was ever specified) for the
|
||||
* purposes of the rest of the stack except for blk-integrity (blk-integrity and
|
||||
* blk-crypto are not currently supported together).
|
||||
*
|
||||
* Return: true on success. Sets bio->bi_status and returns false on error.
|
||||
* Return: true if @bio should be submitted to the driver by the caller, else
|
||||
* false. Sets bio->bi_status, calls bio_endio and returns false on error.
|
||||
*/
|
||||
bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
|
||||
bool blk_crypto_fallback_bio_prep(struct bio *bio)
|
||||
{
|
||||
struct bio *bio = *bio_ptr;
|
||||
struct bio_crypt_ctx *bc = bio->bi_crypt_context;
|
||||
struct bio_fallback_crypt_ctx *f_ctx;
|
||||
|
||||
if (WARN_ON_ONCE(!tfms_inited[bc->bc_key->crypto_cfg.crypto_mode])) {
|
||||
/* User didn't call blk_crypto_start_using_key() first */
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
bio_io_error(bio);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile,
|
||||
&bc->bc_key->crypto_cfg)) {
|
||||
bio->bi_status = BLK_STS_NOTSUPP;
|
||||
bio_endio(bio);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (bio_data_dir(bio) == WRITE)
|
||||
return blk_crypto_fallback_encrypt_bio(bio_ptr);
|
||||
if (bio_data_dir(bio) == WRITE) {
|
||||
blk_crypto_fallback_encrypt_bio(bio);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* bio READ case: Set up a f_ctx in the bio's bi_private and set the
|
||||
|
|
@ -537,7 +541,7 @@ static int blk_crypto_fallback_init(void)
|
|||
|
||||
get_random_bytes(blank_key, sizeof(blank_key));
|
||||
|
||||
err = bioset_init(&crypto_bio_split, 64, 0, 0);
|
||||
err = bioset_init(&enc_bio_set, 64, 0, BIOSET_NEED_BVECS);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
|
|
@ -607,7 +611,7 @@ fail_destroy_profile:
|
|||
fail_free_profile:
|
||||
kfree(blk_crypto_fallback_profile);
|
||||
fail_free_bioset:
|
||||
bioset_exit(&crypto_bio_split);
|
||||
bioset_exit(&enc_bio_set);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
|
@ -641,7 +645,8 @@ int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
|
|||
|
||||
for (i = 0; i < blk_crypto_num_keyslots; i++) {
|
||||
slotp = &blk_crypto_keyslots[i];
|
||||
slotp->tfms[mode_num] = crypto_alloc_skcipher(cipher_str, 0, 0);
|
||||
slotp->tfms[mode_num] = crypto_alloc_sync_skcipher(cipher_str,
|
||||
0, 0);
|
||||
if (IS_ERR(slotp->tfms[mode_num])) {
|
||||
err = PTR_ERR(slotp->tfms[mode_num]);
|
||||
if (err == -ENOENT) {
|
||||
|
|
@ -653,7 +658,7 @@ int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
|
|||
goto out_free_tfms;
|
||||
}
|
||||
|
||||
crypto_skcipher_set_flags(slotp->tfms[mode_num],
|
||||
crypto_sync_skcipher_set_flags(slotp->tfms[mode_num],
|
||||
CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
|
||||
}
|
||||
|
||||
|
|
@ -667,7 +672,7 @@ int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
|
|||
out_free_tfms:
|
||||
for (i = 0; i < blk_crypto_num_keyslots; i++) {
|
||||
slotp = &blk_crypto_keyslots[i];
|
||||
crypto_free_skcipher(slotp->tfms[mode_num]);
|
||||
crypto_free_sync_skcipher(slotp->tfms[mode_num]);
|
||||
slotp->tfms[mode_num] = NULL;
|
||||
}
|
||||
out:
|
||||
|
|
|
|||
|
|
@ -86,6 +86,12 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
|
|||
int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
|
||||
void __user *argp);
|
||||
|
||||
static inline bool blk_crypto_supported(struct bio *bio)
|
||||
{
|
||||
return blk_crypto_config_supported_natively(bio->bi_bdev,
|
||||
&bio->bi_crypt_context->bc_key->crypto_cfg);
|
||||
}
|
||||
|
||||
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
|
||||
|
||||
static inline int blk_crypto_sysfs_register(struct gendisk *disk)
|
||||
|
|
@ -139,6 +145,11 @@ static inline int blk_crypto_ioctl(struct block_device *bdev, unsigned int cmd,
|
|||
return -ENOTTY;
|
||||
}
|
||||
|
||||
static inline bool blk_crypto_supported(struct bio *bio)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
|
||||
|
||||
void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
|
||||
|
|
@ -165,14 +176,6 @@ static inline void bio_crypt_do_front_merge(struct request *rq,
|
|||
#endif
|
||||
}
|
||||
|
||||
bool __blk_crypto_bio_prep(struct bio **bio_ptr);
|
||||
static inline bool blk_crypto_bio_prep(struct bio **bio_ptr)
|
||||
{
|
||||
if (bio_has_crypt_ctx(*bio_ptr))
|
||||
return __blk_crypto_bio_prep(bio_ptr);
|
||||
return true;
|
||||
}
|
||||
|
||||
blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq);
|
||||
static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq)
|
||||
{
|
||||
|
|
@ -215,12 +218,12 @@ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
|
|||
return 0;
|
||||
}
|
||||
|
||||
bool blk_crypto_fallback_bio_prep(struct bio *bio);
|
||||
|
||||
#ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK
|
||||
|
||||
int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num);
|
||||
|
||||
bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr);
|
||||
|
||||
int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key);
|
||||
|
||||
#else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */
|
||||
|
|
@ -232,13 +235,6 @@ blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
|
|||
return -ENOPKG;
|
||||
}
|
||||
|
||||
static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
|
||||
{
|
||||
pr_warn_once("crypto API fallback disabled; failing request.\n");
|
||||
(*bio_ptr)->bi_status = BLK_STS_NOTSUPP;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline int
|
||||
blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -219,22 +219,6 @@ bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes,
|
|||
return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun);
|
||||
}
|
||||
|
||||
/* Check that all I/O segments are data unit aligned. */
|
||||
static bool bio_crypt_check_alignment(struct bio *bio)
|
||||
{
|
||||
const unsigned int data_unit_size =
|
||||
bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size;
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
|
||||
bio_for_each_segment(bv, bio, iter) {
|
||||
if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq)
|
||||
{
|
||||
return blk_crypto_get_keyslot(rq->q->crypto_profile,
|
||||
|
|
@ -258,57 +242,41 @@ void __blk_crypto_free_request(struct request *rq)
|
|||
rq->crypt_ctx = NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* __blk_crypto_bio_prep - Prepare bio for inline encryption
|
||||
/*
|
||||
* Process a bio with a crypto context. Returns true if the caller should
|
||||
* submit the passed in bio, false if the bio is consumed.
|
||||
*
|
||||
* @bio_ptr: pointer to original bio pointer
|
||||
*
|
||||
* If the bio crypt context provided for the bio is supported by the underlying
|
||||
* device's inline encryption hardware, do nothing.
|
||||
*
|
||||
* Otherwise, try to perform en/decryption for this bio by falling back to the
|
||||
* kernel crypto API. When the crypto API fallback is used for encryption,
|
||||
* blk-crypto may choose to split the bio into 2 - the first one that will
|
||||
* continue to be processed and the second one that will be resubmitted via
|
||||
* submit_bio_noacct. A bounce bio will be allocated to encrypt the contents
|
||||
* of the aforementioned "first one", and *bio_ptr will be updated to this
|
||||
* bounce bio.
|
||||
*
|
||||
* Caller must ensure bio has bio_crypt_ctx.
|
||||
*
|
||||
* Return: true on success; false on error (and bio->bi_status will be set
|
||||
* appropriately, and bio_endio() will have been called so bio
|
||||
* submission should abort).
|
||||
* See the kerneldoc comment for blk_crypto_submit_bio for further details.
|
||||
*/
|
||||
bool __blk_crypto_bio_prep(struct bio **bio_ptr)
|
||||
bool __blk_crypto_submit_bio(struct bio *bio)
|
||||
{
|
||||
struct bio *bio = *bio_ptr;
|
||||
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
|
||||
struct block_device *bdev = bio->bi_bdev;
|
||||
|
||||
/* Error if bio has no data. */
|
||||
if (WARN_ON_ONCE(!bio_has_data(bio))) {
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (!bio_crypt_check_alignment(bio)) {
|
||||
bio->bi_status = BLK_STS_INVAL;
|
||||
goto fail;
|
||||
bio_io_error(bio);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Success if device supports the encryption context, or if we succeeded
|
||||
* in falling back to the crypto API.
|
||||
* If the device does not natively support the encryption context, try to use
|
||||
* the fallback if available.
|
||||
*/
|
||||
if (blk_crypto_config_supported_natively(bio->bi_bdev,
|
||||
&bc_key->crypto_cfg))
|
||||
return true;
|
||||
if (blk_crypto_fallback_bio_prep(bio_ptr))
|
||||
return true;
|
||||
fail:
|
||||
bio_endio(*bio_ptr);
|
||||
return false;
|
||||
if (!blk_crypto_config_supported_natively(bdev, &bc_key->crypto_cfg)) {
|
||||
if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)) {
|
||||
pr_warn_once("%pg: crypto API fallback disabled; failing request.\n",
|
||||
bdev);
|
||||
bio->bi_status = BLK_STS_NOTSUPP;
|
||||
bio_endio(bio);
|
||||
return false;
|
||||
}
|
||||
return blk_crypto_fallback_bio_prep(bio);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__blk_crypto_submit_bio);
|
||||
|
||||
int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
|
||||
gfp_t gfp_mask)
|
||||
|
|
|
|||
|
|
@ -199,7 +199,8 @@ static void blk_flush_complete_seq(struct request *rq,
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
|
||||
blk_status_t error)
|
||||
blk_status_t error,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct request_queue *q = flush_rq->q;
|
||||
struct list_head *running;
|
||||
|
|
@ -335,7 +336,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
|
||||
blk_status_t error)
|
||||
blk_status_t error,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
|
||||
|
|
|
|||
|
|
@ -812,7 +812,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
|
|||
u64 now_ns;
|
||||
|
||||
/* rotational? */
|
||||
if (!blk_queue_nonrot(disk->queue))
|
||||
if (blk_queue_rot(disk->queue))
|
||||
return AUTOP_HDD;
|
||||
|
||||
/* handle SATA SSDs w/ broken NCQ */
|
||||
|
|
|
|||
|
|
@ -988,10 +988,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
|
|||
u64 now = blk_time_get_ns();
|
||||
int cpu;
|
||||
|
||||
if (blk_queue_nonrot(blkg->q))
|
||||
iolat->ssd = true;
|
||||
else
|
||||
iolat->ssd = false;
|
||||
iolat->ssd = !blk_queue_rot(blkg->q);
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct latency_stat *stat;
|
||||
|
|
|
|||
|
|
@ -158,8 +158,9 @@ static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
|
|||
return bio;
|
||||
}
|
||||
|
||||
struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
|
||||
unsigned *nsegs)
|
||||
static struct bio *__bio_split_discard(struct bio *bio,
|
||||
const struct queue_limits *lim, unsigned *nsegs,
|
||||
unsigned int max_sectors)
|
||||
{
|
||||
unsigned int max_discard_sectors, granularity;
|
||||
sector_t tmp;
|
||||
|
|
@ -169,8 +170,7 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
|
|||
|
||||
granularity = max(lim->discard_granularity >> 9, 1U);
|
||||
|
||||
max_discard_sectors =
|
||||
min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
|
||||
max_discard_sectors = min(max_sectors, bio_allowed_max_sectors(lim));
|
||||
max_discard_sectors -= max_discard_sectors % granularity;
|
||||
if (unlikely(!max_discard_sectors))
|
||||
return bio;
|
||||
|
|
@ -194,6 +194,19 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
|
|||
return bio_submit_split(bio, split_sectors);
|
||||
}
|
||||
|
||||
struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
|
||||
unsigned *nsegs)
|
||||
{
|
||||
unsigned int max_sectors;
|
||||
|
||||
if (bio_op(bio) == REQ_OP_SECURE_ERASE)
|
||||
max_sectors = lim->max_secure_erase_sectors;
|
||||
else
|
||||
max_sectors = lim->max_discard_sectors;
|
||||
|
||||
return __bio_split_discard(bio, lim, nsegs, max_sectors);
|
||||
}
|
||||
|
||||
static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim,
|
||||
bool is_atomic)
|
||||
{
|
||||
|
|
@ -324,12 +337,19 @@ static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv,
|
|||
int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
|
||||
unsigned *segs, unsigned max_bytes, unsigned len_align_mask)
|
||||
{
|
||||
struct bio_crypt_ctx *bc = bio_crypt_ctx(bio);
|
||||
struct bio_vec bv, bvprv, *bvprvp = NULL;
|
||||
unsigned nsegs = 0, bytes = 0, gaps = 0;
|
||||
struct bvec_iter iter;
|
||||
unsigned start_align_mask = lim->dma_alignment;
|
||||
|
||||
if (bc) {
|
||||
start_align_mask |= (bc->bc_key->crypto_cfg.data_unit_size - 1);
|
||||
len_align_mask |= (bc->bc_key->crypto_cfg.data_unit_size - 1);
|
||||
}
|
||||
|
||||
bio_for_each_bvec(bv, bio, iter) {
|
||||
if (bv.bv_offset & lim->dma_alignment ||
|
||||
if (bv.bv_offset & start_align_mask ||
|
||||
bv.bv_len & len_align_mask)
|
||||
return -EINVAL;
|
||||
|
||||
|
|
|
|||
|
|
@ -608,9 +608,23 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
|
|||
{},
|
||||
};
|
||||
|
||||
static void debugfs_create_files(struct dentry *parent, void *data,
|
||||
static void debugfs_create_files(struct request_queue *q, struct dentry *parent,
|
||||
void *data,
|
||||
const struct blk_mq_debugfs_attr *attr)
|
||||
{
|
||||
lockdep_assert_held(&q->debugfs_mutex);
|
||||
/*
|
||||
* Creating new debugfs entries with queue freezed has the risk of
|
||||
* deadlock.
|
||||
*/
|
||||
WARN_ON_ONCE(q->mq_freeze_depth != 0);
|
||||
/*
|
||||
* debugfs_mutex should not be nested under other locks that can be
|
||||
* grabbed while queue is frozen.
|
||||
*/
|
||||
lockdep_assert_not_held(&q->elevator_lock);
|
||||
lockdep_assert_not_held(&q->rq_qos_mutex);
|
||||
|
||||
if (IS_ERR_OR_NULL(parent))
|
||||
return;
|
||||
|
||||
|
|
@ -624,21 +638,14 @@ void blk_mq_debugfs_register(struct request_queue *q)
|
|||
struct blk_mq_hw_ctx *hctx;
|
||||
unsigned long i;
|
||||
|
||||
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
|
||||
debugfs_create_files(q, q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (!hctx->debugfs_dir)
|
||||
blk_mq_debugfs_register_hctx(q, hctx);
|
||||
}
|
||||
|
||||
if (q->rq_qos) {
|
||||
struct rq_qos *rqos = q->rq_qos;
|
||||
|
||||
while (rqos) {
|
||||
blk_mq_debugfs_register_rqos(rqos);
|
||||
rqos = rqos->next;
|
||||
}
|
||||
}
|
||||
blk_mq_debugfs_register_rq_qos(q);
|
||||
}
|
||||
|
||||
static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
|
||||
|
|
@ -650,7 +657,8 @@ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
|
|||
snprintf(name, sizeof(name), "cpu%u", ctx->cpu);
|
||||
ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir);
|
||||
|
||||
debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs);
|
||||
debugfs_create_files(hctx->queue, ctx_dir, ctx,
|
||||
blk_mq_debugfs_ctx_attrs);
|
||||
}
|
||||
|
||||
void blk_mq_debugfs_register_hctx(struct request_queue *q,
|
||||
|
|
@ -666,7 +674,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
|
|||
snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
|
||||
hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
|
||||
|
||||
debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs);
|
||||
debugfs_create_files(q, hctx->debugfs_dir, hctx,
|
||||
blk_mq_debugfs_hctx_attrs);
|
||||
|
||||
hctx_for_each_ctx(hctx, ctx, i)
|
||||
blk_mq_debugfs_register_ctx(hctx, ctx);
|
||||
|
|
@ -686,8 +695,10 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q)
|
|||
struct blk_mq_hw_ctx *hctx;
|
||||
unsigned long i;
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
queue_for_each_hw_ctx(q, hctx, i)
|
||||
blk_mq_debugfs_register_hctx(q, hctx);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
}
|
||||
|
||||
void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
|
||||
|
|
@ -717,7 +728,7 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
|
|||
|
||||
q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir);
|
||||
|
||||
debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs);
|
||||
debugfs_create_files(q, q->sched_debugfs_dir, q, e->queue_debugfs_attrs);
|
||||
}
|
||||
|
||||
void blk_mq_debugfs_unregister_sched(struct request_queue *q)
|
||||
|
|
@ -741,17 +752,7 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
|
|||
return "unknown";
|
||||
}
|
||||
|
||||
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
|
||||
{
|
||||
lockdep_assert_held(&rqos->disk->queue->debugfs_mutex);
|
||||
|
||||
if (!rqos->disk->queue->debugfs_dir)
|
||||
return;
|
||||
debugfs_remove_recursive(rqos->debugfs_dir);
|
||||
rqos->debugfs_dir = NULL;
|
||||
}
|
||||
|
||||
void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
|
||||
static void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
|
||||
{
|
||||
struct request_queue *q = rqos->disk->queue;
|
||||
const char *dir_name = rq_qos_id_to_name(rqos->id);
|
||||
|
|
@ -766,7 +767,22 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
|
|||
q->debugfs_dir);
|
||||
|
||||
rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir);
|
||||
debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
|
||||
debugfs_create_files(q, rqos->debugfs_dir, rqos,
|
||||
rqos->ops->debugfs_attrs);
|
||||
}
|
||||
|
||||
void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
|
||||
{
|
||||
lockdep_assert_held(&q->debugfs_mutex);
|
||||
|
||||
if (q->rq_qos) {
|
||||
struct rq_qos *rqos = q->rq_qos;
|
||||
|
||||
while (rqos) {
|
||||
blk_mq_debugfs_register_rqos(rqos);
|
||||
rqos = rqos->next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
|
||||
|
|
@ -789,7 +805,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
|
|||
|
||||
hctx->sched_debugfs_dir = debugfs_create_dir("sched",
|
||||
hctx->debugfs_dir);
|
||||
debugfs_create_files(hctx->sched_debugfs_dir, hctx,
|
||||
debugfs_create_files(q, hctx->sched_debugfs_dir, hctx,
|
||||
e->hctx_debugfs_attrs);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,8 +33,7 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
|
|||
struct blk_mq_hw_ctx *hctx);
|
||||
void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
|
||||
|
||||
void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
|
||||
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
|
||||
void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
|
||||
#else
|
||||
static inline void blk_mq_debugfs_register(struct request_queue *q)
|
||||
{
|
||||
|
|
@ -74,13 +73,10 @@ static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hc
|
|||
{
|
||||
}
|
||||
|
||||
static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
|
||||
static inline void blk_mq_debugfs_register_rq_qos(struct request_queue *q)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
|
||||
|
|
|
|||
|
|
@ -6,11 +6,6 @@
|
|||
#include <linux/blk-mq-dma.h>
|
||||
#include "blk.h"
|
||||
|
||||
struct phys_vec {
|
||||
phys_addr_t paddr;
|
||||
u32 len;
|
||||
};
|
||||
|
||||
static bool __blk_map_iter_next(struct blk_map_iter *iter)
|
||||
{
|
||||
if (iter->iter.bi_size)
|
||||
|
|
@ -112,8 +107,8 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
|
|||
struct phys_vec *vec)
|
||||
{
|
||||
enum dma_data_direction dir = rq_dma_dir(req);
|
||||
unsigned int mapped = 0;
|
||||
unsigned int attrs = 0;
|
||||
size_t mapped = 0;
|
||||
int error;
|
||||
|
||||
iter->addr = state->addr;
|
||||
|
|
@ -238,7 +233,6 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
|
|||
* blk_rq_dma_map_iter_next - map the next DMA segment for a request
|
||||
* @req: request to map
|
||||
* @dma_dev: device to map to
|
||||
* @state: DMA IOVA state
|
||||
* @iter: block layer DMA iterator
|
||||
*
|
||||
* Iterate to the next mapping after a previous call to
|
||||
|
|
@ -253,7 +247,7 @@ EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start);
|
|||
* returned in @iter.status.
|
||||
*/
|
||||
bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
|
||||
struct dma_iova_state *state, struct blk_dma_iter *iter)
|
||||
struct blk_dma_iter *iter)
|
||||
{
|
||||
struct phys_vec vec;
|
||||
|
||||
|
|
@ -297,6 +291,8 @@ int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
|
|||
blk_rq_map_iter_init(rq, &iter);
|
||||
while (blk_map_iter_next(rq, &iter, &vec)) {
|
||||
*last_sg = blk_next_sg(last_sg, sglist);
|
||||
|
||||
WARN_ON_ONCE(overflows_type(vec.len, unsigned int));
|
||||
sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len,
|
||||
offset_in_page(vec.paddr));
|
||||
nsegs++;
|
||||
|
|
@ -417,6 +413,8 @@ int blk_rq_map_integrity_sg(struct request *rq, struct scatterlist *sglist)
|
|||
|
||||
while (blk_map_iter_next(rq, &iter, &vec)) {
|
||||
sg = blk_next_sg(&sg, sglist);
|
||||
|
||||
WARN_ON_ONCE(overflows_type(vec.len, unsigned int));
|
||||
sg_set_page(sg, phys_to_page(vec.paddr), vec.len,
|
||||
offset_in_page(vec.paddr));
|
||||
segments++;
|
||||
|
|
|
|||
|
|
@ -137,4 +137,9 @@ static inline void blk_mq_set_min_shallow_depth(struct request_queue *q,
|
|||
depth);
|
||||
}
|
||||
|
||||
static inline bool blk_mq_is_sync_read(blk_opf_t opf)
|
||||
{
|
||||
return op_is_sync(opf) && !op_is_write(opf);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -498,6 +498,42 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
|
|||
return rq_list_pop(data->cached_rqs);
|
||||
}
|
||||
|
||||
static void blk_mq_limit_depth(struct blk_mq_alloc_data *data)
|
||||
{
|
||||
struct elevator_mq_ops *ops;
|
||||
|
||||
/* If no I/O scheduler has been configured, don't limit requests */
|
||||
if (!data->q->elevator) {
|
||||
blk_mq_tag_busy(data->hctx);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* All requests use scheduler tags when an I/O scheduler is
|
||||
* enabled for the queue.
|
||||
*/
|
||||
data->rq_flags |= RQF_SCHED_TAGS;
|
||||
|
||||
/*
|
||||
* Flush/passthrough requests are special and go directly to the
|
||||
* dispatch list, they are not subject to the async_depth limit.
|
||||
*/
|
||||
if ((data->cmd_flags & REQ_OP_MASK) == REQ_OP_FLUSH ||
|
||||
blk_op_is_passthrough(data->cmd_flags))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
|
||||
data->rq_flags |= RQF_USE_SCHED;
|
||||
|
||||
/*
|
||||
* By default, sync requests have no limit, and async requests are
|
||||
* limited to async_depth.
|
||||
*/
|
||||
ops = &data->q->elevator->type->ops;
|
||||
if (ops->limit_depth)
|
||||
ops->limit_depth(data->cmd_flags, data);
|
||||
}
|
||||
|
||||
static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
|
||||
{
|
||||
struct request_queue *q = data->q;
|
||||
|
|
@ -516,31 +552,7 @@ retry:
|
|||
data->ctx = blk_mq_get_ctx(q);
|
||||
data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);
|
||||
|
||||
if (q->elevator) {
|
||||
/*
|
||||
* All requests use scheduler tags when an I/O scheduler is
|
||||
* enabled for the queue.
|
||||
*/
|
||||
data->rq_flags |= RQF_SCHED_TAGS;
|
||||
|
||||
/*
|
||||
* Flush/passthrough requests are special and go directly to the
|
||||
* dispatch list.
|
||||
*/
|
||||
if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
|
||||
!blk_op_is_passthrough(data->cmd_flags)) {
|
||||
struct elevator_mq_ops *ops = &q->elevator->type->ops;
|
||||
|
||||
WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
|
||||
|
||||
data->rq_flags |= RQF_USE_SCHED;
|
||||
if (ops->limit_depth)
|
||||
ops->limit_depth(data->cmd_flags, data);
|
||||
}
|
||||
} else {
|
||||
blk_mq_tag_busy(data->hctx);
|
||||
}
|
||||
|
||||
blk_mq_limit_depth(data);
|
||||
if (data->flags & BLK_MQ_REQ_RESERVED)
|
||||
data->rq_flags |= RQF_RESV;
|
||||
|
||||
|
|
@ -1156,7 +1168,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
|
|||
|
||||
if (rq->end_io) {
|
||||
rq_qos_done(rq->q, rq);
|
||||
if (rq->end_io(rq, error) == RQ_END_IO_FREE)
|
||||
if (rq->end_io(rq, error, NULL) == RQ_END_IO_FREE)
|
||||
blk_mq_free_request(rq);
|
||||
} else {
|
||||
blk_mq_free_request(rq);
|
||||
|
|
@ -1211,7 +1223,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
|
|||
* If end_io handler returns NONE, then it still has
|
||||
* ownership of the request.
|
||||
*/
|
||||
if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
|
||||
if (rq->end_io && rq->end_io(rq, 0, iob) == RQ_END_IO_NONE)
|
||||
continue;
|
||||
|
||||
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
|
||||
|
|
@ -1458,7 +1470,8 @@ struct blk_rq_wait {
|
|||
blk_status_t ret;
|
||||
};
|
||||
|
||||
static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
|
||||
static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct blk_rq_wait *wait = rq->end_io_data;
|
||||
|
||||
|
|
@ -1688,7 +1701,7 @@ static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expi
|
|||
void blk_mq_put_rq_ref(struct request *rq)
|
||||
{
|
||||
if (is_flush_rq(rq)) {
|
||||
if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
|
||||
if (rq->end_io(rq, 0, NULL) == RQ_END_IO_FREE)
|
||||
blk_mq_free_request(rq);
|
||||
} else if (req_ref_put_and_test(rq)) {
|
||||
__blk_mq_free_request(rq);
|
||||
|
|
@ -4649,6 +4662,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
|
|||
spin_lock_init(&q->requeue_lock);
|
||||
|
||||
q->nr_requests = set->queue_depth;
|
||||
q->async_depth = set->queue_depth;
|
||||
|
||||
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
|
||||
blk_mq_map_swqueue(q);
|
||||
|
|
@ -5015,6 +5029,11 @@ struct elevator_tags *blk_mq_update_nr_requests(struct request_queue *q,
|
|||
q->elevator->et = et;
|
||||
}
|
||||
|
||||
/*
|
||||
* Preserve relative value, both nr and async_depth are at most 16 bit
|
||||
* value, no need to worry about overflow.
|
||||
*/
|
||||
q->async_depth = max(q->async_depth * nr / q->nr_requests, 1);
|
||||
q->nr_requests = nr;
|
||||
if (q->elevator && q->elevator->type->ops.depth_updated)
|
||||
q->elevator->type->ops.depth_updated(q);
|
||||
|
|
|
|||
|
|
@ -347,13 +347,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
|
|||
blk_queue_flag_set(QUEUE_FLAG_QOS_ENABLED, q);
|
||||
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
|
||||
if (rqos->ops->debugfs_attrs) {
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_mq_debugfs_register_rqos(rqos);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
}
|
||||
|
||||
return 0;
|
||||
ebusy:
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
|
|
@ -378,8 +371,4 @@ void rq_qos_del(struct rq_qos *rqos)
|
|||
if (!q->rq_qos)
|
||||
blk_queue_flag_clear(QUEUE_FLAG_QOS_ENABLED, q);
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_mq_debugfs_unregister_rqos(rqos);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -127,6 +127,46 @@ unlock:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t queue_async_depth_show(struct gendisk *disk, char *page)
|
||||
{
|
||||
guard(mutex)(&disk->queue->elevator_lock);
|
||||
|
||||
return queue_var_show(disk->queue->async_depth, page);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
queue_async_depth_store(struct gendisk *disk, const char *page, size_t count)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
unsigned int memflags;
|
||||
unsigned long nr;
|
||||
int ret;
|
||||
|
||||
if (!queue_is_mq(q))
|
||||
return -EINVAL;
|
||||
|
||||
ret = queue_var_store(&nr, page, count);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (nr == 0)
|
||||
return -EINVAL;
|
||||
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
scoped_guard(mutex, &q->elevator_lock) {
|
||||
if (q->elevator) {
|
||||
q->async_depth = min(q->nr_requests, nr);
|
||||
if (q->elevator->type->ops.depth_updated)
|
||||
q->elevator->type->ops.depth_updated(q);
|
||||
} else {
|
||||
ret = -EINVAL;
|
||||
}
|
||||
}
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t queue_ra_show(struct gendisk *disk, char *page)
|
||||
{
|
||||
ssize_t ret;
|
||||
|
|
@ -532,6 +572,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \
|
|||
}
|
||||
|
||||
QUEUE_RW_ENTRY(queue_requests, "nr_requests");
|
||||
QUEUE_RW_ENTRY(queue_async_depth, "async_depth");
|
||||
QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb");
|
||||
QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
|
||||
QUEUE_LIM_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
|
||||
|
|
@ -636,11 +677,8 @@ out:
|
|||
static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
|
||||
size_t count)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
struct rq_qos *rqos;
|
||||
ssize_t ret;
|
||||
s64 val;
|
||||
unsigned int memflags;
|
||||
|
||||
ret = queue_var_store64(&val, page);
|
||||
if (ret < 0)
|
||||
|
|
@ -648,40 +686,8 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
|
|||
if (val < -1)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Ensure that the queue is idled, in case the latency update
|
||||
* ends up either enabling or disabling wbt completely. We can't
|
||||
* have IO inflight if that happens.
|
||||
*/
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
|
||||
rqos = wbt_rq_qos(q);
|
||||
if (!rqos) {
|
||||
ret = wbt_init(disk);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = count;
|
||||
if (val == -1)
|
||||
val = wbt_default_latency_nsec(q);
|
||||
else if (val >= 0)
|
||||
val *= 1000ULL;
|
||||
|
||||
if (wbt_get_min_lat(q) == val)
|
||||
goto out;
|
||||
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
mutex_lock(&disk->rqos_state_mutex);
|
||||
wbt_set_min_lat(q, val);
|
||||
mutex_unlock(&disk->rqos_state_mutex);
|
||||
|
||||
blk_mq_unquiesce_queue(q);
|
||||
out:
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
|
||||
return ret;
|
||||
ret = wbt_set_lat(disk, val);
|
||||
return ret ? ret : count;
|
||||
}
|
||||
|
||||
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
|
||||
|
|
@ -754,6 +760,7 @@ static struct attribute *blk_mq_queue_attrs[] = {
|
|||
*/
|
||||
&elv_iosched_entry.attr,
|
||||
&queue_requests_entry.attr,
|
||||
&queue_async_depth_entry.attr,
|
||||
#ifdef CONFIG_BLK_WBT
|
||||
&queue_wb_lat_entry.attr,
|
||||
#endif
|
||||
|
|
|
|||
158
block/blk-wbt.c
158
block/blk-wbt.c
|
|
@ -93,6 +93,8 @@ struct rq_wb {
|
|||
struct rq_depth rq_depth;
|
||||
};
|
||||
|
||||
static int wbt_init(struct gendisk *disk, struct rq_wb *rwb);
|
||||
|
||||
static inline struct rq_wb *RQWB(struct rq_qos *rqos)
|
||||
{
|
||||
return container_of(rqos, struct rq_wb, rqos);
|
||||
|
|
@ -506,7 +508,7 @@ u64 wbt_get_min_lat(struct request_queue *q)
|
|||
return RQWB(rqos)->min_lat_nsec;
|
||||
}
|
||||
|
||||
void wbt_set_min_lat(struct request_queue *q, u64 val)
|
||||
static void wbt_set_min_lat(struct request_queue *q, u64 val)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
if (!rqos)
|
||||
|
|
@ -696,6 +698,41 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
|
|||
}
|
||||
}
|
||||
|
||||
static int wbt_data_dir(const struct request *rq)
|
||||
{
|
||||
const enum req_op op = req_op(rq);
|
||||
|
||||
if (op == REQ_OP_READ)
|
||||
return READ;
|
||||
else if (op_is_write(op))
|
||||
return WRITE;
|
||||
|
||||
/* don't account */
|
||||
return -1;
|
||||
}
|
||||
|
||||
static struct rq_wb *wbt_alloc(void)
|
||||
{
|
||||
struct rq_wb *rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
|
||||
|
||||
if (!rwb)
|
||||
return NULL;
|
||||
|
||||
rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
|
||||
if (!rwb->cb) {
|
||||
kfree(rwb);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return rwb;
|
||||
}
|
||||
|
||||
static void wbt_free(struct rq_wb *rwb)
|
||||
{
|
||||
blk_stat_free_callback(rwb->cb);
|
||||
kfree(rwb);
|
||||
}
|
||||
|
||||
/*
|
||||
* Enable wbt if defaults are configured that way
|
||||
*/
|
||||
|
|
@ -737,33 +774,35 @@ EXPORT_SYMBOL_GPL(wbt_enable_default);
|
|||
|
||||
void wbt_init_enable_default(struct gendisk *disk)
|
||||
{
|
||||
if (__wbt_enable_default(disk))
|
||||
WARN_ON_ONCE(wbt_init(disk));
|
||||
struct request_queue *q = disk->queue;
|
||||
struct rq_wb *rwb;
|
||||
|
||||
if (!__wbt_enable_default(disk))
|
||||
return;
|
||||
|
||||
rwb = wbt_alloc();
|
||||
if (WARN_ON_ONCE(!rwb))
|
||||
return;
|
||||
|
||||
if (WARN_ON_ONCE(wbt_init(disk, rwb))) {
|
||||
wbt_free(rwb);
|
||||
return;
|
||||
}
|
||||
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_mq_debugfs_register_rq_qos(q);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
}
|
||||
|
||||
u64 wbt_default_latency_nsec(struct request_queue *q)
|
||||
static u64 wbt_default_latency_nsec(struct request_queue *q)
|
||||
{
|
||||
/*
|
||||
* We default to 2msec for non-rotational storage, and 75msec
|
||||
* for rotational storage.
|
||||
*/
|
||||
if (blk_queue_nonrot(q))
|
||||
return 2000000ULL;
|
||||
else
|
||||
if (blk_queue_rot(q))
|
||||
return 75000000ULL;
|
||||
}
|
||||
|
||||
static int wbt_data_dir(const struct request *rq)
|
||||
{
|
||||
const enum req_op op = req_op(rq);
|
||||
|
||||
if (op == REQ_OP_READ)
|
||||
return READ;
|
||||
else if (op_is_write(op))
|
||||
return WRITE;
|
||||
|
||||
/* don't account */
|
||||
return -1;
|
||||
return 2000000ULL;
|
||||
}
|
||||
|
||||
static void wbt_queue_depth_changed(struct rq_qos *rqos)
|
||||
|
|
@ -777,8 +816,7 @@ static void wbt_exit(struct rq_qos *rqos)
|
|||
struct rq_wb *rwb = RQWB(rqos);
|
||||
|
||||
blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
|
||||
blk_stat_free_callback(rwb->cb);
|
||||
kfree(rwb);
|
||||
wbt_free(rwb);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -902,22 +940,11 @@ static const struct rq_qos_ops wbt_rqos_ops = {
|
|||
#endif
|
||||
};
|
||||
|
||||
int wbt_init(struct gendisk *disk)
|
||||
static int wbt_init(struct gendisk *disk, struct rq_wb *rwb)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
struct rq_wb *rwb;
|
||||
int i;
|
||||
int ret;
|
||||
|
||||
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
|
||||
if (!rwb)
|
||||
return -ENOMEM;
|
||||
|
||||
rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
|
||||
if (!rwb->cb) {
|
||||
kfree(rwb);
|
||||
return -ENOMEM;
|
||||
}
|
||||
int i;
|
||||
|
||||
for (i = 0; i < WBT_NUM_RWQ; i++)
|
||||
rq_wait_init(&rwb->rq_wait[i]);
|
||||
|
|
@ -937,15 +964,60 @@ int wbt_init(struct gendisk *disk)
|
|||
ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
|
||||
mutex_unlock(&q->rq_qos_mutex);
|
||||
if (ret)
|
||||
goto err_free;
|
||||
return ret;
|
||||
|
||||
blk_stat_add_callback(q, rwb->cb);
|
||||
|
||||
return 0;
|
||||
|
||||
err_free:
|
||||
blk_stat_free_callback(rwb->cb);
|
||||
kfree(rwb);
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
int wbt_set_lat(struct gendisk *disk, s64 val)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
struct rq_wb *rwb = NULL;
|
||||
unsigned int memflags;
|
||||
int ret = 0;
|
||||
|
||||
if (!rqos) {
|
||||
rwb = wbt_alloc();
|
||||
if (!rwb)
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that the queue is idled, in case the latency update
|
||||
* ends up either enabling or disabling wbt completely. We can't
|
||||
* have IO inflight if that happens.
|
||||
*/
|
||||
memflags = blk_mq_freeze_queue(q);
|
||||
if (!rqos) {
|
||||
ret = wbt_init(disk, rwb);
|
||||
if (ret) {
|
||||
wbt_free(rwb);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (val == -1)
|
||||
val = wbt_default_latency_nsec(q);
|
||||
else if (val >= 0)
|
||||
val *= 1000ULL;
|
||||
|
||||
if (wbt_get_min_lat(q) == val)
|
||||
goto out;
|
||||
|
||||
blk_mq_quiesce_queue(q);
|
||||
|
||||
mutex_lock(&disk->rqos_state_mutex);
|
||||
wbt_set_min_lat(q, val);
|
||||
mutex_unlock(&disk->rqos_state_mutex);
|
||||
|
||||
blk_mq_unquiesce_queue(q);
|
||||
out:
|
||||
blk_mq_unfreeze_queue(q, memflags);
|
||||
mutex_lock(&q->debugfs_mutex);
|
||||
blk_mq_debugfs_register_rq_qos(q);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,16 +4,13 @@
|
|||
|
||||
#ifdef CONFIG_BLK_WBT
|
||||
|
||||
int wbt_init(struct gendisk *disk);
|
||||
void wbt_init_enable_default(struct gendisk *disk);
|
||||
void wbt_disable_default(struct gendisk *disk);
|
||||
void wbt_enable_default(struct gendisk *disk);
|
||||
|
||||
u64 wbt_get_min_lat(struct request_queue *q);
|
||||
void wbt_set_min_lat(struct request_queue *q, u64 val);
|
||||
bool wbt_disabled(struct request_queue *);
|
||||
|
||||
u64 wbt_default_latency_nsec(struct request_queue *);
|
||||
bool wbt_disabled(struct request_queue *q);
|
||||
int wbt_set_lat(struct gendisk *disk, s64 val);
|
||||
|
||||
#else
|
||||
|
||||
|
|
|
|||
|
|
@ -112,12 +112,12 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
|
|||
#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
|
||||
|
||||
/**
|
||||
* blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
|
||||
* @zone_cond: BLK_ZONE_COND_XXX.
|
||||
* blk_zone_cond_str - Return a zone condition name string
|
||||
* @zone_cond: a zone condition BLK_ZONE_COND_name
|
||||
*
|
||||
* Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
|
||||
* into string format. Useful in the debugging and tracing zone conditions. For
|
||||
* invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
|
||||
* Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful
|
||||
* for the debugging and tracing zone conditions. For an invalid zone
|
||||
* conditions, the string "UNKNOWN" is returned.
|
||||
*/
|
||||
const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
|
||||
{
|
||||
|
|
|
|||
18
block/blk.h
18
block/blk.h
|
|
@ -208,10 +208,14 @@ static inline unsigned int blk_queue_get_max_sectors(struct request *rq)
|
|||
struct request_queue *q = rq->q;
|
||||
enum req_op op = req_op(rq);
|
||||
|
||||
if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
|
||||
if (unlikely(op == REQ_OP_DISCARD))
|
||||
return min(q->limits.max_discard_sectors,
|
||||
UINT_MAX >> SECTOR_SHIFT);
|
||||
|
||||
if (unlikely(op == REQ_OP_SECURE_ERASE))
|
||||
return min(q->limits.max_secure_erase_sectors,
|
||||
UINT_MAX >> SECTOR_SHIFT);
|
||||
|
||||
if (unlikely(op == REQ_OP_WRITE_ZEROES))
|
||||
return q->limits.max_write_zeroes_sectors;
|
||||
|
||||
|
|
@ -371,12 +375,18 @@ struct bio *bio_split_zone_append(struct bio *bio,
|
|||
static inline bool bio_may_need_split(struct bio *bio,
|
||||
const struct queue_limits *lim)
|
||||
{
|
||||
const struct bio_vec *bv;
|
||||
|
||||
if (lim->chunk_sectors)
|
||||
return true;
|
||||
if (bio->bi_vcnt != 1)
|
||||
|
||||
if (!bio->bi_io_vec)
|
||||
return true;
|
||||
return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset >
|
||||
lim->max_fast_segment_size;
|
||||
|
||||
bv = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
|
||||
if (bio->bi_iter.bi_size > bv->bv_len - bio->bi_iter.bi_bvec_done)
|
||||
return true;
|
||||
return bv->bv_len + bv->bv_offset > lim->max_fast_segment_size;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -589,6 +589,7 @@ static int elevator_switch(struct request_queue *q, struct elv_change_ctx *ctx)
|
|||
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
|
||||
q->elevator = NULL;
|
||||
q->nr_requests = q->tag_set->queue_depth;
|
||||
q->async_depth = q->tag_set->queue_depth;
|
||||
}
|
||||
blk_add_trace_msg(q, "elv switch: %s", ctx->name);
|
||||
|
||||
|
|
|
|||
|
|
@ -692,7 +692,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
|
|||
queue_max_sectors(bdev_get_queue(bdev)));
|
||||
return put_ushort(argp, max_sectors);
|
||||
case BLKROTATIONAL:
|
||||
return put_ushort(argp, !bdev_nonrot(bdev));
|
||||
return put_ushort(argp, bdev_rot(bdev));
|
||||
case BLKRASET:
|
||||
case BLKFRASET:
|
||||
if(!capable(CAP_SYS_ADMIN))
|
||||
|
|
|
|||
|
|
@ -47,9 +47,8 @@ enum {
|
|||
* asynchronous requests, we reserve 25% of requests for synchronous
|
||||
* operations.
|
||||
*/
|
||||
KYBER_ASYNC_PERCENT = 75,
|
||||
KYBER_DEFAULT_ASYNC_PERCENT = 75,
|
||||
};
|
||||
|
||||
/*
|
||||
* Maximum device-wide depth for each scheduling domain.
|
||||
*
|
||||
|
|
@ -157,9 +156,6 @@ struct kyber_queue_data {
|
|||
*/
|
||||
struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
|
||||
|
||||
/* Number of allowed async requests. */
|
||||
unsigned int async_depth;
|
||||
|
||||
struct kyber_cpu_latency __percpu *cpu_latency;
|
||||
|
||||
/* Timer for stats aggregation and adjusting domain tokens. */
|
||||
|
|
@ -401,10 +397,7 @@ err:
|
|||
|
||||
static void kyber_depth_updated(struct request_queue *q)
|
||||
{
|
||||
struct kyber_queue_data *kqd = q->elevator->elevator_data;
|
||||
|
||||
kqd->async_depth = q->nr_requests * KYBER_ASYNC_PERCENT / 100U;
|
||||
blk_mq_set_min_shallow_depth(q, kqd->async_depth);
|
||||
blk_mq_set_min_shallow_depth(q, q->async_depth);
|
||||
}
|
||||
|
||||
static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
|
||||
|
|
@ -414,6 +407,7 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
|
|||
blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
|
||||
|
||||
q->elevator = eq;
|
||||
q->async_depth = q->nr_requests * KYBER_DEFAULT_ASYNC_PERCENT / 100;
|
||||
kyber_depth_updated(q);
|
||||
|
||||
return 0;
|
||||
|
|
@ -552,15 +546,8 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
|
|||
|
||||
static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
|
||||
{
|
||||
/*
|
||||
* We use the scheduler tags as per-hardware queue queueing tokens.
|
||||
* Async requests can be limited at this stage.
|
||||
*/
|
||||
if (!op_is_sync(opf)) {
|
||||
struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
|
||||
|
||||
data->shallow_depth = kqd->async_depth;
|
||||
}
|
||||
if (!blk_mq_is_sync_read(opf))
|
||||
data->shallow_depth = data->q->async_depth;
|
||||
}
|
||||
|
||||
static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
|
|
@ -956,15 +943,6 @@ KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
|
|||
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
|
||||
#undef KYBER_DEBUGFS_DOMAIN_ATTRS
|
||||
|
||||
static int kyber_async_depth_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
struct kyber_queue_data *kqd = q->elevator->elevator_data;
|
||||
|
||||
seq_printf(m, "%u\n", kqd->async_depth);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int kyber_cur_domain_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
|
|
@ -990,7 +968,6 @@ static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
|
|||
KYBER_QUEUE_DOMAIN_ATTRS(write),
|
||||
KYBER_QUEUE_DOMAIN_ATTRS(discard),
|
||||
KYBER_QUEUE_DOMAIN_ATTRS(other),
|
||||
{"async_depth", 0400, kyber_async_depth_show},
|
||||
{},
|
||||
};
|
||||
#undef KYBER_QUEUE_DOMAIN_ATTRS
|
||||
|
|
|
|||
|
|
@ -98,7 +98,6 @@ struct deadline_data {
|
|||
int fifo_batch;
|
||||
int writes_starved;
|
||||
int front_merges;
|
||||
u32 async_depth;
|
||||
int prio_aging_expire;
|
||||
|
||||
spinlock_t lock;
|
||||
|
|
@ -486,32 +485,16 @@ unlock:
|
|||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called by __blk_mq_alloc_request(). The shallow_depth value set by this
|
||||
* function is used by __blk_mq_get_tag().
|
||||
*/
|
||||
static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
|
||||
{
|
||||
struct deadline_data *dd = data->q->elevator->elevator_data;
|
||||
|
||||
/* Do not throttle synchronous reads. */
|
||||
if (op_is_sync(opf) && !op_is_write(opf))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Throttle asynchronous requests and writes such that these requests
|
||||
* do not block the allocation of synchronous requests.
|
||||
*/
|
||||
data->shallow_depth = dd->async_depth;
|
||||
if (!blk_mq_is_sync_read(opf))
|
||||
data->shallow_depth = data->q->async_depth;
|
||||
}
|
||||
|
||||
/* Called by blk_mq_update_nr_requests(). */
|
||||
/* Called by blk_mq_init_sched() and blk_mq_update_nr_requests(). */
|
||||
static void dd_depth_updated(struct request_queue *q)
|
||||
{
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
|
||||
dd->async_depth = q->nr_requests;
|
||||
blk_mq_set_min_shallow_depth(q, 1);
|
||||
blk_mq_set_min_shallow_depth(q, q->async_depth);
|
||||
}
|
||||
|
||||
static void dd_exit_sched(struct elevator_queue *e)
|
||||
|
|
@ -576,6 +559,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_queue *eq)
|
|||
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
|
||||
|
||||
q->elevator = eq;
|
||||
q->async_depth = q->nr_requests;
|
||||
dd_depth_updated(q);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -763,7 +747,6 @@ SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
|
|||
SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
|
||||
SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
|
||||
SHOW_INT(deadline_front_merges_show, dd->front_merges);
|
||||
SHOW_INT(deadline_async_depth_show, dd->async_depth);
|
||||
SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
|
||||
#undef SHOW_INT
|
||||
#undef SHOW_JIFFIES
|
||||
|
|
@ -793,7 +776,6 @@ STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MA
|
|||
STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
|
||||
STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
|
||||
STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
|
||||
STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
|
||||
STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
|
||||
#undef STORE_FUNCTION
|
||||
#undef STORE_INT
|
||||
|
|
@ -807,7 +789,6 @@ static const struct elv_fs_entry deadline_attrs[] = {
|
|||
DD_ATTR(write_expire),
|
||||
DD_ATTR(writes_starved),
|
||||
DD_ATTR(front_merges),
|
||||
DD_ATTR(async_depth),
|
||||
DD_ATTR(fifo_batch),
|
||||
DD_ATTR(prio_aging_expire),
|
||||
__ATTR_NULL
|
||||
|
|
@ -894,15 +875,6 @@ static int deadline_starved_show(void *data, struct seq_file *m)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int dd_async_depth_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
|
||||
seq_printf(m, "%u\n", dd->async_depth);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dd_queued_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
|
|
@ -1002,7 +974,6 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
|
|||
DEADLINE_NEXT_RQ_ATTR(write2),
|
||||
{"batching", 0400, deadline_batching_show},
|
||||
{"starved", 0400, deadline_starved_show},
|
||||
{"async_depth", 0400, dd_async_depth_show},
|
||||
{"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops},
|
||||
{"owned_by_driver", 0400, dd_owned_by_driver_show},
|
||||
{"queued", 0400, dd_queued_show},
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
#include <linux/fs.h>
|
||||
#include <linux/major.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/raid/detect.h>
|
||||
|
|
@ -130,7 +131,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
|
|||
state->pp_buf[0] = '\0';
|
||||
|
||||
state->disk = hd;
|
||||
snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name);
|
||||
strscpy(state->name, hd->disk_name);
|
||||
snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
|
||||
if (isdigit(state->name[strlen(state->name)-1]))
|
||||
sprintf(state->name, "p");
|
||||
|
|
|
|||
|
|
@ -2940,7 +2940,8 @@ static int opal_activate_lsp(struct opal_dev *dev,
|
|||
};
|
||||
int ret;
|
||||
|
||||
if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS)
|
||||
if (opal_lr_act->sum &&
|
||||
(!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS))
|
||||
return -EINVAL;
|
||||
|
||||
ret = opal_get_key(dev, &opal_lr_act->key);
|
||||
|
|
|
|||
|
|
@ -247,8 +247,7 @@ MODULE_ALIAS("rd");
|
|||
/* Legacy boot options - nonmodular */
|
||||
static int __init ramdisk_size(char *str)
|
||||
{
|
||||
rd_size = simple_strtol(str, NULL, 0);
|
||||
return 1;
|
||||
return kstrtoul(str, 0, &rd_size) == 0;
|
||||
}
|
||||
__setup("ramdisk_size=", ramdisk_size);
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -969,7 +969,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
|
|||
lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
|
||||
if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
|
||||
lim->features |= BLK_FEAT_WRITE_CACHE;
|
||||
if (backing_bdev && !bdev_nonrot(backing_bdev))
|
||||
if (backing_bdev && bdev_rot(backing_bdev))
|
||||
lim->features |= BLK_FEAT_ROTATIONAL;
|
||||
lim->max_hw_discard_sectors = max_discard_sectors;
|
||||
lim->max_write_zeroes_sectors = max_discard_sectors;
|
||||
|
|
|
|||
|
|
@ -642,7 +642,7 @@ static void nullb_device_release(struct config_item *item)
|
|||
null_free_dev(dev);
|
||||
}
|
||||
|
||||
static struct configfs_item_operations nullb_device_ops = {
|
||||
static const struct configfs_item_operations nullb_device_ops = {
|
||||
.release = nullb_device_release,
|
||||
};
|
||||
|
||||
|
|
@ -749,7 +749,7 @@ static struct configfs_attribute *nullb_group_attrs[] = {
|
|||
NULL,
|
||||
};
|
||||
|
||||
static struct configfs_group_operations nullb_group_ops = {
|
||||
static const struct configfs_group_operations nullb_group_ops = {
|
||||
.make_group = nullb_group_make_group,
|
||||
.drop_item = nullb_group_drop_item,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -475,9 +475,17 @@ void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev)
|
|||
}
|
||||
}
|
||||
|
||||
static void rnbd_dev_release(struct kobject *kobj)
|
||||
{
|
||||
struct rnbd_clt_dev *dev = container_of(kobj, struct rnbd_clt_dev, kobj);
|
||||
|
||||
kfree(dev);
|
||||
}
|
||||
|
||||
static const struct kobj_type rnbd_dev_ktype = {
|
||||
.sysfs_ops = &kobj_sysfs_ops,
|
||||
.default_groups = rnbd_dev_groups,
|
||||
.release = rnbd_dev_release,
|
||||
};
|
||||
|
||||
static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev)
|
||||
|
|
|
|||
|
|
@ -60,7 +60,9 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev)
|
|||
kfree(dev->pathname);
|
||||
rnbd_clt_put_sess(dev->sess);
|
||||
mutex_destroy(&dev->lock);
|
||||
kfree(dev);
|
||||
|
||||
if (dev->kobj.state_initialized)
|
||||
kobject_put(&dev->kobj);
|
||||
}
|
||||
|
||||
static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev)
|
||||
|
|
@ -1517,7 +1519,7 @@ static bool insert_dev_if_not_exists_devpath(struct rnbd_clt_dev *dev)
|
|||
return found;
|
||||
}
|
||||
|
||||
static void delete_dev(struct rnbd_clt_dev *dev)
|
||||
static void rnbd_delete_dev(struct rnbd_clt_dev *dev)
|
||||
{
|
||||
struct rnbd_clt_session *sess = dev->sess;
|
||||
|
||||
|
|
@ -1638,7 +1640,7 @@ put_iu:
|
|||
kfree(rsp);
|
||||
rnbd_put_iu(sess, iu);
|
||||
del_dev:
|
||||
delete_dev(dev);
|
||||
rnbd_delete_dev(dev);
|
||||
put_dev:
|
||||
rnbd_clt_put_dev(dev);
|
||||
put_sess:
|
||||
|
|
@ -1647,13 +1649,13 @@ put_sess:
|
|||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static void destroy_gen_disk(struct rnbd_clt_dev *dev)
|
||||
static void rnbd_destroy_gen_disk(struct rnbd_clt_dev *dev)
|
||||
{
|
||||
del_gendisk(dev->gd);
|
||||
put_disk(dev->gd);
|
||||
}
|
||||
|
||||
static void destroy_sysfs(struct rnbd_clt_dev *dev,
|
||||
static void rnbd_destroy_sysfs(struct rnbd_clt_dev *dev,
|
||||
const struct attribute *sysfs_self)
|
||||
{
|
||||
rnbd_clt_remove_dev_symlink(dev);
|
||||
|
|
@ -1662,7 +1664,6 @@ static void destroy_sysfs(struct rnbd_clt_dev *dev,
|
|||
/* To avoid deadlock firstly remove itself */
|
||||
sysfs_remove_file_self(&dev->kobj, sysfs_self);
|
||||
kobject_del(&dev->kobj);
|
||||
kobject_put(&dev->kobj);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1691,9 +1692,9 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force,
|
|||
dev->dev_state = DEV_STATE_UNMAPPED;
|
||||
mutex_unlock(&dev->lock);
|
||||
|
||||
delete_dev(dev);
|
||||
destroy_sysfs(dev, sysfs_self);
|
||||
destroy_gen_disk(dev);
|
||||
rnbd_delete_dev(dev);
|
||||
rnbd_destroy_sysfs(dev, sysfs_self);
|
||||
rnbd_destroy_gen_disk(dev);
|
||||
if (was_mapped && sess->rtrs)
|
||||
send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT);
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
#include <rdma/ib.h>
|
||||
|
||||
#define RNBD_PROTO_VER_MAJOR 2
|
||||
#define RNBD_PROTO_VER_MINOR 0
|
||||
#define RNBD_PROTO_VER_MINOR 2
|
||||
|
||||
/* The default port number the RTRS server is listening on. */
|
||||
#define RTRS_PORT 1234
|
||||
|
|
@ -197,6 +197,8 @@ struct rnbd_msg_io {
|
|||
*
|
||||
* @RNBD_F_SYNC: request is sync (sync write or read)
|
||||
* @RNBD_F_FUA: forced unit access
|
||||
* @RNBD_F_PREFLUSH: request for cache flush
|
||||
* @RNBD_F_NOUNMAP: do not free blocks when zeroing
|
||||
*/
|
||||
enum rnbd_io_flags {
|
||||
|
||||
|
|
@ -211,6 +213,8 @@ enum rnbd_io_flags {
|
|||
/* Flags */
|
||||
RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0),
|
||||
RNBD_F_FUA = 1<<(RNBD_OP_BITS + 1),
|
||||
RNBD_F_PREFLUSH = 1<<(RNBD_OP_BITS + 2),
|
||||
RNBD_F_NOUNMAP = 1<<(RNBD_OP_BITS + 3)
|
||||
};
|
||||
|
||||
static inline u32 rnbd_op(u32 flags)
|
||||
|
|
@ -245,6 +249,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf)
|
|||
break;
|
||||
case RNBD_OP_WRITE_ZEROES:
|
||||
bio_opf = REQ_OP_WRITE_ZEROES;
|
||||
|
||||
if (rnbd_opf & RNBD_F_NOUNMAP)
|
||||
bio_opf |= REQ_NOUNMAP;
|
||||
break;
|
||||
default:
|
||||
WARN(1, "Unknown RNBD type: %d (flags %d)\n",
|
||||
|
|
@ -258,6 +265,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf)
|
|||
if (rnbd_opf & RNBD_F_FUA)
|
||||
bio_opf |= REQ_FUA;
|
||||
|
||||
if (rnbd_opf & RNBD_F_PREFLUSH)
|
||||
bio_opf |= REQ_PREFLUSH;
|
||||
|
||||
return bio_opf;
|
||||
}
|
||||
|
||||
|
|
@ -280,6 +290,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq)
|
|||
break;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
rnbd_opf = RNBD_OP_WRITE_ZEROES;
|
||||
|
||||
if (rq->cmd_flags & REQ_NOUNMAP)
|
||||
rnbd_opf |= RNBD_F_NOUNMAP;
|
||||
break;
|
||||
case REQ_OP_FLUSH:
|
||||
rnbd_opf = RNBD_OP_FLUSH;
|
||||
|
|
@ -297,6 +310,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq)
|
|||
if (op_is_flush(rq->cmd_flags))
|
||||
rnbd_opf |= RNBD_F_FUA;
|
||||
|
||||
if (rq->cmd_flags & REQ_PREFLUSH)
|
||||
rnbd_opf |= RNBD_F_PREFLUSH;
|
||||
|
||||
return rnbd_opf;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -44,24 +44,6 @@ DEFINE_EVENT(rnbd_srv_link_class, name, \
|
|||
DEFINE_LINK_EVENT(create_sess);
|
||||
DEFINE_LINK_EVENT(destroy_sess);
|
||||
|
||||
TRACE_DEFINE_ENUM(RNBD_OP_READ);
|
||||
TRACE_DEFINE_ENUM(RNBD_OP_WRITE);
|
||||
TRACE_DEFINE_ENUM(RNBD_OP_FLUSH);
|
||||
TRACE_DEFINE_ENUM(RNBD_OP_DISCARD);
|
||||
TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE);
|
||||
TRACE_DEFINE_ENUM(RNBD_F_SYNC);
|
||||
TRACE_DEFINE_ENUM(RNBD_F_FUA);
|
||||
|
||||
#define show_rnbd_rw_flags(x) \
|
||||
__print_flags(x, "|", \
|
||||
{ RNBD_OP_READ, "READ" }, \
|
||||
{ RNBD_OP_WRITE, "WRITE" }, \
|
||||
{ RNBD_OP_FLUSH, "FLUSH" }, \
|
||||
{ RNBD_OP_DISCARD, "DISCARD" }, \
|
||||
{ RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \
|
||||
{ RNBD_F_SYNC, "SYNC" }, \
|
||||
{ RNBD_F_FUA, "FUA" })
|
||||
|
||||
TRACE_EVENT(process_rdma,
|
||||
TP_PROTO(struct rnbd_srv_session *srv,
|
||||
const struct rnbd_msg_io *msg,
|
||||
|
|
@ -97,7 +79,7 @@ TRACE_EVENT(process_rdma,
|
|||
__entry->usrlen = usrlen;
|
||||
),
|
||||
|
||||
TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu",
|
||||
TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %u, ioprio: %d, datalen: %u, usrlen: %zu",
|
||||
__get_str(sessname),
|
||||
__print_symbolic(__entry->dir,
|
||||
{ READ, "READ" },
|
||||
|
|
@ -106,7 +88,7 @@ TRACE_EVENT(process_rdma,
|
|||
__entry->device_id,
|
||||
__entry->sector,
|
||||
__entry->bi_size,
|
||||
show_rnbd_rw_flags(__entry->flags),
|
||||
__entry->flags,
|
||||
__entry->ioprio,
|
||||
__entry->datalen,
|
||||
__entry->usrlen
|
||||
|
|
|
|||
|
|
@ -145,18 +145,30 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
|
|||
priv->sess_dev = sess_dev;
|
||||
priv->id = id;
|
||||
|
||||
bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1,
|
||||
bio = bio_alloc(file_bdev(sess_dev->bdev_file), !!datalen,
|
||||
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
|
||||
bio_add_virt_nofail(bio, data, datalen);
|
||||
|
||||
bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
|
||||
if (bio_has_data(bio) &&
|
||||
bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) {
|
||||
rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n",
|
||||
bio->bi_iter.bi_size, msg->bi_size);
|
||||
err = -EINVAL;
|
||||
goto bio_put;
|
||||
if (unlikely(!bio)) {
|
||||
err = -ENOMEM;
|
||||
goto put_sess_dev;
|
||||
}
|
||||
|
||||
if (!datalen) {
|
||||
/*
|
||||
* For special requests like DISCARD and WRITE_ZEROES, the datalen is zero.
|
||||
*/
|
||||
bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size);
|
||||
} else {
|
||||
bio_add_virt_nofail(bio, data, datalen);
|
||||
bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
|
||||
if (bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) {
|
||||
rnbd_srv_err_rl(sess_dev,
|
||||
"Datalen mismatch: bio bi_size (%u), bi_size (%u)\n",
|
||||
bio->bi_iter.bi_size, msg->bi_size);
|
||||
err = -EINVAL;
|
||||
goto bio_put;
|
||||
}
|
||||
}
|
||||
|
||||
bio->bi_end_io = rnbd_dev_bi_end_io;
|
||||
bio->bi_private = priv;
|
||||
bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
|
||||
|
|
@ -170,6 +182,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
|
|||
|
||||
bio_put:
|
||||
bio_put(bio);
|
||||
put_sess_dev:
|
||||
rnbd_put_sess_dev(sess_dev);
|
||||
err:
|
||||
kfree(priv);
|
||||
|
|
@ -538,6 +551,8 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
|
|||
{
|
||||
struct block_device *bdev = file_bdev(sess_dev->bdev_file);
|
||||
|
||||
memset(rsp, 0, sizeof(*rsp));
|
||||
|
||||
rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP);
|
||||
rsp->device_id = cpu_to_le32(sess_dev->device_id);
|
||||
rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev));
|
||||
|
|
@ -644,6 +659,7 @@ static void process_msg_sess_info(struct rnbd_srv_session *srv_sess,
|
|||
|
||||
trace_process_msg_sess_info(srv_sess, sess_info_msg);
|
||||
|
||||
memset(rsp, 0, sizeof(*rsp));
|
||||
rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP);
|
||||
rsp->ver = srv_sess->ver;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,7 +13,6 @@ use kernel::{
|
|||
str::{kstrtobool_bytes, CString},
|
||||
sync::Mutex,
|
||||
};
|
||||
use pin_init::PinInit;
|
||||
|
||||
pub(crate) fn subsystem() -> impl PinInit<kernel::configfs::Subsystem<Config>, Error> {
|
||||
let item_type = configfs_attrs! {
|
||||
|
|
@ -25,7 +24,7 @@ pub(crate) fn subsystem() -> impl PinInit<kernel::configfs::Subsystem<Config>, E
|
|||
],
|
||||
};
|
||||
|
||||
kernel::configfs::Subsystem::new(c_str!("rnull"), item_type, try_pin_init!(Config {}))
|
||||
kernel::configfs::Subsystem::new(c"rnull", item_type, try_pin_init!(Config {}))
|
||||
}
|
||||
|
||||
#[pin_data]
|
||||
|
|
|
|||
|
|
@ -14,12 +14,9 @@ use kernel::{
|
|||
Operations, TagSet,
|
||||
},
|
||||
},
|
||||
error::Result,
|
||||
pr_info,
|
||||
prelude::*,
|
||||
sync::{aref::ARef, Arc},
|
||||
};
|
||||
use pin_init::PinInit;
|
||||
|
||||
module! {
|
||||
type: NullBlkModule,
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -295,7 +295,8 @@ static void dm_kill_unmapped_request(struct request *rq, blk_status_t error)
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret end_clone_request(struct request *clone,
|
||||
blk_status_t error)
|
||||
blk_status_t error,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct dm_rq_target_io *tio = clone->end_io_data;
|
||||
|
||||
|
|
|
|||
|
|
@ -2085,7 +2085,7 @@ static void bitmap_destroy(struct mddev *mddev)
|
|||
return;
|
||||
|
||||
bitmap_wait_behind_writes(mddev);
|
||||
if (!mddev->serialize_policy)
|
||||
if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
|
||||
mddev_destroy_serial_pool(mddev, NULL);
|
||||
|
||||
mutex_lock(&mddev->bitmap_info.mutex);
|
||||
|
|
@ -2453,6 +2453,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
|
|||
memcpy(page_address(store.sb_page),
|
||||
page_address(bitmap->storage.sb_page),
|
||||
sizeof(bitmap_super_t));
|
||||
mutex_lock(&bitmap->mddev->bitmap_info.mutex);
|
||||
spin_lock_irq(&bitmap->counts.lock);
|
||||
md_bitmap_file_unmap(&bitmap->storage);
|
||||
bitmap->storage = store;
|
||||
|
|
@ -2560,7 +2561,7 @@ static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
|
|||
set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
|
||||
}
|
||||
spin_unlock_irq(&bitmap->counts.lock);
|
||||
|
||||
mutex_unlock(&bitmap->mddev->bitmap_info.mutex);
|
||||
if (!init) {
|
||||
__bitmap_unplug(bitmap);
|
||||
bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
|
||||
|
|
@ -2809,7 +2810,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
mddev->bitmap_info.max_write_behind = backlog;
|
||||
if (!backlog && mddev->serial_info_pool) {
|
||||
/* serial_info_pool is not needed if backlog is zero */
|
||||
if (!mddev->serialize_policy)
|
||||
if (!test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
|
||||
mddev_destroy_serial_pool(mddev, NULL);
|
||||
} else if (backlog && !mddev->serial_info_pool) {
|
||||
/* serial_info_pool is needed since backlog is not zero */
|
||||
|
|
|
|||
|
|
@ -549,8 +549,13 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg
|
|||
|
||||
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
|
||||
|
||||
/* daemaon thread must exist */
|
||||
thread = rcu_dereference_protected(mddev->thread, true);
|
||||
if (!thread) {
|
||||
pr_warn("md-cluster: Received metadata update but MD thread is not ready\n");
|
||||
dlm_unlock_sync(cinfo->no_new_dev_lockres);
|
||||
return;
|
||||
}
|
||||
|
||||
wait_event(thread->wqueue,
|
||||
(got_lock = mddev_trylock(mddev)) ||
|
||||
test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
|
||||
|
|
|
|||
|
|
@ -712,8 +712,10 @@ static int llbitmap_suspend_timeout(struct llbitmap *llbitmap, int page_idx)
|
|||
percpu_ref_kill(&pctl->active);
|
||||
|
||||
if (!wait_event_timeout(pctl->wait, percpu_ref_is_zero(&pctl->active),
|
||||
llbitmap->mddev->bitmap_info.daemon_sleep * HZ))
|
||||
llbitmap->mddev->bitmap_info.daemon_sleep * HZ)) {
|
||||
percpu_ref_resurrect(&pctl->active);
|
||||
return -ETIMEDOUT;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
188
drivers/md/md.c
188
drivers/md/md.c
|
|
@ -279,7 +279,8 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
|
|||
|
||||
rdev_for_each(temp, mddev) {
|
||||
if (!rdev) {
|
||||
if (!mddev->serialize_policy ||
|
||||
if (!test_bit(MD_SERIALIZE_POLICY,
|
||||
&mddev->flags) ||
|
||||
!rdev_need_serial(temp))
|
||||
rdev_uninit_serial(temp);
|
||||
else
|
||||
|
|
@ -2617,9 +2618,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
|
|||
list_add_rcu(&rdev->same_set, &mddev->disks);
|
||||
bd_link_disk_holder(rdev->bdev, mddev->gendisk);
|
||||
|
||||
/* May as well allow recovery to be retried once */
|
||||
mddev->recovery_disabled++;
|
||||
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
|
|
@ -5864,11 +5862,11 @@ __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
|
|||
|
||||
static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
|
||||
{
|
||||
return sprintf(page, "%d\n", mddev->fail_last_dev);
|
||||
return sprintf(page, "%d\n", test_bit(MD_FAILLAST_DEV, &mddev->flags));
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting fail_last_dev to true to allow last device to be forcibly removed
|
||||
* Setting MD_FAILLAST_DEV to allow last device to be forcibly removed
|
||||
* from RAID1/RAID10.
|
||||
*/
|
||||
static ssize_t
|
||||
|
|
@ -5881,8 +5879,10 @@ fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (value != mddev->fail_last_dev)
|
||||
mddev->fail_last_dev = value;
|
||||
if (value)
|
||||
set_bit(MD_FAILLAST_DEV, &mddev->flags);
|
||||
else
|
||||
clear_bit(MD_FAILLAST_DEV, &mddev->flags);
|
||||
|
||||
return len;
|
||||
}
|
||||
|
|
@ -5895,11 +5895,12 @@ static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
|
|||
if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1))
|
||||
return sprintf(page, "n/a\n");
|
||||
else
|
||||
return sprintf(page, "%d\n", mddev->serialize_policy);
|
||||
return sprintf(page, "%d\n",
|
||||
test_bit(MD_SERIALIZE_POLICY, &mddev->flags));
|
||||
}
|
||||
|
||||
/*
|
||||
* Setting serialize_policy to true to enforce write IO is not reordered
|
||||
* Setting MD_SERIALIZE_POLICY enforce write IO is not reordered
|
||||
* for raid1.
|
||||
*/
|
||||
static ssize_t
|
||||
|
|
@ -5912,7 +5913,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
if (err)
|
||||
return err;
|
||||
|
||||
if (value == mddev->serialize_policy)
|
||||
if (value == test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
|
||||
return len;
|
||||
|
||||
err = mddev_suspend_and_lock(mddev);
|
||||
|
|
@ -5924,11 +5925,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
goto unlock;
|
||||
}
|
||||
|
||||
if (value)
|
||||
if (value) {
|
||||
mddev_create_serial_pool(mddev, NULL);
|
||||
else
|
||||
set_bit(MD_SERIALIZE_POLICY, &mddev->flags);
|
||||
} else {
|
||||
mddev_destroy_serial_pool(mddev, NULL);
|
||||
mddev->serialize_policy = value;
|
||||
clear_bit(MD_SERIALIZE_POLICY, &mddev->flags);
|
||||
}
|
||||
unlock:
|
||||
mddev_unlock_and_resume(mddev);
|
||||
return err ?: len;
|
||||
|
|
@ -6502,7 +6505,7 @@ int md_run(struct mddev *mddev)
|
|||
* the only valid external interface is through the md
|
||||
* device.
|
||||
*/
|
||||
mddev->has_superblocks = false;
|
||||
clear_bit(MD_HAS_SUPERBLOCK, &mddev->flags);
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if (test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
|
|
@ -6515,7 +6518,7 @@ int md_run(struct mddev *mddev)
|
|||
}
|
||||
|
||||
if (rdev->sb_page)
|
||||
mddev->has_superblocks = true;
|
||||
set_bit(MD_HAS_SUPERBLOCK, &mddev->flags);
|
||||
|
||||
/* perform some consistency tests on the device.
|
||||
* We don't want the data to overlap the metadata,
|
||||
|
|
@ -6848,13 +6851,15 @@ static void __md_stop_writes(struct mddev *mddev)
|
|||
{
|
||||
timer_delete_sync(&mddev->safemode_timer);
|
||||
|
||||
if (mddev->pers && mddev->pers->quiesce) {
|
||||
mddev->pers->quiesce(mddev, 1);
|
||||
mddev->pers->quiesce(mddev, 0);
|
||||
}
|
||||
if (md_is_rdwr(mddev) || !mddev_is_dm(mddev)) {
|
||||
if (mddev->pers && mddev->pers->quiesce) {
|
||||
mddev->pers->quiesce(mddev, 1);
|
||||
mddev->pers->quiesce(mddev, 0);
|
||||
}
|
||||
|
||||
if (md_bitmap_enabled(mddev, true))
|
||||
mddev->bitmap_ops->flush(mddev);
|
||||
if (md_bitmap_enabled(mddev, true))
|
||||
mddev->bitmap_ops->flush(mddev);
|
||||
}
|
||||
|
||||
if (md_is_rdwr(mddev) &&
|
||||
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
|
||||
|
|
@ -6865,7 +6870,7 @@ static void __md_stop_writes(struct mddev *mddev)
|
|||
md_update_sb(mddev, 1);
|
||||
}
|
||||
/* disable policy to guarantee rdevs free resources for serialization */
|
||||
mddev->serialize_policy = 0;
|
||||
clear_bit(MD_SERIALIZE_POLICY, &mddev->flags);
|
||||
mddev_destroy_serial_pool(mddev, NULL);
|
||||
}
|
||||
|
||||
|
|
@ -9068,20 +9073,22 @@ static bool is_mddev_idle(struct mddev *mddev, int init)
|
|||
return idle;
|
||||
}
|
||||
|
||||
void md_done_sync(struct mddev *mddev, int blocks, int ok)
|
||||
void md_done_sync(struct mddev *mddev, int blocks)
|
||||
{
|
||||
/* another "blocks" (512byte) blocks have been synced */
|
||||
atomic_sub(blocks, &mddev->recovery_active);
|
||||
wake_up(&mddev->recovery_wait);
|
||||
if (!ok) {
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
// stop recovery, signal do_sync ....
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(md_done_sync);
|
||||
|
||||
void md_sync_error(struct mddev *mddev)
|
||||
{
|
||||
// stop recovery, signal do_sync ....
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
}
|
||||
EXPORT_SYMBOL(md_sync_error);
|
||||
|
||||
/* md_write_start(mddev, bi)
|
||||
* If we need to update some array metadata (e.g. 'active' flag
|
||||
* in superblock) before writing, schedule a superblock update
|
||||
|
|
@ -9125,7 +9132,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
|
|||
rcu_read_unlock();
|
||||
if (did_change)
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
if (!mddev->has_superblocks)
|
||||
if (!test_bit(MD_HAS_SUPERBLOCK, &mddev->flags))
|
||||
return;
|
||||
wait_event(mddev->sb_wait,
|
||||
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
|
||||
|
|
@ -9430,6 +9437,53 @@ static bool sync_io_within_limit(struct mddev *mddev)
|
|||
(raid_is_456(mddev) ? 8 : 128) * sync_io_depth(mddev);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update sync offset and mddev status when sync completes
|
||||
*/
|
||||
static void md_finish_sync(struct mddev *mddev, enum sync_action action)
|
||||
{
|
||||
struct md_rdev *rdev;
|
||||
|
||||
switch (action) {
|
||||
case ACTION_RESYNC:
|
||||
case ACTION_REPAIR:
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
||||
mddev->curr_resync = MaxSector;
|
||||
mddev->resync_offset = mddev->curr_resync;
|
||||
break;
|
||||
case ACTION_RECOVER:
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
||||
mddev->curr_resync = MaxSector;
|
||||
rcu_read_lock();
|
||||
rdev_for_each_rcu(rdev, mddev)
|
||||
if (mddev->delta_disks >= 0 &&
|
||||
rdev_needs_recovery(rdev, mddev->curr_resync))
|
||||
rdev->recovery_offset = mddev->curr_resync;
|
||||
rcu_read_unlock();
|
||||
break;
|
||||
case ACTION_RESHAPE:
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
|
||||
mddev->delta_disks > 0 &&
|
||||
mddev->pers->finish_reshape &&
|
||||
mddev->pers->size &&
|
||||
!mddev_is_dm(mddev)) {
|
||||
mddev_lock_nointr(mddev);
|
||||
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
|
||||
mddev_unlock(mddev);
|
||||
if (!mddev_is_clustered(mddev))
|
||||
set_capacity_and_notify(mddev->gendisk,
|
||||
mddev->array_sectors);
|
||||
}
|
||||
if (mddev->pers->finish_reshape)
|
||||
mddev->pers->finish_reshape(mddev);
|
||||
break;
|
||||
/* */
|
||||
case ACTION_CHECK:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#define SYNC_MARKS 10
|
||||
#define SYNC_MARK_STEP (3*HZ)
|
||||
#define UPDATE_FREQUENCY (5*60*HZ)
|
||||
|
|
@ -9445,7 +9499,6 @@ void md_do_sync(struct md_thread *thread)
|
|||
int last_mark,m;
|
||||
sector_t last_check;
|
||||
int skipped = 0;
|
||||
struct md_rdev *rdev;
|
||||
enum sync_action action;
|
||||
const char *desc;
|
||||
struct blk_plug plug;
|
||||
|
|
@ -9731,65 +9784,21 @@ update:
|
|||
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
|
||||
|
||||
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
|
||||
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
|
||||
mddev->curr_resync >= MD_RESYNC_ACTIVE) {
|
||||
/* All sync IO completes after recovery_active becomes 0 */
|
||||
mddev->curr_resync_completed = mddev->curr_resync;
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_completed);
|
||||
}
|
||||
mddev->pers->sync_request(mddev, max_sectors, max_sectors, &skipped);
|
||||
|
||||
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
|
||||
mddev->curr_resync > MD_RESYNC_ACTIVE) {
|
||||
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
|
||||
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
|
||||
if (mddev->curr_resync >= mddev->resync_offset) {
|
||||
pr_debug("md: checkpointing %s of %s.\n",
|
||||
desc, mdname(mddev));
|
||||
if (test_bit(MD_RECOVERY_ERROR,
|
||||
&mddev->recovery))
|
||||
mddev->resync_offset =
|
||||
mddev->curr_resync_completed;
|
||||
else
|
||||
mddev->resync_offset =
|
||||
mddev->curr_resync;
|
||||
}
|
||||
} else
|
||||
mddev->resync_offset = MaxSector;
|
||||
} else {
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
|
||||
mddev->curr_resync = MaxSector;
|
||||
if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
|
||||
test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
|
||||
rcu_read_lock();
|
||||
rdev_for_each_rcu(rdev, mddev)
|
||||
if (mddev->delta_disks >= 0 &&
|
||||
rdev_needs_recovery(rdev, mddev->curr_resync))
|
||||
rdev->recovery_offset = mddev->curr_resync;
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (mddev->curr_resync > MD_RESYNC_ACTIVE)
|
||||
md_finish_sync(mddev, action);
|
||||
skip:
|
||||
/* set CHANGE_PENDING here since maybe another update is needed,
|
||||
* so other nodes are informed. It should be harmless for normal
|
||||
* raid */
|
||||
set_mask_bits(&mddev->sb_flags, 0,
|
||||
BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
|
||||
|
||||
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
|
||||
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
|
||||
mddev->delta_disks > 0 &&
|
||||
mddev->pers->finish_reshape &&
|
||||
mddev->pers->size &&
|
||||
!mddev_is_dm(mddev)) {
|
||||
mddev_lock_nointr(mddev);
|
||||
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
|
||||
mddev_unlock(mddev);
|
||||
if (!mddev_is_clustered(mddev))
|
||||
set_capacity_and_notify(mddev->gendisk,
|
||||
mddev->array_sectors);
|
||||
}
|
||||
|
||||
spin_lock(&mddev->lock);
|
||||
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
|
||||
/* We completed so min/max setting can be forgotten if used. */
|
||||
|
|
@ -10304,7 +10313,7 @@ void md_reap_sync_thread(struct mddev *mddev)
|
|||
{
|
||||
struct md_rdev *rdev;
|
||||
sector_t old_dev_sectors = mddev->dev_sectors;
|
||||
bool is_reshaped = false;
|
||||
bool is_reshaped = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
|
||||
|
||||
/* resync has finished, collect result */
|
||||
md_unregister_thread(mddev, &mddev->sync_thread);
|
||||
|
|
@ -10320,12 +10329,6 @@ void md_reap_sync_thread(struct mddev *mddev)
|
|||
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
|
||||
}
|
||||
}
|
||||
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
|
||||
mddev->pers->finish_reshape) {
|
||||
mddev->pers->finish_reshape(mddev);
|
||||
if (mddev_is_clustered(mddev))
|
||||
is_reshaped = true;
|
||||
}
|
||||
|
||||
/* If array is no-longer degraded, then any saved_raid_disk
|
||||
* information must be scrapped.
|
||||
|
|
@ -10352,8 +10355,9 @@ void md_reap_sync_thread(struct mddev *mddev)
|
|||
* be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
|
||||
* so it is time to update size across cluster.
|
||||
*/
|
||||
if (mddev_is_clustered(mddev) && is_reshaped
|
||||
&& !test_bit(MD_CLOSING, &mddev->flags))
|
||||
if (mddev_is_clustered(mddev) && is_reshaped &&
|
||||
mddev->pers->finish_reshape &&
|
||||
!test_bit(MD_CLOSING, &mddev->flags))
|
||||
mddev->cluster_ops->update_size(mddev, old_dev_sectors);
|
||||
/* flag recovery needed just to double check */
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
|
|
@ -10413,8 +10417,14 @@ bool rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
|||
else
|
||||
s += rdev->data_offset;
|
||||
|
||||
if (!badblocks_set(&rdev->badblocks, s, sectors, 0))
|
||||
if (!badblocks_set(&rdev->badblocks, s, sectors, 0)) {
|
||||
/*
|
||||
* Mark the disk as Faulty when setting badblocks fails,
|
||||
* otherwise, bad sectors may be read.
|
||||
*/
|
||||
md_error(mddev, rdev);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Make sure they get written out promptly */
|
||||
if (test_bit(ExternalBbl, &rdev->flags))
|
||||
|
|
|
|||
|
|
@ -22,6 +22,10 @@
|
|||
#include <trace/events/block.h>
|
||||
|
||||
#define MaxSector (~(sector_t)0)
|
||||
/*
|
||||
* Number of guaranteed raid bios in case of extreme VM load:
|
||||
*/
|
||||
#define NR_RAID_BIOS 256
|
||||
|
||||
enum md_submodule_type {
|
||||
MD_PERSONALITY = 0,
|
||||
|
|
@ -340,6 +344,9 @@ struct md_cluster_operations;
|
|||
* array is ready yet.
|
||||
* @MD_BROKEN: This is used to stop writes and mark array as failed.
|
||||
* @MD_DELETED: This device is being deleted
|
||||
* @MD_HAS_SUPERBLOCK: There is persistence sb in member disks.
|
||||
* @MD_FAILLAST_DEV: Allow last rdev to be removed.
|
||||
* @MD_SERIALIZE_POLICY: Enforce write IO is not reordered, just used by raid1.
|
||||
*
|
||||
* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
|
||||
*/
|
||||
|
|
@ -356,6 +363,9 @@ enum mddev_flags {
|
|||
MD_BROKEN,
|
||||
MD_DO_DELETE,
|
||||
MD_DELETED,
|
||||
MD_HAS_SUPERBLOCK,
|
||||
MD_FAILLAST_DEV,
|
||||
MD_SERIALIZE_POLICY,
|
||||
};
|
||||
|
||||
enum mddev_sb_flags {
|
||||
|
|
@ -495,12 +505,6 @@ struct mddev {
|
|||
int ok_start_degraded;
|
||||
|
||||
unsigned long recovery;
|
||||
/* If a RAID personality determines that recovery (of a particular
|
||||
* device) will fail due to a read error on the source device, it
|
||||
* takes a copy of this number and does not attempt recovery again
|
||||
* until this number changes.
|
||||
*/
|
||||
int recovery_disabled;
|
||||
|
||||
int in_sync; /* know to not need resync */
|
||||
/* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
|
||||
|
|
@ -622,10 +626,6 @@ struct mddev {
|
|||
|
||||
/* The sequence number for sync thread */
|
||||
atomic_t sync_seq;
|
||||
|
||||
bool has_superblocks:1;
|
||||
bool fail_last_dev:1;
|
||||
bool serialize_policy:1;
|
||||
};
|
||||
|
||||
enum recovery_flags {
|
||||
|
|
@ -646,8 +646,6 @@ enum recovery_flags {
|
|||
MD_RECOVERY_FROZEN,
|
||||
/* waiting for pers->start() to finish */
|
||||
MD_RECOVERY_WAIT,
|
||||
/* interrupted because io-error */
|
||||
MD_RECOVERY_ERROR,
|
||||
|
||||
/* flags determines sync action, see details in enum sync_action */
|
||||
|
||||
|
|
@ -737,8 +735,8 @@ static inline int mddev_trylock(struct mddev *mddev)
|
|||
int ret;
|
||||
|
||||
ret = mutex_trylock(&mddev->reconfig_mutex);
|
||||
if (!ret && test_bit(MD_DELETED, &mddev->flags)) {
|
||||
ret = -ENODEV;
|
||||
if (ret && test_bit(MD_DELETED, &mddev->flags)) {
|
||||
ret = 0;
|
||||
mutex_unlock(&mddev->reconfig_mutex);
|
||||
}
|
||||
return ret;
|
||||
|
|
@ -912,7 +910,8 @@ extern const char *md_sync_action_name(enum sync_action action);
|
|||
extern void md_write_start(struct mddev *mddev, struct bio *bi);
|
||||
extern void md_write_inc(struct mddev *mddev, struct bio *bi);
|
||||
extern void md_write_end(struct mddev *mddev);
|
||||
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
|
||||
extern void md_done_sync(struct mddev *mddev, int blocks);
|
||||
extern void md_sync_error(struct mddev *mddev);
|
||||
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
|
||||
extern void md_finish_reshape(struct mddev *mddev);
|
||||
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
|
||||
|
|
|
|||
|
|
@ -27,7 +27,9 @@ module_param(default_layout, int, 0644);
|
|||
(1L << MD_JOURNAL_CLEAN) | \
|
||||
(1L << MD_FAILFAST_SUPPORTED) |\
|
||||
(1L << MD_HAS_PPL) | \
|
||||
(1L << MD_HAS_MULTIPLE_PPLS))
|
||||
(1L << MD_HAS_MULTIPLE_PPLS) | \
|
||||
(1L << MD_FAILLAST_DEV) | \
|
||||
(1L << MD_SERIALIZE_POLICY))
|
||||
|
||||
/*
|
||||
* inform the user of the raid configuration
|
||||
|
|
|
|||
|
|
@ -3,11 +3,6 @@
|
|||
#define RESYNC_BLOCK_SIZE (64*1024)
|
||||
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
|
||||
|
||||
/*
|
||||
* Number of guaranteed raid bios in case of extreme VM load:
|
||||
*/
|
||||
#define NR_RAID_BIOS 256
|
||||
|
||||
/* when we get a read error on a read-only array, we redirect to another
|
||||
* device without failing the first device, or trying to over-write to
|
||||
* correct the read error. To keep track of bad blocks on a per-bio
|
||||
|
|
|
|||
|
|
@ -542,7 +542,7 @@ static void raid1_end_write_request(struct bio *bio)
|
|||
call_bio_endio(r1_bio);
|
||||
}
|
||||
}
|
||||
} else if (rdev->mddev->serialize_policy)
|
||||
} else if (test_bit(MD_SERIALIZE_POLICY, &rdev->mddev->flags))
|
||||
remove_serial(rdev, lo, hi);
|
||||
if (r1_bio->bios[mirror] == NULL)
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
|
|
@ -1644,7 +1644,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
|
|||
mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
|
||||
&mddev->bio_set);
|
||||
|
||||
if (mddev->serialize_policy)
|
||||
if (test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
|
||||
wait_for_serialization(rdev, r1_bio);
|
||||
}
|
||||
|
||||
|
|
@ -1746,7 +1746,7 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
|
|||
* - &mddev->degraded is bumped.
|
||||
*
|
||||
* @rdev is marked as &Faulty excluding case when array is failed and
|
||||
* &mddev->fail_last_dev is off.
|
||||
* MD_FAILLAST_DEV is not set.
|
||||
*/
|
||||
static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
|
|
@ -1759,8 +1759,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
|
|||
(conf->raid_disks - mddev->degraded) == 1) {
|
||||
set_bit(MD_BROKEN, &mddev->flags);
|
||||
|
||||
if (!mddev->fail_last_dev) {
|
||||
conf->recovery_disabled = mddev->recovery_disabled;
|
||||
if (!test_bit(MD_FAILLAST_DEV, &mddev->flags)) {
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
return;
|
||||
}
|
||||
|
|
@ -1904,7 +1903,6 @@ static bool raid1_remove_conf(struct r1conf *conf, int disk)
|
|||
|
||||
/* Only remove non-faulty devices if recovery is not possible. */
|
||||
if (!test_bit(Faulty, &rdev->flags) &&
|
||||
rdev->mddev->recovery_disabled != conf->recovery_disabled &&
|
||||
rdev->mddev->degraded < conf->raid_disks)
|
||||
return false;
|
||||
|
||||
|
|
@ -1924,9 +1922,6 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
int first = 0;
|
||||
int last = conf->raid_disks - 1;
|
||||
|
||||
if (mddev->recovery_disabled == conf->recovery_disabled)
|
||||
return -EBUSY;
|
||||
|
||||
if (rdev->raid_disk >= 0)
|
||||
first = last = rdev->raid_disk;
|
||||
|
||||
|
|
@ -2062,7 +2057,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
|
|||
} while (sectors_to_go > 0);
|
||||
}
|
||||
|
||||
static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
|
||||
static void put_sync_write_buf(struct r1bio *r1_bio)
|
||||
{
|
||||
if (atomic_dec_and_test(&r1_bio->remaining)) {
|
||||
struct mddev *mddev = r1_bio->mddev;
|
||||
|
|
@ -2073,20 +2068,19 @@ static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
|
|||
reschedule_retry(r1_bio);
|
||||
else {
|
||||
put_buf(r1_bio);
|
||||
md_done_sync(mddev, s, uptodate);
|
||||
md_done_sync(mddev, s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void end_sync_write(struct bio *bio)
|
||||
{
|
||||
int uptodate = !bio->bi_status;
|
||||
struct r1bio *r1_bio = get_resync_r1bio(bio);
|
||||
struct mddev *mddev = r1_bio->mddev;
|
||||
struct r1conf *conf = mddev->private;
|
||||
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
|
||||
|
||||
if (!uptodate) {
|
||||
if (bio->bi_status) {
|
||||
abort_sync_write(mddev, r1_bio);
|
||||
set_bit(WriteErrorSeen, &rdev->flags);
|
||||
if (!test_and_set_bit(WantReplacement, &rdev->flags))
|
||||
|
|
@ -2099,7 +2093,7 @@ static void end_sync_write(struct bio *bio)
|
|||
set_bit(R1BIO_MadeGood, &r1_bio->state);
|
||||
}
|
||||
|
||||
put_sync_write_buf(r1_bio, uptodate);
|
||||
put_sync_write_buf(r1_bio);
|
||||
}
|
||||
|
||||
static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
|
||||
|
|
@ -2116,8 +2110,7 @@ static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
|
|||
rdev->mddev->recovery);
|
||||
}
|
||||
/* need to record an error - either for the block or the device */
|
||||
if (!rdev_set_badblocks(rdev, sector, sectors, 0))
|
||||
md_error(rdev->mddev, rdev);
|
||||
rdev_set_badblocks(rdev, sector, sectors, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -2348,9 +2341,8 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
|
|||
*/
|
||||
if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) ||
|
||||
!fix_sync_read_error(r1_bio)) {
|
||||
conf->recovery_disabled = mddev->recovery_disabled;
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
md_done_sync(mddev, r1_bio->sectors, 0);
|
||||
md_done_sync(mddev, r1_bio->sectors);
|
||||
md_sync_error(mddev);
|
||||
put_buf(r1_bio);
|
||||
return;
|
||||
}
|
||||
|
|
@ -2385,7 +2377,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
|
|||
submit_bio_noacct(wbio);
|
||||
}
|
||||
|
||||
put_sync_write_buf(r1_bio, 1);
|
||||
put_sync_write_buf(r1_bio);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -2442,8 +2434,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
|
|||
if (!success) {
|
||||
/* Cannot read from anywhere - mark it bad */
|
||||
struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
|
||||
if (!rdev_set_badblocks(rdev, sect, s, 0))
|
||||
md_error(mddev, rdev);
|
||||
rdev_set_badblocks(rdev, sect, s, 0);
|
||||
break;
|
||||
}
|
||||
/* write it back and re-read */
|
||||
|
|
@ -2487,7 +2478,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
|
|||
}
|
||||
}
|
||||
|
||||
static bool narrow_write_error(struct r1bio *r1_bio, int i)
|
||||
static void narrow_write_error(struct r1bio *r1_bio, int i)
|
||||
{
|
||||
struct mddev *mddev = r1_bio->mddev;
|
||||
struct r1conf *conf = mddev->private;
|
||||
|
|
@ -2504,17 +2495,16 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i)
|
|||
* We currently own a reference on the rdev.
|
||||
*/
|
||||
|
||||
int block_sectors;
|
||||
int block_sectors, lbs = bdev_logical_block_size(rdev->bdev) >> 9;
|
||||
sector_t sector;
|
||||
int sectors;
|
||||
int sect_to_write = r1_bio->sectors;
|
||||
bool ok = true;
|
||||
|
||||
if (rdev->badblocks.shift < 0)
|
||||
return false;
|
||||
block_sectors = lbs;
|
||||
else
|
||||
block_sectors = roundup(1 << rdev->badblocks.shift, lbs);
|
||||
|
||||
block_sectors = roundup(1 << rdev->badblocks.shift,
|
||||
bdev_logical_block_size(rdev->bdev) >> 9);
|
||||
sector = r1_bio->sector;
|
||||
sectors = ((sector + block_sectors)
|
||||
& ~(sector_t)(block_sectors - 1))
|
||||
|
|
@ -2542,18 +2532,21 @@ static bool narrow_write_error(struct r1bio *r1_bio, int i)
|
|||
bio_trim(wbio, sector - r1_bio->sector, sectors);
|
||||
wbio->bi_iter.bi_sector += rdev->data_offset;
|
||||
|
||||
if (submit_bio_wait(wbio) < 0)
|
||||
/* failure! */
|
||||
ok = rdev_set_badblocks(rdev, sector,
|
||||
sectors, 0)
|
||||
&& ok;
|
||||
if (submit_bio_wait(wbio) &&
|
||||
!rdev_set_badblocks(rdev, sector, sectors, 0)) {
|
||||
/*
|
||||
* Badblocks set failed, disk marked Faulty.
|
||||
* No further operations needed.
|
||||
*/
|
||||
bio_put(wbio);
|
||||
break;
|
||||
}
|
||||
|
||||
bio_put(wbio);
|
||||
sect_to_write -= sectors;
|
||||
sector += sectors;
|
||||
sectors = block_sectors;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
|
||||
|
|
@ -2566,17 +2559,14 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
|
|||
if (bio->bi_end_io == NULL)
|
||||
continue;
|
||||
if (!bio->bi_status &&
|
||||
test_bit(R1BIO_MadeGood, &r1_bio->state)) {
|
||||
test_bit(R1BIO_MadeGood, &r1_bio->state))
|
||||
rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
|
||||
}
|
||||
if (bio->bi_status &&
|
||||
test_bit(R1BIO_WriteError, &r1_bio->state)) {
|
||||
if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
|
||||
md_error(conf->mddev, rdev);
|
||||
}
|
||||
test_bit(R1BIO_WriteError, &r1_bio->state))
|
||||
rdev_set_badblocks(rdev, r1_bio->sector, s, 0);
|
||||
}
|
||||
put_buf(r1_bio);
|
||||
md_done_sync(conf->mddev, s, 1);
|
||||
md_done_sync(conf->mddev, s);
|
||||
}
|
||||
|
||||
static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
|
||||
|
|
@ -2597,10 +2587,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
|
|||
* errors.
|
||||
*/
|
||||
fail = true;
|
||||
if (!narrow_write_error(r1_bio, m))
|
||||
md_error(conf->mddev,
|
||||
conf->mirrors[m].rdev);
|
||||
/* an I/O failed, we can't clear the bitmap */
|
||||
narrow_write_error(r1_bio, m);
|
||||
rdev_dec_pending(conf->mirrors[m].rdev,
|
||||
conf->mddev);
|
||||
}
|
||||
|
|
@ -2955,16 +2942,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
*skipped = 1;
|
||||
put_buf(r1_bio);
|
||||
|
||||
if (!ok) {
|
||||
/* Cannot record the badblocks, so need to
|
||||
if (!ok)
|
||||
/* Cannot record the badblocks, md_error has set INTR,
|
||||
* abort the resync.
|
||||
* If there are multiple read targets, could just
|
||||
* fail the really bad ones ???
|
||||
*/
|
||||
conf->recovery_disabled = mddev->recovery_disabled;
|
||||
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
|
||||
return 0;
|
||||
} else
|
||||
else
|
||||
return min_bad;
|
||||
|
||||
}
|
||||
|
|
@ -3151,7 +3134,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
|
|||
init_waitqueue_head(&conf->wait_barrier);
|
||||
|
||||
bio_list_init(&conf->pending_bio_list);
|
||||
conf->recovery_disabled = mddev->recovery_disabled - 1;
|
||||
|
||||
err = -EIO;
|
||||
for (i = 0; i < conf->raid_disks * 2; i++) {
|
||||
|
|
@ -3254,6 +3236,7 @@ static int raid1_run(struct mddev *mddev)
|
|||
if (!mddev_is_dm(mddev)) {
|
||||
ret = raid1_set_limits(mddev);
|
||||
if (ret) {
|
||||
md_unregister_thread(mddev, &conf->thread);
|
||||
if (!mddev->private)
|
||||
raid1_free(mddev, conf);
|
||||
return ret;
|
||||
|
|
|
|||
|
|
@ -93,11 +93,6 @@ struct r1conf {
|
|||
*/
|
||||
int fullsync;
|
||||
|
||||
/* When the same as mddev->recovery_disabled we don't allow
|
||||
* recovery to be attempted as we expect a read error.
|
||||
*/
|
||||
int recovery_disabled;
|
||||
|
||||
mempool_t *r1bio_pool;
|
||||
mempool_t r1buf_pool;
|
||||
|
||||
|
|
|
|||
|
|
@ -1990,7 +1990,7 @@ static int enough(struct r10conf *conf, int ignore)
|
|||
* - &mddev->degraded is bumped.
|
||||
*
|
||||
* @rdev is marked as &Faulty excluding case when array is failed and
|
||||
* &mddev->fail_last_dev is off.
|
||||
* MD_FAILLAST_DEV is not set.
|
||||
*/
|
||||
static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
|
|
@ -2002,7 +2002,7 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
|
|||
if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
|
||||
set_bit(MD_BROKEN, &mddev->flags);
|
||||
|
||||
if (!mddev->fail_last_dev) {
|
||||
if (!test_bit(MD_FAILLAST_DEV, &mddev->flags)) {
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
return;
|
||||
}
|
||||
|
|
@ -2130,8 +2130,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
mirror = first;
|
||||
for ( ; mirror <= last ; mirror++) {
|
||||
p = &conf->mirrors[mirror];
|
||||
if (p->recovery_disabled == mddev->recovery_disabled)
|
||||
continue;
|
||||
if (p->rdev) {
|
||||
if (test_bit(WantReplacement, &p->rdev->flags) &&
|
||||
p->replacement == NULL && repl_slot < 0)
|
||||
|
|
@ -2143,7 +2141,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
if (err)
|
||||
return err;
|
||||
p->head_position = 0;
|
||||
p->recovery_disabled = mddev->recovery_disabled - 1;
|
||||
rdev->raid_disk = mirror;
|
||||
err = 0;
|
||||
if (rdev->saved_raid_disk != mirror)
|
||||
|
|
@ -2196,7 +2193,6 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
* is not possible.
|
||||
*/
|
||||
if (!test_bit(Faulty, &rdev->flags) &&
|
||||
mddev->recovery_disabled != p->recovery_disabled &&
|
||||
(!p->replacement || p->replacement == rdev) &&
|
||||
number < conf->geo.raid_disks &&
|
||||
enough(conf, -1)) {
|
||||
|
|
@ -2276,7 +2272,7 @@ static void end_sync_request(struct r10bio *r10_bio)
|
|||
reschedule_retry(r10_bio);
|
||||
else
|
||||
put_buf(r10_bio);
|
||||
md_done_sync(mddev, s, 1);
|
||||
md_done_sync(mddev, s);
|
||||
break;
|
||||
} else {
|
||||
struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
|
||||
|
|
@ -2452,7 +2448,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
|
|||
|
||||
done:
|
||||
if (atomic_dec_and_test(&r10_bio->remaining)) {
|
||||
md_done_sync(mddev, r10_bio->sectors, 1);
|
||||
md_done_sync(mddev, r10_bio->sectors);
|
||||
put_buf(r10_bio);
|
||||
}
|
||||
}
|
||||
|
|
@ -2535,8 +2531,6 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
|
|||
pr_notice("md/raid10:%s: recovery aborted due to read error\n",
|
||||
mdname(mddev));
|
||||
|
||||
conf->mirrors[dw].recovery_disabled
|
||||
= mddev->recovery_disabled;
|
||||
set_bit(MD_RECOVERY_INTR,
|
||||
&mddev->recovery);
|
||||
break;
|
||||
|
|
@ -2604,8 +2598,7 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
|
|||
&rdev->mddev->recovery);
|
||||
}
|
||||
/* need to record an error - either for the block or the device */
|
||||
if (!rdev_set_badblocks(rdev, sector, sectors, 0))
|
||||
md_error(rdev->mddev, rdev);
|
||||
rdev_set_badblocks(rdev, sector, sectors, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -2686,7 +2679,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
|||
r10_bio->devs[slot].addr
|
||||
+ sect,
|
||||
s, 0)) {
|
||||
md_error(mddev, rdev);
|
||||
r10_bio->devs[slot].bio
|
||||
= IO_BLOCKED;
|
||||
}
|
||||
|
|
@ -2773,7 +2765,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
|||
}
|
||||
}
|
||||
|
||||
static bool narrow_write_error(struct r10bio *r10_bio, int i)
|
||||
static void narrow_write_error(struct r10bio *r10_bio, int i)
|
||||
{
|
||||
struct bio *bio = r10_bio->master_bio;
|
||||
struct mddev *mddev = r10_bio->mddev;
|
||||
|
|
@ -2790,17 +2782,16 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i)
|
|||
* We currently own a reference to the rdev.
|
||||
*/
|
||||
|
||||
int block_sectors;
|
||||
int block_sectors, lbs = bdev_logical_block_size(rdev->bdev) >> 9;
|
||||
sector_t sector;
|
||||
int sectors;
|
||||
int sect_to_write = r10_bio->sectors;
|
||||
bool ok = true;
|
||||
|
||||
if (rdev->badblocks.shift < 0)
|
||||
return false;
|
||||
block_sectors = lbs;
|
||||
else
|
||||
block_sectors = roundup(1 << rdev->badblocks.shift, lbs);
|
||||
|
||||
block_sectors = roundup(1 << rdev->badblocks.shift,
|
||||
bdev_logical_block_size(rdev->bdev) >> 9);
|
||||
sector = r10_bio->sector;
|
||||
sectors = ((r10_bio->sector + block_sectors)
|
||||
& ~(sector_t)(block_sectors - 1))
|
||||
|
|
@ -2820,18 +2811,21 @@ static bool narrow_write_error(struct r10bio *r10_bio, int i)
|
|||
choose_data_offset(r10_bio, rdev);
|
||||
wbio->bi_opf = REQ_OP_WRITE;
|
||||
|
||||
if (submit_bio_wait(wbio) < 0)
|
||||
/* Failure! */
|
||||
ok = rdev_set_badblocks(rdev, wsector,
|
||||
sectors, 0)
|
||||
&& ok;
|
||||
if (submit_bio_wait(wbio) &&
|
||||
!rdev_set_badblocks(rdev, wsector, sectors, 0)) {
|
||||
/*
|
||||
* Badblocks set failed, disk marked Faulty.
|
||||
* No further operations needed.
|
||||
*/
|
||||
bio_put(wbio);
|
||||
break;
|
||||
}
|
||||
|
||||
bio_put(wbio);
|
||||
sect_to_write -= sectors;
|
||||
sector += sectors;
|
||||
sectors = block_sectors;
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
|
||||
static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
|
||||
|
|
@ -2891,35 +2885,29 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
|
|||
if (r10_bio->devs[m].bio == NULL ||
|
||||
r10_bio->devs[m].bio->bi_end_io == NULL)
|
||||
continue;
|
||||
if (!r10_bio->devs[m].bio->bi_status) {
|
||||
if (!r10_bio->devs[m].bio->bi_status)
|
||||
rdev_clear_badblocks(
|
||||
rdev,
|
||||
r10_bio->devs[m].addr,
|
||||
r10_bio->sectors, 0);
|
||||
} else {
|
||||
if (!rdev_set_badblocks(
|
||||
rdev,
|
||||
r10_bio->devs[m].addr,
|
||||
r10_bio->sectors, 0))
|
||||
md_error(conf->mddev, rdev);
|
||||
}
|
||||
else
|
||||
rdev_set_badblocks(rdev,
|
||||
r10_bio->devs[m].addr,
|
||||
r10_bio->sectors, 0);
|
||||
rdev = conf->mirrors[dev].replacement;
|
||||
if (r10_bio->devs[m].repl_bio == NULL ||
|
||||
r10_bio->devs[m].repl_bio->bi_end_io == NULL)
|
||||
continue;
|
||||
|
||||
if (!r10_bio->devs[m].repl_bio->bi_status) {
|
||||
if (!r10_bio->devs[m].repl_bio->bi_status)
|
||||
rdev_clear_badblocks(
|
||||
rdev,
|
||||
r10_bio->devs[m].addr,
|
||||
r10_bio->sectors, 0);
|
||||
} else {
|
||||
if (!rdev_set_badblocks(
|
||||
rdev,
|
||||
r10_bio->devs[m].addr,
|
||||
r10_bio->sectors, 0))
|
||||
md_error(conf->mddev, rdev);
|
||||
}
|
||||
else
|
||||
rdev_set_badblocks(rdev,
|
||||
r10_bio->devs[m].addr,
|
||||
r10_bio->sectors, 0);
|
||||
}
|
||||
put_buf(r10_bio);
|
||||
} else {
|
||||
|
|
@ -2936,8 +2924,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
|
|||
rdev_dec_pending(rdev, conf->mddev);
|
||||
} else if (bio != NULL && bio->bi_status) {
|
||||
fail = true;
|
||||
if (!narrow_write_error(r10_bio, m))
|
||||
md_error(conf->mddev, rdev);
|
||||
narrow_write_error(r10_bio, m);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
bio = r10_bio->devs[m].repl_bio;
|
||||
|
|
@ -3168,11 +3155,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
int i;
|
||||
int max_sync;
|
||||
sector_t sync_blocks;
|
||||
sector_t sectors_skipped = 0;
|
||||
int chunks_skipped = 0;
|
||||
sector_t chunk_mask = conf->geo.chunk_mask;
|
||||
int page_idx = 0;
|
||||
int error_disk = -1;
|
||||
|
||||
/*
|
||||
* Allow skipping a full rebuild for incremental assembly
|
||||
|
|
@ -3193,7 +3177,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
if (init_resync(conf))
|
||||
return 0;
|
||||
|
||||
skipped:
|
||||
if (sector_nr >= max_sector) {
|
||||
conf->cluster_sync_low = 0;
|
||||
conf->cluster_sync_high = 0;
|
||||
|
|
@ -3245,33 +3228,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
mddev->bitmap_ops->close_sync(mddev);
|
||||
close_sync(conf);
|
||||
*skipped = 1;
|
||||
return sectors_skipped;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
|
||||
return reshape_request(mddev, sector_nr, skipped);
|
||||
|
||||
if (chunks_skipped >= conf->geo.raid_disks) {
|
||||
pr_err("md/raid10:%s: %s fails\n", mdname(mddev),
|
||||
test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery");
|
||||
if (error_disk >= 0 &&
|
||||
!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
|
||||
/*
|
||||
* recovery fails, set mirrors.recovery_disabled,
|
||||
* device shouldn't be added to there.
|
||||
*/
|
||||
conf->mirrors[error_disk].recovery_disabled =
|
||||
mddev->recovery_disabled;
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
* if there has been nothing to do on any drive,
|
||||
* then there is nothing to do at all.
|
||||
*/
|
||||
*skipped = 1;
|
||||
return (max_sector - sector_nr) + sectors_skipped;
|
||||
}
|
||||
|
||||
if (max_sector > mddev->resync_max)
|
||||
max_sector = mddev->resync_max; /* Don't do IO beyond here */
|
||||
|
||||
|
|
@ -3354,7 +3316,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
/* yep, skip the sync_blocks here, but don't assume
|
||||
* that there will never be anything to do here
|
||||
*/
|
||||
chunks_skipped = -1;
|
||||
continue;
|
||||
}
|
||||
if (mrdev)
|
||||
|
|
@ -3402,7 +3363,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
!test_bit(In_sync, &rdev->flags))
|
||||
continue;
|
||||
/* This is where we read from */
|
||||
any_working = 1;
|
||||
sector = r10_bio->devs[j].addr;
|
||||
|
||||
if (is_badblock(rdev, sector, max_sync,
|
||||
|
|
@ -3417,6 +3377,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
continue;
|
||||
}
|
||||
}
|
||||
any_working = 1;
|
||||
bio = r10_bio->devs[0].bio;
|
||||
bio->bi_next = biolist;
|
||||
biolist = bio;
|
||||
|
|
@ -3485,29 +3446,19 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
for (k = 0; k < conf->copies; k++)
|
||||
if (r10_bio->devs[k].devnum == i)
|
||||
break;
|
||||
if (mrdev && !test_bit(In_sync,
|
||||
&mrdev->flags)
|
||||
&& !rdev_set_badblocks(
|
||||
mrdev,
|
||||
r10_bio->devs[k].addr,
|
||||
max_sync, 0))
|
||||
any_working = 0;
|
||||
if (mreplace &&
|
||||
!rdev_set_badblocks(
|
||||
mreplace,
|
||||
r10_bio->devs[k].addr,
|
||||
max_sync, 0))
|
||||
any_working = 0;
|
||||
}
|
||||
if (!any_working) {
|
||||
if (!test_and_set_bit(MD_RECOVERY_INTR,
|
||||
&mddev->recovery))
|
||||
pr_warn("md/raid10:%s: insufficient working devices for recovery.\n",
|
||||
mdname(mddev));
|
||||
mirror->recovery_disabled
|
||||
= mddev->recovery_disabled;
|
||||
} else {
|
||||
error_disk = i;
|
||||
if (mrdev &&
|
||||
!test_bit(In_sync, &mrdev->flags))
|
||||
rdev_set_badblocks(
|
||||
mrdev,
|
||||
r10_bio->devs[k].addr,
|
||||
max_sync, 0);
|
||||
if (mreplace)
|
||||
rdev_set_badblocks(
|
||||
mreplace,
|
||||
r10_bio->devs[k].addr,
|
||||
max_sync, 0);
|
||||
pr_warn("md/raid10:%s: cannot recovery sector %llu + %d.\n",
|
||||
mdname(mddev), r10_bio->devs[k].addr, max_sync);
|
||||
}
|
||||
put_buf(r10_bio);
|
||||
if (rb2)
|
||||
|
|
@ -3548,7 +3499,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
rb2->master_bio = NULL;
|
||||
put_buf(rb2);
|
||||
}
|
||||
goto giveup;
|
||||
*skipped = 1;
|
||||
return max_sync;
|
||||
}
|
||||
} else {
|
||||
/* resync. Schedule a read for every block at this virt offset */
|
||||
|
|
@ -3572,7 +3524,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
&mddev->recovery)) {
|
||||
/* We can skip this block */
|
||||
*skipped = 1;
|
||||
return sync_blocks + sectors_skipped;
|
||||
return sync_blocks;
|
||||
}
|
||||
if (sync_blocks < max_sync)
|
||||
max_sync = sync_blocks;
|
||||
|
|
@ -3664,8 +3616,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
mddev);
|
||||
}
|
||||
put_buf(r10_bio);
|
||||
biolist = NULL;
|
||||
goto giveup;
|
||||
*skipped = 1;
|
||||
return max_sync;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3685,7 +3637,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
if (WARN_ON(!bio_add_page(bio, page, len, 0))) {
|
||||
bio->bi_status = BLK_STS_RESOURCE;
|
||||
bio_endio(bio);
|
||||
goto giveup;
|
||||
*skipped = 1;
|
||||
return max_sync;
|
||||
}
|
||||
}
|
||||
nr_sectors += len>>9;
|
||||
|
|
@ -3753,25 +3706,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
|||
}
|
||||
}
|
||||
|
||||
if (sectors_skipped)
|
||||
/* pretend they weren't skipped, it makes
|
||||
* no important difference in this case
|
||||
*/
|
||||
md_done_sync(mddev, sectors_skipped, 1);
|
||||
|
||||
return sectors_skipped + nr_sectors;
|
||||
giveup:
|
||||
/* There is nowhere to write, so all non-sync
|
||||
* drives must be failed or in resync, all drives
|
||||
* have a bad block, so try the next chunk...
|
||||
*/
|
||||
if (sector_nr + max_sync < max_sector)
|
||||
max_sector = sector_nr + max_sync;
|
||||
|
||||
sectors_skipped += (max_sector - sector_nr);
|
||||
chunks_skipped ++;
|
||||
sector_nr = max_sector;
|
||||
goto skipped;
|
||||
return nr_sectors;
|
||||
}
|
||||
|
||||
static sector_t
|
||||
|
|
@ -4134,8 +4069,6 @@ static int raid10_run(struct mddev *mddev)
|
|||
disk->replacement->saved_raid_disk < 0) {
|
||||
conf->fullsync = 1;
|
||||
}
|
||||
|
||||
disk->recovery_disabled = mddev->recovery_disabled - 1;
|
||||
}
|
||||
|
||||
if (mddev->resync_offset != MaxSector)
|
||||
|
|
@ -4913,7 +4846,8 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
|
|||
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
|
||||
if (handle_reshape_read_error(mddev, r10_bio) < 0) {
|
||||
/* Reshape has been aborted */
|
||||
md_done_sync(mddev, r10_bio->sectors, 0);
|
||||
md_done_sync(mddev, r10_bio->sectors);
|
||||
md_sync_error(mddev);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -5071,7 +5005,7 @@ static void end_reshape_request(struct r10bio *r10_bio)
|
|||
{
|
||||
if (!atomic_dec_and_test(&r10_bio->remaining))
|
||||
return;
|
||||
md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
|
||||
md_done_sync(r10_bio->mddev, r10_bio->sectors);
|
||||
bio_put(r10_bio->master_bio);
|
||||
put_buf(r10_bio);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,11 +18,6 @@
|
|||
struct raid10_info {
|
||||
struct md_rdev *rdev, *replacement;
|
||||
sector_t head_position;
|
||||
int recovery_disabled; /* matches
|
||||
* mddev->recovery_disabled
|
||||
* when we shouldn't try
|
||||
* recovering this device.
|
||||
*/
|
||||
};
|
||||
|
||||
struct r10conf {
|
||||
|
|
|
|||
|
|
@ -56,7 +56,11 @@
|
|||
#include "md-bitmap.h"
|
||||
#include "raid5-log.h"
|
||||
|
||||
#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
|
||||
#define UNSUPPORTED_MDDEV_FLAGS \
|
||||
((1L << MD_FAILFAST_SUPPORTED) | \
|
||||
(1L << MD_FAILLAST_DEV) | \
|
||||
(1L << MD_SERIALIZE_POLICY))
|
||||
|
||||
|
||||
#define cpu_to_group(cpu) cpu_to_node(cpu)
|
||||
#define ANY_GROUP NUMA_NO_NODE
|
||||
|
|
@ -773,14 +777,14 @@ struct stripe_request_ctx {
|
|||
/* last sector in the request */
|
||||
sector_t last_sector;
|
||||
|
||||
/* the request had REQ_PREFLUSH, cleared after the first stripe_head */
|
||||
bool do_flush;
|
||||
|
||||
/*
|
||||
* bitmap to track stripe sectors that have been added to stripes
|
||||
* add one to account for unaligned requests
|
||||
*/
|
||||
DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1);
|
||||
|
||||
/* the request had REQ_PREFLUSH, cleared after the first stripe_head */
|
||||
bool do_flush;
|
||||
unsigned long sectors_to_do[];
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -2817,11 +2821,9 @@ static void raid5_end_read_request(struct bio * bi)
|
|||
else {
|
||||
clear_bit(R5_ReadError, &sh->dev[i].flags);
|
||||
clear_bit(R5_ReWrite, &sh->dev[i].flags);
|
||||
if (!(set_bad
|
||||
&& test_bit(In_sync, &rdev->flags)
|
||||
&& rdev_set_badblocks(
|
||||
rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0)))
|
||||
md_error(conf->mddev, rdev);
|
||||
if (!(set_bad && test_bit(In_sync, &rdev->flags)))
|
||||
rdev_set_badblocks(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), 0);
|
||||
}
|
||||
}
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
|
|
@ -2920,7 +2922,6 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
|
|||
|
||||
if (has_failed(conf)) {
|
||||
set_bit(MD_BROKEN, &conf->mddev->flags);
|
||||
conf->recovery_disabled = mddev->recovery_disabled;
|
||||
|
||||
pr_crit("md/raid:%s: Cannot continue operation (%d/%d failed).\n",
|
||||
mdname(mddev), mddev->degraded, conf->raid_disks);
|
||||
|
|
@ -3599,11 +3600,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
|||
else
|
||||
rdev = NULL;
|
||||
if (rdev) {
|
||||
if (!rdev_set_badblocks(
|
||||
rdev,
|
||||
sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
md_error(conf->mddev, rdev);
|
||||
rdev_set_badblocks(rdev,
|
||||
sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf),
|
||||
0);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
}
|
||||
|
|
@ -3723,11 +3723,11 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
|
|||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
abort = 1;
|
||||
}
|
||||
if (abort)
|
||||
conf->recovery_disabled =
|
||||
conf->mddev->recovery_disabled;
|
||||
}
|
||||
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort);
|
||||
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf));
|
||||
|
||||
if (abort)
|
||||
md_sync_error(conf->mddev);
|
||||
}
|
||||
|
||||
static int want_replace(struct stripe_head *sh, int disk_idx)
|
||||
|
|
@ -3751,9 +3751,14 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
|
|||
struct r5dev *dev = &sh->dev[disk_idx];
|
||||
struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
|
||||
&sh->dev[s->failed_num[1]] };
|
||||
struct mddev *mddev = sh->raid_conf->mddev;
|
||||
bool force_rcw = false;
|
||||
int i;
|
||||
bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
|
||||
|
||||
if (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW ||
|
||||
(mddev->bitmap_ops && mddev->bitmap_ops->blocks_synced &&
|
||||
!mddev->bitmap_ops->blocks_synced(mddev, sh->sector)))
|
||||
force_rcw = true;
|
||||
|
||||
if (test_bit(R5_LOCKED, &dev->flags) ||
|
||||
test_bit(R5_UPTODATE, &dev->flags))
|
||||
|
|
@ -5157,7 +5162,7 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
if ((s.syncing || s.replacing) && s.locked == 0 &&
|
||||
!test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
|
||||
test_bit(STRIPE_INSYNC, &sh->state)) {
|
||||
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
|
||||
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf));
|
||||
clear_bit(STRIPE_SYNCING, &sh->state);
|
||||
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
|
||||
wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
|
||||
|
|
@ -5224,7 +5229,7 @@ static void handle_stripe(struct stripe_head *sh)
|
|||
clear_bit(STRIPE_EXPAND_READY, &sh->state);
|
||||
atomic_dec(&conf->reshape_stripes);
|
||||
wake_up(&conf->wait_for_reshape);
|
||||
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
|
||||
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf));
|
||||
}
|
||||
|
||||
if (s.expanding && s.locked == 0 &&
|
||||
|
|
@ -5253,9 +5258,8 @@ finish:
|
|||
if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
|
||||
/* We own a safe reference to the rdev */
|
||||
rdev = conf->disks[i].rdev;
|
||||
if (!rdev_set_badblocks(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
md_error(conf->mddev, rdev);
|
||||
rdev_set_badblocks(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), 0);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
|
||||
|
|
@ -6080,13 +6084,13 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
|
|||
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
||||
{
|
||||
DEFINE_WAIT_FUNC(wait, woken_wake_function);
|
||||
bool on_wq;
|
||||
struct r5conf *conf = mddev->private;
|
||||
sector_t logical_sector;
|
||||
struct stripe_request_ctx ctx = {};
|
||||
const int rw = bio_data_dir(bi);
|
||||
struct stripe_request_ctx *ctx;
|
||||
sector_t logical_sector;
|
||||
enum stripe_result res;
|
||||
int s, stripe_cnt;
|
||||
bool on_wq;
|
||||
|
||||
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
|
||||
int ret = log_handle_flush_request(conf, bi);
|
||||
|
|
@ -6098,11 +6102,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
|||
return true;
|
||||
}
|
||||
/* ret == -EAGAIN, fallback */
|
||||
/*
|
||||
* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
|
||||
* we need to flush journal device
|
||||
*/
|
||||
ctx.do_flush = bi->bi_opf & REQ_PREFLUSH;
|
||||
}
|
||||
|
||||
md_write_start(mddev, bi);
|
||||
|
|
@ -6125,16 +6124,25 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
|||
}
|
||||
|
||||
logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1);
|
||||
ctx.first_sector = logical_sector;
|
||||
ctx.last_sector = bio_end_sector(bi);
|
||||
bi->bi_next = NULL;
|
||||
|
||||
stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector,
|
||||
ctx = mempool_alloc(conf->ctx_pool, GFP_NOIO);
|
||||
memset(ctx, 0, conf->ctx_size);
|
||||
ctx->first_sector = logical_sector;
|
||||
ctx->last_sector = bio_end_sector(bi);
|
||||
/*
|
||||
* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
|
||||
* we need to flush journal device
|
||||
*/
|
||||
if (unlikely(bi->bi_opf & REQ_PREFLUSH))
|
||||
ctx->do_flush = true;
|
||||
|
||||
stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx->last_sector - logical_sector,
|
||||
RAID5_STRIPE_SECTORS(conf));
|
||||
bitmap_set(ctx.sectors_to_do, 0, stripe_cnt);
|
||||
bitmap_set(ctx->sectors_to_do, 0, stripe_cnt);
|
||||
|
||||
pr_debug("raid456: %s, logical %llu to %llu\n", __func__,
|
||||
bi->bi_iter.bi_sector, ctx.last_sector);
|
||||
bi->bi_iter.bi_sector, ctx->last_sector);
|
||||
|
||||
/* Bail out if conflicts with reshape and REQ_NOWAIT is set */
|
||||
if ((bi->bi_opf & REQ_NOWAIT) &&
|
||||
|
|
@ -6142,6 +6150,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
|||
bio_wouldblock_error(bi);
|
||||
if (rw == WRITE)
|
||||
md_write_end(mddev);
|
||||
mempool_free(ctx, conf->ctx_pool);
|
||||
return true;
|
||||
}
|
||||
md_account_bio(mddev, &bi);
|
||||
|
|
@ -6160,10 +6169,10 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
|||
add_wait_queue(&conf->wait_for_reshape, &wait);
|
||||
on_wq = true;
|
||||
}
|
||||
s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
|
||||
s = (logical_sector - ctx->first_sector) >> RAID5_STRIPE_SHIFT(conf);
|
||||
|
||||
while (1) {
|
||||
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
|
||||
res = make_stripe_request(mddev, conf, ctx, logical_sector,
|
||||
bi);
|
||||
if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE)
|
||||
break;
|
||||
|
|
@ -6180,9 +6189,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
|||
* raid5_activate_delayed() from making progress
|
||||
* and thus deadlocking.
|
||||
*/
|
||||
if (ctx.batch_last) {
|
||||
raid5_release_stripe(ctx.batch_last);
|
||||
ctx.batch_last = NULL;
|
||||
if (ctx->batch_last) {
|
||||
raid5_release_stripe(ctx->batch_last);
|
||||
ctx->batch_last = NULL;
|
||||
}
|
||||
|
||||
wait_woken(&wait, TASK_UNINTERRUPTIBLE,
|
||||
|
|
@ -6190,21 +6199,23 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
|
|||
continue;
|
||||
}
|
||||
|
||||
s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s);
|
||||
s = find_next_bit_wrap(ctx->sectors_to_do, stripe_cnt, s);
|
||||
if (s == stripe_cnt)
|
||||
break;
|
||||
|
||||
logical_sector = ctx.first_sector +
|
||||
logical_sector = ctx->first_sector +
|
||||
(s << RAID5_STRIPE_SHIFT(conf));
|
||||
}
|
||||
if (unlikely(on_wq))
|
||||
remove_wait_queue(&conf->wait_for_reshape, &wait);
|
||||
|
||||
if (ctx.batch_last)
|
||||
raid5_release_stripe(ctx.batch_last);
|
||||
if (ctx->batch_last)
|
||||
raid5_release_stripe(ctx->batch_last);
|
||||
|
||||
if (rw == WRITE)
|
||||
md_write_end(mddev);
|
||||
|
||||
mempool_free(ctx, conf->ctx_pool);
|
||||
if (res == STRIPE_WAIT_RESHAPE) {
|
||||
md_free_cloned_bio(bi);
|
||||
return false;
|
||||
|
|
@ -7374,6 +7385,9 @@ static void free_conf(struct r5conf *conf)
|
|||
bioset_exit(&conf->bio_split);
|
||||
kfree(conf->stripe_hashtbl);
|
||||
kfree(conf->pending_data);
|
||||
|
||||
mempool_destroy(conf->ctx_pool);
|
||||
|
||||
kfree(conf);
|
||||
}
|
||||
|
||||
|
|
@ -7536,8 +7550,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
|
|||
}
|
||||
|
||||
conf->bypass_threshold = BYPASS_THRESHOLD;
|
||||
conf->recovery_disabled = mddev->recovery_disabled - 1;
|
||||
|
||||
conf->raid_disks = mddev->raid_disks;
|
||||
if (mddev->reshape_position == MaxSector)
|
||||
conf->previous_raid_disks = mddev->raid_disks;
|
||||
|
|
@ -7729,6 +7741,25 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int raid5_create_ctx_pool(struct r5conf *conf)
|
||||
{
|
||||
struct stripe_request_ctx *ctx;
|
||||
int size;
|
||||
|
||||
if (mddev_is_dm(conf->mddev))
|
||||
size = BITS_TO_LONGS(RAID5_MAX_REQ_STRIPES);
|
||||
else
|
||||
size = BITS_TO_LONGS(
|
||||
queue_max_hw_sectors(conf->mddev->gendisk->queue) >>
|
||||
RAID5_STRIPE_SHIFT(conf));
|
||||
|
||||
conf->ctx_size = struct_size(ctx, sectors_to_do, size);
|
||||
conf->ctx_pool = mempool_create_kmalloc_pool(NR_RAID_BIOS,
|
||||
conf->ctx_size);
|
||||
|
||||
return conf->ctx_pool ? 0 : -ENOMEM;
|
||||
}
|
||||
|
||||
static int raid5_set_limits(struct mddev *mddev)
|
||||
{
|
||||
struct r5conf *conf = mddev->private;
|
||||
|
|
@ -7785,6 +7816,8 @@ static int raid5_set_limits(struct mddev *mddev)
|
|||
* Limit the max sectors based on this.
|
||||
*/
|
||||
lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf);
|
||||
if ((lim.max_hw_sectors << 9) < lim.io_opt)
|
||||
lim.max_hw_sectors = lim.io_opt >> 9;
|
||||
|
||||
/* No restrictions on the number of segments in the request */
|
||||
lim.max_segments = USHRT_MAX;
|
||||
|
|
@ -8057,7 +8090,12 @@ static int raid5_run(struct mddev *mddev)
|
|||
goto abort;
|
||||
}
|
||||
|
||||
if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
|
||||
ret = raid5_create_ctx_pool(conf);
|
||||
if (ret)
|
||||
goto abort;
|
||||
|
||||
ret = log_init(conf, journal_dev, raid5_has_ppl(conf));
|
||||
if (ret)
|
||||
goto abort;
|
||||
|
||||
return 0;
|
||||
|
|
@ -8211,7 +8249,6 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
* isn't possible.
|
||||
*/
|
||||
if (!test_bit(Faulty, &rdev->flags) &&
|
||||
mddev->recovery_disabled != conf->recovery_disabled &&
|
||||
!has_failed(conf) &&
|
||||
(!p->replacement || p->replacement == rdev) &&
|
||||
number < conf->raid_disks) {
|
||||
|
|
@ -8272,8 +8309,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
|
||||
return 0;
|
||||
}
|
||||
if (mddev->recovery_disabled == conf->recovery_disabled)
|
||||
return -EBUSY;
|
||||
|
||||
if (rdev->saved_raid_disk < 0 && has_failed(conf))
|
||||
/* no point adding a device */
|
||||
|
|
|
|||
|
|
@ -640,7 +640,6 @@ struct r5conf {
|
|||
* (fresh device added).
|
||||
* Cleared when a sync completes.
|
||||
*/
|
||||
int recovery_disabled;
|
||||
/* per cpu variables */
|
||||
struct raid5_percpu __percpu *percpu;
|
||||
int scribble_disks;
|
||||
|
|
@ -690,6 +689,9 @@ struct r5conf {
|
|||
struct list_head pending_list;
|
||||
int pending_data_cnt;
|
||||
struct r5pending_data *next_pending_data;
|
||||
|
||||
mempool_t *ctx_pool;
|
||||
int ctx_size;
|
||||
};
|
||||
|
||||
#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
|
||||
|
|
|
|||
|
|
@ -1333,7 +1333,8 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
|
||||
blk_status_t status)
|
||||
blk_status_t status,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = rq->end_io_data;
|
||||
unsigned long rtt = jiffies - (rq->deadline - rq->timeout);
|
||||
|
|
|
|||
|
|
@ -410,7 +410,8 @@ static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw)
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
|
||||
blk_status_t err)
|
||||
blk_status_t err,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = req->end_io_data;
|
||||
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
|
||||
|
|
@ -425,14 +426,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
|
|||
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
|
||||
/*
|
||||
* IOPOLL could potentially complete this request directly, but
|
||||
* if multiple rings are polling on the same queue, then it's possible
|
||||
* for one ring to find completions for another ring. Punting the
|
||||
* completion via task_work will always direct it to the right
|
||||
* location, rather than potentially complete requests for ringA
|
||||
* under iopoll invocations from ringB.
|
||||
* For IOPOLL, check if this completion is happening in the context
|
||||
* of the same io_ring that owns the request (local context). If so,
|
||||
* we can complete inline without task_work overhead. Otherwise, we
|
||||
* must punt to task_work to ensure completion happens in the correct
|
||||
* ring's context.
|
||||
*/
|
||||
io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
|
||||
if (blk_rq_is_poll(req) && iob &&
|
||||
iob->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) {
|
||||
if (pdu->bio)
|
||||
blk_rq_unmap_user(pdu->bio);
|
||||
io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0);
|
||||
} else {
|
||||
io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
|
||||
}
|
||||
return RQ_END_IO_FREE;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -290,14 +290,14 @@ struct nvme_iod {
|
|||
u8 flags;
|
||||
u8 nr_descriptors;
|
||||
|
||||
unsigned int total_len;
|
||||
size_t total_len;
|
||||
struct dma_iova_state dma_state;
|
||||
void *descriptors[NVME_MAX_NR_DESCRIPTORS];
|
||||
struct nvme_dma_vec *dma_vecs;
|
||||
unsigned int nr_dma_vecs;
|
||||
|
||||
dma_addr_t meta_dma;
|
||||
unsigned int meta_total_len;
|
||||
size_t meta_total_len;
|
||||
struct dma_iova_state meta_dma_state;
|
||||
struct nvme_sgl_desc *meta_descriptor;
|
||||
};
|
||||
|
|
@ -845,11 +845,9 @@ static bool nvme_pci_prp_save_mapping(struct request *req,
|
|||
static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
|
||||
struct blk_dma_iter *iter)
|
||||
{
|
||||
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
|
||||
|
||||
if (iter->len)
|
||||
return true;
|
||||
if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
|
||||
if (!blk_rq_dma_map_iter_next(req, dma_dev, iter))
|
||||
return false;
|
||||
return nvme_pci_prp_save_mapping(req, dma_dev, iter);
|
||||
}
|
||||
|
|
@ -1024,8 +1022,7 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
|
|||
}
|
||||
nvme_pci_sgl_set_data(&sg_list[mapped++], iter);
|
||||
iod->total_len += iter->len;
|
||||
} while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state,
|
||||
iter));
|
||||
} while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, iter));
|
||||
|
||||
nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
|
||||
if (unlikely(iter->status))
|
||||
|
|
@ -1634,7 +1631,8 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
|
|||
return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
|
||||
}
|
||||
|
||||
static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error)
|
||||
static enum rq_end_io_ret abort_endio(struct request *req, blk_status_t error,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
|
||||
|
||||
|
|
@ -2877,7 +2875,8 @@ out_unlock:
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret nvme_del_queue_end(struct request *req,
|
||||
blk_status_t error)
|
||||
blk_status_t error,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct nvme_queue *nvmeq = req->end_io_data;
|
||||
|
||||
|
|
@ -2887,14 +2886,15 @@ static enum rq_end_io_ret nvme_del_queue_end(struct request *req,
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret nvme_del_cq_end(struct request *req,
|
||||
blk_status_t error)
|
||||
blk_status_t error,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct nvme_queue *nvmeq = req->end_io_data;
|
||||
|
||||
if (error)
|
||||
set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
|
||||
|
||||
return nvme_del_queue_end(req, error);
|
||||
return nvme_del_queue_end(req, error, iob);
|
||||
}
|
||||
|
||||
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
|
||||
|
|
|
|||
|
|
@ -298,7 +298,7 @@ static void nvmet_execute_get_log_page_rmi(struct nvmet_req *req)
|
|||
if (status)
|
||||
goto out;
|
||||
|
||||
if (!req->ns->bdev || bdev_nonrot(req->ns->bdev)) {
|
||||
if (!req->ns->bdev || !bdev_rot(req->ns->bdev)) {
|
||||
status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR;
|
||||
goto out;
|
||||
}
|
||||
|
|
@ -1084,7 +1084,7 @@ static void nvmet_execute_id_cs_indep(struct nvmet_req *req)
|
|||
id->nmic = NVME_NS_NMIC_SHARED;
|
||||
if (req->ns->readonly)
|
||||
id->nsattr |= NVME_NS_ATTR_RO;
|
||||
if (req->ns->bdev && !bdev_nonrot(req->ns->bdev))
|
||||
if (req->ns->bdev && bdev_rot(req->ns->bdev))
|
||||
id->nsfeat |= NVME_NS_ROTATIONAL;
|
||||
/*
|
||||
* We need flush command to flush the file's metadata,
|
||||
|
|
|
|||
|
|
@ -247,7 +247,8 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq,
|
||||
blk_status_t blk_status)
|
||||
blk_status_t blk_status,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct nvmet_req *req = rq->end_io_data;
|
||||
|
||||
|
|
|
|||
|
|
@ -2118,7 +2118,8 @@ maybe_retry:
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret eh_lock_door_done(struct request *req,
|
||||
blk_status_t status)
|
||||
blk_status_t status,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
blk_mq_free_request(req);
|
||||
return RQ_END_IO_NONE;
|
||||
|
|
|
|||
|
|
@ -177,7 +177,8 @@ typedef struct sg_device { /* holds the state of each scsi generic device */
|
|||
} Sg_device;
|
||||
|
||||
/* tasklet or soft irq callback */
|
||||
static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status);
|
||||
static enum rq_end_io_ret sg_rq_end_io(struct request *rq, blk_status_t status,
|
||||
const struct io_comp_batch *iob);
|
||||
static int sg_start_req(Sg_request *srp, unsigned char *cmd);
|
||||
static int sg_finish_rem_req(Sg_request * srp);
|
||||
static int sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size);
|
||||
|
|
@ -1309,7 +1310,8 @@ sg_rq_end_io_usercontext(struct work_struct *work)
|
|||
* level when a command is completed (or has failed).
|
||||
*/
|
||||
static enum rq_end_io_ret
|
||||
sg_rq_end_io(struct request *rq, blk_status_t status)
|
||||
sg_rq_end_io(struct request *rq, blk_status_t status,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
|
||||
struct sg_request *srp = rq->end_io_data;
|
||||
|
|
|
|||
|
|
@ -525,7 +525,8 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret st_scsi_execute_end(struct request *req,
|
||||
blk_status_t status)
|
||||
blk_status_t status,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
|
||||
struct st_request *SRpnt = req->end_io_data;
|
||||
|
|
|
|||
|
|
@ -39,7 +39,8 @@ static inline struct pscsi_dev_virt *PSCSI_DEV(struct se_device *dev)
|
|||
}
|
||||
|
||||
static sense_reason_t pscsi_execute_cmd(struct se_cmd *cmd);
|
||||
static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t);
|
||||
static enum rq_end_io_ret pscsi_req_done(struct request *, blk_status_t,
|
||||
const struct io_comp_batch *);
|
||||
|
||||
/* pscsi_attach_hba():
|
||||
*
|
||||
|
|
@ -1001,7 +1002,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev)
|
|||
}
|
||||
|
||||
static enum rq_end_io_ret pscsi_req_done(struct request *req,
|
||||
blk_status_t status)
|
||||
blk_status_t status,
|
||||
const struct io_comp_batch *iob)
|
||||
{
|
||||
struct se_cmd *cmd = req->end_io_data;
|
||||
struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req);
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@
|
|||
#include <linux/slab.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-crypto.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/quotaops.h>
|
||||
#include <linux/highmem.h>
|
||||
|
|
@ -2821,7 +2822,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
|
|||
wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
|
||||
}
|
||||
|
||||
submit_bio(bio);
|
||||
blk_crypto_submit_bio(bio);
|
||||
}
|
||||
|
||||
void submit_bh(blk_opf_t opf, struct buffer_head *bh)
|
||||
|
|
|
|||
|
|
@ -47,50 +47,71 @@ bool fscrypt_decrypt_bio(struct bio *bio)
|
|||
}
|
||||
EXPORT_SYMBOL(fscrypt_decrypt_bio);
|
||||
|
||||
struct fscrypt_zero_done {
|
||||
atomic_t pending;
|
||||
blk_status_t status;
|
||||
struct completion done;
|
||||
};
|
||||
|
||||
static void fscrypt_zeroout_range_done(struct fscrypt_zero_done *done)
|
||||
{
|
||||
if (atomic_dec_and_test(&done->pending))
|
||||
complete(&done->done);
|
||||
}
|
||||
|
||||
static void fscrypt_zeroout_range_end_io(struct bio *bio)
|
||||
{
|
||||
struct fscrypt_zero_done *done = bio->bi_private;
|
||||
|
||||
if (bio->bi_status)
|
||||
cmpxchg(&done->status, 0, bio->bi_status);
|
||||
fscrypt_zeroout_range_done(done);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode,
|
||||
pgoff_t lblk, sector_t pblk,
|
||||
pgoff_t lblk, sector_t sector,
|
||||
unsigned int len)
|
||||
{
|
||||
const unsigned int blockbits = inode->i_blkbits;
|
||||
const unsigned int blocks_per_page = 1 << (PAGE_SHIFT - blockbits);
|
||||
struct bio *bio;
|
||||
int ret, err = 0;
|
||||
int num_pages = 0;
|
||||
|
||||
/* This always succeeds since __GFP_DIRECT_RECLAIM is set. */
|
||||
bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
|
||||
GFP_NOFS);
|
||||
struct fscrypt_zero_done done = {
|
||||
.pending = ATOMIC_INIT(1),
|
||||
.done = COMPLETION_INITIALIZER_ONSTACK(done.done),
|
||||
};
|
||||
|
||||
while (len) {
|
||||
unsigned int blocks_this_page = min(len, blocks_per_page);
|
||||
unsigned int bytes_this_page = blocks_this_page << blockbits;
|
||||
struct bio *bio;
|
||||
unsigned int n;
|
||||
|
||||
if (num_pages == 0) {
|
||||
fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS);
|
||||
bio->bi_iter.bi_sector =
|
||||
pblk << (blockbits - SECTOR_SHIFT);
|
||||
}
|
||||
ret = bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
|
||||
if (WARN_ON_ONCE(ret != bytes_this_page)) {
|
||||
err = -EIO;
|
||||
goto out;
|
||||
}
|
||||
num_pages++;
|
||||
len -= blocks_this_page;
|
||||
lblk += blocks_this_page;
|
||||
pblk += blocks_this_page;
|
||||
if (num_pages == BIO_MAX_VECS || !len ||
|
||||
!fscrypt_mergeable_bio(bio, inode, lblk)) {
|
||||
err = submit_bio_wait(bio);
|
||||
if (err)
|
||||
goto out;
|
||||
bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE);
|
||||
num_pages = 0;
|
||||
bio = bio_alloc(inode->i_sb->s_bdev, BIO_MAX_VECS, REQ_OP_WRITE,
|
||||
GFP_NOFS);
|
||||
bio->bi_iter.bi_sector = sector;
|
||||
bio->bi_private = &done;
|
||||
bio->bi_end_io = fscrypt_zeroout_range_end_io;
|
||||
fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS);
|
||||
|
||||
for (n = 0; n < BIO_MAX_VECS; n++) {
|
||||
unsigned int blocks_this_page =
|
||||
min(len, blocks_per_page);
|
||||
unsigned int bytes_this_page = blocks_this_page << blockbits;
|
||||
|
||||
__bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0);
|
||||
len -= blocks_this_page;
|
||||
lblk += blocks_this_page;
|
||||
sector += (bytes_this_page >> SECTOR_SHIFT);
|
||||
if (!len || !fscrypt_mergeable_bio(bio, inode, lblk))
|
||||
break;
|
||||
}
|
||||
|
||||
atomic_inc(&done.pending);
|
||||
blk_crypto_submit_bio(bio);
|
||||
}
|
||||
out:
|
||||
bio_put(bio);
|
||||
return err;
|
||||
|
||||
fscrypt_zeroout_range_done(&done);
|
||||
|
||||
wait_for_completion(&done.done);
|
||||
return blk_status_to_errno(done.status);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -132,7 +153,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
|
|||
return 0;
|
||||
|
||||
if (fscrypt_inode_uses_inline_crypto(inode))
|
||||
return fscrypt_zeroout_range_inline_crypt(inode, lblk, pblk,
|
||||
return fscrypt_zeroout_range_inline_crypt(inode, lblk, sector,
|
||||
len);
|
||||
|
||||
BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS);
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
* Written by Theodore Ts'o, 2010.
|
||||
*/
|
||||
|
||||
#include <linux/blk-crypto.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/time.h>
|
||||
#include <linux/highuid.h>
|
||||
|
|
@ -401,7 +402,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
|
|||
if (bio) {
|
||||
if (io->io_wbc->sync_mode == WB_SYNC_ALL)
|
||||
io->io_bio->bi_opf |= REQ_SYNC;
|
||||
submit_bio(io->io_bio);
|
||||
blk_crypto_submit_bio(io->io_bio);
|
||||
}
|
||||
io->io_bio = NULL;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@
|
|||
#include <linux/bio.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/blk-crypto.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/prefetch.h>
|
||||
|
|
@ -345,7 +346,7 @@ int ext4_mpage_readpages(struct inode *inode,
|
|||
if (bio && (last_block_in_bio != first_block - 1 ||
|
||||
!fscrypt_mergeable_bio(bio, inode, next_block))) {
|
||||
submit_and_realloc:
|
||||
submit_bio(bio);
|
||||
blk_crypto_submit_bio(bio);
|
||||
bio = NULL;
|
||||
}
|
||||
if (bio == NULL) {
|
||||
|
|
@ -371,14 +372,14 @@ int ext4_mpage_readpages(struct inode *inode,
|
|||
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
|
||||
(relative_block == map.m_len)) ||
|
||||
(first_hole != blocks_per_folio)) {
|
||||
submit_bio(bio);
|
||||
blk_crypto_submit_bio(bio);
|
||||
bio = NULL;
|
||||
} else
|
||||
last_block_in_bio = first_block + blocks_per_folio - 1;
|
||||
continue;
|
||||
confused:
|
||||
if (bio) {
|
||||
submit_bio(bio);
|
||||
blk_crypto_submit_bio(bio);
|
||||
bio = NULL;
|
||||
}
|
||||
if (!folio_test_uptodate(folio))
|
||||
|
|
@ -389,7 +390,7 @@ next_page:
|
|||
; /* A label shall be followed by a statement until C23 */
|
||||
}
|
||||
if (bio)
|
||||
submit_bio(bio);
|
||||
blk_crypto_submit_bio(bio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -513,7 +513,7 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
|
|||
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
|
||||
|
||||
iostat_update_submit_ctx(bio, type);
|
||||
submit_bio(bio);
|
||||
blk_crypto_submit_bio(bio);
|
||||
}
|
||||
|
||||
static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio,
|
||||
|
|
@ -522,7 +522,7 @@ static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio,
|
|||
WARN_ON_ONCE(is_read_io(bio_op(bio)));
|
||||
trace_f2fs_submit_write_bio(sbi->sb, type, bio);
|
||||
iostat_update_submit_ctx(bio, type);
|
||||
submit_bio(bio);
|
||||
blk_crypto_submit_bio(bio);
|
||||
}
|
||||
|
||||
static void __submit_merged_bio(struct f2fs_bio_info *io)
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
|
||||
* http://www.samsung.com/
|
||||
*/
|
||||
#include <linux/blk-crypto.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/stat.h>
|
||||
|
|
@ -5047,7 +5048,7 @@ static void f2fs_dio_write_submit_io(const struct iomap_iter *iter,
|
|||
enum temp_type temp = f2fs_get_segment_temp(sbi, type);
|
||||
|
||||
bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp);
|
||||
submit_bio(bio);
|
||||
blk_crypto_submit_bio(bio);
|
||||
}
|
||||
|
||||
static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = {
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
* Copyright (C) 2010 Red Hat, Inc.
|
||||
* Copyright (c) 2016-2025 Christoph Hellwig.
|
||||
*/
|
||||
#include <linux/blk-crypto.h>
|
||||
#include <linux/fscrypt.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/iomap.h>
|
||||
|
|
@ -75,7 +76,7 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
|
|||
dio->dops->submit_io(iter, bio, pos);
|
||||
} else {
|
||||
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
|
||||
submit_bio(bio);
|
||||
blk_crypto_submit_bio(bio);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -256,12 +256,6 @@ static inline struct folio *bio_first_folio_all(struct bio *bio)
|
|||
return page_folio(bio_first_page_all(bio));
|
||||
}
|
||||
|
||||
static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
|
||||
{
|
||||
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
|
||||
return &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
}
|
||||
|
||||
/**
|
||||
* struct folio_iter - State for iterating all folios in a bio.
|
||||
* @folio: The current folio we're iterating. NULL after the last folio.
|
||||
|
|
|
|||
|
|
@ -132,6 +132,11 @@ static inline bool bio_has_crypt_ctx(struct bio *bio)
|
|||
return bio->bi_crypt_context;
|
||||
}
|
||||
|
||||
static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio)
|
||||
{
|
||||
return bio->bi_crypt_context;
|
||||
}
|
||||
|
||||
void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
|
||||
const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
|
||||
gfp_t gfp_mask);
|
||||
|
|
@ -169,8 +174,35 @@ static inline bool bio_has_crypt_ctx(struct bio *bio)
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline struct bio_crypt_ctx *bio_crypt_ctx(struct bio *bio)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
|
||||
|
||||
bool __blk_crypto_submit_bio(struct bio *bio);
|
||||
|
||||
/**
|
||||
* blk_crypto_submit_bio - Submit a bio that may have a crypto context
|
||||
* @bio: bio to submit
|
||||
*
|
||||
* If @bio has no crypto context, or the crypt context attached to @bio is
|
||||
* supported by the underlying device's inline encryption hardware, just submit
|
||||
* @bio.
|
||||
*
|
||||
* Otherwise, try to perform en/decryption for this bio by falling back to the
|
||||
* kernel crypto API. For encryption this means submitting newly allocated
|
||||
* bios for the encrypted payload while keeping back the source bio until they
|
||||
* complete, while for reads the decryption happens in-place by a hooked in
|
||||
* completion handler.
|
||||
*/
|
||||
static inline void blk_crypto_submit_bio(struct bio *bio)
|
||||
{
|
||||
if (!bio_has_crypt_ctx(bio) || __blk_crypto_submit_bio(bio))
|
||||
submit_bio(bio);
|
||||
}
|
||||
|
||||
int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask);
|
||||
/**
|
||||
* bio_crypt_clone - clone bio encryption context
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
|
|||
return bio_integrity_intervals(bi, sectors) * bi->metadata_size;
|
||||
}
|
||||
|
||||
static inline bool blk_integrity_rq(struct request *rq)
|
||||
static inline bool blk_integrity_rq(const struct request *rq)
|
||||
{
|
||||
return rq->cmd_flags & REQ_INTEGRITY;
|
||||
}
|
||||
|
|
@ -168,9 +168,9 @@ static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
|
|||
{
|
||||
return 0;
|
||||
}
|
||||
static inline int blk_integrity_rq(struct request *rq)
|
||||
static inline bool blk_integrity_rq(const struct request *rq)
|
||||
{
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline struct bio_vec rq_integrity_vec(struct request *rq)
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ struct blk_dma_iter {
|
|||
bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev,
|
||||
struct dma_iova_state *state, struct blk_dma_iter *iter);
|
||||
bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev,
|
||||
struct dma_iova_state *state, struct blk_dma_iter *iter);
|
||||
struct blk_dma_iter *iter);
|
||||
|
||||
/**
|
||||
* blk_rq_dma_map_coalesce - were all segments coalesced?
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
|
||||
struct blk_mq_tags;
|
||||
struct blk_flush_queue;
|
||||
struct io_comp_batch;
|
||||
|
||||
#define BLKDEV_MIN_RQ 4
|
||||
#define BLKDEV_DEFAULT_RQ 128
|
||||
|
|
@ -22,7 +23,8 @@ enum rq_end_io_ret {
|
|||
RQ_END_IO_FREE,
|
||||
};
|
||||
|
||||
typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);
|
||||
typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t,
|
||||
const struct io_comp_batch *);
|
||||
|
||||
/*
|
||||
* request flags */
|
||||
|
|
|
|||
|
|
@ -232,6 +232,8 @@ struct bio {
|
|||
|
||||
atomic_t __bi_remaining;
|
||||
|
||||
/* The actual vec list, preserved by bio_reset() */
|
||||
struct bio_vec *bi_io_vec;
|
||||
struct bvec_iter bi_iter;
|
||||
|
||||
union {
|
||||
|
|
@ -275,8 +277,6 @@ struct bio {
|
|||
|
||||
atomic_t __bi_cnt; /* pin count */
|
||||
|
||||
struct bio_vec *bi_io_vec; /* the actual vec list */
|
||||
|
||||
struct bio_set *bi_pool;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -340,14 +340,13 @@ typedef unsigned int __bitwise blk_features_t;
|
|||
/* skip this queue in blk_mq_(un)quiesce_tagset */
|
||||
#define BLK_FEAT_SKIP_TAGSET_QUIESCE ((__force blk_features_t)(1u << 13))
|
||||
|
||||
/* atomic writes enabled */
|
||||
#define BLK_FEAT_ATOMIC_WRITES ((__force blk_features_t)(1u << 14))
|
||||
|
||||
/* undocumented magic for bcache */
|
||||
#define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \
|
||||
((__force blk_features_t)(1u << 15))
|
||||
|
||||
/* atomic writes enabled */
|
||||
#define BLK_FEAT_ATOMIC_WRITES \
|
||||
((__force blk_features_t)(1u << 16))
|
||||
|
||||
/*
|
||||
* Flags automatically inherited when stacking limits.
|
||||
*/
|
||||
|
|
@ -551,7 +550,8 @@ struct request_queue {
|
|||
/*
|
||||
* queue settings
|
||||
*/
|
||||
unsigned long nr_requests; /* Max # of requests */
|
||||
unsigned int nr_requests; /* Max # of requests */
|
||||
unsigned int async_depth; /* Max # of async requests */
|
||||
|
||||
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
|
||||
struct blk_crypto_profile *crypto_profile;
|
||||
|
|
@ -681,7 +681,7 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
|
|||
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
|
||||
#define blk_queue_noxmerges(q) \
|
||||
test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
|
||||
#define blk_queue_nonrot(q) (!((q)->limits.features & BLK_FEAT_ROTATIONAL))
|
||||
#define blk_queue_rot(q) ((q)->limits.features & BLK_FEAT_ROTATIONAL)
|
||||
#define blk_queue_io_stat(q) ((q)->limits.features & BLK_FEAT_IO_STAT)
|
||||
#define blk_queue_passthrough_stat(q) \
|
||||
((q)->limits.flags & BLK_FLAG_IOSTATS_PASSTHROUGH)
|
||||
|
|
@ -1026,7 +1026,7 @@ extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
|
|||
extern void blk_queue_exit(struct request_queue *q);
|
||||
extern void blk_sync_queue(struct request_queue *q);
|
||||
|
||||
/* Helper to convert REQ_OP_XXX to its string format XXX */
|
||||
/* Convert a request operation REQ_OP_name into the string "name" */
|
||||
extern const char *blk_op_str(enum req_op op);
|
||||
|
||||
int blk_status_to_errno(blk_status_t status);
|
||||
|
|
@ -1044,7 +1044,7 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
|
|||
return bdev->bd_queue; /* this is never NULL */
|
||||
}
|
||||
|
||||
/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
|
||||
/* Convert a zone condition BLK_ZONE_COND_name into the string "name" */
|
||||
const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);
|
||||
|
||||
static inline unsigned int bio_zone_no(struct bio *bio)
|
||||
|
|
@ -1462,9 +1462,14 @@ bdev_write_zeroes_unmap_sectors(struct block_device *bdev)
|
|||
return bdev_limits(bdev)->max_wzeroes_unmap_sectors;
|
||||
}
|
||||
|
||||
static inline bool bdev_rot(struct block_device *bdev)
|
||||
{
|
||||
return blk_queue_rot(bdev_get_queue(bdev));
|
||||
}
|
||||
|
||||
static inline bool bdev_nonrot(struct block_device *bdev)
|
||||
{
|
||||
return blk_queue_nonrot(bdev_get_queue(bdev));
|
||||
return !bdev_rot(bdev);
|
||||
}
|
||||
|
||||
static inline bool bdev_synchronous(struct block_device *bdev)
|
||||
|
|
@ -1822,6 +1827,7 @@ struct io_comp_batch {
|
|||
struct rq_list req_list;
|
||||
bool need_ts;
|
||||
void (*complete)(struct io_comp_batch *);
|
||||
void *poll_ctx;
|
||||
};
|
||||
|
||||
static inline bool blk_atomic_write_start_sect_aligned(sector_t sector,
|
||||
|
|
|
|||
|
|
@ -171,6 +171,11 @@ typedef u64 phys_addr_t;
|
|||
typedef u32 phys_addr_t;
|
||||
#endif
|
||||
|
||||
struct phys_vec {
|
||||
phys_addr_t paddr;
|
||||
size_t len;
|
||||
};
|
||||
|
||||
typedef phys_addr_t resource_size_t;
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -55,7 +55,8 @@
|
|||
_IOWR('u', 0x15, struct ublksrv_ctrl_cmd)
|
||||
#define UBLK_U_CMD_QUIESCE_DEV \
|
||||
_IOWR('u', 0x16, struct ublksrv_ctrl_cmd)
|
||||
|
||||
#define UBLK_U_CMD_TRY_STOP_DEV \
|
||||
_IOWR('u', 0x17, struct ublksrv_ctrl_cmd)
|
||||
/*
|
||||
* 64bits are enough now, and it should be easy to extend in case of
|
||||
* running out of feature flags
|
||||
|
|
@ -103,6 +104,30 @@
|
|||
#define UBLK_U_IO_UNREGISTER_IO_BUF \
|
||||
_IOWR('u', 0x24, struct ublksrv_io_cmd)
|
||||
|
||||
/*
|
||||
* return 0 if the command is run successfully, otherwise failure code
|
||||
* is returned
|
||||
*/
|
||||
#define UBLK_U_IO_PREP_IO_CMDS \
|
||||
_IOWR('u', 0x25, struct ublk_batch_io)
|
||||
/*
|
||||
* If failure code is returned, nothing in the command buffer is handled.
|
||||
* Otherwise, the returned value means how many bytes in command buffer
|
||||
* are handled actually, then number of handled IOs can be calculated with
|
||||
* `elem_bytes` for each IO. IOs in the remained bytes are not committed,
|
||||
* userspace has to check return value for dealing with partial committing
|
||||
* correctly.
|
||||
*/
|
||||
#define UBLK_U_IO_COMMIT_IO_CMDS \
|
||||
_IOWR('u', 0x26, struct ublk_batch_io)
|
||||
|
||||
/*
|
||||
* Fetch io commands to provided buffer in multishot style,
|
||||
* `IORING_URING_CMD_MULTISHOT` is required for this command.
|
||||
*/
|
||||
#define UBLK_U_IO_FETCH_IO_CMDS \
|
||||
_IOWR('u', 0x27, struct ublk_batch_io)
|
||||
|
||||
/* only ABORT means that no re-fetch */
|
||||
#define UBLK_IO_RES_OK 0
|
||||
#define UBLK_IO_RES_NEED_GET_DATA 1
|
||||
|
|
@ -134,6 +159,10 @@
|
|||
#define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS)
|
||||
#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
|
||||
|
||||
/* Copy to/from request integrity buffer instead of data buffer */
|
||||
#define UBLK_INTEGRITY_FLAG_OFF 62
|
||||
#define UBLKSRV_IO_INTEGRITY_FLAG (1ULL << UBLK_INTEGRITY_FLAG_OFF)
|
||||
|
||||
/*
|
||||
* ublk server can register data buffers for incoming I/O requests with a sparse
|
||||
* io_uring buffer table. The request buffer can then be used as the data buffer
|
||||
|
|
@ -311,6 +340,36 @@
|
|||
*/
|
||||
#define UBLK_F_BUF_REG_OFF_DAEMON (1ULL << 14)
|
||||
|
||||
/*
|
||||
* Support the following commands for delivering & committing io command
|
||||
* in batch.
|
||||
*
|
||||
* - UBLK_U_IO_PREP_IO_CMDS
|
||||
* - UBLK_U_IO_COMMIT_IO_CMDS
|
||||
* - UBLK_U_IO_FETCH_IO_CMDS
|
||||
* - UBLK_U_IO_REGISTER_IO_BUF
|
||||
* - UBLK_U_IO_UNREGISTER_IO_BUF
|
||||
*
|
||||
* The existing UBLK_U_IO_FETCH_REQ, UBLK_U_IO_COMMIT_AND_FETCH_REQ and
|
||||
* UBLK_U_IO_NEED_GET_DATA uring_cmd are not supported for this feature.
|
||||
*/
|
||||
#define UBLK_F_BATCH_IO (1ULL << 15)
|
||||
|
||||
/*
|
||||
* ublk device supports requests with integrity/metadata buffer.
|
||||
* Requires UBLK_F_USER_COPY.
|
||||
*/
|
||||
#define UBLK_F_INTEGRITY (1ULL << 16)
|
||||
|
||||
/*
|
||||
* The device supports the UBLK_CMD_TRY_STOP_DEV command, which
|
||||
* allows stopping the device only if there are no openers.
|
||||
*/
|
||||
#define UBLK_F_SAFE_STOP_DEV (1ULL << 17)
|
||||
|
||||
/* Disable automatic partition scanning when device is started */
|
||||
#define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18)
|
||||
|
||||
/* device state */
|
||||
#define UBLK_S_DEV_DEAD 0
|
||||
#define UBLK_S_DEV_LIVE 1
|
||||
|
|
@ -408,6 +467,8 @@ struct ublksrv_ctrl_dev_info {
|
|||
* passed in.
|
||||
*/
|
||||
#define UBLK_IO_F_NEED_REG_BUF (1U << 17)
|
||||
/* Request has an integrity data buffer */
|
||||
#define UBLK_IO_F_INTEGRITY (1UL << 18)
|
||||
|
||||
/*
|
||||
* io cmd is described by this structure, and stored in share memory, indexed
|
||||
|
|
@ -525,6 +586,51 @@ struct ublksrv_io_cmd {
|
|||
};
|
||||
};
|
||||
|
||||
struct ublk_elem_header {
|
||||
__u16 tag; /* IO tag */
|
||||
|
||||
/*
|
||||
* Buffer index for incoming io command, only valid iff
|
||||
* UBLK_F_AUTO_BUF_REG is set
|
||||
*/
|
||||
__u16 buf_index;
|
||||
__s32 result; /* I/O completion result (commit only) */
|
||||
};
|
||||
|
||||
/*
|
||||
* uring_cmd buffer structure for batch commands
|
||||
*
|
||||
* buffer includes multiple elements, which number is specified by
|
||||
* `nr_elem`. Each element buffer is organized in the following order:
|
||||
*
|
||||
* struct ublk_elem_buffer {
|
||||
* // Mandatory fields (8 bytes)
|
||||
* struct ublk_elem_header header;
|
||||
*
|
||||
* // Optional fields (8 bytes each, included based on flags)
|
||||
*
|
||||
* // Buffer address (if UBLK_BATCH_F_HAS_BUF_ADDR) for copying data
|
||||
* // between ublk request and ublk server buffer
|
||||
* __u64 buf_addr;
|
||||
*
|
||||
* // returned Zone append LBA (if UBLK_BATCH_F_HAS_ZONE_LBA)
|
||||
* __u64 zone_lba;
|
||||
* }
|
||||
*
|
||||
* Used for `UBLK_U_IO_PREP_IO_CMDS` and `UBLK_U_IO_COMMIT_IO_CMDS`
|
||||
*/
|
||||
struct ublk_batch_io {
|
||||
__u16 q_id;
|
||||
#define UBLK_BATCH_F_HAS_ZONE_LBA (1 << 0)
|
||||
#define UBLK_BATCH_F_HAS_BUF_ADDR (1 << 1)
|
||||
#define UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK (1 << 2)
|
||||
__u16 flags;
|
||||
__u16 nr_elem;
|
||||
__u8 elem_bytes;
|
||||
__u8 reserved;
|
||||
__u64 reserved2;
|
||||
};
|
||||
|
||||
struct ublk_param_basic {
|
||||
#define UBLK_ATTR_READ_ONLY (1 << 0)
|
||||
#define UBLK_ATTR_ROTATIONAL (1 << 1)
|
||||
|
|
@ -600,6 +706,17 @@ struct ublk_param_segment {
|
|||
__u8 pad[2];
|
||||
};
|
||||
|
||||
struct ublk_param_integrity {
|
||||
__u32 flags; /* LBMD_PI_CAP_* from linux/fs.h */
|
||||
__u16 max_integrity_segments; /* 0 means no limit */
|
||||
__u8 interval_exp;
|
||||
__u8 metadata_size; /* UBLK_PARAM_TYPE_INTEGRITY requires nonzero */
|
||||
__u8 pi_offset;
|
||||
__u8 csum_type; /* LBMD_PI_CSUM_* from linux/fs.h */
|
||||
__u8 tag_size;
|
||||
__u8 pad[5];
|
||||
};
|
||||
|
||||
struct ublk_params {
|
||||
/*
|
||||
* Total length of parameters, userspace has to set 'len' for both
|
||||
|
|
@ -614,6 +731,7 @@ struct ublk_params {
|
|||
#define UBLK_PARAM_TYPE_ZONED (1 << 3)
|
||||
#define UBLK_PARAM_TYPE_DMA_ALIGN (1 << 4)
|
||||
#define UBLK_PARAM_TYPE_SEGMENT (1 << 5)
|
||||
#define UBLK_PARAM_TYPE_INTEGRITY (1 << 6) /* requires UBLK_F_INTEGRITY */
|
||||
__u32 types; /* types of parameter included */
|
||||
|
||||
struct ublk_param_basic basic;
|
||||
|
|
@ -622,6 +740,7 @@ struct ublk_params {
|
|||
struct ublk_param_zoned zoned;
|
||||
struct ublk_param_dma_align dma;
|
||||
struct ublk_param_segment seg;
|
||||
struct ublk_param_integrity integrity;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1055,17 +1055,6 @@ static int io_import_kbuf(int ddir, struct iov_iter *iter,
|
|||
|
||||
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count);
|
||||
iov_iter_advance(iter, offset);
|
||||
|
||||
if (count < imu->len) {
|
||||
const struct bio_vec *bvec = iter->bvec;
|
||||
|
||||
len += iter->iov_offset;
|
||||
while (len > bvec->bv_len) {
|
||||
len -= bvec->bv_len;
|
||||
bvec++;
|
||||
}
|
||||
iter->nr_segs = 1 + bvec - iter->bvec;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1328,6 +1328,12 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
|
|||
struct io_kiocb *req, *tmp;
|
||||
int nr_events = 0;
|
||||
|
||||
/*
|
||||
* Store the polling io_ring_ctx so drivers can detect if they're
|
||||
* completing a request in the same ring context that's polling.
|
||||
*/
|
||||
iob.poll_ctx = ctx;
|
||||
|
||||
/*
|
||||
* Only spin for completions if we don't have multiple devices hanging
|
||||
* off our complete list.
|
||||
|
|
|
|||
|
|
@ -793,7 +793,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
|
|||
return PTR_ERR(bt);
|
||||
}
|
||||
blk_trace_setup_finalize(q, name, 1, bt, &buts2);
|
||||
strcpy(buts.name, buts2.name);
|
||||
strscpy(buts.name, buts2.name, BLKTRACE_BDEV_SIZE);
|
||||
mutex_unlock(&q->debugfs_mutex);
|
||||
|
||||
if (copy_to_user(arg, &buts, sizeof(buts))) {
|
||||
|
|
|
|||
|
|
@ -107,8 +107,7 @@ impl GenDiskBuilder {
|
|||
drop(unsafe { T::QueueData::from_foreign(data) });
|
||||
});
|
||||
|
||||
// SAFETY: `bindings::queue_limits` contain only fields that are valid when zeroed.
|
||||
let mut lim: bindings::queue_limits = unsafe { core::mem::zeroed() };
|
||||
let mut lim: bindings::queue_limits = pin_init::zeroed();
|
||||
|
||||
lim.logical_block_size = self.logical_block_size;
|
||||
lim.physical_block_size = self.physical_block_size;
|
||||
|
|
|
|||
|
|
@ -38,9 +38,7 @@ impl<T: Operations> TagSet<T> {
|
|||
num_tags: u32,
|
||||
num_maps: u32,
|
||||
) -> impl PinInit<Self, error::Error> {
|
||||
// SAFETY: `blk_mq_tag_set` only contains integers and pointers, which
|
||||
// all are allowed to be 0.
|
||||
let tag_set: bindings::blk_mq_tag_set = unsafe { core::mem::zeroed() };
|
||||
let tag_set: bindings::blk_mq_tag_set = pin_init::zeroed();
|
||||
let tag_set: Result<_> = core::mem::size_of::<RequestDataWrapper>()
|
||||
.try_into()
|
||||
.map(|cmd_size| {
|
||||
|
|
|
|||
6
tools/testing/selftests/ublk/.gitignore
vendored
6
tools/testing/selftests/ublk/.gitignore
vendored
|
|
@ -1,3 +1,5 @@
|
|||
kublk
|
||||
/tools
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
*-verify.state
|
||||
/tools
|
||||
kublk
|
||||
metadata_size
|
||||
|
|
|
|||
|
|
@ -7,22 +7,21 @@ endif
|
|||
|
||||
LDLIBS += -lpthread -lm -luring
|
||||
|
||||
TEST_PROGS := test_generic_01.sh
|
||||
TEST_PROGS += test_generic_02.sh
|
||||
TEST_PROGS := test_generic_02.sh
|
||||
TEST_PROGS += test_generic_03.sh
|
||||
TEST_PROGS += test_generic_04.sh
|
||||
TEST_PROGS += test_generic_05.sh
|
||||
TEST_PROGS += test_generic_06.sh
|
||||
TEST_PROGS += test_generic_07.sh
|
||||
|
||||
TEST_PROGS += test_generic_08.sh
|
||||
TEST_PROGS += test_generic_09.sh
|
||||
TEST_PROGS += test_generic_10.sh
|
||||
TEST_PROGS += test_generic_11.sh
|
||||
TEST_PROGS += test_generic_12.sh
|
||||
TEST_PROGS += test_generic_13.sh
|
||||
TEST_PROGS += test_generic_14.sh
|
||||
TEST_PROGS += test_generic_15.sh
|
||||
TEST_PROGS += test_generic_16.sh
|
||||
|
||||
TEST_PROGS += test_batch_01.sh
|
||||
TEST_PROGS += test_batch_02.sh
|
||||
TEST_PROGS += test_batch_03.sh
|
||||
|
||||
TEST_PROGS += test_null_01.sh
|
||||
TEST_PROGS += test_null_02.sh
|
||||
|
|
@ -34,6 +33,14 @@ TEST_PROGS += test_loop_04.sh
|
|||
TEST_PROGS += test_loop_05.sh
|
||||
TEST_PROGS += test_loop_06.sh
|
||||
TEST_PROGS += test_loop_07.sh
|
||||
|
||||
TEST_PROGS += test_integrity_01.sh
|
||||
TEST_PROGS += test_integrity_02.sh
|
||||
|
||||
TEST_PROGS += test_recover_01.sh
|
||||
TEST_PROGS += test_recover_02.sh
|
||||
TEST_PROGS += test_recover_03.sh
|
||||
TEST_PROGS += test_recover_04.sh
|
||||
TEST_PROGS += test_stripe_01.sh
|
||||
TEST_PROGS += test_stripe_02.sh
|
||||
TEST_PROGS += test_stripe_03.sh
|
||||
|
|
@ -41,6 +48,9 @@ TEST_PROGS += test_stripe_04.sh
|
|||
TEST_PROGS += test_stripe_05.sh
|
||||
TEST_PROGS += test_stripe_06.sh
|
||||
|
||||
TEST_PROGS += test_part_01.sh
|
||||
TEST_PROGS += test_part_02.sh
|
||||
|
||||
TEST_PROGS += test_stress_01.sh
|
||||
TEST_PROGS += test_stress_02.sh
|
||||
TEST_PROGS += test_stress_03.sh
|
||||
|
|
@ -48,13 +58,55 @@ TEST_PROGS += test_stress_04.sh
|
|||
TEST_PROGS += test_stress_05.sh
|
||||
TEST_PROGS += test_stress_06.sh
|
||||
TEST_PROGS += test_stress_07.sh
|
||||
TEST_PROGS += test_stress_08.sh
|
||||
TEST_PROGS += test_stress_09.sh
|
||||
|
||||
TEST_GEN_PROGS_EXTENDED = kublk
|
||||
TEST_FILES := settings
|
||||
|
||||
TEST_GEN_PROGS_EXTENDED = kublk metadata_size
|
||||
STANDALONE_UTILS := metadata_size.c
|
||||
|
||||
LOCAL_HDRS += $(wildcard *.h)
|
||||
include ../lib.mk
|
||||
|
||||
$(TEST_GEN_PROGS_EXTENDED): $(wildcard *.c)
|
||||
$(OUTPUT)/kublk: $(filter-out $(STANDALONE_UTILS),$(wildcard *.c))
|
||||
|
||||
check:
|
||||
shellcheck -x -f gcc *.sh
|
||||
|
||||
# Test groups for running subsets of tests
|
||||
# JOBS=1 (default): sequential with kselftest TAP output
|
||||
# JOBS>1: parallel execution with xargs -P
|
||||
# Usage: make run_null JOBS=4
|
||||
JOBS ?= 1
|
||||
export JOBS
|
||||
|
||||
# Auto-detect test groups from TEST_PROGS (test_<group>_<num>.sh -> group)
|
||||
TEST_GROUPS := $(shell echo "$(TEST_PROGS)" | tr ' ' '\n' | \
|
||||
sed 's/test_\([^_]*\)_.*/\1/' | sort -u)
|
||||
|
||||
# Template for group test targets
|
||||
# $(1) = group name (e.g., null, generic, stress)
|
||||
define RUN_GROUP
|
||||
run_$(1): all
|
||||
@if [ $$(JOBS) -gt 1 ]; then \
|
||||
echo $$(filter test_$(1)_%.sh,$$(TEST_PROGS)) | tr ' ' '\n' | \
|
||||
xargs -P $$(JOBS) -n1 sh -c './"$$$$0"' || true; \
|
||||
else \
|
||||
$$(call RUN_TESTS, $$(filter test_$(1)_%.sh,$$(TEST_PROGS))); \
|
||||
fi
|
||||
.PHONY: run_$(1)
|
||||
endef
|
||||
|
||||
# Generate targets for each discovered test group
|
||||
$(foreach group,$(TEST_GROUPS),$(eval $(call RUN_GROUP,$(group))))
|
||||
|
||||
# Run all tests (parallel when JOBS>1)
|
||||
run_all: all
|
||||
@if [ $(JOBS) -gt 1 ]; then \
|
||||
echo $(TEST_PROGS) | tr ' ' '\n' | \
|
||||
xargs -P $(JOBS) -n1 sh -c './"$$0"' || true; \
|
||||
else \
|
||||
$(call RUN_TESTS, $(TEST_PROGS)); \
|
||||
fi
|
||||
.PHONY: run_all
|
||||
|
|
|
|||
607
tools/testing/selftests/ublk/batch.c
Normal file
607
tools/testing/selftests/ublk/batch.c
Normal file
|
|
@ -0,0 +1,607 @@
|
|||
/* SPDX-License-Identifier: MIT */
|
||||
/*
|
||||
* Description: UBLK_F_BATCH_IO buffer management
|
||||
*/
|
||||
|
||||
#include "kublk.h"
|
||||
|
||||
static inline void *ublk_get_commit_buf(struct ublk_thread *t,
|
||||
unsigned short buf_idx)
|
||||
{
|
||||
unsigned idx;
|
||||
|
||||
if (buf_idx < t->commit_buf_start ||
|
||||
buf_idx >= t->commit_buf_start + t->nr_commit_buf)
|
||||
return NULL;
|
||||
idx = buf_idx - t->commit_buf_start;
|
||||
return t->commit_buf + idx * t->commit_buf_size;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate one buffer for UBLK_U_IO_PREP_IO_CMDS or UBLK_U_IO_COMMIT_IO_CMDS
|
||||
*
|
||||
* Buffer index is returned.
|
||||
*/
|
||||
static inline unsigned short ublk_alloc_commit_buf(struct ublk_thread *t)
|
||||
{
|
||||
int idx = allocator_get(&t->commit_buf_alloc);
|
||||
|
||||
if (idx >= 0)
|
||||
return idx + t->commit_buf_start;
|
||||
return UBLKS_T_COMMIT_BUF_INV_IDX;
|
||||
}
|
||||
|
||||
/*
|
||||
* Free one commit buffer which is used by UBLK_U_IO_PREP_IO_CMDS or
|
||||
* UBLK_U_IO_COMMIT_IO_CMDS
|
||||
*/
|
||||
static inline void ublk_free_commit_buf(struct ublk_thread *t,
|
||||
unsigned short i)
|
||||
{
|
||||
unsigned short idx = i - t->commit_buf_start;
|
||||
|
||||
ublk_assert(idx < t->nr_commit_buf);
|
||||
ublk_assert(allocator_get_val(&t->commit_buf_alloc, idx) != 0);
|
||||
|
||||
allocator_put(&t->commit_buf_alloc, idx);
|
||||
}
|
||||
|
||||
static unsigned char ublk_commit_elem_buf_size(struct ublk_dev *dev)
|
||||
{
|
||||
if (dev->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY |
|
||||
UBLK_F_AUTO_BUF_REG))
|
||||
return 8;
|
||||
|
||||
/* one extra 8bytes for carrying buffer address */
|
||||
return 16;
|
||||
}
|
||||
|
||||
static unsigned ublk_commit_buf_size(struct ublk_thread *t)
|
||||
{
|
||||
struct ublk_dev *dev = t->dev;
|
||||
unsigned elem_size = ublk_commit_elem_buf_size(dev);
|
||||
unsigned int total = elem_size * dev->dev_info.queue_depth;
|
||||
unsigned int page_sz = getpagesize();
|
||||
|
||||
return round_up(total, page_sz);
|
||||
}
|
||||
|
||||
static void free_batch_commit_buf(struct ublk_thread *t)
|
||||
{
|
||||
if (t->commit_buf) {
|
||||
unsigned buf_size = ublk_commit_buf_size(t);
|
||||
unsigned int total = buf_size * t->nr_commit_buf;
|
||||
|
||||
munlock(t->commit_buf, total);
|
||||
free(t->commit_buf);
|
||||
}
|
||||
allocator_deinit(&t->commit_buf_alloc);
|
||||
free(t->commit);
|
||||
}
|
||||
|
||||
static int alloc_batch_commit_buf(struct ublk_thread *t)
|
||||
{
|
||||
unsigned buf_size = ublk_commit_buf_size(t);
|
||||
unsigned int total = buf_size * t->nr_commit_buf;
|
||||
unsigned int page_sz = getpagesize();
|
||||
void *buf = NULL;
|
||||
int i, ret, j = 0;
|
||||
|
||||
t->commit = calloc(t->nr_queues, sizeof(*t->commit));
|
||||
for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
|
||||
if (t->q_map[i])
|
||||
t->commit[j++].q_id = i;
|
||||
}
|
||||
|
||||
allocator_init(&t->commit_buf_alloc, t->nr_commit_buf);
|
||||
|
||||
t->commit_buf = NULL;
|
||||
ret = posix_memalign(&buf, page_sz, total);
|
||||
if (ret || !buf)
|
||||
goto fail;
|
||||
|
||||
t->commit_buf = buf;
|
||||
|
||||
/* lock commit buffer pages for fast access */
|
||||
if (mlock(t->commit_buf, total))
|
||||
ublk_err("%s: can't lock commit buffer %s\n", __func__,
|
||||
strerror(errno));
|
||||
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
free_batch_commit_buf(t);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static unsigned int ublk_thread_nr_queues(const struct ublk_thread *t)
|
||||
{
|
||||
int i;
|
||||
int ret = 0;
|
||||
|
||||
for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++)
|
||||
ret += !!t->q_map[i];
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ublk_batch_prepare(struct ublk_thread *t)
|
||||
{
|
||||
/*
|
||||
* We only handle single device in this thread context.
|
||||
*
|
||||
* All queues have same feature flags, so use queue 0's for
|
||||
* calculate uring_cmd flags.
|
||||
*
|
||||
* This way looks not elegant, but it works so far.
|
||||
*/
|
||||
struct ublk_queue *q = &t->dev->q[0];
|
||||
|
||||
/* cache nr_queues because we don't support dynamic load-balance yet */
|
||||
t->nr_queues = ublk_thread_nr_queues(t);
|
||||
|
||||
t->commit_buf_elem_size = ublk_commit_elem_buf_size(t->dev);
|
||||
t->commit_buf_size = ublk_commit_buf_size(t);
|
||||
t->commit_buf_start = t->nr_bufs;
|
||||
t->nr_commit_buf = 2 * t->nr_queues;
|
||||
t->nr_bufs += t->nr_commit_buf;
|
||||
|
||||
t->cmd_flags = 0;
|
||||
if (ublk_queue_use_auto_zc(q)) {
|
||||
if (ublk_queue_auto_zc_fallback(q))
|
||||
t->cmd_flags |= UBLK_BATCH_F_AUTO_BUF_REG_FALLBACK;
|
||||
} else if (!ublk_queue_no_buf(q))
|
||||
t->cmd_flags |= UBLK_BATCH_F_HAS_BUF_ADDR;
|
||||
|
||||
t->state |= UBLKS_T_BATCH_IO;
|
||||
|
||||
ublk_log("%s: thread %d commit(nr_bufs %u, buf_size %u, start %u)\n",
|
||||
__func__, t->idx,
|
||||
t->nr_commit_buf, t->commit_buf_size,
|
||||
t->nr_bufs);
|
||||
}
|
||||
|
||||
static void free_batch_fetch_buf(struct ublk_thread *t)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < t->nr_fetch_bufs; i++) {
|
||||
io_uring_free_buf_ring(&t->ring, t->fetch[i].br, 1, i);
|
||||
munlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size);
|
||||
free(t->fetch[i].fetch_buf);
|
||||
}
|
||||
free(t->fetch);
|
||||
}
|
||||
|
||||
static int alloc_batch_fetch_buf(struct ublk_thread *t)
|
||||
{
|
||||
/* page aligned fetch buffer, and it is mlocked for speedup delivery */
|
||||
unsigned pg_sz = getpagesize();
|
||||
unsigned buf_size = round_up(t->dev->dev_info.queue_depth * 2, pg_sz);
|
||||
int ret;
|
||||
int i = 0;
|
||||
|
||||
/* double fetch buffer for each queue */
|
||||
t->nr_fetch_bufs = t->nr_queues * 2;
|
||||
t->fetch = calloc(t->nr_fetch_bufs, sizeof(*t->fetch));
|
||||
|
||||
/* allocate one buffer for each queue */
|
||||
for (i = 0; i < t->nr_fetch_bufs; i++) {
|
||||
t->fetch[i].fetch_buf_size = buf_size;
|
||||
|
||||
if (posix_memalign((void **)&t->fetch[i].fetch_buf, pg_sz,
|
||||
t->fetch[i].fetch_buf_size))
|
||||
return -ENOMEM;
|
||||
|
||||
/* lock fetch buffer page for fast fetching */
|
||||
if (mlock(t->fetch[i].fetch_buf, t->fetch[i].fetch_buf_size))
|
||||
ublk_err("%s: can't lock fetch buffer %s\n", __func__,
|
||||
strerror(errno));
|
||||
t->fetch[i].br = io_uring_setup_buf_ring(&t->ring, 1,
|
||||
i, IOU_PBUF_RING_INC, &ret);
|
||||
if (!t->fetch[i].br) {
|
||||
ublk_err("Buffer ring register failed %d\n", ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ublk_batch_alloc_buf(struct ublk_thread *t)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ublk_assert(t->nr_commit_buf < 2 * UBLK_MAX_QUEUES);
|
||||
|
||||
ret = alloc_batch_commit_buf(t);
|
||||
if (ret)
|
||||
return ret;
|
||||
return alloc_batch_fetch_buf(t);
|
||||
}
|
||||
|
||||
void ublk_batch_free_buf(struct ublk_thread *t)
|
||||
{
|
||||
free_batch_commit_buf(t);
|
||||
free_batch_fetch_buf(t);
|
||||
}
|
||||
|
||||
static void ublk_init_batch_cmd(struct ublk_thread *t, __u16 q_id,
|
||||
struct io_uring_sqe *sqe, unsigned op,
|
||||
unsigned short elem_bytes,
|
||||
unsigned short nr_elem,
|
||||
unsigned short buf_idx)
|
||||
{
|
||||
struct ublk_batch_io *cmd;
|
||||
__u64 user_data;
|
||||
|
||||
cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe);
|
||||
|
||||
ublk_set_sqe_cmd_op(sqe, op);
|
||||
|
||||
sqe->fd = 0; /* dev->fds[0] */
|
||||
sqe->opcode = IORING_OP_URING_CMD;
|
||||
sqe->flags = IOSQE_FIXED_FILE;
|
||||
|
||||
cmd->q_id = q_id;
|
||||
cmd->flags = 0;
|
||||
cmd->reserved = 0;
|
||||
cmd->elem_bytes = elem_bytes;
|
||||
cmd->nr_elem = nr_elem;
|
||||
|
||||
user_data = build_user_data(buf_idx, _IOC_NR(op), nr_elem, q_id, 0);
|
||||
io_uring_sqe_set_data64(sqe, user_data);
|
||||
|
||||
t->cmd_inflight += 1;
|
||||
|
||||
ublk_dbg(UBLK_DBG_IO_CMD, "%s: thread %u qid %d cmd_op %x data %lx "
|
||||
"nr_elem %u elem_bytes %u buf_size %u buf_idx %d "
|
||||
"cmd_inflight %u\n",
|
||||
__func__, t->idx, q_id, op, user_data,
|
||||
cmd->nr_elem, cmd->elem_bytes,
|
||||
nr_elem * elem_bytes, buf_idx, t->cmd_inflight);
|
||||
}
|
||||
|
||||
static void ublk_setup_commit_sqe(struct ublk_thread *t,
|
||||
struct io_uring_sqe *sqe,
|
||||
unsigned short buf_idx)
|
||||
{
|
||||
struct ublk_batch_io *cmd;
|
||||
|
||||
cmd = (struct ublk_batch_io *)ublk_get_sqe_cmd(sqe);
|
||||
|
||||
/* Use plain user buffer instead of fixed buffer */
|
||||
cmd->flags |= t->cmd_flags;
|
||||
}
|
||||
|
||||
static void ublk_batch_queue_fetch(struct ublk_thread *t,
|
||||
struct ublk_queue *q,
|
||||
unsigned short buf_idx)
|
||||
{
|
||||
unsigned short nr_elem = t->fetch[buf_idx].fetch_buf_size / 2;
|
||||
struct io_uring_sqe *sqe;
|
||||
|
||||
io_uring_buf_ring_add(t->fetch[buf_idx].br, t->fetch[buf_idx].fetch_buf,
|
||||
t->fetch[buf_idx].fetch_buf_size,
|
||||
0, 0, 0);
|
||||
io_uring_buf_ring_advance(t->fetch[buf_idx].br, 1);
|
||||
|
||||
ublk_io_alloc_sqes(t, &sqe, 1);
|
||||
|
||||
ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_FETCH_IO_CMDS, 2, nr_elem,
|
||||
buf_idx);
|
||||
|
||||
sqe->rw_flags= IORING_URING_CMD_MULTISHOT;
|
||||
sqe->buf_group = buf_idx;
|
||||
sqe->flags |= IOSQE_BUFFER_SELECT;
|
||||
|
||||
t->fetch[buf_idx].fetch_buf_off = 0;
|
||||
}
|
||||
|
||||
void ublk_batch_start_fetch(struct ublk_thread *t)
|
||||
{
|
||||
int i;
|
||||
int j = 0;
|
||||
|
||||
for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
|
||||
if (t->q_map[i]) {
|
||||
struct ublk_queue *q = &t->dev->q[i];
|
||||
|
||||
/* submit two fetch commands for each queue */
|
||||
ublk_batch_queue_fetch(t, q, j++);
|
||||
ublk_batch_queue_fetch(t, q, j++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned short ublk_compl_batch_fetch(struct ublk_thread *t,
|
||||
struct ublk_queue *q,
|
||||
const struct io_uring_cqe *cqe)
|
||||
{
|
||||
unsigned short buf_idx = user_data_to_tag(cqe->user_data);
|
||||
unsigned start = t->fetch[buf_idx].fetch_buf_off;
|
||||
unsigned end = start + cqe->res;
|
||||
void *buf = t->fetch[buf_idx].fetch_buf;
|
||||
int i;
|
||||
|
||||
if (cqe->res < 0)
|
||||
return buf_idx;
|
||||
|
||||
if ((end - start) / 2 > q->q_depth) {
|
||||
ublk_err("%s: fetch duplicated ios offset %u count %u\n", __func__, start, cqe->res);
|
||||
|
||||
for (i = start; i < end; i += 2) {
|
||||
unsigned short tag = *(unsigned short *)(buf + i);
|
||||
|
||||
ublk_err("%u ", tag);
|
||||
}
|
||||
ublk_err("\n");
|
||||
}
|
||||
|
||||
for (i = start; i < end; i += 2) {
|
||||
unsigned short tag = *(unsigned short *)(buf + i);
|
||||
|
||||
if (tag >= q->q_depth)
|
||||
ublk_err("%s: bad tag %u\n", __func__, tag);
|
||||
|
||||
if (q->tgt_ops->queue_io)
|
||||
q->tgt_ops->queue_io(t, q, tag);
|
||||
}
|
||||
t->fetch[buf_idx].fetch_buf_off = end;
|
||||
return buf_idx;
|
||||
}
|
||||
|
||||
static int __ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q)
|
||||
{
|
||||
unsigned short nr_elem = q->q_depth;
|
||||
unsigned short buf_idx = ublk_alloc_commit_buf(t);
|
||||
struct io_uring_sqe *sqe;
|
||||
void *buf;
|
||||
int i;
|
||||
|
||||
ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX);
|
||||
|
||||
ublk_io_alloc_sqes(t, &sqe, 1);
|
||||
|
||||
ublk_assert(nr_elem == q->q_depth);
|
||||
buf = ublk_get_commit_buf(t, buf_idx);
|
||||
for (i = 0; i < nr_elem; i++) {
|
||||
struct ublk_batch_elem *elem = (struct ublk_batch_elem *)(
|
||||
buf + i * t->commit_buf_elem_size);
|
||||
struct ublk_io *io = &q->ios[i];
|
||||
|
||||
elem->tag = i;
|
||||
elem->result = 0;
|
||||
|
||||
if (ublk_queue_use_auto_zc(q))
|
||||
elem->buf_index = ublk_batch_io_buf_idx(t, q, i);
|
||||
else if (!ublk_queue_no_buf(q))
|
||||
elem->buf_addr = (__u64)io->buf_addr;
|
||||
}
|
||||
|
||||
sqe->addr = (__u64)buf;
|
||||
sqe->len = t->commit_buf_elem_size * nr_elem;
|
||||
|
||||
ublk_init_batch_cmd(t, q->q_id, sqe, UBLK_U_IO_PREP_IO_CMDS,
|
||||
t->commit_buf_elem_size, nr_elem, buf_idx);
|
||||
ublk_setup_commit_sqe(t, sqe, buf_idx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
pthread_spin_lock(&q->lock);
|
||||
if (q->flags & UBLKS_Q_PREPARED)
|
||||
goto unlock;
|
||||
ret = __ublk_batch_queue_prep_io_cmds(t, q);
|
||||
if (!ret)
|
||||
q->flags |= UBLKS_Q_PREPARED;
|
||||
unlock:
|
||||
pthread_spin_unlock(&q->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void ublk_batch_compl_commit_cmd(struct ublk_thread *t,
|
||||
const struct io_uring_cqe *cqe,
|
||||
unsigned op)
|
||||
{
|
||||
unsigned short buf_idx = user_data_to_tag(cqe->user_data);
|
||||
|
||||
if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS))
|
||||
ublk_assert(cqe->res == 0);
|
||||
else if (op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) {
|
||||
int nr_elem = user_data_to_tgt_data(cqe->user_data);
|
||||
|
||||
ublk_assert(cqe->res == t->commit_buf_elem_size * nr_elem);
|
||||
} else
|
||||
ublk_assert(0);
|
||||
|
||||
ublk_free_commit_buf(t, buf_idx);
|
||||
}
|
||||
|
||||
void ublk_batch_compl_cmd(struct ublk_thread *t,
|
||||
const struct io_uring_cqe *cqe)
|
||||
{
|
||||
unsigned op = user_data_to_op(cqe->user_data);
|
||||
struct ublk_queue *q;
|
||||
unsigned buf_idx;
|
||||
unsigned q_id;
|
||||
|
||||
if (op == _IOC_NR(UBLK_U_IO_PREP_IO_CMDS) ||
|
||||
op == _IOC_NR(UBLK_U_IO_COMMIT_IO_CMDS)) {
|
||||
t->cmd_inflight--;
|
||||
ublk_batch_compl_commit_cmd(t, cqe, op);
|
||||
return;
|
||||
}
|
||||
|
||||
/* FETCH command is per queue */
|
||||
q_id = user_data_to_q_id(cqe->user_data);
|
||||
q = &t->dev->q[q_id];
|
||||
buf_idx = ublk_compl_batch_fetch(t, q, cqe);
|
||||
|
||||
if (cqe->res < 0 && cqe->res != -ENOBUFS) {
|
||||
t->cmd_inflight--;
|
||||
t->state |= UBLKS_T_STOPPING;
|
||||
} else if (!(cqe->flags & IORING_CQE_F_MORE) || cqe->res == -ENOBUFS) {
|
||||
t->cmd_inflight--;
|
||||
ublk_batch_queue_fetch(t, q, buf_idx);
|
||||
}
|
||||
}
|
||||
|
||||
static void __ublk_batch_commit_io_cmds(struct ublk_thread *t,
|
||||
struct batch_commit_buf *cb)
|
||||
{
|
||||
struct io_uring_sqe *sqe;
|
||||
unsigned short buf_idx;
|
||||
unsigned short nr_elem = cb->done;
|
||||
|
||||
/* nothing to commit */
|
||||
if (!nr_elem) {
|
||||
ublk_free_commit_buf(t, cb->buf_idx);
|
||||
return;
|
||||
}
|
||||
|
||||
ublk_io_alloc_sqes(t, &sqe, 1);
|
||||
buf_idx = cb->buf_idx;
|
||||
sqe->addr = (__u64)cb->elem;
|
||||
sqe->len = nr_elem * t->commit_buf_elem_size;
|
||||
|
||||
/* commit isn't per-queue command */
|
||||
ublk_init_batch_cmd(t, cb->q_id, sqe, UBLK_U_IO_COMMIT_IO_CMDS,
|
||||
t->commit_buf_elem_size, nr_elem, buf_idx);
|
||||
ublk_setup_commit_sqe(t, sqe, buf_idx);
|
||||
}
|
||||
|
||||
void ublk_batch_commit_io_cmds(struct ublk_thread *t)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < t->nr_queues; i++) {
|
||||
struct batch_commit_buf *cb = &t->commit[i];
|
||||
|
||||
if (cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX)
|
||||
__ublk_batch_commit_io_cmds(t, cb);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void __ublk_batch_init_commit(struct ublk_thread *t,
|
||||
struct batch_commit_buf *cb,
|
||||
unsigned short buf_idx)
|
||||
{
|
||||
/* so far only support 1:1 queue/thread mapping */
|
||||
cb->buf_idx = buf_idx;
|
||||
cb->elem = ublk_get_commit_buf(t, buf_idx);
|
||||
cb->done = 0;
|
||||
cb->count = t->commit_buf_size /
|
||||
t->commit_buf_elem_size;
|
||||
}
|
||||
|
||||
/* COMMIT_IO_CMDS is per-queue command, so use its own commit buffer */
|
||||
static void ublk_batch_init_commit(struct ublk_thread *t,
|
||||
struct batch_commit_buf *cb)
|
||||
{
|
||||
unsigned short buf_idx = ublk_alloc_commit_buf(t);
|
||||
|
||||
ublk_assert(buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX);
|
||||
ublk_assert(!ublk_batch_commit_prepared(cb));
|
||||
|
||||
__ublk_batch_init_commit(t, cb, buf_idx);
|
||||
}
|
||||
|
||||
void ublk_batch_prep_commit(struct ublk_thread *t)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < t->nr_queues; i++)
|
||||
t->commit[i].buf_idx = UBLKS_T_COMMIT_BUF_INV_IDX;
|
||||
}
|
||||
|
||||
void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q,
|
||||
unsigned tag, int res)
|
||||
{
|
||||
unsigned q_t_idx = ublk_queue_idx_in_thread(t, q);
|
||||
struct batch_commit_buf *cb = &t->commit[q_t_idx];
|
||||
struct ublk_batch_elem *elem;
|
||||
struct ublk_io *io = &q->ios[tag];
|
||||
|
||||
if (!ublk_batch_commit_prepared(cb))
|
||||
ublk_batch_init_commit(t, cb);
|
||||
|
||||
ublk_assert(q->q_id == cb->q_id);
|
||||
|
||||
elem = (struct ublk_batch_elem *)(cb->elem + cb->done * t->commit_buf_elem_size);
|
||||
elem->tag = tag;
|
||||
elem->buf_index = ublk_batch_io_buf_idx(t, q, tag);
|
||||
elem->result = res;
|
||||
|
||||
if (!ublk_queue_no_buf(q))
|
||||
elem->buf_addr = (__u64) (uintptr_t) io->buf_addr;
|
||||
|
||||
cb->done += 1;
|
||||
ublk_assert(cb->done <= cb->count);
|
||||
}
|
||||
|
||||
void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES],
|
||||
int nthreads, int queues)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
/*
|
||||
* Setup round-robin queue-to-thread mapping for arbitrary N:M combinations.
|
||||
*
|
||||
* This algorithm distributes queues across threads (and threads across queues)
|
||||
* in a balanced round-robin fashion to ensure even load distribution.
|
||||
*
|
||||
* Examples:
|
||||
* - 2 threads, 4 queues: T0=[Q0,Q2], T1=[Q1,Q3]
|
||||
* - 4 threads, 2 queues: T0=[Q0], T1=[Q1], T2=[Q0], T3=[Q1]
|
||||
* - 3 threads, 3 queues: T0=[Q0], T1=[Q1], T2=[Q2] (1:1 mapping)
|
||||
*
|
||||
* Phase 1: Mark which queues each thread handles (boolean mapping)
|
||||
*/
|
||||
for (i = 0, j = 0; i < queues || j < nthreads; i++, j++) {
|
||||
q_thread_map[j % nthreads][i % queues] = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Phase 2: Convert boolean mapping to sequential indices within each thread.
|
||||
*
|
||||
* Transform from: q_thread_map[thread][queue] = 1 (handles queue)
|
||||
* To: q_thread_map[thread][queue] = N (queue index within thread)
|
||||
*
|
||||
* This allows each thread to know the local index of each queue it handles,
|
||||
* which is essential for buffer allocation and management. For example:
|
||||
* - Thread 0 handling queues [0,2] becomes: q_thread_map[0][0]=1, q_thread_map[0][2]=2
|
||||
* - Thread 1 handling queues [1,3] becomes: q_thread_map[1][1]=1, q_thread_map[1][3]=2
|
||||
*/
|
||||
for (j = 0; j < nthreads; j++) {
|
||||
unsigned char seq = 1;
|
||||
|
||||
for (i = 0; i < queues; i++) {
|
||||
if (q_thread_map[j][i])
|
||||
q_thread_map[j][i] = seq++;
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
for (j = 0; j < nthreads; j++) {
|
||||
printf("thread %0d: ", j);
|
||||
for (i = 0; i < queues; i++) {
|
||||
if (q_thread_map[j][i])
|
||||
printf("%03u ", i);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("\n");
|
||||
for (j = 0; j < nthreads; j++) {
|
||||
for (i = 0; i < queues; i++) {
|
||||
printf("%03u ", q_thread_map[j][i]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
@ -12,11 +12,11 @@ void backing_file_tgt_deinit(struct ublk_dev *dev)
|
|||
}
|
||||
}
|
||||
|
||||
int backing_file_tgt_init(struct ublk_dev *dev)
|
||||
int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct)
|
||||
{
|
||||
int fd, i;
|
||||
|
||||
assert(dev->nr_fds == 1);
|
||||
ublk_assert(dev->nr_fds == 1);
|
||||
|
||||
for (i = 0; i < dev->tgt.nr_backing_files; i++) {
|
||||
char *file = dev->tgt.backing_file[i];
|
||||
|
|
@ -25,7 +25,7 @@ int backing_file_tgt_init(struct ublk_dev *dev)
|
|||
|
||||
ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file);
|
||||
|
||||
fd = open(file, O_RDWR | O_DIRECT);
|
||||
fd = open(file, O_RDWR | (i < nr_direct ? O_DIRECT : 0));
|
||||
if (fd < 0) {
|
||||
ublk_err("%s: backing file %s can't be opened: %s\n",
|
||||
__func__, file, strerror(errno));
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
|
|||
.dev_sectors = dev_size >> 9,
|
||||
},
|
||||
};
|
||||
ublk_set_integrity_params(ctx, &dev->tgt.params);
|
||||
|
||||
dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000);
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int
|
|||
return zc ? IORING_OP_READ_FIXED : IORING_OP_READ;
|
||||
else if (ublk_op == UBLK_IO_OP_WRITE)
|
||||
return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE;
|
||||
assert(0);
|
||||
ublk_assert(0);
|
||||
}
|
||||
|
||||
static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q,
|
||||
|
|
@ -35,8 +35,23 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
|
|||
unsigned auto_zc = ublk_queue_use_auto_zc(q);
|
||||
enum io_uring_op op = ublk_to_uring_op(iod, zc | auto_zc);
|
||||
struct ublk_io *io = ublk_get_io(q, tag);
|
||||
__u64 offset = iod->start_sector << 9;
|
||||
__u32 len = iod->nr_sectors << 9;
|
||||
struct io_uring_sqe *sqe[3];
|
||||
void *addr = io->buf_addr;
|
||||
unsigned short buf_index = ublk_io_buf_idx(t, q, tag);
|
||||
|
||||
if (iod->op_flags & UBLK_IO_F_INTEGRITY) {
|
||||
ublk_io_alloc_sqes(t, sqe, 1);
|
||||
/* Use second backing file for integrity data */
|
||||
io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 2),
|
||||
io->integrity_buf,
|
||||
ublk_integrity_len(q, len),
|
||||
ublk_integrity_len(q, offset));
|
||||
sqe[0]->flags = IOSQE_FIXED_FILE;
|
||||
/* tgt_data = 1 indicates integrity I/O */
|
||||
sqe[0]->user_data = build_user_data(tag, ublk_op, 1, q->q_id, 1);
|
||||
}
|
||||
|
||||
if (!zc || auto_zc) {
|
||||
ublk_io_alloc_sqes(t, sqe, 1);
|
||||
|
|
@ -45,34 +60,34 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q,
|
|||
|
||||
io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1) /*fds[1]*/,
|
||||
addr,
|
||||
iod->nr_sectors << 9,
|
||||
iod->start_sector << 9);
|
||||
len,
|
||||
offset);
|
||||
if (auto_zc)
|
||||
sqe[0]->buf_index = tag;
|
||||
sqe[0]->buf_index = buf_index;
|
||||
io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE);
|
||||
/* bit63 marks us as tgt io */
|
||||
sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
|
||||
return 1;
|
||||
return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 1;
|
||||
}
|
||||
|
||||
ublk_io_alloc_sqes(t, sqe, 3);
|
||||
|
||||
io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, io->buf_index);
|
||||
io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_index);
|
||||
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
|
||||
sqe[0]->user_data = build_user_data(tag,
|
||||
ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
|
||||
|
||||
io_uring_prep_rw(op, sqe[1], ublk_get_registered_fd(q, 1) /*fds[1]*/, 0,
|
||||
iod->nr_sectors << 9,
|
||||
iod->start_sector << 9);
|
||||
sqe[1]->buf_index = tag;
|
||||
len,
|
||||
offset);
|
||||
sqe[1]->buf_index = buf_index;
|
||||
sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK;
|
||||
sqe[1]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1);
|
||||
|
||||
io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, io->buf_index);
|
||||
io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_index);
|
||||
sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
|
||||
|
||||
return 2;
|
||||
return !!(iod->op_flags & UBLK_IO_F_INTEGRITY) + 2;
|
||||
}
|
||||
|
||||
static int loop_queue_tgt_io(struct ublk_thread *t, struct ublk_queue *q, int tag)
|
||||
|
|
@ -119,12 +134,17 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q,
|
|||
unsigned op = user_data_to_op(cqe->user_data);
|
||||
struct ublk_io *io = ublk_get_io(q, tag);
|
||||
|
||||
if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) {
|
||||
if (!io->result)
|
||||
io->result = cqe->res;
|
||||
if (cqe->res < 0)
|
||||
ublk_err("%s: io failed op %x user_data %lx\n",
|
||||
__func__, op, cqe->user_data);
|
||||
if (cqe->res < 0) {
|
||||
io->result = cqe->res;
|
||||
ublk_err("%s: io failed op %x user_data %lx\n",
|
||||
__func__, op, cqe->user_data);
|
||||
} else if (op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) {
|
||||
__s32 data_len = user_data_to_tgt_data(cqe->user_data)
|
||||
? ublk_integrity_data_len(q, cqe->res)
|
||||
: cqe->res;
|
||||
|
||||
if (!io->result || data_len < io->result)
|
||||
io->result = data_len;
|
||||
}
|
||||
|
||||
/* buffer register op is IOSQE_CQE_SKIP_SUCCESS */
|
||||
|
|
@ -135,9 +155,30 @@ static void ublk_loop_io_done(struct ublk_thread *t, struct ublk_queue *q,
|
|||
ublk_complete_io(t, q, tag, io->result);
|
||||
}
|
||||
|
||||
static int ublk_loop_memset_file(int fd, __u8 byte, size_t len)
|
||||
{
|
||||
off_t offset = 0;
|
||||
__u8 buf[4096];
|
||||
|
||||
memset(buf, byte, sizeof(buf));
|
||||
while (len) {
|
||||
int ret = pwrite(fd, buf, min(len, sizeof(buf)), offset);
|
||||
|
||||
if (ret < 0)
|
||||
return -errno;
|
||||
if (!ret)
|
||||
return -EIO;
|
||||
|
||||
len -= ret;
|
||||
offset += ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
||||
{
|
||||
unsigned long long bytes;
|
||||
unsigned long blocks;
|
||||
int ret;
|
||||
struct ublk_params p = {
|
||||
.types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN,
|
||||
|
|
@ -154,19 +195,39 @@ static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
|||
},
|
||||
};
|
||||
|
||||
ublk_set_integrity_params(ctx, &p);
|
||||
if (ctx->auto_zc_fallback) {
|
||||
ublk_err("%s: not support auto_zc_fallback\n", __func__);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = backing_file_tgt_init(dev);
|
||||
/* Use O_DIRECT only for data file */
|
||||
ret = backing_file_tgt_init(dev, 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (dev->tgt.nr_backing_files != 1)
|
||||
/* Expect a second file for integrity data */
|
||||
if (dev->tgt.nr_backing_files != 1 + !!ctx->metadata_size)
|
||||
return -EINVAL;
|
||||
|
||||
bytes = dev->tgt.backing_file_size[0];
|
||||
blocks = dev->tgt.backing_file_size[0] >> p.basic.logical_bs_shift;
|
||||
if (ctx->metadata_size) {
|
||||
unsigned long metadata_blocks =
|
||||
dev->tgt.backing_file_size[1] / ctx->metadata_size;
|
||||
unsigned long integrity_len;
|
||||
|
||||
/* Ensure both data and integrity data fit in backing files */
|
||||
blocks = min(blocks, metadata_blocks);
|
||||
integrity_len = blocks * ctx->metadata_size;
|
||||
/*
|
||||
* Initialize PI app tag and ref tag to 0xFF
|
||||
* to disable bio-integrity-auto checks
|
||||
*/
|
||||
ret = ublk_loop_memset_file(dev->fds[2], 0xFF, integrity_len);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
bytes = blocks << p.basic.logical_bs_shift;
|
||||
dev->tgt.dev_size = bytes;
|
||||
p.basic.dev_sectors = bytes >> 9;
|
||||
dev->tgt.params = p;
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
* Description: uring_cmd based ublk
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include "kublk.h"
|
||||
|
||||
#define MAX_NR_TGT_ARG 64
|
||||
|
|
@ -107,6 +108,15 @@ static int ublk_ctrl_stop_dev(struct ublk_dev *dev)
|
|||
return __ublk_ctrl_cmd(dev, &data);
|
||||
}
|
||||
|
||||
static int ublk_ctrl_try_stop_dev(struct ublk_dev *dev)
|
||||
{
|
||||
struct ublk_ctrl_cmd_data data = {
|
||||
.cmd_op = UBLK_U_CMD_TRY_STOP_DEV,
|
||||
};
|
||||
|
||||
return __ublk_ctrl_cmd(dev, &data);
|
||||
}
|
||||
|
||||
static int ublk_ctrl_start_dev(struct ublk_dev *dev,
|
||||
int daemon_pid)
|
||||
{
|
||||
|
|
@ -415,14 +425,18 @@ static void ublk_queue_deinit(struct ublk_queue *q)
|
|||
if (q->io_cmd_buf)
|
||||
munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q));
|
||||
|
||||
for (i = 0; i < nr_ios; i++)
|
||||
for (i = 0; i < nr_ios; i++) {
|
||||
free(q->ios[i].buf_addr);
|
||||
free(q->ios[i].integrity_buf);
|
||||
}
|
||||
}
|
||||
|
||||
static void ublk_thread_deinit(struct ublk_thread *t)
|
||||
{
|
||||
io_uring_unregister_buffers(&t->ring);
|
||||
|
||||
ublk_batch_free_buf(t);
|
||||
|
||||
io_uring_unregister_ring_fd(&t->ring);
|
||||
|
||||
if (t->ring.ring_fd > 0) {
|
||||
|
|
@ -432,19 +446,22 @@ static void ublk_thread_deinit(struct ublk_thread *t)
|
|||
}
|
||||
}
|
||||
|
||||
static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags)
|
||||
static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags,
|
||||
__u8 metadata_size)
|
||||
{
|
||||
struct ublk_dev *dev = q->dev;
|
||||
int depth = dev->dev_info.queue_depth;
|
||||
int i;
|
||||
int cmd_buf_size, io_buf_size;
|
||||
int cmd_buf_size, io_buf_size, integrity_size;
|
||||
unsigned long off;
|
||||
|
||||
pthread_spin_init(&q->lock, PTHREAD_PROCESS_PRIVATE);
|
||||
q->tgt_ops = dev->tgt.ops;
|
||||
q->flags = 0;
|
||||
q->q_depth = depth;
|
||||
q->flags = dev->dev_info.flags;
|
||||
q->flags |= extra_flags;
|
||||
q->metadata_size = metadata_size;
|
||||
|
||||
/* Cache fd in queue for fast path access */
|
||||
q->ublk_fd = dev->fds[0];
|
||||
|
|
@ -460,11 +477,23 @@ static int ublk_queue_init(struct ublk_queue *q, unsigned long long extra_flags)
|
|||
}
|
||||
|
||||
io_buf_size = dev->dev_info.max_io_buf_bytes;
|
||||
integrity_size = ublk_integrity_len(q, io_buf_size);
|
||||
for (i = 0; i < q->q_depth; i++) {
|
||||
q->ios[i].buf_addr = NULL;
|
||||
q->ios[i].flags = UBLKS_IO_NEED_FETCH_RQ | UBLKS_IO_FREE;
|
||||
q->ios[i].tag = i;
|
||||
|
||||
if (integrity_size) {
|
||||
q->ios[i].integrity_buf = malloc(integrity_size);
|
||||
if (!q->ios[i].integrity_buf) {
|
||||
ublk_err("ublk dev %d queue %d io %d malloc(%d) failed: %m\n",
|
||||
dev->dev_info.dev_id, q->q_id, i,
|
||||
integrity_size);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (ublk_queue_no_buf(q))
|
||||
continue;
|
||||
|
||||
|
|
@ -491,6 +520,10 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag
|
|||
int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth;
|
||||
int ret;
|
||||
|
||||
/* FETCH_IO_CMDS is multishot, so increase cq depth for BATCH_IO */
|
||||
if (ublk_dev_batch_io(dev))
|
||||
cq_depth += dev->dev_info.queue_depth * 2;
|
||||
|
||||
ret = ublk_setup_ring(&t->ring, ring_depth, cq_depth,
|
||||
IORING_SETUP_COOP_TASKRUN |
|
||||
IORING_SETUP_SINGLE_ISSUER |
|
||||
|
|
@ -505,15 +538,33 @@ static int ublk_thread_init(struct ublk_thread *t, unsigned long long extra_flag
|
|||
unsigned nr_ios = dev->dev_info.queue_depth * dev->dev_info.nr_hw_queues;
|
||||
unsigned max_nr_ios_per_thread = nr_ios / dev->nthreads;
|
||||
max_nr_ios_per_thread += !!(nr_ios % dev->nthreads);
|
||||
ret = io_uring_register_buffers_sparse(
|
||||
&t->ring, max_nr_ios_per_thread);
|
||||
|
||||
t->nr_bufs = max_nr_ios_per_thread;
|
||||
} else {
|
||||
t->nr_bufs = 0;
|
||||
}
|
||||
|
||||
if (ublk_dev_batch_io(dev))
|
||||
ublk_batch_prepare(t);
|
||||
|
||||
if (t->nr_bufs) {
|
||||
ret = io_uring_register_buffers_sparse(&t->ring, t->nr_bufs);
|
||||
if (ret) {
|
||||
ublk_err("ublk dev %d thread %d register spare buffers failed %d",
|
||||
ublk_err("ublk dev %d thread %d register spare buffers failed %d\n",
|
||||
dev->dev_info.dev_id, t->idx, ret);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
if (ublk_dev_batch_io(dev)) {
|
||||
ret = ublk_batch_alloc_buf(t);
|
||||
if (ret) {
|
||||
ublk_err("ublk dev %d thread %d alloc batch buf failed %d\n",
|
||||
dev->dev_info.dev_id, t->idx, ret);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
|
||||
io_uring_register_ring_fd(&t->ring);
|
||||
|
||||
if (flags & UBLKS_Q_NO_UBLK_FIXED_FD) {
|
||||
|
|
@ -579,16 +630,17 @@ static void ublk_dev_unprep(struct ublk_dev *dev)
|
|||
close(dev->fds[0]);
|
||||
}
|
||||
|
||||
static void ublk_set_auto_buf_reg(const struct ublk_queue *q,
|
||||
static void ublk_set_auto_buf_reg(const struct ublk_thread *t,
|
||||
const struct ublk_queue *q,
|
||||
struct io_uring_sqe *sqe,
|
||||
unsigned short tag)
|
||||
{
|
||||
struct ublk_auto_buf_reg buf = {};
|
||||
|
||||
if (q->tgt_ops->buf_index)
|
||||
buf.index = q->tgt_ops->buf_index(q, tag);
|
||||
buf.index = q->tgt_ops->buf_index(t, q, tag);
|
||||
else
|
||||
buf.index = q->ios[tag].buf_index;
|
||||
buf.index = ublk_io_buf_idx(t, q, tag);
|
||||
|
||||
if (ublk_queue_auto_zc_fallback(q))
|
||||
buf.flags = UBLK_AUTO_BUF_REG_FALLBACK;
|
||||
|
|
@ -607,13 +659,13 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
|
|||
__u8 ublk_op = ublksrv_get_op(iod);
|
||||
__u32 len = iod->nr_sectors << 9;
|
||||
void *addr = io->buf_addr;
|
||||
ssize_t copied;
|
||||
|
||||
if (ublk_op != match_ublk_op)
|
||||
return;
|
||||
|
||||
while (len) {
|
||||
__u32 copy_len = min(len, UBLK_USER_COPY_LEN);
|
||||
ssize_t copied;
|
||||
|
||||
if (ublk_op == UBLK_IO_OP_WRITE)
|
||||
copied = pread(q->ublk_fd, addr, copy_len, off);
|
||||
|
|
@ -626,6 +678,20 @@ static void ublk_user_copy(const struct ublk_io *io, __u8 match_ublk_op)
|
|||
off += copy_len;
|
||||
len -= copy_len;
|
||||
}
|
||||
|
||||
if (!(iod->op_flags & UBLK_IO_F_INTEGRITY))
|
||||
return;
|
||||
|
||||
len = ublk_integrity_len(q, iod->nr_sectors << 9);
|
||||
off = ublk_user_copy_offset(q->q_id, io->tag);
|
||||
off |= UBLKSRV_IO_INTEGRITY_FLAG;
|
||||
if (ublk_op == UBLK_IO_OP_WRITE)
|
||||
copied = pread(q->ublk_fd, io->integrity_buf, len, off);
|
||||
else if (ublk_op == UBLK_IO_OP_READ)
|
||||
copied = pwrite(q->ublk_fd, io->integrity_buf, len, off);
|
||||
else
|
||||
assert(0);
|
||||
assert(copied == (ssize_t)len);
|
||||
}
|
||||
|
||||
int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
|
||||
|
|
@ -690,7 +756,7 @@ int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io)
|
|||
cmd->addr = 0;
|
||||
|
||||
if (ublk_queue_use_auto_zc(q))
|
||||
ublk_set_auto_buf_reg(q, sqe[0], io->tag);
|
||||
ublk_set_auto_buf_reg(t, q, sqe[0], io->tag);
|
||||
|
||||
user_data = build_user_data(io->tag, _IOC_NR(cmd_op), 0, q->q_id, 0);
|
||||
io_uring_sqe_set_data64(sqe[0], user_data);
|
||||
|
|
@ -779,13 +845,15 @@ static void ublk_handle_uring_cmd(struct ublk_thread *t,
|
|||
unsigned tag = user_data_to_tag(cqe->user_data);
|
||||
struct ublk_io *io = &q->ios[tag];
|
||||
|
||||
t->cmd_inflight--;
|
||||
|
||||
if (!fetch) {
|
||||
t->state |= UBLKS_T_STOPPING;
|
||||
io->flags &= ~UBLKS_IO_NEED_FETCH_RQ;
|
||||
}
|
||||
|
||||
if (cqe->res == UBLK_IO_RES_OK) {
|
||||
assert(tag < q->q_depth);
|
||||
ublk_assert(tag < q->q_depth);
|
||||
|
||||
if (ublk_queue_use_user_copy(q))
|
||||
ublk_user_copy(io, UBLK_IO_OP_WRITE);
|
||||
|
|
@ -813,28 +881,30 @@ static void ublk_handle_cqe(struct ublk_thread *t,
|
|||
{
|
||||
struct ublk_dev *dev = t->dev;
|
||||
unsigned q_id = user_data_to_q_id(cqe->user_data);
|
||||
struct ublk_queue *q = &dev->q[q_id];
|
||||
unsigned cmd_op = user_data_to_op(cqe->user_data);
|
||||
|
||||
if (cqe->res < 0 && cqe->res != -ENODEV)
|
||||
ublk_err("%s: res %d userdata %llx queue state %x\n", __func__,
|
||||
cqe->res, cqe->user_data, q->flags);
|
||||
if (cqe->res < 0 && cqe->res != -ENODEV && cqe->res != -ENOBUFS)
|
||||
ublk_err("%s: res %d userdata %llx thread state %x\n", __func__,
|
||||
cqe->res, cqe->user_data, t->state);
|
||||
|
||||
ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n",
|
||||
__func__, cqe->res, q->q_id, user_data_to_tag(cqe->user_data),
|
||||
cmd_op, is_target_io(cqe->user_data),
|
||||
ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (thread %d qid %d tag %u cmd_op %x "
|
||||
"data %lx target %d/%d) stopping %d\n",
|
||||
__func__, cqe->res, t->idx, q_id,
|
||||
user_data_to_tag(cqe->user_data),
|
||||
cmd_op, cqe->user_data, is_target_io(cqe->user_data),
|
||||
user_data_to_tgt_data(cqe->user_data),
|
||||
(t->state & UBLKS_T_STOPPING));
|
||||
|
||||
/* Don't retrieve io in case of target io */
|
||||
if (is_target_io(cqe->user_data)) {
|
||||
ublksrv_handle_tgt_cqe(t, q, cqe);
|
||||
ublksrv_handle_tgt_cqe(t, &dev->q[q_id], cqe);
|
||||
return;
|
||||
}
|
||||
|
||||
t->cmd_inflight--;
|
||||
|
||||
ublk_handle_uring_cmd(t, q, cqe);
|
||||
if (ublk_thread_batch_io(t))
|
||||
ublk_batch_compl_cmd(t, cqe);
|
||||
else
|
||||
ublk_handle_uring_cmd(t, &dev->q[q_id], cqe);
|
||||
}
|
||||
|
||||
static int ublk_reap_events_uring(struct ublk_thread *t)
|
||||
|
|
@ -866,7 +936,13 @@ static int ublk_process_io(struct ublk_thread *t)
|
|||
return -ENODEV;
|
||||
|
||||
ret = io_uring_submit_and_wait(&t->ring, 1);
|
||||
reapped = ublk_reap_events_uring(t);
|
||||
if (ublk_thread_batch_io(t)) {
|
||||
ublk_batch_prep_commit(t);
|
||||
reapped = ublk_reap_events_uring(t);
|
||||
ublk_batch_commit_io_cmds(t);
|
||||
} else {
|
||||
reapped = ublk_reap_events_uring(t);
|
||||
}
|
||||
|
||||
ublk_dbg(UBLK_DBG_THREAD, "submit result %d, reapped %d stop %d idle %d\n",
|
||||
ret, reapped, (t->state & UBLKS_T_STOPPING),
|
||||
|
|
@ -882,6 +958,7 @@ struct ublk_thread_info {
|
|||
sem_t *ready;
|
||||
cpu_set_t *affinity;
|
||||
unsigned long long extra_flags;
|
||||
unsigned char (*q_thread_map)[UBLK_MAX_QUEUES];
|
||||
};
|
||||
|
||||
static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
|
||||
|
|
@ -891,6 +968,26 @@ static void ublk_thread_set_sched_affinity(const struct ublk_thread_info *info)
|
|||
info->dev->dev_info.dev_id, info->idx);
|
||||
}
|
||||
|
||||
static void ublk_batch_setup_queues(struct ublk_thread *t)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < t->dev->dev_info.nr_hw_queues; i++) {
|
||||
struct ublk_queue *q = &t->dev->q[i];
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Only prepare io commands in the mapped thread context,
|
||||
* otherwise io command buffer index may not work as expected
|
||||
*/
|
||||
if (t->q_map[i] == 0)
|
||||
continue;
|
||||
|
||||
ret = ublk_batch_queue_prep_io_cmds(t, q);
|
||||
ublk_assert(ret >= 0);
|
||||
}
|
||||
}
|
||||
|
||||
static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_info *info)
|
||||
{
|
||||
struct ublk_thread t = {
|
||||
|
|
@ -900,6 +997,10 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf
|
|||
int dev_id = info->dev->dev_info.dev_id;
|
||||
int ret;
|
||||
|
||||
/* Copy per-thread queue mapping into thread-local variable */
|
||||
if (info->q_thread_map)
|
||||
memcpy(t.q_map, info->q_thread_map[info->idx], sizeof(t.q_map));
|
||||
|
||||
ret = ublk_thread_init(&t, info->extra_flags);
|
||||
if (ret) {
|
||||
ublk_err("ublk dev %d thread %u init failed\n",
|
||||
|
|
@ -911,8 +1012,14 @@ static __attribute__((noinline)) int __ublk_io_handler_fn(struct ublk_thread_inf
|
|||
ublk_dbg(UBLK_DBG_THREAD, "tid %d: ublk dev %d thread %u started\n",
|
||||
gettid(), dev_id, t.idx);
|
||||
|
||||
/* submit all io commands to ublk driver */
|
||||
ublk_submit_fetch_commands(&t);
|
||||
if (!ublk_thread_batch_io(&t)) {
|
||||
/* submit all io commands to ublk driver */
|
||||
ublk_submit_fetch_commands(&t);
|
||||
} else {
|
||||
ublk_batch_setup_queues(&t);
|
||||
ublk_batch_start_fetch(&t);
|
||||
}
|
||||
|
||||
do {
|
||||
if (ublk_process_io(&t) < 0)
|
||||
break;
|
||||
|
|
@ -984,6 +1091,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
|||
struct ublk_thread_info *tinfo;
|
||||
unsigned long long extra_flags = 0;
|
||||
cpu_set_t *affinity_buf;
|
||||
unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
|
||||
void *thread_ret;
|
||||
sem_t ready;
|
||||
int ret, i;
|
||||
|
|
@ -1003,6 +1111,16 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (ublk_dev_batch_io(dev)) {
|
||||
q_thread_map = calloc(dev->nthreads, sizeof(*q_thread_map));
|
||||
if (!q_thread_map) {
|
||||
ret = -ENOMEM;
|
||||
goto fail;
|
||||
}
|
||||
ublk_batch_setup_map(q_thread_map, dev->nthreads,
|
||||
dinfo->nr_hw_queues);
|
||||
}
|
||||
|
||||
if (ctx->auto_zc_fallback)
|
||||
extra_flags = UBLKS_Q_AUTO_BUF_REG_FALLBACK;
|
||||
if (ctx->no_ublk_fixed_fd)
|
||||
|
|
@ -1012,7 +1130,8 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
|||
dev->q[i].dev = dev;
|
||||
dev->q[i].q_id = i;
|
||||
|
||||
ret = ublk_queue_init(&dev->q[i], extra_flags);
|
||||
ret = ublk_queue_init(&dev->q[i], extra_flags,
|
||||
ctx->metadata_size);
|
||||
if (ret) {
|
||||
ublk_err("ublk dev %d queue %d init queue failed\n",
|
||||
dinfo->dev_id, i);
|
||||
|
|
@ -1025,6 +1144,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
|||
tinfo[i].idx = i;
|
||||
tinfo[i].ready = &ready;
|
||||
tinfo[i].extra_flags = extra_flags;
|
||||
tinfo[i].q_thread_map = q_thread_map;
|
||||
|
||||
/*
|
||||
* If threads are not tied 1:1 to queues, setting thread
|
||||
|
|
@ -1044,6 +1164,7 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
|||
for (i = 0; i < dev->nthreads; i++)
|
||||
sem_wait(&ready);
|
||||
free(affinity_buf);
|
||||
free(q_thread_map);
|
||||
|
||||
/* everything is fine now, start us */
|
||||
if (ctx->recovery)
|
||||
|
|
@ -1214,7 +1335,8 @@ static int __cmd_dev_add(const struct dev_ctx *ctx)
|
|||
goto fail;
|
||||
}
|
||||
|
||||
if (nthreads != nr_queues && !ctx->per_io_tasks) {
|
||||
if (nthreads != nr_queues && (!ctx->per_io_tasks &&
|
||||
!(ctx->flags & UBLK_F_BATCH_IO))) {
|
||||
ublk_err("%s: threads %u must be same as queues %u if "
|
||||
"not using per_io_tasks\n",
|
||||
__func__, nthreads, nr_queues);
|
||||
|
|
@ -1394,6 +1516,42 @@ static int cmd_dev_del(struct dev_ctx *ctx)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int cmd_dev_stop(struct dev_ctx *ctx)
|
||||
{
|
||||
int number = ctx->dev_id;
|
||||
struct ublk_dev *dev;
|
||||
int ret;
|
||||
|
||||
if (number < 0) {
|
||||
ublk_err("%s: device id is required\n", __func__);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
dev = ublk_ctrl_init();
|
||||
dev->dev_info.dev_id = number;
|
||||
|
||||
ret = ublk_ctrl_get_info(dev);
|
||||
if (ret < 0)
|
||||
goto fail;
|
||||
|
||||
if (ctx->safe_stop) {
|
||||
ret = ublk_ctrl_try_stop_dev(dev);
|
||||
if (ret < 0)
|
||||
ublk_err("%s: try_stop dev %d failed ret %d\n",
|
||||
__func__, number, ret);
|
||||
} else {
|
||||
ret = ublk_ctrl_stop_dev(dev);
|
||||
if (ret < 0)
|
||||
ublk_err("%s: stop dev %d failed ret %d\n",
|
||||
__func__, number, ret);
|
||||
}
|
||||
|
||||
fail:
|
||||
ublk_ctrl_deinit(dev);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __cmd_dev_list(struct dev_ctx *ctx)
|
||||
{
|
||||
struct ublk_dev *dev = ublk_ctrl_init();
|
||||
|
|
@ -1456,6 +1614,10 @@ static int cmd_dev_get_features(void)
|
|||
FEAT_NAME(UBLK_F_QUIESCE),
|
||||
FEAT_NAME(UBLK_F_PER_IO_DAEMON),
|
||||
FEAT_NAME(UBLK_F_BUF_REG_OFF_DAEMON),
|
||||
FEAT_NAME(UBLK_F_INTEGRITY),
|
||||
FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
|
||||
FEAT_NAME(UBLK_F_BATCH_IO),
|
||||
FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
|
||||
};
|
||||
struct ublk_dev *dev;
|
||||
__u64 features = 0;
|
||||
|
|
@ -1551,6 +1713,9 @@ static void __cmd_create_help(char *exe, bool recovery)
|
|||
printf("\t[--foreground] [--quiet] [-z] [--auto_zc] [--auto_zc_fallback] [--debug_mask mask] [-r 0|1] [-g] [-u]\n");
|
||||
printf("\t[-e 0|1 ] [-i 0|1] [--no_ublk_fixed_fd]\n");
|
||||
printf("\t[--nthreads threads] [--per_io_tasks]\n");
|
||||
printf("\t[--integrity_capable] [--integrity_reftag] [--metadata_size SIZE] "
|
||||
"[--pi_offset OFFSET] [--csum_type ip|t10dif|nvme] [--tag_size SIZE]\n");
|
||||
printf("\t[--batch|-b] [--no_auto_part_scan]\n");
|
||||
printf("\t[target options] [backfile1] [backfile2] ...\n");
|
||||
printf("\tdefault: nr_queues=2(max 32), depth=128(max 1024), dev_id=-1(auto allocation)\n");
|
||||
printf("\tdefault: nthreads=nr_queues");
|
||||
|
|
@ -1583,6 +1748,8 @@ static int cmd_dev_help(char *exe)
|
|||
|
||||
printf("%s del [-n dev_id] -a \n", exe);
|
||||
printf("\t -a delete all devices -n delete specified device\n\n");
|
||||
printf("%s stop -n dev_id [--safe]\n", exe);
|
||||
printf("\t --safe only stop if device has no active openers\n\n");
|
||||
printf("%s list [-n dev_id] -a \n", exe);
|
||||
printf("\t -a list all devices, -n list specified device, default -a \n\n");
|
||||
printf("%s features\n", exe);
|
||||
|
|
@ -1614,6 +1781,15 @@ int main(int argc, char *argv[])
|
|||
{ "nthreads", 1, NULL, 0 },
|
||||
{ "per_io_tasks", 0, NULL, 0 },
|
||||
{ "no_ublk_fixed_fd", 0, NULL, 0 },
|
||||
{ "integrity_capable", 0, NULL, 0 },
|
||||
{ "integrity_reftag", 0, NULL, 0 },
|
||||
{ "metadata_size", 1, NULL, 0 },
|
||||
{ "pi_offset", 1, NULL, 0 },
|
||||
{ "csum_type", 1, NULL, 0 },
|
||||
{ "tag_size", 1, NULL, 0 },
|
||||
{ "safe", 0, NULL, 0 },
|
||||
{ "batch", 0, NULL, 'b'},
|
||||
{ "no_auto_part_scan", 0, NULL, 0 },
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
const struct ublk_tgt_ops *ops = NULL;
|
||||
|
|
@ -1625,6 +1801,7 @@ int main(int argc, char *argv[])
|
|||
.nr_hw_queues = 2,
|
||||
.dev_id = -1,
|
||||
.tgt_type = "unknown",
|
||||
.csum_type = LBMD_PI_CSUM_NONE,
|
||||
};
|
||||
int ret = -EINVAL, i;
|
||||
int tgt_argc = 1;
|
||||
|
|
@ -1636,12 +1813,15 @@ int main(int argc, char *argv[])
|
|||
|
||||
opterr = 0;
|
||||
optind = 2;
|
||||
while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazu",
|
||||
while ((opt = getopt_long(argc, argv, "t:n:d:q:r:e:i:s:gazub",
|
||||
longopts, &option_idx)) != -1) {
|
||||
switch (opt) {
|
||||
case 'a':
|
||||
ctx.all = 1;
|
||||
break;
|
||||
case 'b':
|
||||
ctx.flags |= UBLK_F_BATCH_IO;
|
||||
break;
|
||||
case 'n':
|
||||
ctx.dev_id = strtol(optarg, NULL, 10);
|
||||
break;
|
||||
|
|
@ -1699,6 +1879,32 @@ int main(int argc, char *argv[])
|
|||
ctx.per_io_tasks = 1;
|
||||
if (!strcmp(longopts[option_idx].name, "no_ublk_fixed_fd"))
|
||||
ctx.no_ublk_fixed_fd = 1;
|
||||
if (!strcmp(longopts[option_idx].name, "integrity_capable"))
|
||||
ctx.integrity_flags |= LBMD_PI_CAP_INTEGRITY;
|
||||
if (!strcmp(longopts[option_idx].name, "integrity_reftag"))
|
||||
ctx.integrity_flags |= LBMD_PI_CAP_REFTAG;
|
||||
if (!strcmp(longopts[option_idx].name, "metadata_size"))
|
||||
ctx.metadata_size = strtoul(optarg, NULL, 0);
|
||||
if (!strcmp(longopts[option_idx].name, "pi_offset"))
|
||||
ctx.pi_offset = strtoul(optarg, NULL, 0);
|
||||
if (!strcmp(longopts[option_idx].name, "csum_type")) {
|
||||
if (!strcmp(optarg, "ip")) {
|
||||
ctx.csum_type = LBMD_PI_CSUM_IP;
|
||||
} else if (!strcmp(optarg, "t10dif")) {
|
||||
ctx.csum_type = LBMD_PI_CSUM_CRC16_T10DIF;
|
||||
} else if (!strcmp(optarg, "nvme")) {
|
||||
ctx.csum_type = LBMD_PI_CSUM_CRC64_NVME;
|
||||
} else {
|
||||
ublk_err("invalid csum_type: %s\n", optarg);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
if (!strcmp(longopts[option_idx].name, "tag_size"))
|
||||
ctx.tag_size = strtoul(optarg, NULL, 0);
|
||||
if (!strcmp(longopts[option_idx].name, "safe"))
|
||||
ctx.safe_stop = 1;
|
||||
if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
|
||||
ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
|
||||
break;
|
||||
case '?':
|
||||
/*
|
||||
|
|
@ -1722,6 +1928,11 @@ int main(int argc, char *argv[])
|
|||
}
|
||||
}
|
||||
|
||||
if (ctx.per_io_tasks && (ctx.flags & UBLK_F_BATCH_IO)) {
|
||||
ublk_err("per_io_task and F_BATCH_IO conflict\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* auto_zc_fallback depends on F_AUTO_BUF_REG & F_SUPPORT_ZERO_COPY */
|
||||
if (ctx.auto_zc_fallback &&
|
||||
!((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
|
||||
|
|
@ -1741,6 +1952,28 @@ int main(int argc, char *argv[])
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (ctx.metadata_size) {
|
||||
if (!(ctx.flags & UBLK_F_USER_COPY)) {
|
||||
ublk_err("integrity requires user_copy\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ctx.flags |= UBLK_F_INTEGRITY;
|
||||
} else if (ctx.integrity_flags ||
|
||||
ctx.pi_offset ||
|
||||
ctx.csum_type != LBMD_PI_CSUM_NONE ||
|
||||
ctx.tag_size) {
|
||||
ublk_err("integrity parameters require metadata_size\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if ((ctx.flags & UBLK_F_AUTO_BUF_REG) &&
|
||||
(ctx.flags & UBLK_F_BATCH_IO) &&
|
||||
(ctx.nthreads > ctx.nr_hw_queues)) {
|
||||
ublk_err("too many threads for F_AUTO_BUF_REG & F_BATCH_IO\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
i = optind;
|
||||
while (i < argc && ctx.nr_files < MAX_BACK_FILES) {
|
||||
ctx.files[ctx.nr_files++] = argv[i++];
|
||||
|
|
@ -1766,6 +1999,8 @@ int main(int argc, char *argv[])
|
|||
}
|
||||
} else if (!strcmp(cmd, "del"))
|
||||
ret = cmd_dev_del(&ctx);
|
||||
else if (!strcmp(cmd, "stop"))
|
||||
ret = cmd_dev_stop(&ctx);
|
||||
else if (!strcmp(cmd, "list")) {
|
||||
ctx.all = 1;
|
||||
ret = cmd_dev_list(&ctx);
|
||||
|
|
|
|||
|
|
@ -78,6 +78,13 @@ struct dev_ctx {
|
|||
unsigned int auto_zc_fallback:1;
|
||||
unsigned int per_io_tasks:1;
|
||||
unsigned int no_ublk_fixed_fd:1;
|
||||
unsigned int safe_stop:1;
|
||||
unsigned int no_auto_part_scan:1;
|
||||
__u32 integrity_flags;
|
||||
__u8 metadata_size;
|
||||
__u8 pi_offset;
|
||||
__u8 csum_type;
|
||||
__u8 tag_size;
|
||||
|
||||
int _evtfd;
|
||||
int _shmid;
|
||||
|
|
@ -107,6 +114,7 @@ struct ublk_ctrl_cmd_data {
|
|||
|
||||
struct ublk_io {
|
||||
char *buf_addr;
|
||||
void *integrity_buf;
|
||||
|
||||
#define UBLKS_IO_NEED_FETCH_RQ (1UL << 0)
|
||||
#define UBLKS_IO_NEED_COMMIT_RQ_COMP (1UL << 1)
|
||||
|
|
@ -143,7 +151,8 @@ struct ublk_tgt_ops {
|
|||
void (*usage)(const struct ublk_tgt_ops *ops);
|
||||
|
||||
/* return buffer index for UBLK_F_AUTO_BUF_REG */
|
||||
unsigned short (*buf_index)(const struct ublk_queue *, int tag);
|
||||
unsigned short (*buf_index)(const struct ublk_thread *t,
|
||||
const struct ublk_queue *, int tag);
|
||||
};
|
||||
|
||||
struct ublk_tgt {
|
||||
|
|
@ -165,23 +174,76 @@ struct ublk_queue {
|
|||
const struct ublk_tgt_ops *tgt_ops;
|
||||
struct ublksrv_io_desc *io_cmd_buf;
|
||||
|
||||
/* borrow one bit of ublk uapi flags, which may never be used */
|
||||
/* borrow three bit of ublk uapi flags, which may never be used */
|
||||
#define UBLKS_Q_AUTO_BUF_REG_FALLBACK (1ULL << 63)
|
||||
#define UBLKS_Q_NO_UBLK_FIXED_FD (1ULL << 62)
|
||||
#define UBLKS_Q_PREPARED (1ULL << 61)
|
||||
__u64 flags;
|
||||
int ublk_fd; /* cached ublk char device fd */
|
||||
__u8 metadata_size;
|
||||
struct ublk_io ios[UBLK_QUEUE_DEPTH];
|
||||
|
||||
/* used for prep io commands */
|
||||
pthread_spinlock_t lock;
|
||||
};
|
||||
|
||||
/* align with `ublk_elem_header` */
|
||||
struct ublk_batch_elem {
|
||||
__u16 tag;
|
||||
__u16 buf_index;
|
||||
__s32 result;
|
||||
__u64 buf_addr;
|
||||
};
|
||||
|
||||
struct batch_commit_buf {
|
||||
unsigned short q_id;
|
||||
unsigned short buf_idx;
|
||||
void *elem;
|
||||
unsigned short done;
|
||||
unsigned short count;
|
||||
};
|
||||
|
||||
struct batch_fetch_buf {
|
||||
struct io_uring_buf_ring *br;
|
||||
void *fetch_buf;
|
||||
unsigned int fetch_buf_size;
|
||||
unsigned int fetch_buf_off;
|
||||
};
|
||||
|
||||
struct ublk_thread {
|
||||
/* Thread-local copy of queue-to-thread mapping for this thread */
|
||||
unsigned char q_map[UBLK_MAX_QUEUES];
|
||||
|
||||
struct ublk_dev *dev;
|
||||
unsigned idx;
|
||||
unsigned short idx;
|
||||
unsigned short nr_queues;
|
||||
|
||||
#define UBLKS_T_STOPPING (1U << 0)
|
||||
#define UBLKS_T_IDLE (1U << 1)
|
||||
#define UBLKS_T_BATCH_IO (1U << 31) /* readonly */
|
||||
unsigned state;
|
||||
unsigned int cmd_inflight;
|
||||
unsigned int io_inflight;
|
||||
|
||||
unsigned short nr_bufs;
|
||||
|
||||
/* followings are for BATCH_IO */
|
||||
unsigned short commit_buf_start;
|
||||
unsigned char commit_buf_elem_size;
|
||||
/*
|
||||
* We just support single device, so pre-calculate commit/prep flags
|
||||
*/
|
||||
unsigned short cmd_flags;
|
||||
unsigned int nr_commit_buf;
|
||||
unsigned int commit_buf_size;
|
||||
void *commit_buf;
|
||||
#define UBLKS_T_COMMIT_BUF_INV_IDX ((unsigned short)-1)
|
||||
struct allocator commit_buf_alloc;
|
||||
struct batch_commit_buf *commit;
|
||||
/* FETCH_IO_CMDS buffer */
|
||||
unsigned short nr_fetch_bufs;
|
||||
struct batch_fetch_buf *fetch;
|
||||
|
||||
struct io_uring ring;
|
||||
};
|
||||
|
||||
|
|
@ -202,6 +264,55 @@ struct ublk_dev {
|
|||
|
||||
extern int ublk_queue_io_cmd(struct ublk_thread *t, struct ublk_io *io);
|
||||
|
||||
static inline int __ublk_use_batch_io(__u64 flags)
|
||||
{
|
||||
return flags & UBLK_F_BATCH_IO;
|
||||
}
|
||||
|
||||
static inline int ublk_queue_batch_io(const struct ublk_queue *q)
|
||||
{
|
||||
return __ublk_use_batch_io(q->flags);
|
||||
}
|
||||
|
||||
static inline int ublk_dev_batch_io(const struct ublk_dev *dev)
|
||||
{
|
||||
return __ublk_use_batch_io(dev->dev_info.flags);
|
||||
}
|
||||
|
||||
/* only work for handle single device in this pthread context */
|
||||
static inline int ublk_thread_batch_io(const struct ublk_thread *t)
|
||||
{
|
||||
return t->state & UBLKS_T_BATCH_IO;
|
||||
}
|
||||
|
||||
static inline void ublk_set_integrity_params(const struct dev_ctx *ctx,
|
||||
struct ublk_params *params)
|
||||
{
|
||||
if (!ctx->metadata_size)
|
||||
return;
|
||||
|
||||
params->types |= UBLK_PARAM_TYPE_INTEGRITY;
|
||||
params->integrity = (struct ublk_param_integrity) {
|
||||
.flags = ctx->integrity_flags,
|
||||
.interval_exp = params->basic.logical_bs_shift,
|
||||
.metadata_size = ctx->metadata_size,
|
||||
.pi_offset = ctx->pi_offset,
|
||||
.csum_type = ctx->csum_type,
|
||||
.tag_size = ctx->tag_size,
|
||||
};
|
||||
}
|
||||
|
||||
static inline size_t ublk_integrity_len(const struct ublk_queue *q, size_t len)
|
||||
{
|
||||
/* All targets currently use interval_exp = logical_bs_shift = 9 */
|
||||
return (len >> 9) * q->metadata_size;
|
||||
}
|
||||
|
||||
static inline size_t
|
||||
ublk_integrity_data_len(const struct ublk_queue *q, size_t integrity_len)
|
||||
{
|
||||
return (integrity_len / q->metadata_size) << 9;
|
||||
}
|
||||
|
||||
static inline int ublk_io_auto_zc_fallback(const struct ublksrv_io_desc *iod)
|
||||
{
|
||||
|
|
@ -224,9 +335,9 @@ static inline __u64 build_user_data(unsigned tag, unsigned op,
|
|||
{
|
||||
/* we only have 7 bits to encode q_id */
|
||||
_Static_assert(UBLK_MAX_QUEUES_SHIFT <= 7, "UBLK_MAX_QUEUES_SHIFT must be <= 7");
|
||||
assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
|
||||
ublk_assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16) && !(q_id >> 7));
|
||||
|
||||
return tag | (op << 16) | (tgt_data << 24) |
|
||||
return tag | ((__u64)op << 16) | ((__u64)tgt_data << 24) |
|
||||
(__u64)q_id << 56 | (__u64)is_target_io << 63;
|
||||
}
|
||||
|
||||
|
|
@ -357,35 +468,24 @@ static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op)
|
|||
addr[1] = 0;
|
||||
}
|
||||
|
||||
static inline unsigned short ublk_batch_io_buf_idx(
|
||||
const struct ublk_thread *t, const struct ublk_queue *q,
|
||||
unsigned tag);
|
||||
|
||||
static inline unsigned short ublk_io_buf_idx(const struct ublk_thread *t,
|
||||
const struct ublk_queue *q,
|
||||
unsigned tag)
|
||||
{
|
||||
if (ublk_queue_batch_io(q))
|
||||
return ublk_batch_io_buf_idx(t, q, tag);
|
||||
return q->ios[tag].buf_index;
|
||||
}
|
||||
|
||||
static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag)
|
||||
{
|
||||
return &q->ios[tag];
|
||||
}
|
||||
|
||||
static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q,
|
||||
unsigned tag, int res)
|
||||
{
|
||||
struct ublk_io *io = &q->ios[tag];
|
||||
|
||||
ublk_mark_io_done(io, res);
|
||||
|
||||
return ublk_queue_io_cmd(t, io);
|
||||
}
|
||||
|
||||
static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q,
|
||||
unsigned tag, int queued)
|
||||
{
|
||||
if (queued < 0)
|
||||
ublk_complete_io(t, q, tag, queued);
|
||||
else {
|
||||
struct ublk_io *io = ublk_get_io(q, tag);
|
||||
|
||||
t->io_inflight += queued;
|
||||
io->tgt_ios = queued;
|
||||
io->result = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int ublk_completed_tgt_io(struct ublk_thread *t,
|
||||
struct ublk_queue *q, unsigned tag)
|
||||
{
|
||||
|
|
@ -421,12 +521,90 @@ static inline int ublk_queue_no_buf(const struct ublk_queue *q)
|
|||
return ublk_queue_use_zc(q) || ublk_queue_use_auto_zc(q);
|
||||
}
|
||||
|
||||
static inline int ublk_batch_commit_prepared(struct batch_commit_buf *cb)
|
||||
{
|
||||
return cb->buf_idx != UBLKS_T_COMMIT_BUF_INV_IDX;
|
||||
}
|
||||
|
||||
static inline unsigned ublk_queue_idx_in_thread(const struct ublk_thread *t,
|
||||
const struct ublk_queue *q)
|
||||
{
|
||||
unsigned char idx;
|
||||
|
||||
idx = t->q_map[q->q_id];
|
||||
ublk_assert(idx != 0);
|
||||
return idx - 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Each IO's buffer index has to be calculated by this helper for
|
||||
* UBLKS_T_BATCH_IO
|
||||
*/
|
||||
static inline unsigned short ublk_batch_io_buf_idx(
|
||||
const struct ublk_thread *t, const struct ublk_queue *q,
|
||||
unsigned tag)
|
||||
{
|
||||
return ublk_queue_idx_in_thread(t, q) * q->q_depth + tag;
|
||||
}
|
||||
|
||||
/* Queue UBLK_U_IO_PREP_IO_CMDS for a specific queue with batch elements */
|
||||
int ublk_batch_queue_prep_io_cmds(struct ublk_thread *t, struct ublk_queue *q);
|
||||
/* Start fetching I/O commands using multishot UBLK_U_IO_FETCH_IO_CMDS */
|
||||
void ublk_batch_start_fetch(struct ublk_thread *t);
|
||||
/* Handle completion of batch I/O commands (prep/commit) */
|
||||
void ublk_batch_compl_cmd(struct ublk_thread *t,
|
||||
const struct io_uring_cqe *cqe);
|
||||
/* Initialize batch I/O state and calculate buffer parameters */
|
||||
void ublk_batch_prepare(struct ublk_thread *t);
|
||||
/* Allocate and register commit buffers for batch operations */
|
||||
int ublk_batch_alloc_buf(struct ublk_thread *t);
|
||||
/* Free commit buffers and cleanup batch allocator */
|
||||
void ublk_batch_free_buf(struct ublk_thread *t);
|
||||
|
||||
/* Prepare a new commit buffer for batching completed I/O operations */
|
||||
void ublk_batch_prep_commit(struct ublk_thread *t);
|
||||
/* Submit UBLK_U_IO_COMMIT_IO_CMDS with batched completed I/O operations */
|
||||
void ublk_batch_commit_io_cmds(struct ublk_thread *t);
|
||||
/* Add a completed I/O operation to the current batch commit buffer */
|
||||
void ublk_batch_complete_io(struct ublk_thread *t, struct ublk_queue *q,
|
||||
unsigned tag, int res);
|
||||
void ublk_batch_setup_map(unsigned char (*q_thread_map)[UBLK_MAX_QUEUES],
|
||||
int nthreads, int queues);
|
||||
|
||||
static inline int ublk_complete_io(struct ublk_thread *t, struct ublk_queue *q,
|
||||
unsigned tag, int res)
|
||||
{
|
||||
if (ublk_queue_batch_io(q)) {
|
||||
ublk_batch_complete_io(t, q, tag, res);
|
||||
return 0;
|
||||
} else {
|
||||
struct ublk_io *io = &q->ios[tag];
|
||||
|
||||
ublk_mark_io_done(io, res);
|
||||
return ublk_queue_io_cmd(t, io);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue *q,
|
||||
unsigned tag, int queued)
|
||||
{
|
||||
if (queued < 0)
|
||||
ublk_complete_io(t, q, tag, queued);
|
||||
else {
|
||||
struct ublk_io *io = ublk_get_io(q, tag);
|
||||
|
||||
t->io_inflight += queued;
|
||||
io->tgt_ios = queued;
|
||||
io->result = 0;
|
||||
}
|
||||
}
|
||||
|
||||
extern const struct ublk_tgt_ops null_tgt_ops;
|
||||
extern const struct ublk_tgt_ops loop_tgt_ops;
|
||||
extern const struct ublk_tgt_ops stripe_tgt_ops;
|
||||
extern const struct ublk_tgt_ops fault_inject_tgt_ops;
|
||||
|
||||
void backing_file_tgt_deinit(struct ublk_dev *dev);
|
||||
int backing_file_tgt_init(struct ublk_dev *dev);
|
||||
int backing_file_tgt_init(struct ublk_dev *dev, unsigned int nr_direct);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
36
tools/testing/selftests/ublk/metadata_size.c
Normal file
36
tools/testing/selftests/ublk/metadata_size.c
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <fcntl.h>
|
||||
#include <linux/fs.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/ioctl.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct logical_block_metadata_cap cap = {};
|
||||
const char *filename;
|
||||
int fd;
|
||||
int result;
|
||||
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "Usage: %s BLOCK_DEVICE\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
filename = argv[1];
|
||||
fd = open(filename, O_RDONLY);
|
||||
if (fd < 0) {
|
||||
perror(filename);
|
||||
return 1;
|
||||
}
|
||||
|
||||
result = ioctl(fd, FS_IOC_GETLBMD_CAP, &cap);
|
||||
if (result < 0) {
|
||||
perror("ioctl");
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("metadata_size: %u\n", cap.lbmd_size);
|
||||
printf("pi_offset: %u\n", cap.lbmd_pi_offset);
|
||||
printf("pi_tuple_size: %u\n", cap.lbmd_pi_size);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -36,6 +36,7 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
|||
.max_segments = 32,
|
||||
},
|
||||
};
|
||||
ublk_set_integrity_params(ctx, &dev->tgt.params);
|
||||
|
||||
if (info->flags & UBLK_F_SUPPORT_ZERO_COPY)
|
||||
dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth;
|
||||
|
|
@ -43,12 +44,12 @@ static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev)
|
|||
}
|
||||
|
||||
static void __setup_nop_io(int tag, const struct ublksrv_io_desc *iod,
|
||||
struct io_uring_sqe *sqe, int q_id)
|
||||
struct io_uring_sqe *sqe, int q_id, unsigned buf_idx)
|
||||
{
|
||||
unsigned ublk_op = ublksrv_get_op(iod);
|
||||
|
||||
io_uring_prep_nop(sqe);
|
||||
sqe->buf_index = tag;
|
||||
sqe->buf_index = buf_idx;
|
||||
sqe->flags |= IOSQE_FIXED_FILE;
|
||||
sqe->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT;
|
||||
sqe->len = iod->nr_sectors << 9; /* injected result */
|
||||
|
|
@ -60,18 +61,19 @@ static int null_queue_zc_io(struct ublk_thread *t, struct ublk_queue *q,
|
|||
{
|
||||
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
|
||||
struct io_uring_sqe *sqe[3];
|
||||
unsigned short buf_idx = ublk_io_buf_idx(t, q, tag);
|
||||
|
||||
ublk_io_alloc_sqes(t, sqe, 3);
|
||||
|
||||
io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
|
||||
io_uring_prep_buf_register(sqe[0], q, tag, q->q_id, buf_idx);
|
||||
sqe[0]->user_data = build_user_data(tag,
|
||||
ublk_cmd_op_nr(sqe[0]->cmd_op), 0, q->q_id, 1);
|
||||
sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK;
|
||||
|
||||
__setup_nop_io(tag, iod, sqe[1], q->q_id);
|
||||
__setup_nop_io(tag, iod, sqe[1], q->q_id, buf_idx);
|
||||
sqe[1]->flags |= IOSQE_IO_HARDLINK;
|
||||
|
||||
io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, ublk_get_io(q, tag)->buf_index);
|
||||
io_uring_prep_buf_unregister(sqe[2], q, tag, q->q_id, buf_idx);
|
||||
sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, q->q_id, 1);
|
||||
|
||||
// buf register is marked as IOSQE_CQE_SKIP_SUCCESS
|
||||
|
|
@ -85,7 +87,7 @@ static int null_queue_auto_zc_io(struct ublk_thread *t, struct ublk_queue *q,
|
|||
struct io_uring_sqe *sqe[1];
|
||||
|
||||
ublk_io_alloc_sqes(t, sqe, 1);
|
||||
__setup_nop_io(tag, iod, sqe[0], q->q_id);
|
||||
__setup_nop_io(tag, iod, sqe[0], q->q_id, ublk_io_buf_idx(t, q, tag));
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -136,11 +138,12 @@ static int ublk_null_queue_io(struct ublk_thread *t, struct ublk_queue *q,
|
|||
* return invalid buffer index for triggering auto buffer register failure,
|
||||
* then UBLK_IO_RES_NEED_REG_BUF handling is covered
|
||||
*/
|
||||
static unsigned short ublk_null_buf_index(const struct ublk_queue *q, int tag)
|
||||
static unsigned short ublk_null_buf_index(const struct ublk_thread *t,
|
||||
const struct ublk_queue *q, int tag)
|
||||
{
|
||||
if (ublk_queue_auto_zc_fallback(q))
|
||||
return (unsigned short)-1;
|
||||
return q->ios[tag].buf_index;
|
||||
return ublk_io_buf_idx(t, q, tag);
|
||||
}
|
||||
|
||||
const struct ublk_tgt_ops null_tgt_ops = {
|
||||
|
|
|
|||
1
tools/testing/selftests/ublk/settings
Normal file
1
tools/testing/selftests/ublk/settings
Normal file
|
|
@ -0,0 +1 @@
|
|||
timeout=150
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue