mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 01:24:47 +01:00
Fix netfslib such that when it's making an unbuffered or DIO write, to make
sure that it sends each subrequest strictly sequentially, waiting till the
previous one is 'committed' before sending the next so that we don't have
pieces landing out of order and potentially leaving a hole if an error
occurs (ENOSPC for example).
This is done by copying in just those bits of issuing, collecting and
retrying subrequests that are necessary to do one subrequest at a time.
Retrying, in particular, is simpler because if the current subrequest needs
retrying, the source iterator can just be copied again and the subrequest
prepped and issued again without needing to be concerned about whether it
needs merging with the previous or next in the sequence.
Note that the issuing loop waits for a subrequest to complete right after
issuing it, but this wait could be moved elsewhere allowing preparatory
steps to be performed whilst the subrequest is in progress. In particular,
once content encryption is available in netfslib, that could be done whilst
waiting, as could cleanup of buffers that have been completed.
Fixes: 153a9961b5 ("netfs: Implement unbuffered/DIO write support")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://patch.msgid.link/58526.1772112753@warthog.procyon.org.uk
Tested-by: Steve French <sfrench@samba.org>
Reviewed-by: Paulo Alcantara (Red Hat) <pc@manguebit.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
510 lines
15 KiB
C
510 lines
15 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/* Network filesystem write subrequest result collection, assessment
|
|
* and retrying.
|
|
*
|
|
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
*/
|
|
|
|
#include <linux/export.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/slab.h>
|
|
#include "internal.h"
|
|
|
|
/* Notes made in the collector */
|
|
#define HIT_PENDING 0x01 /* A front op was still pending */
|
|
#define NEED_REASSESS 0x02 /* Need to loop round and reassess */
|
|
#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */
|
|
#define NEED_UNLOCK 0x08 /* The pagecache needs unlocking */
|
|
#define NEED_RETRY 0x10 /* A front op requests retrying */
|
|
#define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */
|
|
|
|
static void netfs_dump_request(const struct netfs_io_request *rreq)
|
|
{
|
|
pr_err("Request R=%08x r=%d fl=%lx or=%x e=%ld\n",
|
|
rreq->debug_id, refcount_read(&rreq->ref), rreq->flags,
|
|
rreq->origin, rreq->error);
|
|
pr_err(" st=%llx tsl=%zx/%llx/%llx\n",
|
|
rreq->start, rreq->transferred, rreq->submitted, rreq->len);
|
|
pr_err(" cci=%llx/%llx/%llx\n",
|
|
rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to));
|
|
pr_err(" iw=%pSR\n", rreq->netfs_ops->issue_write);
|
|
for (int i = 0; i < NR_IO_STREAMS; i++) {
|
|
const struct netfs_io_subrequest *sreq;
|
|
const struct netfs_io_stream *s = &rreq->io_streams[i];
|
|
|
|
pr_err(" str[%x] s=%x e=%d acnf=%u,%u,%u,%u\n",
|
|
s->stream_nr, s->source, s->error,
|
|
s->avail, s->active, s->need_retry, s->failed);
|
|
pr_err(" str[%x] ct=%llx t=%zx\n",
|
|
s->stream_nr, s->collected_to, s->transferred);
|
|
list_for_each_entry(sreq, &s->subrequests, rreq_link) {
|
|
pr_err(" sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\n",
|
|
sreq->stream_nr, sreq->debug_index, sreq->source,
|
|
sreq->start, sreq->transferred, sreq->len,
|
|
refcount_read(&sreq->ref), sreq->flags);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Successful completion of write of a folio to the server and/or cache. Note
|
|
* that we are not allowed to lock the folio here on pain of deadlocking with
|
|
* truncate.
|
|
*/
|
|
int netfs_folio_written_back(struct folio *folio)
|
|
{
|
|
enum netfs_folio_trace why = netfs_folio_trace_clear;
|
|
struct netfs_inode *ictx = netfs_inode(folio->mapping->host);
|
|
struct netfs_folio *finfo;
|
|
struct netfs_group *group = NULL;
|
|
int gcount = 0;
|
|
|
|
if ((finfo = netfs_folio_info(folio))) {
|
|
/* Streaming writes cannot be redirtied whilst under writeback,
|
|
* so discard the streaming record.
|
|
*/
|
|
unsigned long long fend;
|
|
|
|
fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len;
|
|
if (fend > ictx->zero_point)
|
|
ictx->zero_point = fend;
|
|
|
|
folio_detach_private(folio);
|
|
group = finfo->netfs_group;
|
|
gcount++;
|
|
kfree(finfo);
|
|
why = netfs_folio_trace_clear_s;
|
|
goto end_wb;
|
|
}
|
|
|
|
if ((group = netfs_folio_group(folio))) {
|
|
if (group == NETFS_FOLIO_COPY_TO_CACHE) {
|
|
why = netfs_folio_trace_clear_cc;
|
|
folio_detach_private(folio);
|
|
goto end_wb;
|
|
}
|
|
|
|
/* Need to detach the group pointer if the page didn't get
|
|
* redirtied. If it has been redirtied, then it must be within
|
|
* the same group.
|
|
*/
|
|
why = netfs_folio_trace_redirtied;
|
|
if (!folio_test_dirty(folio)) {
|
|
folio_detach_private(folio);
|
|
gcount++;
|
|
why = netfs_folio_trace_clear_g;
|
|
}
|
|
}
|
|
|
|
end_wb:
|
|
trace_netfs_folio(folio, why);
|
|
folio_end_writeback(folio);
|
|
return gcount;
|
|
}
|
|
|
|
/*
|
|
* Unlock any folios we've finished with.
|
|
*/
|
|
static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
|
|
unsigned int *notes)
|
|
{
|
|
struct folio_queue *folioq = wreq->buffer.tail;
|
|
unsigned long long collected_to = wreq->collected_to;
|
|
unsigned int slot = wreq->buffer.first_tail_slot;
|
|
|
|
if (WARN_ON_ONCE(!folioq)) {
|
|
pr_err("[!] Writeback unlock found empty rolling buffer!\n");
|
|
netfs_dump_request(wreq);
|
|
return;
|
|
}
|
|
|
|
if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
|
|
if (netfs_pgpriv2_unlock_copied_folios(wreq))
|
|
*notes |= MADE_PROGRESS;
|
|
return;
|
|
}
|
|
|
|
if (slot >= folioq_nr_slots(folioq)) {
|
|
folioq = rolling_buffer_delete_spent(&wreq->buffer);
|
|
if (!folioq)
|
|
return;
|
|
slot = 0;
|
|
}
|
|
|
|
for (;;) {
|
|
struct folio *folio;
|
|
struct netfs_folio *finfo;
|
|
unsigned long long fpos, fend;
|
|
size_t fsize, flen;
|
|
|
|
folio = folioq_folio(folioq, slot);
|
|
if (WARN_ONCE(!folio_test_writeback(folio),
|
|
"R=%08x: folio %lx is not under writeback\n",
|
|
wreq->debug_id, folio->index))
|
|
trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
|
|
|
|
fpos = folio_pos(folio);
|
|
fsize = folio_size(folio);
|
|
finfo = netfs_folio_info(folio);
|
|
flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;
|
|
|
|
fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
|
|
|
|
trace_netfs_collect_folio(wreq, folio, fend, collected_to);
|
|
|
|
/* Unlock any folio we've transferred all of. */
|
|
if (collected_to < fend)
|
|
break;
|
|
|
|
wreq->nr_group_rel += netfs_folio_written_back(folio);
|
|
wreq->cleaned_to = fpos + fsize;
|
|
*notes |= MADE_PROGRESS;
|
|
|
|
/* Clean up the head folioq. If we clear an entire folioq, then
|
|
* we can get rid of it provided it's not also the tail folioq
|
|
* being filled by the issuer.
|
|
*/
|
|
folioq_clear(folioq, slot);
|
|
slot++;
|
|
if (slot >= folioq_nr_slots(folioq)) {
|
|
folioq = rolling_buffer_delete_spent(&wreq->buffer);
|
|
if (!folioq)
|
|
goto done;
|
|
slot = 0;
|
|
}
|
|
|
|
if (fpos + fsize >= collected_to)
|
|
break;
|
|
}
|
|
|
|
wreq->buffer.tail = folioq;
|
|
done:
|
|
wreq->buffer.first_tail_slot = slot;
|
|
}
|
|
|
|
/*
|
|
* Collect and assess the results of various write subrequests. We may need to
|
|
* retry some of the results - or even do an RMW cycle for content crypto.
|
|
*
|
|
* Note that we have a number of parallel, overlapping lists of subrequests,
|
|
* one to the server and one to the local cache for example, which may not be
|
|
* the same size or starting position and may not even correspond in boundary
|
|
* alignment.
|
|
*/
|
|
static void netfs_collect_write_results(struct netfs_io_request *wreq)
|
|
{
|
|
struct netfs_io_subrequest *front, *remove;
|
|
struct netfs_io_stream *stream;
|
|
unsigned long long collected_to, issued_to;
|
|
unsigned int notes;
|
|
int s;
|
|
|
|
_enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
|
|
trace_netfs_collect(wreq);
|
|
trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
|
|
|
|
reassess_streams:
|
|
issued_to = atomic64_read(&wreq->issued_to);
|
|
smp_rmb();
|
|
collected_to = ULLONG_MAX;
|
|
if (wreq->origin == NETFS_WRITEBACK ||
|
|
wreq->origin == NETFS_WRITETHROUGH ||
|
|
wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
|
|
notes = NEED_UNLOCK;
|
|
else
|
|
notes = 0;
|
|
|
|
/* Remove completed subrequests from the front of the streams and
|
|
* advance the completion point on each stream. We stop when we hit
|
|
* something that's in progress. The issuer thread may be adding stuff
|
|
* to the tail whilst we're doing this.
|
|
*/
|
|
for (s = 0; s < NR_IO_STREAMS; s++) {
|
|
stream = &wreq->io_streams[s];
|
|
/* Read active flag before list pointers */
|
|
if (!smp_load_acquire(&stream->active))
|
|
continue;
|
|
|
|
front = stream->front;
|
|
while (front) {
|
|
trace_netfs_collect_sreq(wreq, front);
|
|
//_debug("sreq [%x] %llx %zx/%zx",
|
|
// front->debug_index, front->start, front->transferred, front->len);
|
|
|
|
if (stream->collected_to < front->start) {
|
|
trace_netfs_collect_gap(wreq, stream, issued_to, 'F');
|
|
stream->collected_to = front->start;
|
|
}
|
|
|
|
/* Stall if the front is still undergoing I/O. */
|
|
if (netfs_check_subreq_in_progress(front)) {
|
|
notes |= HIT_PENDING;
|
|
break;
|
|
}
|
|
smp_rmb(); /* Read counters after I-P flag. */
|
|
|
|
if (stream->failed) {
|
|
stream->collected_to = front->start + front->len;
|
|
notes |= MADE_PROGRESS | SAW_FAILURE;
|
|
goto cancel;
|
|
}
|
|
if (front->start + front->transferred > stream->collected_to) {
|
|
stream->collected_to = front->start + front->transferred;
|
|
stream->transferred = stream->collected_to - wreq->start;
|
|
stream->transferred_valid = true;
|
|
notes |= MADE_PROGRESS;
|
|
}
|
|
if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
|
|
stream->failed = true;
|
|
stream->error = front->error;
|
|
if (stream->source == NETFS_UPLOAD_TO_SERVER)
|
|
mapping_set_error(wreq->mapping, front->error);
|
|
notes |= NEED_REASSESS | SAW_FAILURE;
|
|
break;
|
|
}
|
|
if (front->transferred < front->len) {
|
|
stream->need_retry = true;
|
|
notes |= NEED_RETRY | MADE_PROGRESS;
|
|
break;
|
|
}
|
|
|
|
cancel:
|
|
/* Remove if completely consumed. */
|
|
spin_lock(&wreq->lock);
|
|
|
|
remove = front;
|
|
list_del_init(&front->rreq_link);
|
|
front = list_first_entry_or_null(&stream->subrequests,
|
|
struct netfs_io_subrequest, rreq_link);
|
|
stream->front = front;
|
|
spin_unlock(&wreq->lock);
|
|
netfs_put_subrequest(remove,
|
|
notes & SAW_FAILURE ?
|
|
netfs_sreq_trace_put_cancel :
|
|
netfs_sreq_trace_put_done);
|
|
}
|
|
|
|
/* If we have an empty stream, we need to jump it forward
|
|
* otherwise the collection point will never advance.
|
|
*/
|
|
if (!front && issued_to > stream->collected_to) {
|
|
trace_netfs_collect_gap(wreq, stream, issued_to, 'E');
|
|
stream->collected_to = issued_to;
|
|
}
|
|
|
|
if (stream->collected_to < collected_to)
|
|
collected_to = stream->collected_to;
|
|
}
|
|
|
|
if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
|
|
wreq->collected_to = collected_to;
|
|
|
|
for (s = 0; s < NR_IO_STREAMS; s++) {
|
|
stream = &wreq->io_streams[s];
|
|
if (stream->active)
|
|
trace_netfs_collect_stream(wreq, stream);
|
|
}
|
|
|
|
trace_netfs_collect_state(wreq, wreq->collected_to, notes);
|
|
|
|
/* Unlock any folios that we have now finished with. */
|
|
if (notes & NEED_UNLOCK) {
|
|
if (wreq->cleaned_to < wreq->collected_to)
|
|
netfs_writeback_unlock_folios(wreq, ¬es);
|
|
} else {
|
|
wreq->cleaned_to = wreq->collected_to;
|
|
}
|
|
|
|
// TODO: Discard encryption buffers
|
|
|
|
if (notes & NEED_RETRY)
|
|
goto need_retry;
|
|
|
|
if (notes & MADE_PROGRESS) {
|
|
netfs_wake_rreq_flag(wreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause);
|
|
//cond_resched();
|
|
goto reassess_streams;
|
|
}
|
|
|
|
if (notes & NEED_REASSESS) {
|
|
//cond_resched();
|
|
goto reassess_streams;
|
|
}
|
|
|
|
out:
|
|
netfs_put_group_many(wreq->group, wreq->nr_group_rel);
|
|
wreq->nr_group_rel = 0;
|
|
_leave(" = %x", notes);
|
|
return;
|
|
|
|
need_retry:
|
|
/* Okay... We're going to have to retry one or both streams. Note
|
|
* that any partially completed op will have had any wholly transferred
|
|
* folios removed from it.
|
|
*/
|
|
_debug("retry");
|
|
netfs_retry_writes(wreq);
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* Perform the collection of subrequests, folios and encryption buffers.
|
|
*/
|
|
bool netfs_write_collection(struct netfs_io_request *wreq)
|
|
{
|
|
struct netfs_inode *ictx = netfs_inode(wreq->inode);
|
|
size_t transferred;
|
|
bool transferred_valid = false;
|
|
int s;
|
|
|
|
_enter("R=%x", wreq->debug_id);
|
|
|
|
netfs_collect_write_results(wreq);
|
|
|
|
/* We're done when the app thread has finished posting subreqs and all
|
|
* the queues in all the streams are empty.
|
|
*/
|
|
if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags))
|
|
return false;
|
|
smp_rmb(); /* Read ALL_QUEUED before lists. */
|
|
|
|
transferred = LONG_MAX;
|
|
for (s = 0; s < NR_IO_STREAMS; s++) {
|
|
struct netfs_io_stream *stream = &wreq->io_streams[s];
|
|
if (!stream->active)
|
|
continue;
|
|
if (!list_empty(&stream->subrequests))
|
|
return false;
|
|
if (stream->transferred_valid &&
|
|
stream->transferred < transferred) {
|
|
transferred = stream->transferred;
|
|
transferred_valid = true;
|
|
}
|
|
}
|
|
|
|
/* Okay, declare that all I/O is complete. */
|
|
if (transferred_valid)
|
|
wreq->transferred = transferred;
|
|
trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
|
|
|
|
if (wreq->io_streams[1].active &&
|
|
wreq->io_streams[1].failed &&
|
|
ictx->ops->invalidate_cache) {
|
|
/* Cache write failure doesn't prevent writeback completion
|
|
* unless we're in disconnected mode.
|
|
*/
|
|
ictx->ops->invalidate_cache(wreq);
|
|
}
|
|
|
|
_debug("finished");
|
|
netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
|
|
/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
|
|
|
|
if (wreq->iocb) {
|
|
size_t written = min(wreq->transferred, wreq->len);
|
|
wreq->iocb->ki_pos += written;
|
|
if (wreq->iocb->ki_complete) {
|
|
trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
|
|
wreq->iocb->ki_complete(
|
|
wreq->iocb, wreq->error ? wreq->error : written);
|
|
}
|
|
wreq->iocb = VFS_PTR_POISON;
|
|
}
|
|
|
|
netfs_clear_subrequests(wreq);
|
|
return true;
|
|
}
|
|
|
|
void netfs_write_collection_worker(struct work_struct *work)
|
|
{
|
|
struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
|
|
|
|
netfs_see_request(rreq, netfs_rreq_trace_see_work);
|
|
if (netfs_check_rreq_in_progress(rreq)) {
|
|
if (netfs_write_collection(rreq))
|
|
/* Drop the ref from the IN_PROGRESS flag. */
|
|
netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
|
|
else
|
|
netfs_see_request(rreq, netfs_rreq_trace_see_work_complete);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* netfs_write_subrequest_terminated - Note the termination of a write operation.
|
|
* @_op: The I/O request that has terminated.
|
|
* @transferred_or_error: The amount of data transferred or an error code.
|
|
*
|
|
* This tells the library that a contributory write I/O operation has
|
|
* terminated, one way or another, and that it should collect the results.
|
|
*
|
|
* The caller indicates in @transferred_or_error the outcome of the operation,
|
|
* supplying a positive value to indicate the number of bytes transferred or a
|
|
* negative error code. The library will look after reissuing I/O operations
|
|
* as appropriate and writing downloaded data to the cache.
|
|
*
|
|
* When this is called, ownership of the subrequest is transferred back to the
|
|
* library, along with a ref.
|
|
*
|
|
* Note that %_op is a void* so that the function can be passed to
|
|
* kiocb::term_func without the need for a casting wrapper.
|
|
*/
|
|
void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error)
|
|
{
|
|
struct netfs_io_subrequest *subreq = _op;
|
|
struct netfs_io_request *wreq = subreq->rreq;
|
|
|
|
_enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
|
|
|
|
switch (subreq->source) {
|
|
case NETFS_UPLOAD_TO_SERVER:
|
|
netfs_stat(&netfs_n_wh_upload_done);
|
|
break;
|
|
case NETFS_WRITE_TO_CACHE:
|
|
netfs_stat(&netfs_n_wh_write_done);
|
|
break;
|
|
default:
|
|
BUG();
|
|
}
|
|
|
|
if (IS_ERR_VALUE(transferred_or_error)) {
|
|
subreq->error = transferred_or_error;
|
|
/* if need retry is set, error should not matter */
|
|
if (!test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
|
|
set_bit(NETFS_SREQ_FAILED, &subreq->flags);
|
|
trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
|
|
}
|
|
|
|
switch (subreq->source) {
|
|
case NETFS_WRITE_TO_CACHE:
|
|
netfs_stat(&netfs_n_wh_write_failed);
|
|
break;
|
|
case NETFS_UPLOAD_TO_SERVER:
|
|
netfs_stat(&netfs_n_wh_upload_failed);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
|
|
set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
|
|
} else {
|
|
if (WARN(transferred_or_error > subreq->len - subreq->transferred,
|
|
"Subreq excess write: R=%x[%x] %zd > %zu - %zu",
|
|
wreq->debug_id, subreq->debug_index,
|
|
transferred_or_error, subreq->len, subreq->transferred))
|
|
transferred_or_error = subreq->len - subreq->transferred;
|
|
|
|
subreq->error = 0;
|
|
subreq->transferred += transferred_or_error;
|
|
|
|
if (subreq->transferred < subreq->len)
|
|
set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
|
|
}
|
|
|
|
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
|
|
netfs_subreq_clear_in_progress(subreq);
|
|
netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated);
|
|
}
|
|
EXPORT_SYMBOL(netfs_write_subrequest_terminated);
|