From 6df164e29bd4e6505c5a2e0e5f1e1f6957a16a42 Mon Sep 17 00:00:00 2001 From: Lei Lu Date: Mon, 11 Aug 2025 21:58:48 +0800 Subject: [PATCH 01/42] sunrpc: fix null pointer dereference on zero-length checksum In xdr_stream_decode_opaque_auth(), zero-length checksum.len causes checksum.data to be set to NULL. This triggers a NPD when accessing checksum.data in gss_krb5_verify_mic_v2(). This patch ensures that the value of checksum.len is not less than XDR_UNIT. Fixes: 0653028e8f1c ("SUNRPC: Convert gss_verify_header() to use xdr_stream") Cc: stable@kernel.org Signed-off-by: Lei Lu Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index e82212f6b562..a8ec30759a18 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -724,7 +724,7 @@ svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } - if (flavor != RPC_AUTH_GSS) { + if (flavor != RPC_AUTH_GSS || checksum.len < XDR_UNIT) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } From c926f0298d3cdd25f1bfa019f5b74ed48796cef7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 2 Jul 2025 19:33:44 -0400 Subject: [PATCH 02/42] NFSD: Relocate the fh_want_write() and fh_drop_write() helpers Clean up: these helpers are part of the NFSD file handle API. Relocate them to fs/nfsd/nfsfh.h. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsfh.h | 37 +++++++++++++++++++++++++++++++++++++ fs/nfsd/vfs.h | 20 -------------------- 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 1cf979722521..6f5255d1c190 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -14,6 +14,8 @@ #include #include +#include "export.h" + /* * The file handle starts with a sequence of four-byte words. * The first word contains a version number (1) and three descriptor bytes @@ -271,6 +273,41 @@ static inline bool fh_fsid_match(const struct knfsd_fh *fh1, return true; } +/** + * fh_want_write - Get write access to an export + * @fhp: File handle of file to be written + * + * Caller must invoke fh_drop_write() when its write operation + * is complete. + * + * Returns 0 if the file handle's export can be written to. Otherwise + * the export is not prepared for updates, and the returned negative + * errno value reflects the reason for the failure. + */ +static inline int fh_want_write(struct svc_fh *fhp) +{ + int ret; + + if (fhp->fh_want_write) + return 0; + ret = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (!ret) + fhp->fh_want_write = true; + return ret; +} + +/** + * fh_drop_write - Release write access on an export + * @fhp: File handle of file on which fh_want_write() was previously called + */ +static inline void fh_drop_write(struct svc_fh *fhp) +{ + if (fhp->fh_want_write) { + fhp->fh_want_write = false; + mnt_drop_write(fhp->fh_export->ex_path.mnt); + } +} + /** * knfsd_fh_hash - calculate the crc32 hash for the filehandle * @fh - pointer to filehandle diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index eff04959606f..4007dcbbbfef 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -160,26 +160,6 @@ __be32 nfsd_permission(struct svc_cred *cred, struct svc_export *exp, void nfsd_filp_close(struct file *fp); -static inline int fh_want_write(struct svc_fh *fh) -{ - int ret; - - if (fh->fh_want_write) - return 0; - ret = mnt_want_write(fh->fh_export->ex_path.mnt); - if (!ret) - fh->fh_want_write = true; - return ret; -} - -static inline void fh_drop_write(struct svc_fh *fh) -{ - if (fh->fh_want_write) { - fh->fh_want_write = false; - mnt_drop_write(fh->fh_export->ex_path.mnt); - } -} - static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) { u32 request_mask = STATX_BASIC_STATS; From c1f203e46c55ac063791bb893e30e5d14cabe1f6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 2 Jul 2025 19:33:45 -0400 Subject: [PATCH 03/42] NFSD: Move the fh_getattr() helper Clean up: The fh_getattr() function is part of NFSD's file handle API, so relocate it. I've made it an un-inlined function so that trace points and new functionality can easily be introduced. That increases the size of nfsd.ko by about a page on my x86_64 system (out of 26MB; compiled with -O2). Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsfh.c | 23 +++++++++++++++++++++++ fs/nfsd/nfsfh.h | 1 + fs/nfsd/vfs.h | 13 ------------- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 74cf1f4de174..f4c2fb3dd5d0 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -662,6 +662,29 @@ out_negative: return nfserr_serverfault; } +/** + * fh_getattr - Retrieve attributes on a local file + * @fhp: File handle of target file + * @stat: Caller-supplied kstat buffer to be filled in + * + * Returns nfs_ok on success, otherwise an NFS status code is + * returned. + */ +__be32 fh_getattr(const struct svc_fh *fhp, struct kstat *stat) +{ + struct path p = { + .mnt = fhp->fh_export->ex_path.mnt, + .dentry = fhp->fh_dentry, + }; + u32 request_mask = STATX_BASIC_STATS; + + if (fhp->fh_maxsize == NFS4_FHSIZE) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + + return nfserrno(vfs_getattr(&p, stat, request_mask, + AT_STATX_SYNC_AS_STAT)); +} + /** * fh_fill_pre_attrs - Fill in pre-op attributes * @fhp: file handle to be updated diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 6f5255d1c190..5ef7191f8ad8 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -222,6 +222,7 @@ extern char * SVCFH_fmt(struct svc_fh *fhp); __be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int); __be32 fh_verify_local(struct net *, struct svc_cred *, struct auth_domain *, struct svc_fh *, umode_t, int); +__be32 fh_getattr(const struct svc_fh *fhp, struct kstat *stat); __be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); __be32 fh_update(struct svc_fh *); void fh_put(struct svc_fh *); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 4007dcbbbfef..0c0292611c6d 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -160,17 +160,4 @@ __be32 nfsd_permission(struct svc_cred *cred, struct svc_export *exp, void nfsd_filp_close(struct file *fp); -static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) -{ - u32 request_mask = STATX_BASIC_STATS; - struct path p = {.mnt = fh->fh_export->ex_path.mnt, - .dentry = fh->fh_dentry}; - - if (fh->fh_maxsize == NFS4_FHSIZE) - request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); - - return nfserrno(vfs_getattr(&p, stat, request_mask, - AT_STATX_SYNC_AS_STAT)); -} - #endif /* LINUX_NFSD_VFS_H */ From d9adbb6e10bf7d4223d3d521ede1b2052903bc5e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 8 Jul 2025 14:14:53 -0400 Subject: [PATCH 04/42] sunrpc: delay pc_release callback until after the reply is sent The server-side sunrpc code currently calls pc_release before sending the reply. Change svc_process and svc_process_bc to call pc_release after sending the reply instead. Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b1fab3a69544..fc70e13b1cb9 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1426,8 +1426,6 @@ svc_process_common(struct svc_rqst *rqstp) /* Call the function that processes the request. */ rc = process.dispatch(rqstp); - if (procp->pc_release) - procp->pc_release(rqstp); xdr_finish_decode(xdr); if (!rc) @@ -1526,6 +1524,14 @@ static void svc_drop(struct svc_rqst *rqstp) trace_svc_drop(rqstp); } +static void svc_release_rqst(struct svc_rqst *rqstp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; + + if (procp && procp->pc_release) + procp->pc_release(rqstp); +} + /** * svc_process - Execute one RPC transaction * @rqstp: RPC transaction context @@ -1565,9 +1571,12 @@ void svc_process(struct svc_rqst *rqstp) if (unlikely(*p != rpc_call)) goto out_baddir; - if (!svc_process_common(rqstp)) + if (!svc_process_common(rqstp)) { + svc_release_rqst(rqstp); goto out_drop; + } svc_send(rqstp); + svc_release_rqst(rqstp); return; out_baddir: @@ -1635,6 +1644,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); + svc_release_rqst(rqstp); return; } /* Finally, send the reply synchronously */ @@ -1648,6 +1658,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) timeout.to_maxval = timeout.to_initval; memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req, &timeout); + svc_release_rqst(rqstp); if (IS_ERR(task)) return; From 2ee3a75e42081db3d951c0893f5d654f16d1c0e8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 18 Jul 2025 11:26:15 +1000 Subject: [PATCH 05/42] nfsd: discard nfsd_file_get_local() This interface was deprecated by commit e6f7e1487ab5 ("nfs_localio: simplify interface to nfsd for getting nfsd_file") and is now unused. So let's remove it. Signed-off-by: NeilBrown Reviewed-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 21 --------------------- fs/nfsd/filecache.h | 1 - fs/nfsd/localio.c | 1 - include/linux/nfslocalio.h | 1 - 4 files changed, 24 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 732abf6b92a5..75bc48031c07 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -391,27 +391,6 @@ nfsd_file_put_local(struct nfsd_file __rcu **pnf) return net; } -/** - * nfsd_file_get_local - get nfsd_file reference and reference to net - * @nf: nfsd_file of which to put the reference - * - * Get reference to both the nfsd_file and nf->nf_net. - */ -struct nfsd_file * -nfsd_file_get_local(struct nfsd_file *nf) -{ - struct net *net = nf->nf_net; - - if (nfsd_net_try_get(net)) { - nf = nfsd_file_get(nf); - if (!nf) - nfsd_net_put(net); - } else { - nf = NULL; - } - return nf; -} - /** * nfsd_file_file - get the backing file of an nfsd_file * @nf: nfsd_file of which to access the backing file. diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 722b26c71e45..24ddf60e8434 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -63,7 +63,6 @@ int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf); -struct nfsd_file *nfsd_file_get_local(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); struct file *nfsd_file_file(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index cb237f1b902a..269fa9391dc4 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -122,7 +122,6 @@ static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_net_put = nfsd_net_put, .nfsd_open_local_fh = nfsd_open_local_fh, .nfsd_file_put_local = nfsd_file_put_local, - .nfsd_file_get_local = nfsd_file_get_local, .nfsd_file_file = nfsd_file_file, }; diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 5c7c92659e73..59ea90bd136b 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -63,7 +63,6 @@ struct nfsd_localio_operations { struct nfsd_file __rcu **pnf, const fmode_t); struct net *(*nfsd_file_put_local)(struct nfsd_file __rcu **); - struct nfsd_file *(*nfsd_file_get_local)(struct nfsd_file *); struct file *(*nfsd_file_file)(struct nfsd_file *); } ____cacheline_aligned; From c97b737ef8f10f28424822c139e3b22b9e9bcc2b Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Fri, 18 Jul 2025 11:09:56 +0300 Subject: [PATCH 06/42] sunrpc: Change ret code of xdr_stream_decode_opaque_fixed Since the opaque is fixed in size, the caller already knows how many bytes were decoded, on success. Thus, xdr_stream_decode_opaque_fixed() doesn't need to return that value. And, xdr_stream_decode_u32 and _u64 both return zero on success. This patch simplifies the caller's error checking to avoid potential integer promotion issues. Suggested-by: Dan Carpenter Signed-off-by: Sergey Bashirov Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 4 ++-- .../xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8a9ec617cf66..8d354015d762 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -721,7 +721,7 @@ xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr) * @len: size of buffer pointed to by @ptr * * Return values: - * On success, returns size of object stored in @ptr + * %0 on success * %-EBADMSG on XDR buffer overflow */ static inline ssize_t @@ -732,7 +732,7 @@ xdr_stream_decode_opaque_fixed(struct xdr_stream *xdr, void *ptr, size_t len) if (unlikely(!p)) return -EBADMSG; xdr_decode_opaque_fixed(p, ptr, len); - return len; + return 0; } /** diff --git a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 index 8b4ff08c49e5..bdc7bd24ffb1 100644 --- a/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 +++ b/tools/net/sunrpc/xdrgen/templates/C/typedef/decoder/fixed_length_opaque.j2 @@ -13,5 +13,5 @@ xdrgen_decode_{{ name }}(struct xdr_stream *xdr, {{ classifier }}{{ name }} *ptr {% if annotate %} /* (fixed-length opaque) */ {% endif %} - return xdr_stream_decode_opaque_fixed(xdr, ptr, {{ size }}) >= 0; + return xdr_stream_decode_opaque_fixed(xdr, ptr, {{ size }}) == 0; }; From 832738e4b325b742940761e10487403f9aad13e8 Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Mon, 21 Jul 2025 17:48:55 +0300 Subject: [PATCH 07/42] NFSD: Rework encoding and decoding of nfsd4_deviceid Compilers may optimize the layout of C structures, so we should not rely on sizeof struct and memcpy to encode and decode XDR structures. The byte order of the fields should also be taken into account. This patch adds the correct functions to handle the deviceid4 structure and removes the pad field, which is currently not used by NFSD, from the runtime state. The server's byte order is preserved because the deviceid4 blob on the wire is only used as a cookie by the client. Signed-off-by: Sergey Bashirov Signed-off-by: Chuck Lever --- fs/nfsd/blocklayoutxdr.c | 7 ++----- fs/nfsd/flexfilelayoutxdr.c | 3 +-- fs/nfsd/nfs4layouts.c | 1 - fs/nfsd/nfs4xdr.c | 14 +------------- fs/nfsd/xdr4.h | 36 +++++++++++++++++++++++++++++++++++- 5 files changed, 39 insertions(+), 22 deletions(-) diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index bcf21fde9120..18de37ff2891 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -29,8 +29,7 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr, *p++ = cpu_to_be32(len); *p++ = cpu_to_be32(1); /* we always return a single extent */ - p = xdr_encode_opaque_fixed(p, &b->vol_id, - sizeof(struct nfsd4_deviceid)); + p = svcxdr_encode_deviceid4(p, &b->vol_id); p = xdr_encode_hyper(p, b->foff); p = xdr_encode_hyper(p, b->len); p = xdr_encode_hyper(p, b->soff); @@ -156,9 +155,7 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, for (i = 0; i < nr_iomaps; i++) { struct pnfs_block_extent bex; - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); - + p = svcxdr_decode_deviceid4(p, &bex.vol_id); p = xdr_decode_hyper(p, &bex.foff); if (bex.foff & (block_size - 1)) { goto fail; diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c index aeb71c10ff1b..f9f7e38cba13 100644 --- a/fs/nfsd/flexfilelayoutxdr.c +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -54,8 +54,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, *p++ = cpu_to_be32(1); /* single mirror */ *p++ = cpu_to_be32(1); /* single data server */ - p = xdr_encode_opaque_fixed(p, &fl->deviceid, - sizeof(struct nfsd4_deviceid)); + p = svcxdr_encode_deviceid4(p, &fl->deviceid); *p++ = cpu_to_be32(1); /* efficiency */ diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index aea905fcaf87..683bd1130afe 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -120,7 +120,6 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, id->fsid_idx = fhp->fh_export->ex_devid_map->idx; id->generation = device_generation; - id->pad = 0; return 0; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index ea91bad4eee2..2acc9abee668 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -587,18 +587,6 @@ nfsd4_decode_state_owner4(struct nfsd4_compoundargs *argp, } #ifdef CONFIG_NFSD_PNFS -static __be32 -nfsd4_decode_deviceid4(struct nfsd4_compoundargs *argp, - struct nfsd4_deviceid *devid) -{ - __be32 *p; - - p = xdr_inline_decode(argp->xdr, NFS4_DEVICEID4_SIZE); - if (!p) - return nfserr_bad_xdr; - memcpy(devid, p, sizeof(*devid)); - return nfs_ok; -} static __be32 nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, @@ -1783,7 +1771,7 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, __be32 status; memset(gdev, 0, sizeof(*gdev)); - status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid); + status = nfsd4_decode_deviceid4(argp->xdr, &gdev->gd_devid); if (status) return status; if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_layout_type) < 0) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index a23bc56051ca..e65b552bf5f5 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -595,9 +595,43 @@ struct nfsd4_reclaim_complete { struct nfsd4_deviceid { u64 fsid_idx; u32 generation; - u32 pad; }; +static inline __be32 * +svcxdr_encode_deviceid4(__be32 *p, const struct nfsd4_deviceid *devid) +{ + __be64 *q = (__be64 *)p; + + *q = (__force __be64)devid->fsid_idx; + p += 2; + *p++ = (__force __be32)devid->generation; + *p++ = xdr_zero; + return p; +} + +static inline __be32 * +svcxdr_decode_deviceid4(__be32 *p, struct nfsd4_deviceid *devid) +{ + __be64 *q = (__be64 *)p; + + devid->fsid_idx = (__force u64)(*q); + p += 2; + devid->generation = (__force u32)(*p++); + p++; /* NFSD does not use the remaining octets */ + return p; +} + +static inline __be32 +nfsd4_decode_deviceid4(struct xdr_stream *xdr, struct nfsd4_deviceid *devid) +{ + __be32 *p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE); + + if (unlikely(!p)) + return nfserr_bad_xdr; + svcxdr_decode_deviceid4(p, devid); + return nfs_ok; +} + struct nfsd4_layout_seg { u32 iomode; u64 offset; From 274365a51d88658fb51cca637ba579034e90a799 Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Tue, 15 Jul 2025 18:32:18 +0300 Subject: [PATCH 08/42] NFSD: Minor cleanup in layoutcommit processing Remove dprintk in nfsd4_layoutcommit. These are not needed in day to day usage, and the information is also available in Wireshark when capturing NFS traffic. Reviewed-by: Christoph Hellwig Signed-off-by: Sergey Bashirov Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 71b428efcbb5..04b8856d0615 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2492,18 +2492,12 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, inode = d_inode(current_fh->fh_dentry); nfserr = nfserr_inval; - if (new_size <= seg->offset) { - dprintk("pnfsd: last write before layout segment\n"); + if (new_size <= seg->offset) goto out; - } - if (new_size > seg->offset + seg->length) { - dprintk("pnfsd: last write beyond layout segment\n"); + if (new_size > seg->offset + seg->length) goto out; - } - if (!lcp->lc_newoffset && new_size > i_size_read(inode)) { - dprintk("pnfsd: layoutcommit beyond EOF\n"); + if (!lcp->lc_newoffset && new_size > i_size_read(inode)) goto out; - } nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, false, lcp->lc_layout_type, From 6bf1be3399e2635805074954011cb55745569788 Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Tue, 15 Jul 2025 18:32:20 +0300 Subject: [PATCH 09/42] NFSD: Minor cleanup in layoutcommit decoding Use the appropriate xdr function to decode the lc_newoffset field, which is a boolean value. See RFC 8881, section 18.42.1. Signed-off-by: Sergey Bashirov Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2acc9abee668..cbbb61fcdd49 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1802,7 +1802,7 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, status = nfsd4_decode_stateid4(argp, &lcp->lc_sid); if (status) return status; - if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_newoffset) < 0) + if (xdr_stream_decode_bool(argp->xdr, &lcp->lc_newoffset) < 0) return nfserr_bad_xdr; if (lcp->lc_newoffset) { if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_last_wr) < 0) From f963cf2b91a30b5614c514f3ad53ca124cb65280 Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Mon, 21 Jul 2025 21:40:55 +0300 Subject: [PATCH 10/42] NFSD: Implement large extent array support in pNFS When pNFS client in the block or scsi layout mode sends layoutcommit to MDS, a variable length array of modified extents is supplied within the request. This patch allows the server to accept such extent arrays if they do not fit within single memory page. The issue can be reproduced when writing to a 1GB file using FIO with O_DIRECT, 4K block and large I/O depth without preallocation of the file. In this case, the server returns NFSERR_BADXDR to the client. Co-developed-by: Konstantin Evtushenko Signed-off-by: Konstantin Evtushenko Signed-off-by: Sergey Bashirov Reviewed-by: Jeff Layton Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 20 ++++++---- fs/nfsd/blocklayoutxdr.c | 83 +++++++++++++++++++++++++++------------- fs/nfsd/blocklayoutxdr.h | 4 +- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfs4xdr.c | 11 +++--- fs/nfsd/pnfs.h | 1 + fs/nfsd/xdr4.h | 3 +- 7 files changed, 78 insertions(+), 46 deletions(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 19078a043e85..4c936132eb44 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -173,16 +173,18 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb, } static __be32 -nfsd4_block_proc_layoutcommit(struct inode *inode, +nfsd4_block_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp) { struct iomap *iomaps; int nr_iomaps; __be32 nfserr; - nfserr = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, &nr_iomaps, - i_blocksize(inode)); + rqstp->rq_arg = lcp->lc_up_layout; + svcxdr_init_decode(rqstp); + + nfserr = nfsd4_block_decode_layoutupdate(&rqstp->rq_arg_stream, + &iomaps, &nr_iomaps, i_blocksize(inode)); if (nfserr != nfs_ok) return nfserr; @@ -313,16 +315,18 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); } static __be32 -nfsd4_scsi_proc_layoutcommit(struct inode *inode, +nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp) { struct iomap *iomaps; int nr_iomaps; __be32 nfserr; - nfserr = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, &nr_iomaps, - i_blocksize(inode)); + rqstp->rq_arg = lcp->lc_up_layout; + svcxdr_init_decode(rqstp); + + nfserr = nfsd4_scsi_decode_layoutupdate(&rqstp->rq_arg_stream, + &iomaps, &nr_iomaps, i_blocksize(inode)); if (nfserr != nfs_ok) return nfserr; diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 18de37ff2891..e50afe340737 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -113,8 +113,7 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, /** * nfsd4_block_decode_layoutupdate - decode the block layout extent array - * @p: pointer to the xdr data - * @len: number of bytes to decode + * @xdr: subbuf set to the encoded array * @iomapp: pointer to store the decoded extent array * @nr_iomapsp: pointer to store the number of extents * @block_size: alignment of extent offset and length @@ -127,25 +126,24 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, * * Return values: * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid - * %nfserr_bad_xdr: The encoded array in @p is invalid + * %nfserr_bad_xdr: The encoded array in @xdr is invalid * %nfserr_inval: An unaligned extent found * %nfserr_delay: Failed to allocate memory for @iomapp */ __be32 -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, +nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, int *nr_iomapsp, u32 block_size) { struct iomap *iomaps; - u32 nr_iomaps, i; + u32 nr_iomaps, expected, len, i; + __be32 nfserr; - if (len < sizeof(u32)) - return nfserr_bad_xdr; - len -= sizeof(u32); - if (len % PNFS_BLOCK_EXTENT_SIZE) + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) return nfserr_bad_xdr; - nr_iomaps = be32_to_cpup(p++); - if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) + len = sizeof(__be32) + xdr_stream_remaining(xdr); + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; + if (len != expected) return nfserr_bad_xdr; iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); @@ -155,21 +153,44 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, for (i = 0; i < nr_iomaps; i++) { struct pnfs_block_extent bex; - p = svcxdr_decode_deviceid4(p, &bex.vol_id); - p = xdr_decode_hyper(p, &bex.foff); + if (nfsd4_decode_deviceid4(xdr, &bex.vol_id)) { + nfserr = nfserr_bad_xdr; + goto fail; + } + + if (xdr_stream_decode_u64(xdr, &bex.foff)) { + nfserr = nfserr_bad_xdr; + goto fail; + } if (bex.foff & (block_size - 1)) { + nfserr = nfserr_inval; + goto fail; + } + + if (xdr_stream_decode_u64(xdr, &bex.len)) { + nfserr = nfserr_bad_xdr; goto fail; } - p = xdr_decode_hyper(p, &bex.len); if (bex.len & (block_size - 1)) { + nfserr = nfserr_inval; + goto fail; + } + + if (xdr_stream_decode_u64(xdr, &bex.soff)) { + nfserr = nfserr_bad_xdr; goto fail; } - p = xdr_decode_hyper(p, &bex.soff); if (bex.soff & (block_size - 1)) { + nfserr = nfserr_inval; + goto fail; + } + + if (xdr_stream_decode_u32(xdr, &bex.es)) { + nfserr = nfserr_bad_xdr; goto fail; } - bex.es = be32_to_cpup(p++); if (bex.es != PNFS_BLOCK_READWRITE_DATA) { + nfserr = nfserr_inval; goto fail; } @@ -182,13 +203,12 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, return nfs_ok; fail: kfree(iomaps); - return nfserr_inval; + return nfserr; } /** * nfsd4_scsi_decode_layoutupdate - decode the scsi layout extent array - * @p: pointer to the xdr data - * @len: number of bytes to decode + * @xdr: subbuf set to the encoded array * @iomapp: pointer to store the decoded extent array * @nr_iomapsp: pointer to store the number of extents * @block_size: alignment of extent offset and length @@ -200,21 +220,22 @@ fail: * * Return values: * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid - * %nfserr_bad_xdr: The encoded array in @p is invalid + * %nfserr_bad_xdr: The encoded array in @xdr is invalid * %nfserr_inval: An unaligned extent found * %nfserr_delay: Failed to allocate memory for @iomapp */ __be32 -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, +nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, int *nr_iomapsp, u32 block_size) { struct iomap *iomaps; - u32 nr_iomaps, expected, i; + u32 nr_iomaps, expected, len, i; + __be32 nfserr; - if (len < sizeof(u32)) + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) return nfserr_bad_xdr; - nr_iomaps = be32_to_cpup(p++); + len = sizeof(__be32) + xdr_stream_remaining(xdr); expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; if (len != expected) return nfserr_bad_xdr; @@ -226,14 +247,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, for (i = 0; i < nr_iomaps; i++) { u64 val; - p = xdr_decode_hyper(p, &val); + if (xdr_stream_decode_u64(xdr, &val)) { + nfserr = nfserr_bad_xdr; + goto fail; + } if (val & (block_size - 1)) { + nfserr = nfserr_inval; goto fail; } iomaps[i].offset = val; - p = xdr_decode_hyper(p, &val); + if (xdr_stream_decode_u64(xdr, &val)) { + nfserr = nfserr_bad_xdr; + goto fail; + } if (val & (block_size - 1)) { + nfserr = nfserr_inval; goto fail; } iomaps[i].length = val; @@ -244,5 +273,5 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, return nfs_ok; fail: kfree(iomaps); - return nfserr_inval; + return nfserr; } diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index 15b3569f3d9a..7d25ef689671 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, const struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp); -__be32 nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, +__be32 nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, int *nr_iomapsp, u32 block_size); -__be32 nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, +__be32 nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, int *nr_iomapsp, u32 block_size); #endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 04b8856d0615..656b2e7d8840 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2520,7 +2520,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, lcp->lc_size_chg = false; } - nfserr = ops->proc_layoutcommit(inode, lcp); + nfserr = ops->proc_layoutcommit(inode, rqstp, lcp); nfs4_put_stid(&ls->ls_stid); out: return nfserr; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index cbbb61fcdd49..8b68f74a8cf0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -592,6 +592,8 @@ static __be32 nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, struct nfsd4_layoutcommit *lcp) { + u32 len; + if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0) return nfserr_bad_xdr; if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES) @@ -599,13 +601,10 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX) return nfserr_bad_xdr; - if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) + return nfserr_bad_xdr; + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len)) return nfserr_bad_xdr; - if (lcp->lc_up_len > 0) { - lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); - if (!lcp->lc_up_layout) - return nfserr_bad_xdr; - } return nfs_ok; } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 925817f66917..dfd411d1f363 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -35,6 +35,7 @@ struct nfsd4_layout_ops { const struct nfsd4_layoutget *lgp); __be32 (*proc_layoutcommit)(struct inode *inode, + struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp); void (*fence_client)(struct nfs4_layout_stateid *ls, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index e65b552bf5f5..d4b48602b2b0 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -664,8 +664,7 @@ struct nfsd4_layoutcommit { u64 lc_last_wr; /* request */ struct timespec64 lc_mtime; /* request */ u32 lc_layout_type; /* request */ - u32 lc_up_len; /* layout length */ - void *lc_up_layout; /* decoded by callback */ + struct xdr_buf lc_up_layout; /* decoded by callback */ bool lc_size_chg; /* response */ u64 lc_newsize; /* response */ }; From d68886bae76a4b9b3484d23e5b7df086f940fa38 Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Mon, 21 Jul 2025 21:40:56 +0300 Subject: [PATCH 11/42] NFSD: Fix last write offset handling in layoutcommit The data type of loca_last_write_offset is newoffset4 and is switched on a boolean value, no_newoffset, that indicates if a previous write occurred or not. If no_newoffset is FALSE, an offset is not given. This means that client does not try to update the file size. Thus, server should not try to calculate new file size and check if it fits into the segment range. See RFC 8881, section 12.5.4.2. Sometimes the current incorrect logic may cause clients to hang when trying to sync an inode. If layoutcommit fails, the client marks the inode as dirty again. Fixes: 9cf514ccfacb ("nfsd: implement pNFS operations") Cc: stable@vger.kernel.org Co-developed-by: Konstantin Evtushenko Signed-off-by: Konstantin Evtushenko Signed-off-by: Sergey Bashirov Reviewed-by: Christoph Hellwig Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 5 ++--- fs/nfsd/nfs4proc.c | 30 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 4c936132eb44..0822d8a119c6 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -118,7 +118,6 @@ nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, struct iomap *iomaps, int nr_iomaps) { struct timespec64 mtime = inode_get_mtime(inode); - loff_t new_size = lcp->lc_last_wr + 1; struct iattr iattr = { .ia_valid = 0 }; int error; @@ -128,9 +127,9 @@ nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; - if (new_size > i_size_read(inode)) { + if (lcp->lc_size_chg) { iattr.ia_valid |= ATTR_SIZE; - iattr.ia_size = new_size; + iattr.ia_size = lcp->lc_newsize; } error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps, diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 656b2e7d8840..7043fc475458 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2475,7 +2475,6 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, const struct nfsd4_layout_seg *seg = &lcp->lc_seg; struct svc_fh *current_fh = &cstate->current_fh; const struct nfsd4_layout_ops *ops; - loff_t new_size = lcp->lc_last_wr + 1; struct inode *inode; struct nfs4_layout_stateid *ls; __be32 nfserr; @@ -2491,13 +2490,21 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, goto out; inode = d_inode(current_fh->fh_dentry); - nfserr = nfserr_inval; - if (new_size <= seg->offset) - goto out; - if (new_size > seg->offset + seg->length) - goto out; - if (!lcp->lc_newoffset && new_size > i_size_read(inode)) - goto out; + lcp->lc_size_chg = false; + if (lcp->lc_newoffset) { + loff_t new_size = lcp->lc_last_wr + 1; + + nfserr = nfserr_inval; + if (new_size <= seg->offset) + goto out; + if (new_size > seg->offset + seg->length) + goto out; + + if (new_size > i_size_read(inode)) { + lcp->lc_size_chg = true; + lcp->lc_newsize = new_size; + } + } nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, false, lcp->lc_layout_type, @@ -2513,13 +2520,6 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, /* LAYOUTCOMMIT does not require any serialization */ mutex_unlock(&ls->ls_mutex); - if (new_size > i_size_read(inode)) { - lcp->lc_size_chg = true; - lcp->lc_newsize = new_size; - } else { - lcp->lc_size_chg = false; - } - nfserr = ops->proc_layoutcommit(inode, rqstp, lcp); nfs4_put_stid(&ls->ls_stid); out: From 2990b5a47984c27873d165de9e88099deee95c8d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:30 -0400 Subject: [PATCH 12/42] nfsd: fix assignment of ia_ctime.tv_nsec on delegated mtime update The ia_ctime.tv_nsec field should be set to modify.nseconds. Fixes: 7e13f4f8d27d ("nfsd: handle delegated timestamps in SETATTR") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 8b68f74a8cf0..52033e2d603e 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -538,7 +538,7 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen, iattr->ia_mtime.tv_sec = modify.seconds; iattr->ia_mtime.tv_nsec = modify.nseconds; iattr->ia_ctime.tv_sec = modify.seconds; - iattr->ia_ctime.tv_nsec = modify.seconds; + iattr->ia_ctime.tv_nsec = modify.nseconds; iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME | ATTR_MTIME_SET | ATTR_DELEG; } From 5affb498e70bba3053b835c478a199bf92c99c4d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:31 -0400 Subject: [PATCH 13/42] nfsd: ignore ATTR_DELEG when checking ia_valid before notify_change() If the only flag left is ATTR_DELEG, then there are no changes to be made. Fixes: 7e13f4f8d27d ("nfsd: handle delegated timestamps in SETATTR") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index edf050766e57..3cd3b9e069f4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -467,7 +467,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) return 0; } - if (!iap->ia_valid) + if ((iap->ia_valid & ~ATTR_DELEG) == 0) return 0; /* From afc5b36e29b95fbd31a60b9630d148857e5e513d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:32 -0400 Subject: [PATCH 14/42] vfs: add ATTR_CTIME_SET flag When ATTR_ATIME_SET and ATTR_MTIME_SET are set in the ia_valid mask, the notify_change() logic takes that to mean that the request should set those values explicitly, and not override them with "now". With the advent of delegated timestamps, similar functionality is needed for the ctime. Add a ATTR_CTIME_SET flag, and use that to indicate that the ctime should be accepted as-is. Also, clean up the if statements to eliminate the extra negatives. In setattr_copy() and setattr_copy_mgtime() use inode_set_ctime_deleg() when ATTR_CTIME_SET is set, instead of basing the decision on ATTR_DELEG. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/attr.c | 44 +++++++++++++++++++------------------------- include/linux/fs.h | 1 + 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/fs/attr.c b/fs/attr.c index 5425c1dbbff9..795f231d00e8 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -286,20 +286,12 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) unsigned int ia_valid = attr->ia_valid; struct timespec64 now; - if (ia_valid & ATTR_CTIME) { - /* - * In the case of an update for a write delegation, we must respect - * the value in ia_ctime and not use the current time. - */ - if (ia_valid & ATTR_DELEG) - now = inode_set_ctime_deleg(inode, attr->ia_ctime); - else - now = inode_set_ctime_current(inode); - } else { - /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */ - WARN_ON_ONCE(ia_valid & ATTR_MTIME); + if (ia_valid & ATTR_CTIME_SET) + now = inode_set_ctime_deleg(inode, attr->ia_ctime); + else if (ia_valid & ATTR_CTIME) + now = inode_set_ctime_current(inode); + else now = current_time(inode); - } if (ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); @@ -359,12 +351,11 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) { - if (ia_valid & ATTR_DELEG) - inode_set_ctime_deleg(inode, attr->ia_ctime); - else - inode_set_ctime_to_ts(inode, attr->ia_ctime); - } + + if (ia_valid & ATTR_CTIME_SET) + inode_set_ctime_deleg(inode, attr->ia_ctime); + else if (ia_valid & ATTR_CTIME) + inode_set_ctime_to_ts(inode, attr->ia_ctime); } EXPORT_SYMBOL(setattr_copy); @@ -463,15 +454,18 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, now = current_time(inode); - attr->ia_ctime = now; - if (!(ia_valid & ATTR_ATIME_SET)) - attr->ia_atime = now; - else + if (ia_valid & ATTR_ATIME_SET) attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); - if (!(ia_valid & ATTR_MTIME_SET)) - attr->ia_mtime = now; else + attr->ia_atime = now; + if (ia_valid & ATTR_CTIME_SET) + attr->ia_ctime = timestamp_truncate(attr->ia_ctime, inode); + else + attr->ia_ctime = now; + if (ia_valid & ATTR_MTIME_SET) attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); + else + attr->ia_mtime = now; if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); diff --git a/include/linux/fs.h b/include/linux/fs.h index 601d036a6c78..74f2bfc51926 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -238,6 +238,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ +#define ATTR_CTIME_SET (1 << 10) #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) From c066ff58e5d6e5d7400e5fda0c33f95b8c37dd02 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:33 -0400 Subject: [PATCH 15/42] nfsd: use ATTR_CTIME_SET for delegated ctime updates Ensure that notify_change() doesn't clobber a delegated ctime update with current_time() by setting ATTR_CTIME_SET for those updates. Don't bother setting the timestamps in cb_getattr_update_times() in the non-delegated case. notify_change() will do that itself. Fixes: 7e13f4f8d27d ("nfsd: handle delegated timestamps in SETATTR") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 6 +++--- fs/nfsd/nfs4xdr.c | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 88c347957da5..77eea2ad93cc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -9167,7 +9167,6 @@ static bool set_cb_time(struct timespec64 *cb, const struct timespec64 *orig, static int cb_getattr_update_times(struct dentry *dentry, struct nfs4_delegation *dp) { struct inode *inode = d_inode(dentry); - struct timespec64 now = current_time(inode); struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr; struct iattr attrs = { }; int ret; @@ -9175,6 +9174,7 @@ static int cb_getattr_update_times(struct dentry *dentry, struct nfs4_delegation if (deleg_attrs_deleg(dp->dl_type)) { struct timespec64 atime = inode_get_atime(inode); struct timespec64 mtime = inode_get_mtime(inode); + struct timespec64 now = current_time(inode); attrs.ia_atime = ncf->ncf_cb_atime; attrs.ia_mtime = ncf->ncf_cb_mtime; @@ -9183,12 +9183,12 @@ static int cb_getattr_update_times(struct dentry *dentry, struct nfs4_delegation attrs.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; if (set_cb_time(&attrs.ia_mtime, &mtime, &now)) { - attrs.ia_valid |= ATTR_CTIME | ATTR_MTIME | ATTR_MTIME_SET; + attrs.ia_valid |= ATTR_CTIME | ATTR_CTIME_SET | + ATTR_MTIME | ATTR_MTIME_SET; attrs.ia_ctime = attrs.ia_mtime; } } else { attrs.ia_valid |= ATTR_MTIME | ATTR_CTIME; - attrs.ia_mtime = attrs.ia_ctime = now; } if (!attrs.ia_valid) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 52033e2d603e..c0a3c6a7c8bb 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -539,7 +539,8 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen, iattr->ia_mtime.tv_nsec = modify.nseconds; iattr->ia_ctime.tv_sec = modify.seconds; iattr->ia_ctime.tv_nsec = modify.nseconds; - iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME | ATTR_MTIME_SET | ATTR_DELEG; + iattr->ia_valid |= ATTR_CTIME | ATTR_CTIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | ATTR_DELEG; } /* request sanity: did attrlist4 contain the expected number of words? */ From 7663e963a51122792811811c8119fd55c9ab254a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:34 -0400 Subject: [PATCH 16/42] nfsd: track original timestamps in nfs4_delegation As Trond points out [1], the "original time" mentioned in RFC 9754 refers to the timestamps on the files at the time that the delegation was granted, and not the current timestamp of the file on the server. Store the current timestamps for the file in the nfs4_delegation when granting one. Add STATX_ATIME and STATX_MTIME to the request mask in nfs4_delegation_stat(). When granting OPEN_DELEGATE_READ_ATTRS_DELEG, do a nfs4_delegation_stat() and save the correct atime. If the stat() fails for any reason, fall back to granting a normal read deleg. [1]: https://lore.kernel.org/linux-nfs/47a4e40310e797f21b5137e847b06bb203d99e66.camel@kernel.org/ Fixes: 7e13f4f8d27d ("nfsd: handle delegated timestamps in SETATTR") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 11 ++++++++--- fs/nfsd/state.h | 5 +++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 77eea2ad93cc..8737b721daf3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6157,7 +6157,8 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, path.dentry = file_dentry(nf->nf_file); rc = vfs_getattr(&path, stat, - (STATX_MODE | STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), + STATX_MODE | STATX_SIZE | STATX_ATIME | + STATX_MTIME | STATX_CTIME | STATX_CHANGE_COOKIE, AT_STATX_SYNC_AS_STAT); nfsd_file_put(nf); @@ -6274,10 +6275,14 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, OPEN_DELEGATE_WRITE; dp->dl_cb_fattr.ncf_cur_fsize = stat.size; dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); + dp->dl_atime = stat.atime; + dp->dl_ctime = stat.ctime; + dp->dl_mtime = stat.mtime; trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); } else { - open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_READ_ATTRS_DELEG : - OPEN_DELEGATE_READ; + open->op_delegate_type = deleg_ts && nfs4_delegation_stat(dp, currentfh, &stat) ? + OPEN_DELEGATE_READ_ATTRS_DELEG : OPEN_DELEGATE_READ; + dp->dl_atime = stat.atime; trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); } nfs4_put_stid(&dp->dl_stid); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 8adc2550129e..ce7c0d129ba3 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -224,6 +224,11 @@ struct nfs4_delegation { /* for CB_GETATTR */ struct nfs4_cb_fattr dl_cb_fattr; + + /* For delegated timestamps */ + struct timespec64 dl_atime; + struct timespec64 dl_mtime; + struct timespec64 dl_ctime; }; static inline bool deleg_is_read(u32 dl_type) From 3952f1cbcbc454b2cb639ddbf165c07068e90371 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:35 -0400 Subject: [PATCH 17/42] nfsd: fix SETATTR updates for delegated timestamps SETATTRs containing delegated timestamp updates are currently not being vetted properly. Since we no longer need to compare the timestamps vs. the current timestamps, move the vetting of delegated timestamps wholly into nfsd. Rename the set_cb_time() helper to nfsd4_vet_deleg_time(), and make it non-static. Add a new vet_deleg_attrs() helper that is called from nfsd4_setattr that uses nfsd4_vet_deleg_time() to properly validate the all the timestamps. If the validation indicates that the update should be skipped, unset the appropriate flags in ia_valid. Fixes: 7e13f4f8d27d ("nfsd: handle delegated timestamps in SETATTR") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 31 ++++++++++++++++++++++++++++++- fs/nfsd/nfs4state.c | 24 +++++++++++------------- fs/nfsd/state.h | 3 +++ 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 7043fc475458..aacd912a5fbe 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1133,6 +1133,33 @@ nfsd4_secinfo_no_name_release(union nfsd4_op_u *u) exp_put(u->secinfo_no_name.sin_exp); } +/* + * Validate that the requested timestamps are within the acceptable range. If + * timestamp appears to be in the future, then it will be clamped to + * current_time(). + */ +static void +vet_deleg_attrs(struct nfsd4_setattr *setattr, struct nfs4_delegation *dp) +{ + struct timespec64 now = current_time(dp->dl_stid.sc_file->fi_inode); + struct iattr *iattr = &setattr->sa_iattr; + + if ((setattr->sa_bmval[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) && + !nfsd4_vet_deleg_time(&iattr->ia_atime, &dp->dl_atime, &now)) + iattr->ia_valid &= ~(ATTR_ATIME | ATTR_ATIME_SET); + + if (setattr->sa_bmval[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) { + if (nfsd4_vet_deleg_time(&iattr->ia_mtime, &dp->dl_mtime, &now)) { + iattr->ia_ctime = iattr->ia_mtime; + if (!nfsd4_vet_deleg_time(&iattr->ia_ctime, &dp->dl_ctime, &now)) + iattr->ia_valid &= ~(ATTR_CTIME | ATTR_CTIME_SET); + } else { + iattr->ia_valid &= ~(ATTR_CTIME | ATTR_CTIME_SET | + ATTR_MTIME | ATTR_MTIME_SET); + } + } +} + static __be32 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -1170,8 +1197,10 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_delegation *dp = delegstateid(st); /* Only for *_ATTRS_DELEG flavors */ - if (deleg_attrs_deleg(dp->dl_type)) + if (deleg_attrs_deleg(dp->dl_type)) { + vet_deleg_attrs(setattr, dp); status = nfs_ok; + } } } if (st) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8737b721daf3..f2fd0cbe256b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -9135,25 +9135,25 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, } /** - * set_cb_time - vet and set the timespec for a cb_getattr update - * @cb: timestamp from the CB_GETATTR response + * nfsd4_vet_deleg_time - vet and set the timespec for a delegated timestamp update + * @req: timestamp from the client * @orig: original timestamp in the inode * @now: current time * - * Given a timestamp in a CB_GETATTR response, check it against the + * Given a timestamp from the client response, check it against the * current timestamp in the inode and the current time. Returns true * if the inode's timestamp needs to be updated, and false otherwise. - * @cb may also be changed if the timestamp needs to be clamped. + * @req may also be changed if the timestamp needs to be clamped. */ -static bool set_cb_time(struct timespec64 *cb, const struct timespec64 *orig, - const struct timespec64 *now) +bool nfsd4_vet_deleg_time(struct timespec64 *req, const struct timespec64 *orig, + const struct timespec64 *now) { /* * "When the time presented is before the original time, then the * update is ignored." Also no need to update if there is no change. */ - if (timespec64_compare(cb, orig) <= 0) + if (timespec64_compare(req, orig) <= 0) return false; /* @@ -9161,10 +9161,8 @@ static bool set_cb_time(struct timespec64 *cb, const struct timespec64 *orig, * clamp the new time to the current time, or it may * return NFS4ERR_DELAY to the client, allowing it to retry." */ - if (timespec64_compare(cb, now) > 0) { - /* clamp it */ - *cb = *now; - } + if (timespec64_compare(req, now) > 0) + *req = *now; return true; } @@ -9184,10 +9182,10 @@ static int cb_getattr_update_times(struct dentry *dentry, struct nfs4_delegation attrs.ia_atime = ncf->ncf_cb_atime; attrs.ia_mtime = ncf->ncf_cb_mtime; - if (set_cb_time(&attrs.ia_atime, &atime, &now)) + if (nfsd4_vet_deleg_time(&attrs.ia_atime, &atime, &now)) attrs.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - if (set_cb_time(&attrs.ia_mtime, &mtime, &now)) { + if (nfsd4_vet_deleg_time(&attrs.ia_mtime, &mtime, &now)) { attrs.ia_valid |= ATTR_CTIME | ATTR_CTIME_SET | ATTR_MTIME | ATTR_MTIME_SET; attrs.ia_ctime = attrs.ia_mtime; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index ce7c0d129ba3..bf9436cdb93c 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -247,6 +247,9 @@ static inline bool deleg_attrs_deleg(u32 dl_type) dl_type == OPEN_DELEGATE_WRITE_ATTRS_DELEG; } +bool nfsd4_vet_deleg_time(struct timespec64 *cb, const struct timespec64 *orig, + const struct timespec64 *now); + #define cb_to_delegation(cb) \ container_of(cb, struct nfs4_delegation, dl_recall) From b40b1ba37ad5b6099c426765c4bc327c08b390b9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:36 -0400 Subject: [PATCH 18/42] nfsd: fix timestamp updates in CB_GETATTR When updating the local timestamps from CB_GETATTR, the updated values are not being properly vetted. Compare the update times vs. the saved times in the delegation rather than the current times in the inode. Also, ensure that the ctime is properly vetted vs. its original value. Fixes: 6ae30d6eb26b ("nfsd: add support for delegated timestamps") Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f2fd0cbe256b..205ee8cc6fa2 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -9175,20 +9175,19 @@ static int cb_getattr_update_times(struct dentry *dentry, struct nfs4_delegation int ret; if (deleg_attrs_deleg(dp->dl_type)) { - struct timespec64 atime = inode_get_atime(inode); - struct timespec64 mtime = inode_get_mtime(inode); struct timespec64 now = current_time(inode); attrs.ia_atime = ncf->ncf_cb_atime; attrs.ia_mtime = ncf->ncf_cb_mtime; - if (nfsd4_vet_deleg_time(&attrs.ia_atime, &atime, &now)) + if (nfsd4_vet_deleg_time(&attrs.ia_atime, &dp->dl_atime, &now)) attrs.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - if (nfsd4_vet_deleg_time(&attrs.ia_mtime, &mtime, &now)) { - attrs.ia_valid |= ATTR_CTIME | ATTR_CTIME_SET | - ATTR_MTIME | ATTR_MTIME_SET; + if (nfsd4_vet_deleg_time(&attrs.ia_mtime, &dp->dl_mtime, &now)) { + attrs.ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; attrs.ia_ctime = attrs.ia_mtime; + if (nfsd4_vet_deleg_time(&attrs.ia_ctime, &dp->dl_ctime, &now)) + attrs.ia_valid |= ATTR_CTIME | ATTR_CTIME_SET; } } else { attrs.ia_valid |= ATTR_MTIME | ATTR_CTIME; From e5e9b24ab8fa9e899d6627123d7d5ba0c317d267 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 30 Jul 2025 09:24:37 -0400 Subject: [PATCH 19/42] nfsd: freeze c/mtime updates with outstanding WRITE_ATTRS delegation Instead of allowing the ctime to roll backward with a WRITE_ATTRS delegation, set FMODE_NOCMTIME on the file and have it skip mtime and ctime updates. It is possible that the client will never send a SETATTR to set the times before returning the delegation. Add two new bools to struct nfs4_delegation: dl_written: tracks whether the file has been written since the delegation was granted. This is set in the WRITE and LAYOUTCOMMIT handlers. dl_setattr: tracks whether the client has sent at least one valid mtime that can also update the ctime in a SETATTR. When unlocking the lease for the delegation, clear FMODE_NOCMTIME. If the file has been written, but no setattr for the delegated mtime and ctime has been done, update the timestamps to current_time(). Suggested-by: NeilBrown Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 26 ++++++++++++++++++++++++-- fs/nfsd/nfs4state.c | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/state.h | 4 +++- 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index aacd912a5fbe..bfebe6e25638 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1151,7 +1151,9 @@ vet_deleg_attrs(struct nfsd4_setattr *setattr, struct nfs4_delegation *dp) if (setattr->sa_bmval[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) { if (nfsd4_vet_deleg_time(&iattr->ia_mtime, &dp->dl_mtime, &now)) { iattr->ia_ctime = iattr->ia_mtime; - if (!nfsd4_vet_deleg_time(&iattr->ia_ctime, &dp->dl_ctime, &now)) + if (nfsd4_vet_deleg_time(&iattr->ia_ctime, &dp->dl_ctime, &now)) + dp->dl_setattr = true; + else iattr->ia_valid &= ~(ATTR_CTIME | ATTR_CTIME_SET); } else { iattr->ia_valid &= ~(ATTR_CTIME | ATTR_CTIME_SET | @@ -1238,12 +1240,26 @@ out: return status; } +static void nfsd4_file_mark_deleg_written(struct nfs4_file *fi) +{ + spin_lock(&fi->fi_lock); + if (!list_empty(&fi->fi_delegations)) { + struct nfs4_delegation *dp = list_first_entry(&fi->fi_delegations, + struct nfs4_delegation, dl_perfile); + + if (dp->dl_type == OPEN_DELEGATE_WRITE_ATTRS_DELEG) + dp->dl_written = true; + } + spin_unlock(&fi->fi_lock); +} + static __be32 nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_write *write = &u->write; stateid_t *stateid = &write->wr_stateid; + struct nfs4_stid *stid = NULL; struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; @@ -1256,10 +1272,15 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, trace_nfsd_write_start(rqstp, &cstate->current_fh, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - stateid, WR_STATE, &nf, NULL); + stateid, WR_STATE, &nf, &stid); if (status) return status; + if (stid) { + nfsd4_file_mark_deleg_written(stid->sc_file); + nfs4_put_stid(stid); + } + write->wr_how_written = write->wr_stable_how; status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, write->wr_offset, &write->wr_payload, @@ -2550,6 +2571,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, mutex_unlock(&ls->ls_mutex); nfserr = ops->proc_layoutcommit(inode, rqstp, lcp); + nfsd4_file_mark_deleg_written(ls->ls_stid.sc_file); nfs4_put_stid(&ls->ls_stid); out: return nfserr; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 205ee8cc6fa2..81fa7cc6c77b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1222,6 +1222,42 @@ static void put_deleg_file(struct nfs4_file *fp) nfs4_file_put_access(fp, NFS4_SHARE_ACCESS_READ); } +static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct file *f) +{ + struct iattr ia = { .ia_valid = ATTR_ATIME | ATTR_CTIME | ATTR_MTIME }; + struct inode *inode = file_inode(f); + int ret; + + /* don't do anything if FMODE_NOCMTIME isn't set */ + if ((READ_ONCE(f->f_mode) & FMODE_NOCMTIME) == 0) + return; + + spin_lock(&f->f_lock); + f->f_mode &= ~FMODE_NOCMTIME; + spin_unlock(&f->f_lock); + + /* was it never written? */ + if (!dp->dl_written) + return; + + /* did it get a setattr for the timestamps at some point? */ + if (dp->dl_setattr) + return; + + /* Stamp everything to "now" */ + inode_lock(inode); + ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &ia, NULL); + inode_unlock(inode); + if (ret) { + struct inode *inode = file_inode(f); + + pr_notice_ratelimited("Unable to update timestamps on inode %02x:%02x:%lu: %d\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino, ret); + } +} + static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; @@ -1229,6 +1265,7 @@ static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) WARN_ON_ONCE(!fp->fi_delegees); + nfsd4_finalize_deleg_timestamps(dp, nf->nf_file); kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } @@ -6265,6 +6302,8 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { + struct file *f = dp->dl_stid.sc_file->fi_deleg_file->nf_file; + if (!nfsd4_add_rdaccess_to_wrdeleg(rqstp, open, fh, stp) || !nfs4_delegation_stat(dp, currentfh, &stat)) { nfs4_put_stid(&dp->dl_stid); @@ -6278,6 +6317,9 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, dp->dl_atime = stat.atime; dp->dl_ctime = stat.ctime; dp->dl_mtime = stat.mtime; + spin_lock(&f->f_lock); + f->f_mode |= FMODE_NOCMTIME; + spin_unlock(&f->f_lock); trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); } else { open->op_delegate_type = deleg_ts && nfs4_delegation_stat(dp, currentfh, &stat) ? diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index bf9436cdb93c..b6ac0f37e9cd 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -217,10 +217,12 @@ struct nfs4_delegation { struct nfs4_clnt_odstate *dl_clnt_odstate; time64_t dl_time; u32 dl_type; -/* For recall: */ + /* For recall: */ int dl_retries; struct nfsd4_callback dl_recall; bool dl_recalled; + bool dl_written; + bool dl_setattr; /* for CB_GETATTR */ struct nfs4_cb_fattr dl_cb_fattr; From 6ecdfd7aa8e30e16193d4ee07bcb3f1216dbc358 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 29 Jul 2025 14:07:09 +0100 Subject: [PATCH 20/42] lockd: Remove space before newline There is an extraneous space before a newline in a dprintk message. Remove the space. Signed-off-by: Colin Ian King Signed-off-by: Chuck Lever --- fs/lockd/svclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c1315df4b350..a31dc9588eb8 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -980,7 +980,7 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) struct file_lock *fl; int error; - dprintk("grant_reply: looking for cookie %x, s=%d \n", + dprintk("grant_reply: looking for cookie %x, s=%d\n", *(unsigned int *)(cookie->data), status); if (!(block = nlmsvc_find_block(cookie))) return; From 17695d72d0b192bb471a699483dd6c6c2576c57d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 4 Aug 2025 22:46:59 +0000 Subject: [PATCH 21/42] nfsd: Replace open-coded conversion of bytes to hex Since the Linux kernel's sprintf() has conversion to hex built-in via "%*phN", delete md5_to_hex() and just use that. Also add an explicit array bound to the dname parameter of nfs4_make_rec_clidname() to make its size clear. No functional change. Reviewed-by: Jeff Layton Signed-off-by: Eric Biggers Signed-off-by: Chuck Lever --- fs/nfsd/nfs4recover.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 2231192ec33f..54f5e5392ef9 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -92,22 +92,8 @@ nfs4_reset_creds(const struct cred *original) put_cred(revert_creds(original)); } -static void -md5_to_hex(char *out, char *md5) -{ - int i; - - for (i=0; i<16; i++) { - unsigned char c = md5[i]; - - *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); - *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); - } - *out = '\0'; -} - static int -nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) +nfs4_make_rec_clidname(char dname[HEXDIR_LEN], const struct xdr_netobj *clname) { struct xdr_netobj cksum; struct crypto_shash *tfm; @@ -133,7 +119,7 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) if (status) goto out; - md5_to_hex(dname, cksum.data); + sprintf(dname, "%*phN", 16, cksum.data); status = 0; out: From 9ebcd022a34388bd3c37c6a11c1a9d49d5394eb2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 4 Aug 2025 22:47:00 +0000 Subject: [PATCH 22/42] nfsd: Eliminate an allocation in nfs4_make_rec_clidname() Since MD5 digests are fixed-size, make nfs4_make_rec_clidname() store the digest in a stack buffer instead of a dynamically allocated buffer. Use MD5_DIGEST_SIZE instead of a hard-coded value, both in nfs4_make_rec_clidname() and in the definition of HEXDIR_LEN. Signed-off-by: Eric Biggers Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4recover.c | 15 ++++----------- fs/nfsd/state.h | 4 +++- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 54f5e5392ef9..e2b9472e5c78 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -95,7 +95,7 @@ nfs4_reset_creds(const struct cred *original) static int nfs4_make_rec_clidname(char dname[HEXDIR_LEN], const struct xdr_netobj *clname) { - struct xdr_netobj cksum; + u8 digest[MD5_DIGEST_SIZE]; struct crypto_shash *tfm; int status; @@ -107,23 +107,16 @@ nfs4_make_rec_clidname(char dname[HEXDIR_LEN], const struct xdr_netobj *clname) goto out_no_tfm; } - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) { - status = -ENOMEM; - goto out; - } - status = crypto_shash_tfm_digest(tfm, clname->data, clname->len, - cksum.data); + digest); if (status) goto out; - sprintf(dname, "%*phN", 16, cksum.data); + static_assert(HEXDIR_LEN == 2 * MD5_DIGEST_SIZE + 1); + sprintf(dname, "%*phN", MD5_DIGEST_SIZE, digest); status = 0; out: - kfree(cksum.data); crypto_free_shash(tfm); out_no_tfm: return status; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index b6ac0f37e9cd..1e736f402426 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -35,6 +35,7 @@ #ifndef _NFSD4_STATE_H #define _NFSD4_STATE_H +#include #include #include #include @@ -391,7 +392,8 @@ struct nfsd4_sessionid { u32 reserved; }; -#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ +/* Length of MD5 digest as hex, plus terminating '\0' */ +#define HEXDIR_LEN (2 * MD5_DIGEST_SIZE + 1) /* * State Meaning Where set From ab1c282c010c4f327bd7addc3c0035fd8e3c1721 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 6 Aug 2025 03:10:01 +0200 Subject: [PATCH 23/42] NFSD: Fix destination buffer size in nfsd4_ssc_setup_dul() Commit 5304877936c0 ("NFSD: Fix strncpy() fortify warning") replaced strncpy(,, sizeof(..)) with strlcpy(,, sizeof(..) - 1), but strlcpy() already guaranteed NUL-termination of the destination buffer and subtracting one byte potentially truncated the source string. The incorrect size was then carried over in commit 72f78ae00a8e ("NFSD: move from strlcpy with unused retval to strscpy") when switching from strlcpy() to strscpy(). Fix this off-by-one error by using the full size of the destination buffer again. Cc: stable@vger.kernel.org Fixes: 5304877936c0 ("NFSD: Fix strncpy() fortify warning") Signed-off-by: Thorsten Blum Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index bfebe6e25638..d7c58aa64f06 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1519,7 +1519,7 @@ try_again: return 0; } if (work) { - strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1); + strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr)); refcount_set(&work->nsui_refcnt, 2); work->nsui_busy = true; list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list); From e4f574ca9c6dfa66695bb054ff5df43ecea873ec Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Wed, 6 Aug 2025 15:15:43 -0400 Subject: [PATCH 24/42] nfsd: decouple the xprtsec policy check from check_nfsd_access() A while back I had reported that an NFSv3 client could successfully mount using '-o xprtsec=none' an export that had been exported with 'xprtsec=tls:mtls'. By "successfully" I mean that the mount command would succeed and the mount would show up in /proc/mount. Attempting to do anything futher with the mount would be met with NFS3ERR_ACCES. This was fixed (albeit accidentally) by commit bb4f07f2409c ("nfsd: Fix NFSD_MAY_BYPASS_GSS and NFSD_MAY_BYPASS_GSS_ON_ROOT") and was subsequently re-broken by commit 0813c5f01249 ("nfsd: fix access checking for NLM under XPRTSEC policies"). Transport Layer Security isn't an RPC security flavor or pseudo-flavor, so we shouldn't be conflating them when determining whether the access checks can be bypassed. Split check_nfsd_access() into two helpers, and have __fh_verify() call the helpers directly since __fh_verify() has logic that allows one or both of the checks to be skipped. All other sites will continue to call check_nfsd_access(). Link: https://lore.kernel.org/linux-nfs/ZjO3Qwf_G87yNXb2@aion/ Fixes: 9280c5774314 ("NFSD: Handle new xprtsec= export option") Cc: stable@vger.kernel.org Signed-off-by: Scott Mayhew Signed-off-by: Chuck Lever --- fs/nfsd/export.c | 82 +++++++++++++++++++++++++++++++++--------------- fs/nfsd/export.h | 3 ++ fs/nfsd/nfsfh.c | 24 +++++++++++++- 3 files changed, 83 insertions(+), 26 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index cadfc2bae60e..95b5681152c4 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1082,50 +1082,62 @@ static struct svc_export *exp_find(struct cache_detail *cd, } /** - * check_nfsd_access - check if access to export is allowed. + * check_xprtsec_policy - check if access to export is allowed by the + * xprtsec policy * @exp: svc_export that is being accessed. - * @rqstp: svc_rqst attempting to access @exp (will be NULL for LOCALIO). - * @may_bypass_gss: reduce strictness of authorization check + * @rqstp: svc_rqst attempting to access @exp. + * + * Helper function for check_nfsd_access(). Note that callers should be + * using check_nfsd_access() instead of calling this function directly. The + * one exception is __fh_verify() since it has logic that may result in one + * or both of the helpers being skipped. * * Return values: * %nfs_ok if access is granted, or * %nfserr_wrongsec if access is denied */ -__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, - bool may_bypass_gss) +__be32 check_xprtsec_policy(struct svc_export *exp, struct svc_rqst *rqstp) { - struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; - struct svc_xprt *xprt; - - /* - * If rqstp is NULL, this is a LOCALIO request which will only - * ever use a filehandle/credential pair for which access has - * been affirmed (by ACCESS or OPEN NFS requests) over the - * wire. So there is no need for further checks here. - */ - if (!rqstp) - return nfs_ok; - - xprt = rqstp->rq_xprt; + struct svc_xprt *xprt = rqstp->rq_xprt; if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) - goto ok; + return nfs_ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) - goto ok; + return nfs_ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) - goto ok; + return nfs_ok; } - if (!may_bypass_gss) - goto denied; + return nfserr_wrongsec; +} + +/** + * check_security_flavor - check if access to export is allowed by the + * security flavor + * @exp: svc_export that is being accessed. + * @rqstp: svc_rqst attempting to access @exp. + * @may_bypass_gss: reduce strictness of authorization check + * + * Helper function for check_nfsd_access(). Note that callers should be + * using check_nfsd_access() instead of calling this function directly. The + * one exception is __fh_verify() since it has logic that may result in one + * or both of the helpers being skipped. + * + * Return values: + * %nfs_ok if access is granted, or + * %nfserr_wrongsec if access is denied + */ +__be32 check_security_flavor(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss) +{ + struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; -ok: /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) return nfs_ok; @@ -1167,10 +1179,30 @@ ok: } } -denied: return nfserr_wrongsec; } +/** + * check_nfsd_access - check if access to export is allowed. + * @exp: svc_export that is being accessed. + * @rqstp: svc_rqst attempting to access @exp. + * @may_bypass_gss: reduce strictness of authorization check + * + * Return values: + * %nfs_ok if access is granted, or + * %nfserr_wrongsec if access is denied + */ +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss) +{ + __be32 status; + + status = check_xprtsec_policy(exp, rqstp); + if (status != nfs_ok) + return status; + return check_security_flavor(exp, rqstp, may_bypass_gss); +} + /* * Uses rq_client and rq_gssclient to find an export; uses rq_client (an * auth_unix client) if it's available and has secinfo information; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index b9c0adb3ce09..ef5581911d5b 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -101,6 +101,9 @@ struct svc_expkey { struct svc_cred; int nfsexp_flags(struct svc_cred *cred, struct svc_export *exp); +__be32 check_xprtsec_policy(struct svc_export *exp, struct svc_rqst *rqstp); +__be32 check_security_flavor(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss); __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, bool may_bypass_gss); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index f4c2fb3dd5d0..062cfc18d8c6 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -364,10 +364,30 @@ __fh_verify(struct svc_rqst *rqstp, if (error) goto out; + /* + * If rqstp is NULL, this is a LOCALIO request which will only + * ever use a filehandle/credential pair for which access has + * been affirmed (by ACCESS or OPEN NFS requests) over the + * wire. Skip both the xprtsec policy and the security flavor + * checks. + */ + if (!rqstp) + goto check_permissions; + if ((access & NFSD_MAY_NLM) && (exp->ex_flags & NFSEXP_NOAUTHNLM)) /* NLM is allowed to fully bypass authentication */ goto out; + /* + * NLM is allowed to bypass the xprtsec policy check because lockd + * doesn't support xprtsec. + */ + if (!(access & NFSD_MAY_NLM)) { + error = check_xprtsec_policy(exp, rqstp); + if (error) + goto out; + } + if (access & NFSD_MAY_BYPASS_GSS) may_bypass_gss = true; /* @@ -379,13 +399,15 @@ __fh_verify(struct svc_rqst *rqstp, && exp->ex_path.dentry == dentry) may_bypass_gss = true; - error = check_nfsd_access(exp, rqstp, may_bypass_gss); + error = check_security_flavor(exp, rqstp, may_bypass_gss); if (error) goto out; + /* During LOCALIO call to fh_verify will be called with a NULL rqstp */ if (rqstp) svc_xprt_set_valid(rqstp->rq_xprt); +check_permissions: /* Finally, check access permissions. */ error = nfsd_permission(cred, exp, dentry, access); out: From a9a15ba23efc4d6d34127e8d175ae63a95434f58 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Aug 2025 10:37:07 -0400 Subject: [PATCH 25/42] sunrpc: fix pr_notice in svc_tcp_sendto() to show correct length This pr_notice() is confusing since it only prints xdr->len, which doesn't include the 4-byte record marker. That can make it sometimes look like the socket sent more than was requested if it's short by just a few bytes. Add sizeof(marker) to the size and fix the format accordingly. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e2c5e0e626f9..1afaeb45d6a3 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1293,10 +1293,10 @@ out_notconn: mutex_unlock(&xprt->xpt_mutex); return -ENOTCONN; out_close: - pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + pr_notice("rpc-srv/tcp: %s: %s %d when sending %zu bytes - shutting down socket\n", xprt->xpt_server->sv_name, (err < 0) ? "got error" : "sent", - (err < 0) ? err : sent, xdr->len); + (err < 0) ? err : sent, xdr->len + sizeof(marker)); svc_xprt_deferred_close(xprt); mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; From 7569065fb123f8428cb9d29939dd16d43d4b50c4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Aug 2025 10:37:08 -0400 Subject: [PATCH 26/42] sunrpc: eliminate return pointer in svc_tcp_sendmsg() Return a positive value if something was sent, or a negative error code. Eliminate the "err" variable in the only caller as well. Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 1afaeb45d6a3..c0d5a27ba674 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1224,7 +1224,7 @@ err_noclose: * that the pages backing @xdr are unchanging. */ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, - rpc_fraghdr marker, int *sentp) + rpc_fraghdr marker) { struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES, @@ -1233,8 +1233,6 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, void *buf; int ret; - *sentp = 0; - /* The stream record marker is copied into a temporary page * fragment buffer so that it can be included in rq_bvec. */ @@ -1252,10 +1250,7 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); page_frag_free(buf); - if (ret < 0) - return ret; - *sentp += ret; - return 0; + return ret; } /** @@ -1274,7 +1269,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) struct xdr_buf *xdr = &rqstp->rq_res; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); - int sent, err; + int sent; svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); rqstp->rq_xprt_ctxt = NULL; @@ -1282,9 +1277,9 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = svc_tcp_sendmsg(svsk, rqstp, marker, &sent); - trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent); - if (err < 0 || sent != (xdr->len + sizeof(marker))) + sent = svc_tcp_sendmsg(svsk, rqstp, marker); + trace_svcsock_tcp_send(xprt, sent); + if (sent < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; mutex_unlock(&xprt->xpt_mutex); return sent; @@ -1295,8 +1290,8 @@ out_notconn: out_close: pr_notice("rpc-srv/tcp: %s: %s %d when sending %zu bytes - shutting down socket\n", xprt->xpt_server->sv_name, - (err < 0) ? "got error" : "sent", - (err < 0) ? err : sent, xdr->len + sizeof(marker)); + (sent < 0) ? "got error" : "sent", + sent, xdr->len + sizeof(marker)); svc_xprt_deferred_close(xprt); mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; From f64397e04b4d094319a8f72bb1b82e4d2e3672ae Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Mon, 18 Aug 2025 18:03:20 +0800 Subject: [PATCH 27/42] NFSD: Drop redundant conversion to bool The result of integer comparison already evaluates to bool. No need for explicit conversion. Signed-off-by: Xichao Zhao Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c index 84b0c8b559dc..f07d790d56aa 100644 --- a/fs/nfsd/debugfs.c +++ b/fs/nfsd/debugfs.c @@ -26,7 +26,7 @@ static int nfsd_dsr_get(void *data, u64 *val) static int nfsd_dsr_set(void *data, u64 val) { - nfsd_disable_splice_read = (val > 0) ? true : false; + nfsd_disable_splice_read = (val > 0); return 0; } From 898374fdd7f06fa4c4a66e8be3135efeae6128d5 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 19 Aug 2025 14:04:02 -0400 Subject: [PATCH 28/42] nfsd: unregister with rpcbind when deleting a transport When a listener is added, a part of creation of transport also registers program/port with rpcbind. However, when the listener is removed, while transport goes away, rpcbind still has the entry for that port/type. When deleting the transport, unregister with rpcbind when appropriate. ---v2 created a new xpt_flag XPT_RPCB_UNREG to mark TCP and UDP transport and at xprt destroy send rpcbind unregister if flag set. Suggested-by: Chuck Lever Fixes: d093c9089260 ("nfsd: fix management of listener transports") Cc: stable@vger.kernel.org Signed-off-by: Olga Kornievskaia Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_xprt.h | 3 +++ net/sunrpc/svc_xprt.c | 13 +++++++++++++ net/sunrpc/svcsock.c | 2 ++ 3 files changed, 18 insertions(+) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 369a89aea186..2b886f7eb295 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -104,6 +104,9 @@ enum { * it has access to. It is NOT counted * in ->sv_tmpcnt. */ + XPT_RPCB_UNREG, /* transport that needs unregistering + * with rpcbind (TCP, UDP) on destroy + */ }; /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 8b1837228799..b800d704d807 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1014,6 +1014,19 @@ static void svc_delete_xprt(struct svc_xprt *xprt) struct svc_serv *serv = xprt->xpt_server; struct svc_deferred_req *dr; + /* unregister with rpcbind for when transport type is TCP or UDP. + */ + if (test_bit(XPT_RPCB_UNREG, &xprt->xpt_flags)) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, + sk_xprt); + struct socket *sock = svsk->sk_sock; + + if (svc_register(serv, xprt->xpt_net, sock->sk->sk_family, + sock->sk->sk_protocol, 0) < 0) + pr_warn("failed to unregister %s with rpcbind\n", + xprt->xpt_class->xcl_name); + } + if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) return; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c0d5a27ba674..7b90abc5cf0e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -836,6 +836,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* data might have come in before data_ready set up */ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { @@ -1350,6 +1351,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) if (sk->sk_state == TCP_LISTEN) { strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { From dd9adfa0da2b0dac4c0abdae4bdb88c366bd83d5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Aug 2025 10:27:27 -0400 Subject: [PATCH 29/42] NFS: Remove rpcbind cleanup for NFSv4.0 callback The NFS client's NFSv4.0 callback listeners are created with SVC_SOCK_ANONYMOUS, therefore svc_setup_socket() does not register them with the client's rpcbind service. And, note that nfs_callback_down_net() does not call svc_rpcb_cleanup() at all when shutting down the callback server. Even if svc_setup_socket() were to attempt to register or unregister these sockets, the callback service has vs_hidden set, which shunts the rpcbind upcalls. The svc_rpcb_cleanup() error flow was introduced by commit c946556b8749 ("NFS: move per-net callback thread initialization to nfs_callback_up_net()"). It doesn't appear in the code that was relocated by that commit. Therefore, there is no need to call svc_rpcb_cleanup() when listener creation fails during callback server start-up. Signed-off-by: Chuck Lever --- fs/nfs/callback.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 86bdc7d23fb9..511f80878809 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -153,7 +153,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, ret = svc_bind(serv, net); if (ret < 0) { printk(KERN_WARNING "NFS: bind callback service failed\n"); - goto err_bind; + goto err; } ret = 0; @@ -166,13 +166,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, if (ret < 0) { printk(KERN_ERR "NFS: callback service start failed\n"); - goto err_socks; + goto err; } return 0; -err_socks: - svc_rpcb_cleanup(serv, net); -err_bind: +err: nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %x\n", ret, net->ns.inum); From d73d06dac604043b94a5f18ebb6a69da1b867702 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 20 Aug 2025 10:27:28 -0400 Subject: [PATCH 30/42] SUNRPC: Move the svc_rpcb_cleanup() call sites Clean up: because svc_rpcb_cleanup() and svc_xprt_destroy_all() are always invoked in pairs, we can deduplicate code by moving the svc_rpcb_cleanup() call sites into svc_xprt_destroy_all(). Tested-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/lockd/svc.c | 6 ++---- fs/nfs/callback.c | 2 +- fs/nfsd/nfsctl.c | 2 +- fs/nfsd/nfssvc.c | 7 ++----- include/linux/sunrpc/svc_xprt.h | 3 ++- net/sunrpc/svc.c | 1 - net/sunrpc/svc_xprt.c | 7 ++++++- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e80262a51884..d68afa196535 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -216,8 +216,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); return err; } @@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 511f80878809..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc6b776fc657..63d52edcad72 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1993,7 +1993,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * remaining listeners and recreate the list. */ if (delete) - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82b0111ac469..7057ddd7a0a8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net) #endif } - svc_xprt_destroy_all(serv, net); - /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are + * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 2b886f7eb295..da2a2531e110 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -168,7 +168,8 @@ int svc_xprt_create(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags, const struct cred *cred); -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net); +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister); void svc_xprt_received(struct svc_xprt *xprt); void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_put(struct svc_xprt *xprt); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index fc70e13b1cb9..cb4010e2dc0c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b800d704d807..6973184ff667 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1115,6 +1115,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1127,7 +1128,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1137,6 +1139,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); From fb340bfd48bcc3a51d35be1fe2a2db290092d4ea Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 21 Aug 2025 10:53:20 -0400 Subject: [PATCH 31/42] NFSD: Delay adding new entries to LRU Neil Brown observes: > I would not include RC_INPROG entries in the lru at all - they are > always ignored, and will be added when they are switched to > RCU_DONE. I also removed a stale comment. Suggested-by: NeilBrown Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index ba9d326b3de6..d929c8c63bd9 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -237,10 +237,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) } -/* - * Move cache entry to end of LRU list, and queue the cleaner to run if it's - * not already scheduled. - */ static void lru_put_end(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp) { @@ -272,13 +268,6 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, /* The bucket LRU is ordered oldest-first. */ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { - /* - * Don't free entries attached to calls that are still - * in-progress, but do keep scanning the list. - */ - if (rp->c_state == RC_INPROG) - continue; - if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && time_before(expiry, rp->c_timestamp)) break; @@ -453,8 +442,6 @@ out: nn->longest_chain_cachesize, atomic_read(&nn->num_drc_entries)); } - - lru_put_end(b, ret); return ret; } From 8ddd06be9a9e2b9f4af9b337150af834862331ef Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 21 Aug 2025 10:53:21 -0400 Subject: [PATCH 32/42] NFSD: Reduce DRC bucket size The common case is that a DRC lookup will not find the XID in the bucket. Reduce the amount of pointer chasing during the lookup by keeping fewer entries in each hash bucket. Changing the bucket size constant forces the size of the DRC hash table to increase, and the height of each bucket r-b tree to be reduced. Signed-off-by: Chuck Lever --- fs/nfsd/nfscache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index d929c8c63bd9..ab13ee9c7fd8 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -27,7 +27,7 @@ * cache size, the idea being that when the cache is at its maximum number * of entries, then this should be the average number of entries per bucket. */ -#define TARGET_BUCKET_SIZE 64 +#define TARGET_BUCKET_SIZE 8 struct nfsd_drc_bucket { struct rb_root rb_head; From a082e4b4d08a4a0e656d90c2c05da85f23e6d0c9 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 21 Aug 2025 16:31:46 -0400 Subject: [PATCH 33/42] nfsd: nfserr_jukebox in nlm_fopen should lead to a retry When v3 NLM request finds a conflicting delegation, it triggers a delegation recall and nfsd_open fails with EAGAIN. nfsd_open then translates EAGAIN into nfserr_jukebox. In nlm_fopen, instead of returning nlm_failed for when there is a conflicting delegation, drop this NLM request so that the client retries. Once delegation is recalled and if a local lock is claimed, a retry would lead to nfsd returning a nlm_lck_blocked error or a successful nlm lock. Fixes: d343fce148a4 ("[PATCH] knfsd: Allow lockd to drop replies as appropriate") Cc: stable@vger.kernel.org # v6.6 Signed-off-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- fs/nfsd/lockd.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index edc9f75dc75c..6b042218668b 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -57,6 +57,21 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, switch (nfserr) { case nfs_ok: return 0; + case nfserr_jukebox: + /* this error can indicate a presence of a conflicting + * delegation to an NLM lock request. Options are: + * (1) For now, drop this request and make the client + * retry. When delegation is returned, client's lock retry + * will complete. + * (2) NLM4_DENIED as per "spec" signals to the client + * that the lock is unavailable now but client can retry. + * Linux client implementation does not. It treats + * NLM4_DENIED same as NLM4_FAILED and errors the request. + * (3) For the future, treat this as blocked lock and try + * to callback when the delegation is returned but might + * not have a proper lock request to block on. + */ + fallthrough; case nfserr_dropit: return nlm_drop_reply; case nfserr_stale: From 13289ed501bad7a37ccbfa8581961d2e9dd4aea3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 3 Aug 2025 14:21:30 -0700 Subject: [PATCH 34/42] nfsd: Don't force CRYPTO_LIB_SHA256 to be built-in Now that nfsd is accessing SHA-256 via the library API instead of via crypto_shash, there is a direct symbol dependency on the SHA-256 code and there is no benefit to be gained from forcing it to be built-in. Therefore, select CRYPTO_LIB_SHA256 from NFSD (conditional on NFSD_V4) instead of from NFSD_V4, so that it can be 'm' if NFSD is 'm'. Signed-off-by: Eric Biggers Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 879e0b104d1c..e134dce45e35 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -5,6 +5,7 @@ config NFSD depends on FILE_LOCKING depends on FSNOTIFY select CRC32 + select CRYPTO_LIB_SHA256 if NFSD_V4 select LOCKD select SUNRPC select EXPORTFS @@ -77,7 +78,6 @@ config NFSD_V4 select FS_POSIX_ACL select RPCSEC_GSS_KRB5 select CRYPTO - select CRYPTO_LIB_SHA256 select CRYPTO_MD5 select GRACE_PERIOD select NFS_V4_2_SSC_HELPER if NFS_V4_2 From 6c15463c4511d26f2a820f63f5b76624a71afc44 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 12 Aug 2025 19:33:59 +0800 Subject: [PATCH 35/42] sunrpc: fix "occurence"->"occurrence" Trivial fix to spelling mistake in comment text. Signed-off-by: Xichao Zhao Reviewed-by: Joe Damato Signed-off-by: Chuck Lever --- net/sunrpc/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 09434e1143c5..8b01b7ae2690 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -389,7 +389,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, saddr = (struct sockaddr *)&xprt->addr; port = rpc_get_port(saddr); - /* buf_len is the len until the first occurence of either + /* buf_len is the len until the first occurrence of either * '\n' or '\0' */ buf_len = strcspn(buf, "\n"); From db155b7c7c85b5f14edec21e164001a168581ffb Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Mon, 25 Aug 2025 16:11:02 +0300 Subject: [PATCH 36/42] NFSD: Disallow layoutget during grace period When the server is recovering from a reboot and is in a grace period, any operation that may result in deletion or reallocation of block extents should not be allowed. See RFC 8881, section 18.43.3. If multiple clients write data to the same file, rebooting the server during writing may result in file corruption. In the worst case, the exported XFS may also become corrupted. Observed this behavior while testing pNFS block volume setup. Co-developed-by: Konstantin Evtushenko Signed-off-by: Konstantin Evtushenko Signed-off-by: Sergey Bashirov Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d7c58aa64f06..2dc8910f8f72 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2435,6 +2435,7 @@ static __be32 nfsd4_layoutget(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { + struct net *net = SVC_NET(rqstp); struct nfsd4_layoutget *lgp = &u->layoutget; struct svc_fh *current_fh = &cstate->current_fh; const struct nfsd4_layout_ops *ops; @@ -2486,6 +2487,10 @@ nfsd4_layoutget(struct svc_rqst *rqstp, if (lgp->lg_seg.length == 0) goto out; + nfserr = nfserr_grace; + if (locks_in_grace(net)) + goto out; + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid, true, lgp->lg_layout_type, &ls); if (nfserr) { From e0963ce53b0097a115ad35669b02cf5b87607ebf Mon Sep 17 00:00:00 2001 From: Sergey Bashirov Date: Thu, 4 Sep 2025 18:48:44 +0300 Subject: [PATCH 37/42] NFSD: Allow layoutcommit during grace period If the loca_reclaim field is set to TRUE, this indicates that the client is attempting to commit changes to a layout after the restart of the metadata server during the metadata server's recovery grace period. This type of request may be necessary when the client has uncommitted writes to provisionally allocated byte-ranges of a file that were sent to the storage devices before the restart of the metadata server. See RFC 8881, section 18.42.3. Without this, the client is not able to increase the file size and commit preallocated extents when the block/scsi layout server is restarted during a write and is in a grace period. And when the grace period ends, the client also cannot perform layoutcommit because the old layout state becomes invalid, resulting in file corruption. Co-developed-by: Konstantin Evtushenko Signed-off-by: Konstantin Evtushenko Signed-off-by: Sergey Bashirov Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2dc8910f8f72..33c21666fa7f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2526,6 +2526,7 @@ static __be32 nfsd4_layoutcommit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { + struct net *net = SVC_NET(rqstp); struct nfsd4_layoutcommit *lcp = &u->layoutcommit; const struct nfsd4_layout_seg *seg = &lcp->lc_seg; struct svc_fh *current_fh = &cstate->current_fh; @@ -2561,23 +2562,34 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, } } - nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, - false, lcp->lc_layout_type, - &ls); - if (nfserr) { - trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid); - /* fixup error code as per RFC5661 */ - if (nfserr == nfserr_bad_stateid) - nfserr = nfserr_badlayout; + nfserr = nfserr_grace; + if (locks_in_grace(net) && !lcp->lc_reclaim) goto out; + nfserr = nfserr_no_grace; + if (!locks_in_grace(net) && lcp->lc_reclaim) + goto out; + + if (!lcp->lc_reclaim) { + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, + &lcp->lc_sid, false, lcp->lc_layout_type, &ls); + if (nfserr) { + trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid); + /* fixup error code as per RFC5661 */ + if (nfserr == nfserr_bad_stateid) + nfserr = nfserr_badlayout; + goto out; + } + + /* LAYOUTCOMMIT does not require any serialization */ + mutex_unlock(&ls->ls_mutex); } - /* LAYOUTCOMMIT does not require any serialization */ - mutex_unlock(&ls->ls_mutex); - nfserr = ops->proc_layoutcommit(inode, rqstp, lcp); - nfsd4_file_mark_deleg_written(ls->ls_stid.sc_file); - nfs4_put_stid(&ls->ls_stid); + + if (!lcp->lc_reclaim) { + nfsd4_file_mark_deleg_written(ls->ls_stid.sc_file); + nfs4_put_stid(&ls->ls_stid); + } out: return nfserr; } From eafdd7e949bb412bb6daa1f8c71b61d11c23ca5f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Sep 2025 21:59:30 +0300 Subject: [PATCH 38/42] nfsd: delete unnecessary NULL check in __fh_verify() In commit 4a0de50a44bb ("nfsd: decouple the xprtsec policy check from check_nfsd_access()") we added a NULL check on "rqstp" to earlier in the function. This check is no longer required so delete it. Signed-off-by: Dan Carpenter Reviewed-by: Scott Mayhew Signed-off-by: Chuck Lever --- fs/nfsd/nfsfh.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 062cfc18d8c6..3edccc38db42 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -403,9 +403,7 @@ __fh_verify(struct svc_rqst *rqstp, if (error) goto out; - /* During LOCALIO call to fh_verify will be called with a NULL rqstp */ - if (rqstp) - svc_xprt_set_valid(rqstp->rq_xprt); + svc_xprt_set_valid(rqstp->rq_xprt); check_permissions: /* Finally, check access permissions. */ From d6e80d48f9c83fc766c1418c584dbba5a0bc9e8a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 5 Sep 2025 10:30:37 -0400 Subject: [PATCH 39/42] NFSD: Do the grace period check in ->proc_layoutget RFC 8881 Section 18.43.3 states: > If the metadata server is in a grace period, and does not persist > layouts and device ID to device address mappings, then it MUST > return NFS4ERR_GRACE (see Section 8.4.2.1). Jeff observed that this suggests the grace period check is better done by the individual layout type implementations, because checking for the server grace period is unnecessary for some layout types. Suggested-by: Jeff Layton Link: https://lore.kernel.org/linux-nfs/7h5p5ktyptyt37u6jhpbjfd5u6tg44lriqkdc7iz7czeeabrvo@ijgxz27dw4sg/T/#t Signed-off-by: Chuck Lever --- fs/nfsd/blocklayout.c | 7 +++++-- fs/nfsd/flexfilelayout.c | 4 ++-- fs/nfsd/nfs4proc.c | 7 +------ fs/nfsd/pnfs.h | 4 ++-- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 0822d8a119c6..fde5539cf6a6 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -18,8 +18,8 @@ static __be32 -nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, - struct nfsd4_layoutget *args) +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *args) { struct nfsd4_layout_seg *seg = &args->lg_seg; struct super_block *sb = inode->i_sb; @@ -29,6 +29,9 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, u32 device_generation = 0; int error; + if (locks_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + if (seg->offset & (block_size - 1)) { dprintk("pnfsd: I/O misaligned\n"); goto out_layoutunavailable; diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index 3ca5304440ff..c318cf74e388 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -20,8 +20,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PNFS static __be32 -nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, - struct nfsd4_layoutget *args) +nfsd4_ff_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *args) { struct nfsd4_layout_seg *seg = &args->lg_seg; u32 device_generation = 0; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 33c21666fa7f..e466cf52d7d7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2435,7 +2435,6 @@ static __be32 nfsd4_layoutget(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - struct net *net = SVC_NET(rqstp); struct nfsd4_layoutget *lgp = &u->layoutget; struct svc_fh *current_fh = &cstate->current_fh; const struct nfsd4_layout_ops *ops; @@ -2487,10 +2486,6 @@ nfsd4_layoutget(struct svc_rqst *rqstp, if (lgp->lg_seg.length == 0) goto out; - nfserr = nfserr_grace; - if (locks_in_grace(net)) - goto out; - nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid, true, lgp->lg_layout_type, &ls); if (nfserr) { @@ -2502,7 +2497,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) goto out_put_stid; - nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry), + nfserr = ops->proc_layoutget(rqstp, d_inode(current_fh->fh_dentry), current_fh, lgp); if (nfserr) goto out_put_stid; diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index dfd411d1f363..db9af780438b 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -29,8 +29,8 @@ struct nfsd4_layout_ops { __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, const struct nfsd4_getdeviceinfo *gdevp); - __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, - struct nfsd4_layoutget *lgp); + __be32 (*proc_layoutget)(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *lgp); __be32 (*encode_layoutget)(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp); From 6304affe45648294229d18cab2b4ba6d40045570 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Sat, 6 Sep 2025 17:25:11 -0400 Subject: [PATCH 40/42] NFSD: Add io_cache_{read,write} controls to debugfs Add 'io_cache_read' to NFSD's debugfs interface so that any data read by NFSD will either be: - cached using page cache (NFSD_IO_BUFFERED=0) - cached but removed from the page cache upon completion (NFSD_IO_DONTCACHE=1). io_cache_read may be set by writing to: /sys/kernel/debug/nfsd/io_cache_read Add 'io_cache_write' to NFSD's debugfs interface so that any data written by NFSD will either be: - cached using page cache (NFSD_IO_BUFFERED=0) - cached but removed from the page cache upon completion (NFSD_IO_DONTCACHE=1). io_cache_write may be set by writing to: /sys/kernel/debug/nfsd/io_cache_write The default value for both settings is NFSD_IO_BUFFERED, which is NFSD's existing behavior for both read and write. Changes to these settings take immediate effect for all exports and NFS versions. Currently only xfs and ext4 implement RWF_DONTCACHE. For file systems that do not implement RWF_DONTCACHE, NFSD use only buffered I/O when the io_cache setting is NFSD_IO_DONTCACHE. Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/debugfs.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/nfsd.h | 9 +++++ fs/nfsd/vfs.c | 21 +++++++++++ 3 files changed, 123 insertions(+) diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c index f07d790d56aa..ed2b9e066206 100644 --- a/fs/nfsd/debugfs.c +++ b/fs/nfsd/debugfs.c @@ -27,11 +27,98 @@ static int nfsd_dsr_get(void *data, u64 *val) static int nfsd_dsr_set(void *data, u64 val) { nfsd_disable_splice_read = (val > 0); + if (!nfsd_disable_splice_read) { + /* + * Must use buffered I/O if splice_read is enabled. + */ + nfsd_io_cache_read = NFSD_IO_BUFFERED; + } return 0; } DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n"); +/* + * /sys/kernel/debug/nfsd/io_cache_read + * + * Contents: + * %0: NFS READ will use buffered IO + * %1: NFS READ will use dontcache (buffered IO w/ dropbehind) + * + * This setting takes immediate effect for all NFS versions, + * all exports, and in all NFSD net namespaces. + */ + +static int nfsd_io_cache_read_get(void *data, u64 *val) +{ + *val = nfsd_io_cache_read; + return 0; +} + +static int nfsd_io_cache_read_set(void *data, u64 val) +{ + int ret = 0; + + switch (val) { + case NFSD_IO_BUFFERED: + nfsd_io_cache_read = NFSD_IO_BUFFERED; + break; + case NFSD_IO_DONTCACHE: + /* + * Must disable splice_read when enabling + * NFSD_IO_DONTCACHE. + */ + nfsd_disable_splice_read = true; + nfsd_io_cache_read = val; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_read_fops, nfsd_io_cache_read_get, + nfsd_io_cache_read_set, "%llu\n"); + +/* + * /sys/kernel/debug/nfsd/io_cache_write + * + * Contents: + * %0: NFS WRITE will use buffered IO + * %1: NFS WRITE will use dontcache (buffered IO w/ dropbehind) + * + * This setting takes immediate effect for all NFS versions, + * all exports, and in all NFSD net namespaces. + */ + +static int nfsd_io_cache_write_get(void *data, u64 *val) +{ + *val = nfsd_io_cache_write; + return 0; +} + +static int nfsd_io_cache_write_set(void *data, u64 val) +{ + int ret = 0; + + switch (val) { + case NFSD_IO_BUFFERED: + case NFSD_IO_DONTCACHE: + nfsd_io_cache_write = val; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_write_fops, nfsd_io_cache_write_get, + nfsd_io_cache_write_set, "%llu\n"); + void nfsd_debugfs_exit(void) { debugfs_remove_recursive(nfsd_top_dir); @@ -44,4 +131,10 @@ void nfsd_debugfs_init(void) debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO, nfsd_top_dir, NULL, &nfsd_dsr_fops); + + debugfs_create_file("io_cache_read", 0644, nfsd_top_dir, NULL, + &nfsd_io_cache_read_fops); + + debugfs_create_file("io_cache_write", 0644, nfsd_top_dir, NULL, + &nfsd_io_cache_write_fops); } diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 1cd0bed57bc2..809729d41e08 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -153,6 +153,15 @@ static inline void nfsd_debugfs_exit(void) {} extern bool nfsd_disable_splice_read __read_mostly; +enum { + /* Any new NFSD_IO enum value must be added at the end */ + NFSD_IO_BUFFERED, + NFSD_IO_DONTCACHE, +}; + +extern u64 nfsd_io_cache_read __read_mostly; +extern u64 nfsd_io_cache_write __read_mostly; + extern int nfsd_max_blksize; static inline int nfsd_v4client(struct svc_rqst *rq) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 3cd3b9e069f4..714777c221ed 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -49,6 +49,8 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP bool nfsd_disable_splice_read __read_mostly; +u64 nfsd_io_cache_read __read_mostly = NFSD_IO_BUFFERED; +u64 nfsd_io_cache_write __read_mostly = NFSD_IO_BUFFERED; /** * nfserrno - Map Linux errnos to NFS errnos @@ -1099,6 +1101,16 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, size_t len; init_sync_kiocb(&kiocb, file); + + switch (nfsd_io_cache_read) { + case NFSD_IO_BUFFERED: + break; + case NFSD_IO_DONTCACHE: + if (file->f_op->fop_flags & FOP_DONTCACHE) + kiocb.ki_flags = IOCB_DONTCACHE; + break; + } + kiocb.ki_pos = offset; v = 0; @@ -1224,6 +1236,15 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); + + switch (nfsd_io_cache_write) { + case NFSD_IO_BUFFERED: + break; + case NFSD_IO_DONTCACHE: + if (file->f_op->fop_flags & FOP_DONTCACHE) + kiocb.ki_flags |= IOCB_DONTCACHE; + break; + } host_err = vfs_iocb_iter_write(file, &kiocb, &iter); if (host_err < 0) { commit_reset_write_verifier(nn, rqstp, host_err); From d8e97cc476e33037ac69c5b09b351f5cc8d0589d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 6 Sep 2025 16:00:19 -0700 Subject: [PATCH 41/42] SUNRPC: Make RPCSEC_GSS_KRB5 select CRYPTO instead of depending on it Make RPCSEC_GSS_KRB5 select CRYPTO instead of depending on it. This unblocks the eventual removal of the selection of CRYPTO from NFSD_V4, which will no longer be needed by nfsd itself due to switching to the crypto library functions. But NFSD_V4 selects RPCSEC_GSS_KRB5, which still needs CRYPTO. It makes more sense for RPCSEC_GSS_KRB5 to select CRYPTO itself, like most other kconfig options that need CRYPTO do. Signed-off-by: Eric Biggers Acked-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 2d8b67dac7b5..33aafdc8392e 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -18,9 +18,10 @@ config SUNRPC_SWAP config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism" - depends on SUNRPC && CRYPTO + depends on SUNRPC default y select SUNRPC_GSS + select CRYPTO select CRYPTO_SKCIPHER select CRYPTO_HASH help From 73cc6ec1a89a6c443a77b9b93ddcea63b7cea223 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 8 Sep 2025 11:37:25 +1000 Subject: [PATCH 42/42] nfsd: discard nfserr_dropit nfserr_dropit hasn't been used for over a decade, since rq_dropme and the RQ_DROPME were introduced. Time to get rid of it completely. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/lockd.c | 2 -- fs/nfsd/nfsd.h | 8 +------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 6b042218668b..c774ce9aa296 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -71,8 +71,6 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, * to callback when the delegation is returned but might * not have a proper lock request to block on. */ - fallthrough; - case nfserr_dropit: return nlm_drop_reply; case nfserr_stale: return nlm_stale_fh; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 809729d41e08..ea87b42894dd 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -344,14 +344,8 @@ void nfsd_lockd_shutdown(void); * cannot conflict with any existing be32 nfserr value. */ enum { - NFSERR_DROPIT = NFS4ERR_FIRST_FREE, -/* if a request fails due to kmalloc failure, it gets dropped. - * Client should resend eventually - */ -#define nfserr_dropit cpu_to_be32(NFSERR_DROPIT) - /* end-of-file indicator in readdir */ - NFSERR_EOF, + NFSERR_EOF = NFS4ERR_FIRST_FREE, #define nfserr_eof cpu_to_be32(NFSERR_EOF) /* replay detected */