From 19bfef0178c64a6281a44687380b082e69215e06 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Mon, 8 Dec 2025 17:31:38 +0800 Subject: [PATCH 01/31] erofs: Use %pe format specifier for error pointers %pe will print a symbolic error name (e.g,. -ENOMEM), opposed to the raw errno (e.g,. -12) produced by PTR_ERR(). Signed-off-by: Ferry Meng Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 3d31f7840ca0..70e1597dec8a 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1324,8 +1324,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, bool eio) GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); if (IS_ERR(reason)) { - erofs_err(be->sb, "failed to decompress (%s) %ld @ pa %llu size %u => %u", - alg->name, PTR_ERR(reason), pcl->pos, + erofs_err(be->sb, "failed to decompress (%s) %pe @ pa %llu size %u => %u", + alg->name, reason, pcl->pos, pcl->pclustersize, pcl->length); err = PTR_ERR(reason); } else if (unlikely(reason)) { From 0cc7d0c926cc80151bdf803f3cb6f7476f648d73 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Tue, 16 Dec 2025 16:21:41 +0800 Subject: [PATCH 02/31] erofs: make z_erofs_crypto[] static Reduce the scope of 'z_erofs_crypto[]' that is not used outside of 'decompressor_crypto.c'. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512102025.4mWeBSsf-lkp@intel.com/ Signed-off-by: Ferry Meng Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/decompressor_crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/decompressor_crypto.c b/fs/erofs/decompressor_crypto.c index 5ef6f71d3b7f..77c6bd535df3 100644 --- a/fs/erofs/decompressor_crypto.c +++ b/fs/erofs/decompressor_crypto.c @@ -62,7 +62,7 @@ struct z_erofs_crypto_engine { struct crypto_acomp *tfm; }; -struct z_erofs_crypto_engine *z_erofs_crypto[Z_EROFS_COMPRESSION_MAX] = { +static struct z_erofs_crypto_engine *z_erofs_crypto[Z_EROFS_COMPRESSION_MAX] = { [Z_EROFS_COMPRESSION_LZ4] = (struct z_erofs_crypto_engine[]) { {}, }, From 43ac93b5432c4aa826a19be95737af53c0f5c1e1 Mon Sep 17 00:00:00 2001 From: Yuwen Chen Date: Thu, 18 Dec 2025 12:19:52 +0800 Subject: [PATCH 03/31] erofs: simplify the code using for_each_set_bit When mounting the EROFS file system, it is necessary to check the available compression algorithms. At this time, the for_each_set_bit function can be used to simplify the code logic. Signed-off-by: Yuwen Chen Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index d5d090276391..15b464a58993 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -452,7 +452,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - unsigned int algs, alg; + unsigned long algs, alg; erofs_off_t offset; int size, ret = 0; @@ -461,33 +461,30 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) return z_erofs_load_lz4_config(sb, dsb, NULL, 0); } - sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); - if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { - erofs_err(sb, "unidentified algorithms %x, please upgrade kernel", - sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); + algs = le16_to_cpu(dsb->u1.available_compr_algs); + sbi->available_compr_algs = algs; + if (algs & ~Z_EROFS_ALL_COMPR_ALGS) { + erofs_err(sb, "unidentified algorithms %lx, please upgrade kernel", + algs & ~Z_EROFS_ALL_COMPR_ALGS); return -EOPNOTSUPP; } (void)erofs_init_metabuf(&buf, sb, false); offset = EROFS_SUPER_OFFSET + sbi->sb_size; - alg = 0; - for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { + for_each_set_bit(alg, &algs, Z_EROFS_COMPRESSION_MAX) { const struct z_erofs_decompressor *dec = z_erofs_decomp[alg]; void *data; - if (!(algs & 1)) - continue; - data = erofs_read_metadata(sb, &buf, &offset, &size); if (IS_ERR(data)) { ret = PTR_ERR(data); break; } - if (alg < Z_EROFS_COMPRESSION_MAX && dec && dec->config) { + if (dec && dec->config) { ret = dec->config(sb, dsb, data, size); } else { - erofs_err(sb, "algorithm %d isn't enabled on this kernel", + erofs_err(sb, "algorithm %ld isn't enabled on this kernel", alg); ret = -EOPNOTSUPP; } From 48df6d1bc9d5e8d2d778c39d952c3d6cc39e5c73 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 19 Dec 2025 14:43:36 +0800 Subject: [PATCH 04/31] erofs: improve LZ4 error strings Just like what was done for other algorithms, let's propagate detailed error reasons for LZ4 instead of just -EFSCORRUPTED to users: "corrupted compressed data": the compressed data is malformed or destination buffer is not large enough "unexpected end of stream": the compressed stream ends normally, but without producing enough decompressed data. "compressed data start not found": can be returned by z_erofs_fixup_insize(). Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 42 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 15b464a58993..e9d799a03a91 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -195,26 +195,25 @@ const char *z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, return NULL; } -static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst) +static const char *__z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, + u8 *dst) { - bool support_0padding = false, may_inplace = false; + bool zeropadded = erofs_sb_has_zero_padding(EROFS_SB(rq->sb)); + bool may_inplace = false; unsigned int inputmargin; u8 *out, *headpage, *src; const char *reason; int ret, maptype; - DBG_BUGON(*rq->in == NULL); headpage = kmap_local_page(*rq->in); - /* LZ4 decompression inplace is only safe if zero_padding is enabled */ - if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { - support_0padding = true; + if (zeropadded) { reason = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, min_t(unsigned int, rq->inputsize, rq->sb->s_blocksize - rq->pageofs_in)); if (reason) { kunmap_local(headpage); - return IS_ERR(reason) ? PTR_ERR(reason) : -EFSCORRUPTED; + return reason; } may_inplace = !((rq->pageofs_in + rq->inputsize) & (rq->sb->s_blocksize - 1)); @@ -224,26 +223,24 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) - return PTR_ERR(src); + return ERR_CAST(src); out = dst + rq->pageofs_out; /* legacy format could compress extra data in a pcluster. */ - if (rq->partial_decoding || !support_0padding) + if (rq->partial_decoding || !zeropadded) ret = LZ4_decompress_safe_partial(src + inputmargin, out, rq->inputsize, rq->outputsize, rq->outputsize); else ret = LZ4_decompress_safe(src + inputmargin, out, rq->inputsize, rq->outputsize); + if (ret == rq->outputsize) + reason = NULL; + else if (ret < 0) + reason = "corrupted compressed data"; + else + reason = "unexpected end of stream"; - if (ret != rq->outputsize) { - if (ret >= 0) - memset(out + ret, 0, rq->outputsize - ret); - ret = -EFSCORRUPTED; - } else { - ret = 0; - } - - if (maptype == 0) { + if (!maptype) { kunmap_local(headpage); } else if (maptype == 1) { vm_unmap_ram(src, rq->inpages); @@ -251,15 +248,16 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst z_erofs_put_gbuf(src); } else if (maptype != 3) { DBG_BUGON(1); - return -EFAULT; + return ERR_PTR(-EFAULT); } - return ret; + return reason; } static const char *z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool) { unsigned int dst_maptype; + const char *reason; void *dst; int ret; @@ -283,12 +281,12 @@ static const char *z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, dst_maptype = 2; } } - ret = z_erofs_lz4_decompress_mem(rq, dst); + reason = __z_erofs_lz4_decompress(rq, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) vm_unmap_ram(dst, rq->outpages); - return ERR_PTR(ret); + return reason; } static const char *z_erofs_transform_plain(struct z_erofs_decompress_req *rq, From 9aa64b62a73cbca226c0144dcf3cdf97294e0641 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 26 Dec 2025 14:09:45 +0800 Subject: [PATCH 05/31] erofs: avoid noisy messages for transient -ENOMEM EROFS may allocate temporary pages using GFP_NOWAIT | GFP_NORETRY when pcl->besteffort is off (e.g., for readahead requests). If the allocation fails, the original request will fall back to synchronous read, so the failure is transient. Such fallback can frequently happen in low memory scenarios, but since these failures are expected and temporary, avoid printing error messages like below: [ 7425.184264] erofs (device sr0): failed to decompress (lz4) -ENOMEM @ pa 148447232 size 28672 => 26788 [ 7426.244267] erofs (device sr0): failed to decompress (lz4) -ENOMEM @ pa 149422080 size 28672 => 15903 [ 7426.245508] erofs (device sr0): failed to decompress (lz4) -ENOMEM @ pa 138440704 size 28672 => 39294 ... [ 7504.258373] erofs (device sr0): failed to decompress (lz4) -ENOMEM @ pa 93581312 size 20480 => 47366 Fixes: 831faabed812 ("erofs: improve decompression error reporting") Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 70e1597dec8a..c62908f1ce47 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1324,9 +1324,10 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, bool eio) GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); if (IS_ERR(reason)) { - erofs_err(be->sb, "failed to decompress (%s) %pe @ pa %llu size %u => %u", - alg->name, reason, pcl->pos, - pcl->pclustersize, pcl->length); + if (pcl->besteffort || reason != ERR_PTR(-ENOMEM)) + erofs_err(be->sb, "failed to decompress (%s) %pe @ pa %llu size %u => %u", + alg->name, reason, pcl->pos, + pcl->pclustersize, pcl->length); err = PTR_ERR(reason); } else if (unlikely(reason)) { erofs_err(be->sb, "failed to decompress (%s) %s @ pa %llu size %u => %u", From 643575d5a4f24b23b0c54aa20aa74a4abed8ff5e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 29 Dec 2025 17:29:46 +0800 Subject: [PATCH 06/31] erofs: fix incorrect early exits for invalid metabox-enabled images Crafted EROFS images with metadata compression enabled can trigger incorrect early returns, leading to folio reference leaks. However, this does not cause system crashes or other severe issues. Fixes: 414091322c63 ("erofs: implement metadata compression") Cc: stable@kernel.org Reviewed-by: Hongbo Li Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 5136cda5972a..b54083128e0f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -330,12 +330,13 @@ static int erofs_read_superblock(struct super_block *sb) } sbi->packed_nid = le64_to_cpu(dsb->packed_nid); if (erofs_sb_has_metabox(sbi)) { + ret = -EFSCORRUPTED; if (sbi->sb_size <= offsetof(struct erofs_super_block, metabox_nid)) - return -EFSCORRUPTED; + goto out; sbi->metabox_nid = le64_to_cpu(dsb->metabox_nid); if (sbi->metabox_nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT)) - return -EFSCORRUPTED; /* self-loop detection */ + goto out; /* self-loop detection */ } sbi->inos = le64_to_cpu(dsb->inos); From 3afa4da38802a4cba1c23848a32284e7e57b831b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 29 Dec 2025 17:29:47 +0800 Subject: [PATCH 07/31] erofs: fix incorrect early exits in volume label handling Crafted EROFS images containing valid volume labels can trigger incorrect early returns, leading to folio reference leaks. However, this does not cause system crashes or other severe issues. Fixes: 1cf12c717741 ("erofs: Add support for FS_IOC_GETFSLABEL") Cc: stable@kernel.org Reviewed-by: Hongbo Li Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index b54083128e0f..ee37628ec99f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -347,8 +347,10 @@ static int erofs_read_superblock(struct super_block *sb) if (dsb->volume_name[0]) { sbi->volume_name = kstrndup(dsb->volume_name, sizeof(dsb->volume_name), GFP_KERNEL); - if (!sbi->volume_name) - return -ENOMEM; + if (!sbi->volume_name) { + ret = -ENOMEM; + goto out; + } } /* parse on-disk compression configurations */ From 09225312f2dbba4f084a16e544662f33f4dd035b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 29 Dec 2025 17:29:48 +0800 Subject: [PATCH 08/31] erofs: unexport erofs_getxattr() No external users other than those in xattr.c. Reviewed-by: Hongbo Li Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/xattr.c | 108 +++++++++++++++++++++++------------------------ fs/erofs/xattr.h | 7 --- 2 files changed, 54 insertions(+), 61 deletions(-) diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 396536d9a862..972941ecb71c 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -125,58 +125,6 @@ out_unlock: return ret; } -static bool erofs_xattr_user_list(struct dentry *dentry) -{ - return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); -} - -static bool erofs_xattr_trusted_list(struct dentry *dentry) -{ - return capable(CAP_SYS_ADMIN); -} - -static int erofs_xattr_generic_get(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - if (handler->flags == EROFS_XATTR_INDEX_USER && - !test_opt(&EROFS_I_SB(inode)->opt, XATTR_USER)) - return -EOPNOTSUPP; - - return erofs_getxattr(inode, handler->flags, name, buffer, size); -} - -const struct xattr_handler erofs_xattr_user_handler = { - .prefix = XATTR_USER_PREFIX, - .flags = EROFS_XATTR_INDEX_USER, - .list = erofs_xattr_user_list, - .get = erofs_xattr_generic_get, -}; - -const struct xattr_handler erofs_xattr_trusted_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .flags = EROFS_XATTR_INDEX_TRUSTED, - .list = erofs_xattr_trusted_list, - .get = erofs_xattr_generic_get, -}; - -#ifdef CONFIG_EROFS_FS_SECURITY -const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .flags = EROFS_XATTR_INDEX_SECURITY, - .get = erofs_xattr_generic_get, -}; -#endif - -const struct xattr_handler * const erofs_xattr_handlers[] = { - &erofs_xattr_user_handler, - &erofs_xattr_trusted_handler, -#ifdef CONFIG_EROFS_FS_SECURITY - &erofs_xattr_security_handler, -#endif - NULL, -}; - static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, unsigned int len) { @@ -391,8 +339,8 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, return i ? ret : -ENODATA; } -int erofs_getxattr(struct inode *inode, int index, const char *name, - void *buffer, size_t buffer_size) +static int erofs_getxattr(struct inode *inode, int index, const char *name, + void *buffer, size_t buffer_size) { int ret; unsigned int hashbit; @@ -462,6 +410,58 @@ ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) return ret ? ret : it.buffer_ofs; } +static bool erofs_xattr_user_list(struct dentry *dentry) +{ + return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); +} + +static bool erofs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static int erofs_xattr_generic_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + if (handler->flags == EROFS_XATTR_INDEX_USER && + !test_opt(&EROFS_I_SB(inode)->opt, XATTR_USER)) + return -EOPNOTSUPP; + + return erofs_getxattr(inode, handler->flags, name, buffer, size); +} + +const struct xattr_handler erofs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = EROFS_XATTR_INDEX_USER, + .list = erofs_xattr_user_list, + .get = erofs_xattr_generic_get, +}; + +const struct xattr_handler erofs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = EROFS_XATTR_INDEX_TRUSTED, + .list = erofs_xattr_trusted_list, + .get = erofs_xattr_generic_get, +}; + +#ifdef CONFIG_EROFS_FS_SECURITY +const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = EROFS_XATTR_INDEX_SECURITY, + .get = erofs_xattr_generic_get, +}; +#endif + +const struct xattr_handler * const erofs_xattr_handlers[] = { + &erofs_xattr_user_handler, + &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + &erofs_xattr_security_handler, +#endif + NULL, +}; + void erofs_xattr_prefixes_cleanup(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 6317caa8413e..ee1d8c310d97 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -45,17 +45,10 @@ extern const struct xattr_handler * const erofs_xattr_handlers[]; int erofs_xattr_prefixes_init(struct super_block *sb); void erofs_xattr_prefixes_cleanup(struct super_block *sb); -int erofs_getxattr(struct inode *, int, const char *, void *, size_t); ssize_t erofs_listxattr(struct dentry *, char *, size_t); #else static inline int erofs_xattr_prefixes_init(struct super_block *sb) { return 0; } static inline void erofs_xattr_prefixes_cleanup(struct super_block *sb) {} -static inline int erofs_getxattr(struct inode *inode, int index, - const char *name, void *buffer, - size_t buffer_size) -{ - return -EOPNOTSUPP; -} #define erofs_listxattr (NULL) #define erofs_xattr_handlers (NULL) From 7ed7a713f1267d14e737a935b662bffa4c667dc9 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 31 Dec 2025 12:57:36 +0800 Subject: [PATCH 09/31] erofs: unexport erofs_xattr_prefix() It can be simply in xattr.c due to no external users. Reviewed-by: Hongbo Li Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/xattr.c | 31 ++++++++++++++++++++++++++++--- fs/erofs/xattr.h | 30 ------------------------------ 2 files changed, 28 insertions(+), 33 deletions(-) diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 972941ecb71c..f8668157162f 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -25,6 +25,8 @@ struct erofs_xattr_iter { struct dentry *dentry; }; +static const char *erofs_xattr_prefix(unsigned int idx, struct dentry *dentry); + static int erofs_init_inode_xattrs(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -431,14 +433,14 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, return erofs_getxattr(inode, handler->flags, name, buffer, size); } -const struct xattr_handler erofs_xattr_user_handler = { +static const struct xattr_handler erofs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = EROFS_XATTR_INDEX_USER, .list = erofs_xattr_user_list, .get = erofs_xattr_generic_get, }; -const struct xattr_handler erofs_xattr_trusted_handler = { +static const struct xattr_handler erofs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = EROFS_XATTR_INDEX_TRUSTED, .list = erofs_xattr_trusted_list, @@ -446,7 +448,7 @@ const struct xattr_handler erofs_xattr_trusted_handler = { }; #ifdef CONFIG_EROFS_FS_SECURITY -const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { +static const struct xattr_handler erofs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = EROFS_XATTR_INDEX_SECURITY, .get = erofs_xattr_generic_get, @@ -462,6 +464,29 @@ const struct xattr_handler * const erofs_xattr_handlers[] = { NULL, }; +static const char *erofs_xattr_prefix(unsigned int idx, struct dentry *dentry) +{ + static const struct xattr_handler * const xattr_handler_map[] = { + [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, +#ifdef CONFIG_EROFS_FS_POSIX_ACL + [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, + [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, +#endif + [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, +#ifdef CONFIG_EROFS_FS_SECURITY + [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, +#endif + }; + const struct xattr_handler *handler = NULL; + + if (idx && idx < ARRAY_SIZE(xattr_handler_map)) { + handler = xattr_handler_map[idx]; + if (xattr_handler_can_list(handler, dentry)) + return xattr_prefix(handler); + } + return NULL; +} + void erofs_xattr_prefixes_cleanup(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index ee1d8c310d97..36f2667afc2d 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -11,36 +11,6 @@ #include #ifdef CONFIG_EROFS_FS_XATTR -extern const struct xattr_handler erofs_xattr_user_handler; -extern const struct xattr_handler erofs_xattr_trusted_handler; -extern const struct xattr_handler erofs_xattr_security_handler; - -static inline const char *erofs_xattr_prefix(unsigned int idx, - struct dentry *dentry) -{ - const struct xattr_handler *handler = NULL; - - static const struct xattr_handler * const xattr_handler_map[] = { - [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, -#ifdef CONFIG_EROFS_FS_POSIX_ACL - [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, - [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, -#endif - [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, -#ifdef CONFIG_EROFS_FS_SECURITY - [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, -#endif - }; - - if (idx && idx < ARRAY_SIZE(xattr_handler_map)) - handler = xattr_handler_map[idx]; - - if (!xattr_handler_can_list(handler, dentry)) - return NULL; - - return xattr_prefix(handler); -} - extern const struct xattr_handler * const erofs_xattr_handlers[]; int erofs_xattr_prefixes_init(struct super_block *sb); From 06e5c340941fba1c7a2da95ee64d9cae6154bd6e Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Mon, 29 Dec 2025 18:05:15 +0800 Subject: [PATCH 10/31] erofs: remove useless src in erofs_xattr_copy_to_buffer() Use it->kaddr directly. Signed-off-by: Ferry Meng Reviewed-by: Hongbo Li Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/xattr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index f8668157162f..dad076ce0108 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -132,17 +132,15 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, { unsigned int slice, processed; struct super_block *sb = it->sb; - void *src; for (processed = 0; processed < len; processed += slice) { it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); - src = it->kaddr; slice = min_t(unsigned int, sb->s_blocksize - erofs_blkoff(sb, it->pos), len - processed); - memcpy(it->buffer + it->buffer_ofs, src, slice); + memcpy(it->buffer + it->buffer_ofs, it->kaddr, slice); it->buffer_ofs += slice; it->pos += slice; } From cc831ab33644088c1eef78936de24701014d520a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Mon, 12 Jan 2026 11:43:30 +0800 Subject: [PATCH 11/31] erofs: tidy up synchronous decompression - Get rid of `sbi->opt.max_sync_decompress_pages` since it's fixed as 3 all the time; - Add Z_EROFS_MAX_SYNC_DECOMPRESS_BYTES in bytes instead of in pages, since for non-4K pages, 3-page limitation makes no sense; - Move `sync_decompress` to sbi to avoid unexpected remount impact; - Fold z_erofs_is_sync_decompress() into its caller; - Better description of sysfs entry `sync_decompress`. Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- Documentation/ABI/testing/sysfs-fs-erofs | 14 ++++++---- fs/erofs/internal.h | 5 +--- fs/erofs/super.c | 3 +- fs/erofs/sysfs.c | 2 +- fs/erofs/zdata.c | 35 +++++++++--------------- 5 files changed, 25 insertions(+), 34 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-erofs b/Documentation/ABI/testing/sysfs-fs-erofs index 76d9808ed581..b9243c7f28d7 100644 --- a/Documentation/ABI/testing/sysfs-fs-erofs +++ b/Documentation/ABI/testing/sysfs-fs-erofs @@ -10,12 +10,16 @@ Description: Shows all enabled kernel features. What: /sys/fs/erofs//sync_decompress Date: November 2021 Contact: "Huang Jianan" -Description: Control strategy of sync decompression: +Description: Control strategy of synchronous decompression. Synchronous + decompression tries to decompress in the reader thread for + synchronous reads and small asynchronous reads (<= 12 KiB): - - 0 (default, auto): enable for readpage, and enable for - readahead on atomic contexts only. - - 1 (force on): enable for readpage and readahead. - - 2 (force off): disable for all situations. + - 0 (auto, default): apply to synchronous reads only, but will + switch to 1 (force on) if any decompression + request is detected in atomic contexts; + - 1 (force on): apply to synchronous reads and small + asynchronous reads; + - 2 (force off): disable synchronous decompression completely. What: /sys/fs/erofs//drop_caches Date: November 2024 diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index f7f622836198..87edbb4366d1 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -59,10 +59,6 @@ enum { struct erofs_mount_opts { /* current strategy of how to use managed cache */ unsigned char cache_strategy; - /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */ - unsigned int sync_decompress; - /* threshold for decompression synchronously */ - unsigned int max_sync_decompress_pages; unsigned int mount_opt; }; @@ -116,6 +112,7 @@ struct erofs_sb_info { /* managed XArray arranged in physical block number */ struct xarray managed_pslots; + unsigned int sync_decompress; /* strategy for sync decompression */ unsigned int shrinker_run_no; u16 available_compr_algs; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index ee37628ec99f..984afbda1c18 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -375,8 +375,7 @@ static void erofs_default_options(struct erofs_sb_info *sbi) { #ifdef CONFIG_EROFS_FS_ZIP sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; - sbi->opt.max_sync_decompress_pages = 3; - sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; + sbi->sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; #endif #ifdef CONFIG_EROFS_FS_XATTR set_opt(&sbi->opt, XATTR_USER); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 1e0658a1d95b..86b22b9f0c19 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -59,7 +59,7 @@ static struct erofs_attr erofs_attr_##_name = { \ #define ATTR_LIST(name) (&erofs_attr_##name.attr) #ifdef CONFIG_EROFS_FS_ZIP -EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); +EROFS_ATTR_RW_UI(sync_decompress, erofs_sb_info); EROFS_ATTR_FUNC(drop_caches, 0200); #endif #ifdef CONFIG_EROFS_FS_ZIP_ACCEL diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c62908f1ce47..5860ea6dbc60 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -9,6 +9,7 @@ #include #include +#define Z_EROFS_MAX_SYNC_DECOMPRESS_BYTES 12288 #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_INLINE_BVECS 2 @@ -1095,21 +1096,6 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f, return err; } -static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, - unsigned int readahead_pages) -{ - /* auto: enable for read_folio, disable for readahead */ - if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && - !readahead_pages) - return true; - - if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && - (readahead_pages <= sbi->opt.max_sync_decompress_pages)) - return true; - - return false; -} - static bool z_erofs_page_is_invalidated(struct page *page) { return !page_folio(page)->mapping && !z_erofs_is_shortlived_page(page); @@ -1484,9 +1470,9 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, #else queue_work(z_erofs_workqueue, &io->u.work); #endif - /* enable sync decompression for readahead */ - if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) - sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; + /* See `sync_decompress` in sysfs-fs-erofs for more details */ + if (sbi->sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) + sbi->sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; return; } z_erofs_decompressqueue_work(&io->u.work); @@ -1803,16 +1789,21 @@ drain_io: z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } -static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rapages) +static int z_erofs_runqueue(struct z_erofs_frontend *f, unsigned int rabytes) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; struct erofs_sb_info *sbi = EROFS_I_SB(f->inode); - bool force_fg = z_erofs_is_sync_decompress(sbi, rapages); + int syncmode = sbi->sync_decompress; + bool force_fg; int err; + force_fg = (syncmode == EROFS_SYNC_DECOMPRESS_AUTO && !rabytes) || + (syncmode == EROFS_SYNC_DECOMPRESS_FORCE_ON && + (rabytes <= Z_EROFS_MAX_SYNC_DECOMPRESS_BYTES)); + if (f->head == Z_EROFS_PCLUSTER_TAIL) return 0; - z_erofs_submit_queue(f, io, &force_fg, !!rapages); + z_erofs_submit_queue(f, io, &force_fg, !!rabytes); /* handle bypass queue (no i/o pclusters) immediately */ err = z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); @@ -1933,7 +1924,7 @@ static void z_erofs_readahead(struct readahead_control *rac) z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); - (void)z_erofs_runqueue(&f, nrpages); + (void)z_erofs_runqueue(&f, nrpages << PAGE_SHIFT); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } From a221a737406501c9036d00667095d61317d50d7f Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 22 Jan 2026 10:52:52 +0800 Subject: [PATCH 12/31] erofs: add missing documentation about `directio` mount option Document the `directio` mount option for file-backed mounts, because recent users need this and this mount option has been available since commit 6422cde1b0d5 ("erofs: use buffered I/O for file-backed mounts by default") without proper documentation. Reported-by: Yuxuan Liu Signed-off-by: Gao Xiang --- Documentation/filesystems/erofs.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index 08194f194b94..96101c3fe53a 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -125,6 +125,8 @@ dax={always,never} Use direct access (no page cache). See Documentation/filesystems/dax.rst. dax A legacy option which is an alias for ``dax=always``. device=%s Specify a path to an extra device to be used together. +directio (For file-backed mounts) Use direct I/O to access backing + files, and asynchronous I/O will be enabled if supported. fsid=%s Specify a filesystem image ID for Fscache back-end. domain_id=%s Specify a domain ID in fscache mode so that different images with the same blobs under a given domain ID can share storage. From 58d081ea4eab924b9e5a55cbb151bde847068c8d Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 22 Jan 2026 22:36:33 +0800 Subject: [PATCH 13/31] erofs: tidy up erofs_init_inode_xattrs() Mainly get rid of the use of `struct erofs_xattr_iter`, as it is no longer needed now that meta buffers are used. This also simplifies the code and uses an early return when there are no xattrs. Signed-off-by: Gao Xiang --- fs/erofs/xattr.c | 62 +++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index dad076ce0108..512b998bdfff 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -29,13 +29,18 @@ static const char *erofs_xattr_prefix(unsigned int idx, struct dentry *dentry); static int erofs_init_inode_xattrs(struct inode *inode) { - struct erofs_inode *const vi = EROFS_I(inode); - struct erofs_xattr_iter it; - unsigned int i; - struct erofs_xattr_ibody_header *ih; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + struct erofs_inode *vi = EROFS_I(inode); struct super_block *sb = inode->i_sb; + const struct erofs_xattr_ibody_header *ih; + __le32 *xattr_id; + erofs_off_t pos; + unsigned int i; int ret = 0; + if (!vi->xattr_isize) + return -ENODATA; + /* the most case is that xattrs of this inode are initialized. */ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) { /* @@ -45,7 +50,6 @@ static int erofs_init_inode_xattrs(struct inode *inode) smp_mb(); return 0; } - if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE)) return -ERESTARTSYS; @@ -62,66 +66,50 @@ static int erofs_init_inode_xattrs(struct inode *inode) * undefined right now (maybe use later with some new sb feature). */ if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { - erofs_err(sb, - "xattr_isize %d of nid %llu is not supported yet", + erofs_err(sb, "xattr_isize %d of nid %llu is not supported yet", vi->xattr_isize, vi->nid); ret = -EOPNOTSUPP; goto out_unlock; } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { - if (vi->xattr_isize) { - erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid); - DBG_BUGON(1); - ret = -EFSCORRUPTED; - goto out_unlock; /* xattr ondisk layout error */ - } - ret = -ENODATA; + erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid); + DBG_BUGON(1); + ret = -EFSCORRUPTED; goto out_unlock; } - it.buf = __EROFS_BUF_INITIALIZER; - ret = erofs_init_metabuf(&it.buf, sb, erofs_inode_in_metabox(inode)); - if (ret) - goto out_unlock; - it.pos = erofs_iloc(inode) + vi->inode_isize; - - /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_bread(&it.buf, it.pos, true); - if (IS_ERR(it.kaddr)) { - ret = PTR_ERR(it.kaddr); + pos = erofs_iloc(inode) + vi->inode_isize; + ih = erofs_read_metabuf(&buf, sb, pos, erofs_inode_in_metabox(inode)); + if (IS_ERR(ih)) { + ret = PTR_ERR(ih); goto out_unlock; } - - ih = it.kaddr; vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); if (!vi->xattr_shared_xattrs) { - erofs_put_metabuf(&it.buf); + erofs_put_metabuf(&buf); ret = -ENOMEM; goto out_unlock; } - /* let's skip ibody header */ - it.pos += sizeof(struct erofs_xattr_ibody_header); - + /* skip the ibody header and read the shared xattr array */ + pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - it.kaddr = erofs_bread(&it.buf, it.pos, true); - if (IS_ERR(it.kaddr)) { + xattr_id = erofs_bread(&buf, pos + i * sizeof(__le32), true); + if (IS_ERR(xattr_id)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; - ret = PTR_ERR(it.kaddr); + ret = PTR_ERR(xattr_id); goto out_unlock; } - vi->xattr_shared_xattrs[i] = le32_to_cpu(*(__le32 *)it.kaddr); - it.pos += sizeof(__le32); + vi->xattr_shared_xattrs[i] = le32_to_cpu(*xattr_id); } - erofs_put_metabuf(&it.buf); + erofs_put_metabuf(&buf); /* paired with smp_mb() at the beginning of the function. */ smp_mb(); set_bit(EROFS_I_EA_INITED_BIT, &vi->flags); - out_unlock: clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags); return ret; From 37364497048c5081d3bfa424638cc91a7a7644e2 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 23 Jan 2026 01:31:23 +0000 Subject: [PATCH 14/31] fs: Export alloc_empty_backing_file There is no need to open nonexistent real files if backing files couldn't be backed by real files (e.g., EROFS page cache sharing doesn't need typical real files to open again). Therefore, we export the alloc_empty_backing_file() helper, allowing filesystems to dynamically set the backing file without real file open. This is particularly useful for obtaining the correct @path and @inode when calling file_user_path() and file_user_inode(). Signed-off-by: Hongbo Li Reviewed-by: Gao Xiang Acked-by: Amir Goldstein Acked-by: Christian Brauner Signed-off-by: Gao Xiang --- fs/file_table.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/file_table.c b/fs/file_table.c index cd4a3db4659a..476edfe7d8f5 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -308,6 +308,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT; return &ff->file; } +EXPORT_SYMBOL_GPL(alloc_empty_backing_file); /** * file_init_path - initialize a 'struct file' based on path From 4340ca47c35b2be0b0945a3c2c9c9ee058520b96 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 23 Jan 2026 01:31:24 +0000 Subject: [PATCH 15/31] erofs: decouple `struct erofs_anon_fs_type` - Move the `struct erofs_anon_fs_type` to super.c and expose it in preparation for the upcoming page cache share feature; - Remove the `.owner` field, as they are all internal mounts and fully managed by EROFS. Retaining `.owner` would unnecessarily increment module reference counts, preventing the EROFS kernel module from being unloaded. Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 13 ------------- fs/erofs/internal.h | 2 ++ fs/erofs/super.c | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 7a346e20f7b7..f4937b025038 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -3,7 +3,6 @@ * Copyright (C) 2022, Alibaba Cloud * Copyright (C) 2022, Bytedance Inc. All rights reserved. */ -#include #include #include "internal.h" @@ -13,18 +12,6 @@ static LIST_HEAD(erofs_domain_list); static LIST_HEAD(erofs_domain_cookies_list); static struct vfsmount *erofs_pseudo_mnt; -static int erofs_anon_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type erofs_anon_fs_type = { - .owner = THIS_MODULE, - .name = "pseudo_erofs", - .init_fs_context = erofs_anon_init_fs_context, - .kill_sb = kill_anon_super, -}; - struct erofs_fscache_io { struct netfs_cache_resources cres; struct iov_iter iter; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 87edbb4366d1..c508c96ce142 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -185,6 +185,8 @@ static inline bool erofs_is_fileio_mode(struct erofs_sb_info *sbi) return IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && sbi->dif0.file; } +extern struct file_system_type erofs_anon_fs_type; + static inline bool erofs_is_fscache_mode(struct super_block *sb) { return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 984afbda1c18..8940e8ff158a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "xattr.h" #define CREATE_TRACE_POINTS @@ -945,6 +946,19 @@ static struct file_system_type erofs_fs_type = { }; MODULE_ALIAS_FS("erofs"); +#if defined(CONFIG_EROFS_FS_ONDEMAND) +static int erofs_anon_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM; +} + +struct file_system_type erofs_anon_fs_type = { + .name = "pseudo_erofs", + .init_fs_context = erofs_anon_init_fs_context, + .kill_sb = kill_anon_super, +}; +#endif + static int __init erofs_module_init(void) { int err; From e0bf7d1c074dc4252223ae897560345ccc24100d Mon Sep 17 00:00:00 2001 From: Hongzhen Luo Date: Fri, 23 Jan 2026 01:31:25 +0000 Subject: [PATCH 16/31] erofs: support user-defined fingerprint name When creating the EROFS image, users can specify the fingerprint name. This is to prepare for the upcoming inode page cache share. Signed-off-by: Hongzhen Luo Signed-off-by: Hongbo Li Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/Kconfig | 9 +++++++++ fs/erofs/erofs_fs.h | 5 +++-- fs/erofs/internal.h | 2 ++ fs/erofs/super.c | 9 +++++++++ fs/erofs/xattr.c | 13 +++++++++++++ 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index d81f3318417d..b71f2a8074fe 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -194,3 +194,12 @@ config EROFS_FS_PCPU_KTHREAD_HIPRI at higher priority. If unsure, say N. + +config EROFS_FS_PAGE_CACHE_SHARE + bool "EROFS page cache share support (experimental)" + depends on EROFS_FS && EROFS_FS_XATTR && !EROFS_FS_ONDEMAND + help + This enables page cache sharing among inodes with identical + content fingerprints on the same machine. + + If unsure, say N. diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index e24268acdd62..b30a74d307c5 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -17,7 +17,7 @@ #define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 #define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008 #define EROFS_FEATURE_COMPAT_PLAIN_XATTR_PFX 0x00000010 - +#define EROFS_FEATURE_COMPAT_ISHARE_XATTRS 0x00000020 /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should @@ -83,7 +83,8 @@ struct erofs_super_block { __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ __u8 xattr_filter_reserved; /* reserved for xattr name filter */ - __u8 reserved[3]; + __u8 ishare_xattr_prefix_id; + __u8 reserved[2]; __le32 build_time; /* seconds added to epoch for mkfs time */ __le64 rootnid_8b; /* (48BIT on) nid of root directory */ __le64 reserved2; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index c508c96ce142..ae4ade00b578 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -131,6 +131,7 @@ struct erofs_sb_info { u32 xattr_blkaddr; u32 xattr_prefix_start; u8 xattr_prefix_count; + u8 ishare_xattr_prefix_id; struct erofs_xattr_prefix_item *xattr_prefixes; unsigned int xattr_filter_reserved; #endif @@ -235,6 +236,7 @@ EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX) EROFS_FEATURE_FUNCS(plain_xattr_pfx, compat, COMPAT_PLAIN_XATTR_PFX) +EROFS_FEATURE_FUNCS(ishare_xattrs, compat, COMPAT_ISHARE_XATTRS) static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid) { diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 8940e8ff158a..c9ea70d600ad 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -320,6 +320,15 @@ static int erofs_read_superblock(struct super_block *sb) sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); sbi->xattr_prefix_count = dsb->xattr_prefix_count; sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; + if (erofs_sb_has_ishare_xattrs(sbi)) { + if (dsb->ishare_xattr_prefix_id >= sbi->xattr_prefix_count) { + erofs_err(sb, "invalid ishare xattr prefix id %u", + dsb->ishare_xattr_prefix_id); + ret = -EFSCORRUPTED; + goto out; + } + sbi->ishare_xattr_prefix_id = dsb->ishare_xattr_prefix_id; + } #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) { diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 512b998bdfff..732e3b3379d5 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -530,6 +530,19 @@ int erofs_xattr_prefixes_init(struct super_block *sb) } erofs_put_metabuf(&buf); + if (!ret && erofs_sb_has_ishare_xattrs(sbi)) { + struct erofs_xattr_prefix_item *pf = pfs + sbi->ishare_xattr_prefix_id; + struct erofs_xattr_long_prefix *newpfx; + + newpfx = krealloc(pf->prefix, + sizeof(*newpfx) + pf->infix_len + 1, GFP_KERNEL); + if (newpfx) { + newpfx->infix[pf->infix_len] = '\0'; + pf->prefix = newpfx; + } else { + ret = -ENOMEM; + } + } sbi->xattr_prefixes = pfs; if (ret) erofs_xattr_prefixes_cleanup(sb); From 78331814a502b6b27ccf57dcd09801d2b48709e1 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 23 Jan 2026 07:52:39 +0000 Subject: [PATCH 17/31] erofs: add erofs_inode_set_aops helper to set the aops Add erofs_inode_set_aops helper to set the inode->i_mapping->a_ops and use IS_ENABLED to make it cleaner. Signed-off-by: Hongbo Li Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 24 +----------------------- fs/erofs/internal.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index bce98c845a18..202cbbb4eada 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -203,7 +203,6 @@ err_out: static int erofs_fill_inode(struct inode *inode) { - struct erofs_inode *vi = EROFS_I(inode); int err; trace_erofs_fill_inode(inode); @@ -235,28 +234,7 @@ static int erofs_fill_inode(struct inode *inode) } mapping_set_large_folios(inode->i_mapping); - if (erofs_inode_is_data_compressed(vi->datalayout)) { -#ifdef CONFIG_EROFS_FS_ZIP - DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, - erofs_info, inode->i_sb, - "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); - inode->i_mapping->a_ops = &z_erofs_aops; -#else - err = -EOPNOTSUPP; -#endif - } else { - inode->i_mapping->a_ops = &erofs_aops; -#ifdef CONFIG_EROFS_FS_ONDEMAND - if (erofs_is_fscache_mode(inode->i_sb)) - inode->i_mapping->a_ops = &erofs_fscache_access_aops; -#endif -#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE - if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) - inode->i_mapping->a_ops = &erofs_fileio_aops; -#endif - } - - return err; + return erofs_inode_set_aops(inode, inode, false); } /* diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index ae4ade00b578..aea28c2fe274 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -452,6 +452,28 @@ static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) return NULL; } +static inline int erofs_inode_set_aops(struct inode *inode, + struct inode *realinode, bool no_fscache) +{ + if (erofs_inode_is_data_compressed(EROFS_I(realinode)->datalayout)) { + if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP)) + return -EOPNOTSUPP; + DO_ONCE_LITE_IF(realinode->i_blkbits != PAGE_SHIFT, + erofs_info, realinode->i_sb, + "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!"); + inode->i_mapping->a_ops = &z_erofs_aops; + return 0; + } + inode->i_mapping->a_ops = &erofs_aops; + if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !no_fscache && + erofs_is_fscache_mode(realinode->i_sb)) + inode->i_mapping->a_ops = &erofs_fscache_access_aops; + if (IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) && + erofs_is_fileio_mode(EROFS_SB(realinode->i_sb))) + inode->i_mapping->a_ops = &erofs_fileio_aops; + return 0; +} + int erofs_register_sysfs(struct super_block *sb); void erofs_unregister_sysfs(struct super_block *sb); int __init erofs_init_sysfs(void); From e77762e8966c9466a84b22680ad04880dab11914 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 23 Jan 2026 01:31:27 +0000 Subject: [PATCH 18/31] erofs: using domain_id in the safer way Either the existing fscache usecase or the upcoming page cache sharing case, the `domain_id` should be protected as sensitive information, so we use the safer helpers to allocate, free and display domain_id. Signed-off-by: Hongbo Li Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- Documentation/filesystems/erofs.rst | 5 +++-- fs/erofs/fscache.c | 4 ++-- fs/erofs/super.c | 10 ++++------ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index 96101c3fe53a..9f98d18c39f6 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -128,8 +128,9 @@ device=%s Specify a path to an extra device to be used together. directio (For file-backed mounts) Use direct I/O to access backing files, and asynchronous I/O will be enabled if supported. fsid=%s Specify a filesystem image ID for Fscache back-end. -domain_id=%s Specify a domain ID in fscache mode so that different images - with the same blobs under a given domain ID can share storage. +domain_id=%s Specify a trusted domain ID for fscache mode so that + different images with the same blobs, identified by blob IDs, + can share storage within the same trusted domain. fsoffset=%llu Specify block-aligned filesystem offset for the primary device. =================== ========================================================= diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index f4937b025038..a2cc0f3fa9d0 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -379,7 +379,7 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain) } fscache_relinquish_volume(domain->volume, NULL, false); mutex_unlock(&erofs_domain_list_lock); - kfree(domain->domain_id); + kfree_sensitive(domain->domain_id); kfree(domain); return; } @@ -446,7 +446,7 @@ static int erofs_fscache_init_domain(struct super_block *sb) sbi->domain = domain; return 0; out: - kfree(domain->domain_id); + kfree_sensitive(domain->domain_id); kfree(domain); return err; } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index c9ea70d600ad..13876fa597ee 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -527,10 +527,8 @@ static int erofs_fc_parse_param(struct fs_context *fc, return -ENOMEM; break; case Opt_domain_id: - kfree(sbi->domain_id); - sbi->domain_id = kstrdup(param->string, GFP_KERNEL); - if (!sbi->domain_id) - return -ENOMEM; + kfree_sensitive(sbi->domain_id); + sbi->domain_id = no_free_ptr(param->string); break; #else case Opt_fsid: @@ -626,7 +624,7 @@ static void erofs_set_sysfs_name(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - if (sbi->domain_id) + if (sbi->domain_id && sbi->fsid) super_set_sysfs_name_generic(sb, "%s,%s", sbi->domain_id, sbi->fsid); else if (sbi->fsid) @@ -861,7 +859,7 @@ static void erofs_sb_free(struct erofs_sb_info *sbi) { erofs_free_dev_context(sbi->devs); kfree(sbi->fsid); - kfree(sbi->domain_id); + kfree_sensitive(sbi->domain_id); if (sbi->dif0.file) fput(sbi->dif0.file); kfree(sbi->volume_name); From 5ef3208e3be50aa08b4e7a2832f34e16d42c08b3 Mon Sep 17 00:00:00 2001 From: Hongzhen Luo Date: Fri, 23 Jan 2026 01:31:28 +0000 Subject: [PATCH 19/31] erofs: introduce the page cache share feature Currently, reading files with different paths (or names) but the same content will consume multiple copies of the page cache, even if the content of these page caches is the same. For example, reading identical files (e.g., *.so files) from two different minor versions of container images will cost multiple copies of the same page cache, since different containers have different mount points. Therefore, sharing the page cache for files with the same content can save memory. This introduces the page cache share feature in erofs. It allocate a shared inode and use its page cache as shared. Reads for files with identical content will ultimately be routed to the page cache of the shared inode. In this way, a single page cache satisfies multiple read requests for different files with the same contents. We introduce new mount option `inode_share` to enable the page sharing mode during mounting. This option is used in conjunction with `domain_id` to share the page cache within the same trusted domain. Signed-off-by: Hongzhen Luo Signed-off-by: Hongbo Li Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- Documentation/filesystems/erofs.rst | 5 + fs/erofs/Makefile | 1 + fs/erofs/internal.h | 31 ++++++ fs/erofs/ishare.c | 167 ++++++++++++++++++++++++++++ fs/erofs/super.c | 62 ++++++++++- fs/erofs/xattr.c | 34 ++++++ fs/erofs/xattr.h | 3 + 7 files changed, 301 insertions(+), 2 deletions(-) create mode 100644 fs/erofs/ishare.c diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index 9f98d18c39f6..af1df574e66c 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -131,7 +131,12 @@ fsid=%s Specify a filesystem image ID for Fscache back-end. domain_id=%s Specify a trusted domain ID for fscache mode so that different images with the same blobs, identified by blob IDs, can share storage within the same trusted domain. + Also used for different filesystems with inode page sharing + enabled to share page cache within the trusted domain. fsoffset=%llu Specify block-aligned filesystem offset for the primary device. +inode_share Enable inode page sharing for this filesystem. Inodes with + identical content within the same domain ID can share the + page cache. =================== ========================================================= Sysfs Entries diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 549abc424763..a80e1762b607 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -10,3 +10,4 @@ erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o +erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += ishare.o diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index aea28c2fe274..367a9a9f0542 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -176,6 +176,7 @@ struct erofs_sb_info { #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 #define EROFS_MOUNT_DIRECT_IO 0x00000100 +#define EROFS_MOUNT_INODE_SHARE 0x00000200 #define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) @@ -266,6 +267,11 @@ static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid) /* default readahead size of directories */ #define EROFS_DIR_RA_BYTES 16384 +struct erofs_inode_fingerprint { + u8 *opaque; + int size; +}; + struct erofs_inode { erofs_nid_t nid; @@ -301,6 +307,18 @@ struct erofs_inode { }; #endif /* CONFIG_EROFS_FS_ZIP */ }; +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + struct list_head ishare_list; + union { + /* for each anon shared inode */ + struct { + struct erofs_inode_fingerprint fingerprint; + spinlock_t ishare_lock; + }; + /* for each real inode */ + struct inode *sharedinode; + }; +#endif /* the corresponding vfs inode */ struct inode vfs_inode; }; @@ -407,6 +425,7 @@ extern const struct inode_operations erofs_dir_iops; extern const struct file_operations erofs_file_fops; extern const struct file_operations erofs_dir_fops; +extern const struct file_operations erofs_ishare_fops; extern const struct iomap_ops z_erofs_iomap_report_ops; @@ -560,6 +579,18 @@ static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { static inline void erofs_fscache_submit_bio(struct bio *bio) {} #endif +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE +int __init erofs_init_ishare(void); +void erofs_exit_ishare(void); +bool erofs_ishare_fill_inode(struct inode *inode); +void erofs_ishare_free_inode(struct inode *inode); +#else +static inline int erofs_init_ishare(void) { return 0; } +static inline void erofs_exit_ishare(void) {} +static inline bool erofs_ishare_fill_inode(struct inode *inode) { return false; } +static inline void erofs_ishare_free_inode(struct inode *inode) {} +#endif + long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); long erofs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); diff --git a/fs/erofs/ishare.c b/fs/erofs/ishare.c new file mode 100644 index 000000000000..3d26b2826710 --- /dev/null +++ b/fs/erofs/ishare.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2024, Alibaba Cloud + */ +#include +#include +#include "internal.h" +#include "xattr.h" + +#include "../internal.h" + +static struct vfsmount *erofs_ishare_mnt; + +static int erofs_ishare_iget5_eq(struct inode *inode, void *data) +{ + struct erofs_inode_fingerprint *fp1 = &EROFS_I(inode)->fingerprint; + struct erofs_inode_fingerprint *fp2 = data; + + return fp1->size == fp2->size && + !memcmp(fp1->opaque, fp2->opaque, fp2->size); +} + +static int erofs_ishare_iget5_set(struct inode *inode, void *data) +{ + struct erofs_inode *vi = EROFS_I(inode); + + vi->fingerprint = *(struct erofs_inode_fingerprint *)data; + INIT_LIST_HEAD(&vi->ishare_list); + spin_lock_init(&vi->ishare_lock); + return 0; +} + +bool erofs_ishare_fill_inode(struct inode *inode) +{ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_inode_fingerprint fp; + struct inode *sharedinode; + unsigned long hash; + + if (erofs_xattr_fill_inode_fingerprint(&fp, inode, sbi->domain_id)) + return false; + hash = xxh32(fp.opaque, fp.size, 0); + sharedinode = iget5_locked(erofs_ishare_mnt->mnt_sb, hash, + erofs_ishare_iget5_eq, erofs_ishare_iget5_set, + &fp); + if (!sharedinode) { + kfree(fp.opaque); + return false; + } + + if (inode_state_read_once(sharedinode) & I_NEW) { + if (erofs_inode_set_aops(sharedinode, inode, true)) { + iget_failed(sharedinode); + kfree(fp.opaque); + return false; + } + sharedinode->i_size = vi->vfs_inode.i_size; + unlock_new_inode(sharedinode); + } else { + kfree(fp.opaque); + if (sharedinode->i_size != vi->vfs_inode.i_size) { + _erofs_printk(inode->i_sb, KERN_WARNING + "size(%lld:%lld) not matches for the same fingerprint\n", + vi->vfs_inode.i_size, sharedinode->i_size); + iput(sharedinode); + return false; + } + } + vi->sharedinode = sharedinode; + INIT_LIST_HEAD(&vi->ishare_list); + spin_lock(&EROFS_I(sharedinode)->ishare_lock); + list_add(&vi->ishare_list, &EROFS_I(sharedinode)->ishare_list); + spin_unlock(&EROFS_I(sharedinode)->ishare_lock); + return true; +} + +void erofs_ishare_free_inode(struct inode *inode) +{ + struct erofs_inode *vi = EROFS_I(inode); + struct inode *sharedinode = vi->sharedinode; + + if (!sharedinode) + return; + spin_lock(&EROFS_I(sharedinode)->ishare_lock); + list_del(&vi->ishare_list); + spin_unlock(&EROFS_I(sharedinode)->ishare_lock); + iput(sharedinode); + vi->sharedinode = NULL; +} + +static int erofs_ishare_file_open(struct inode *inode, struct file *file) +{ + struct inode *sharedinode = EROFS_I(inode)->sharedinode; + struct file *realfile; + + if (file->f_flags & O_DIRECT) + return -EINVAL; + realfile = alloc_empty_backing_file(O_RDONLY|O_NOATIME, current_cred()); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + ihold(sharedinode); + realfile->f_op = &erofs_file_fops; + realfile->f_inode = sharedinode; + realfile->f_mapping = sharedinode->i_mapping; + path_get(&file->f_path); + backing_file_set_user_path(realfile, &file->f_path); + + file_ra_state_init(&realfile->f_ra, file->f_mapping); + realfile->private_data = EROFS_I(inode); + file->private_data = realfile; + return 0; +} + +static int erofs_ishare_file_release(struct inode *inode, struct file *file) +{ + struct file *realfile = file->private_data; + + iput(realfile->f_inode); + fput(realfile); + file->private_data = NULL; + return 0; +} + +static ssize_t erofs_ishare_file_read_iter(struct kiocb *iocb, + struct iov_iter *to) +{ + struct file *realfile = iocb->ki_filp->private_data; + struct kiocb dedup_iocb; + ssize_t nread; + + if (!iov_iter_count(to)) + return 0; + kiocb_clone(&dedup_iocb, iocb, realfile); + nread = filemap_read(&dedup_iocb, to, 0); + iocb->ki_pos = dedup_iocb.ki_pos; + return nread; +} + +static int erofs_ishare_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *realfile = file->private_data; + + vma_set_file(vma, realfile); + return generic_file_readonly_mmap(file, vma); +} + +const struct file_operations erofs_ishare_fops = { + .open = erofs_ishare_file_open, + .llseek = generic_file_llseek, + .read_iter = erofs_ishare_file_read_iter, + .mmap = erofs_ishare_mmap, + .release = erofs_ishare_file_release, + .get_unmapped_area = thp_get_unmapped_area, + .splice_read = filemap_splice_read, +}; + +int __init erofs_init_ishare(void) +{ + erofs_ishare_mnt = kern_mount(&erofs_anon_fs_type); + return PTR_ERR_OR_ZERO(erofs_ishare_mnt); +} + +void erofs_exit_ishare(void) +{ + kern_unmount(erofs_ishare_mnt); +} diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 13876fa597ee..b9ffb3d42bf4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -398,6 +398,7 @@ static void erofs_default_options(struct erofs_sb_info *sbi) enum { Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset, + Opt_inode_share, }; static const struct constant_table erofs_param_cache_strategy[] = { @@ -425,6 +426,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_string("domain_id", Opt_domain_id), fsparam_flag_no("directio", Opt_directio), fsparam_u64("fsoffset", Opt_fsoffset), + fsparam_flag("inode_share", Opt_inode_share), {} }; @@ -526,6 +528,8 @@ static int erofs_fc_parse_param(struct fs_context *fc, if (!sbi->fsid) return -ENOMEM; break; +#endif +#if defined(CONFIG_EROFS_FS_ONDEMAND) || defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) case Opt_domain_id: kfree_sensitive(sbi->domain_id); sbi->domain_id = no_free_ptr(param->string); @@ -549,6 +553,13 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_fsoffset: sbi->dif0.fsoff = result.uint_64; break; + case Opt_inode_share: +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + set_opt(&sbi->opt, INODE_SHARE); +#else + errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); +#endif + break; } return 0; } @@ -647,6 +658,15 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_op = &erofs_sops; + if (!sbi->domain_id && test_opt(&sbi->opt, INODE_SHARE)) { + errorfc(fc, "domain_id is needed when inode_ishare is on"); + return -EINVAL; + } + if (test_opt(&sbi->opt, DAX_ALWAYS) && test_opt(&sbi->opt, INODE_SHARE)) { + errorfc(fc, "FSDAX is not allowed when inode_ishare is on"); + return -EINVAL; + } + sbi->blkszbits = PAGE_SHIFT; if (!sb->s_bdev) { /* @@ -724,6 +744,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) erofs_info(sb, "unsupported blocksize for DAX"); clear_opt(&sbi->opt, DAX_ALWAYS); } + if (test_opt(&sbi->opt, INODE_SHARE) && !erofs_sb_has_ishare_xattrs(sbi)) { + erofs_info(sb, "on-disk ishare xattrs not found. Turning off inode_share."); + clear_opt(&sbi->opt, INODE_SHARE); + } + if (test_opt(&sbi->opt, INODE_SHARE)) + erofs_info(sb, "EXPERIMENTAL EROFS page cache share support in use. Use at your own risk!"); sb->s_time_gran = 1; sb->s_xattr = erofs_xattr_handlers; @@ -953,10 +979,32 @@ static struct file_system_type erofs_fs_type = { }; MODULE_ALIAS_FS("erofs"); -#if defined(CONFIG_EROFS_FS_ONDEMAND) +#if defined(CONFIG_EROFS_FS_ONDEMAND) || defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) +static void erofs_free_anon_inode(struct inode *inode) +{ + struct erofs_inode *vi = EROFS_I(inode); + +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE + kfree(vi->fingerprint.opaque); +#endif + kmem_cache_free(erofs_inode_cachep, vi); +} + +static const struct super_operations erofs_anon_sops = { + .alloc_inode = erofs_alloc_inode, + .drop_inode = inode_just_drop, + .free_inode = erofs_free_anon_inode, +}; + static int erofs_anon_init_fs_context(struct fs_context *fc) { - return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM; + struct pseudo_fs_context *ctx; + + ctx = init_pseudo(fc, EROFS_SUPER_MAGIC); + if (!ctx) + return -ENOMEM; + ctx->ops = &erofs_anon_sops; + return 0; } struct file_system_type erofs_anon_fs_type = { @@ -991,6 +1039,10 @@ static int __init erofs_module_init(void) if (err) goto sysfs_err; + err = erofs_init_ishare(); + if (err) + goto ishare_err; + err = register_filesystem(&erofs_fs_type); if (err) goto fs_err; @@ -998,6 +1050,8 @@ static int __init erofs_module_init(void) return 0; fs_err: + erofs_exit_ishare(); +ishare_err: erofs_exit_sysfs(); sysfs_err: z_erofs_exit_subsystem(); @@ -1015,6 +1069,7 @@ static void __exit erofs_module_exit(void) /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); + erofs_exit_ishare(); erofs_exit_sysfs(); z_erofs_exit_subsystem(); erofs_exit_shrinker(); @@ -1069,6 +1124,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) #endif if (sbi->dif0.fsoff) seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff); + if (test_opt(opt, INODE_SHARE)) + seq_puts(seq, ",inode_share"); return 0; } @@ -1079,6 +1136,7 @@ static void erofs_evict_inode(struct inode *inode) dax_break_layout_final(inode); #endif + erofs_ishare_free_inode(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 732e3b3379d5..2ef9d6436b05 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -588,3 +588,37 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) return acl; } #endif + +#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE +int erofs_xattr_fill_inode_fingerprint(struct erofs_inode_fingerprint *fp, + struct inode *inode, const char *domain_id) +{ + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); + struct erofs_xattr_prefix_item *prefix; + const char *infix; + int valuelen, base_index; + + if (!test_opt(&sbi->opt, INODE_SHARE)) + return -EOPNOTSUPP; + if (!sbi->xattr_prefixes) + return -EINVAL; + prefix = sbi->xattr_prefixes + sbi->ishare_xattr_prefix_id; + infix = prefix->prefix->infix; + base_index = prefix->prefix->base_index; + valuelen = erofs_getxattr(inode, base_index, infix, NULL, 0); + if (valuelen <= 0 || valuelen > (1 << sbi->blkszbits)) + return -EFSCORRUPTED; + fp->size = valuelen + (domain_id ? strlen(domain_id) : 0); + fp->opaque = kmalloc(fp->size, GFP_KERNEL); + if (!fp->opaque) + return -ENOMEM; + if (valuelen != erofs_getxattr(inode, base_index, infix, + fp->opaque, valuelen)) { + kfree(fp->opaque); + fp->opaque = NULL; + return -EFSCORRUPTED; + } + memcpy(fp->opaque + valuelen, domain_id, fp->size - valuelen); + return 0; +} +#endif diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 36f2667afc2d..4d0e58ff7a14 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -30,4 +30,7 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu); #define erofs_get_acl (NULL) #endif +int erofs_xattr_fill_inode_fingerprint(struct erofs_inode_fingerprint *fp, + struct inode *inode, const char *domain_id); + #endif From 69368d2ded39b6a7ca6bd9fd53ee0d1f904e3601 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 23 Jan 2026 01:31:29 +0000 Subject: [PATCH 20/31] erofs: pass inode to trace_erofs_read_folio The trace_erofs_read_folio accesses inode information through folio, but this method fails if the real inode is not associated with the folio(such as in the upcoming page cache sharing case). Therefore, we pass the real inode to it so that the inode information can be printed out in that case. Signed-off-by: Hongbo Li Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/data.c | 6 ++---- fs/erofs/fileio.c | 2 +- fs/erofs/zdata.c | 2 +- include/trace/events/erofs.h | 10 +++++----- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 71e23d91123d..ea198defb531 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -385,8 +385,7 @@ static int erofs_read_folio(struct file *file, struct folio *folio) }; struct erofs_iomap_iter_ctx iter_ctx = {}; - trace_erofs_read_folio(folio, true); - + trace_erofs_read_folio(folio_inode(folio), folio, true); iomap_read_folio(&erofs_iomap_ops, &read_ctx, &iter_ctx); return 0; } @@ -400,8 +399,7 @@ static void erofs_readahead(struct readahead_control *rac) struct erofs_iomap_iter_ctx iter_ctx = {}; trace_erofs_readahead(rac->mapping->host, readahead_index(rac), - readahead_count(rac), true); - + readahead_count(rac), true); iomap_readahead(&erofs_iomap_ops, &read_ctx, &iter_ctx); } diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 932e8b353ba1..d07dc248d264 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -161,7 +161,7 @@ static int erofs_fileio_read_folio(struct file *file, struct folio *folio) struct erofs_fileio io = {}; int err; - trace_erofs_read_folio(folio, true); + trace_erofs_read_folio(folio_inode(folio), folio, true); err = erofs_fileio_scan_folio(&io, folio); erofs_fileio_rq_submit(io.rq); return err; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5860ea6dbc60..c5fa07af34f5 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1879,7 +1879,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio)); int err; - trace_erofs_read_folio(folio, false); + trace_erofs_read_folio(inode, folio, false); z_erofs_pcluster_readmore(&f, NULL, true); err = z_erofs_scan_folio(&f, folio, false); z_erofs_pcluster_readmore(&f, NULL, false); diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index dad7360f42f9..def20d06507b 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -82,9 +82,9 @@ TRACE_EVENT(erofs_fill_inode, TRACE_EVENT(erofs_read_folio, - TP_PROTO(struct folio *folio, bool raw), + TP_PROTO(struct inode *inode, struct folio *folio, bool raw), - TP_ARGS(folio, raw), + TP_ARGS(inode, folio, raw), TP_STRUCT__entry( __field(dev_t, dev ) @@ -96,9 +96,9 @@ TRACE_EVENT(erofs_read_folio, ), TP_fast_assign( - __entry->dev = folio->mapping->host->i_sb->s_dev; - __entry->nid = EROFS_I(folio->mapping->host)->nid; - __entry->dir = S_ISDIR(folio->mapping->host->i_mode); + __entry->dev = inode->i_sb->s_dev; + __entry->nid = EROFS_I(inode)->nid; + __entry->dir = S_ISDIR(inode->i_mode); __entry->index = folio->index; __entry->uptodate = folio_test_uptodate(folio); __entry->raw = raw; From 34096ba919fd0b32e86e5d617c45d0998a495c7a Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Fri, 23 Jan 2026 01:31:30 +0000 Subject: [PATCH 21/31] erofs: support unencoded inodes for page cache share This patch adds inode page cache sharing functionality for unencoded files. I conducted experiments in the container environment. Below is the memory usage for reading all files in two different minor versions of container images: +-------------------+------------------+-------------+---------------+ | Image | Page Cache Share | Memory (MB) | Memory | | | | | Reduction (%) | +-------------------+------------------+-------------+---------------+ | | No | 241 | - | | redis +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 163 | 33% | +-------------------+------------------+-------------+---------------+ | | No | 872 | - | | postgres +------------------+-------------+---------------+ | 16.1 & 16.2 | Yes | 630 | 28% | +-------------------+------------------+-------------+---------------+ | | No | 2771 | - | | tensorflow +------------------+-------------+---------------+ | 2.11.0 & 2.11.1 | Yes | 2340 | 16% | +-------------------+------------------+-------------+---------------+ | | No | 926 | - | | mysql +------------------+-------------+---------------+ | 8.0.11 & 8.0.12 | Yes | 735 | 21% | +-------------------+------------------+-------------+---------------+ | | No | 390 | - | | nginx +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 219 | 44% | +-------------------+------------------+-------------+---------------+ | tomcat | No | 924 | - | | 10.1.25 & 10.1.26 +------------------+-------------+---------------+ | | Yes | 474 | 49% | +-------------------+------------------+-------------+---------------+ Additionally, the table below shows the runtime memory usage of the container: +-------------------+------------------+-------------+---------------+ | Image | Page Cache Share | Memory (MB) | Memory | | | | | Reduction (%) | +-------------------+------------------+-------------+---------------+ | | No | 35 | - | | redis +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 28 | 20% | +-------------------+------------------+-------------+---------------+ | | No | 149 | - | | postgres +------------------+-------------+---------------+ | 16.1 & 16.2 | Yes | 95 | 37% | +-------------------+------------------+-------------+---------------+ | | No | 1028 | - | | tensorflow +------------------+-------------+---------------+ | 2.11.0 & 2.11.1 | Yes | 930 | 10% | +-------------------+------------------+-------------+---------------+ | | No | 155 | - | | mysql +------------------+-------------+---------------+ | 8.0.11 & 8.0.12 | Yes | 132 | 15% | +-------------------+------------------+-------------+---------------+ | | No | 25 | - | | nginx +------------------+-------------+---------------+ | 7.2.4 & 7.2.5 | Yes | 20 | 20% | +-------------------+------------------+-------------+---------------+ | tomcat | No | 186 | - | | 10.1.25 & 10.1.26 +------------------+-------------+---------------+ | | Yes | 98 | 48% | +-------------------+------------------+-------------+---------------+ Co-developed-by: Hongzhen Luo Signed-off-by: Hongzhen Luo Signed-off-by: Hongbo Li Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/data.c | 32 +++++++++++++++++++++++--------- fs/erofs/fileio.c | 25 ++++++++++++++++--------- fs/erofs/inode.c | 3 ++- fs/erofs/internal.h | 6 ++++++ fs/erofs/ishare.c | 34 ++++++++++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 19 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index ea198defb531..3a4eb0dececd 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -269,6 +269,7 @@ void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty) struct erofs_iomap_iter_ctx { struct page *page; void *base; + struct inode *realinode; }; static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -276,14 +277,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, { struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); struct erofs_iomap_iter_ctx *ctx = iter->private; - struct super_block *sb = inode->i_sb; + struct inode *realinode = ctx ? ctx->realinode : inode; + struct super_block *sb = realinode->i_sb; struct erofs_map_blocks map; struct erofs_map_dev mdev; int ret; map.m_la = offset; map.m_llen = length; - ret = erofs_map_blocks(inode, &map); + ret = erofs_map_blocks(realinode, &map); if (ret < 0) return ret; @@ -296,7 +298,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return 0; } - if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(inode)) { + if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(realinode)) { mdev = (struct erofs_map_dev) { .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, @@ -322,7 +324,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, void *ptr; ptr = erofs_read_metabuf(&buf, sb, map.m_pa, - erofs_inode_in_metabox(inode)); + erofs_inode_in_metabox(realinode)); if (IS_ERR(ptr)) return PTR_ERR(ptr); iomap->inline_data = ptr; @@ -383,10 +385,15 @@ static int erofs_read_folio(struct file *file, struct folio *folio) .ops = &iomap_bio_read_ops, .cur_folio = folio, }; - struct erofs_iomap_iter_ctx iter_ctx = {}; + bool need_iput; + struct erofs_iomap_iter_ctx iter_ctx = { + .realinode = erofs_real_inode(folio_inode(folio), &need_iput), + }; - trace_erofs_read_folio(folio_inode(folio), folio, true); + trace_erofs_read_folio(iter_ctx.realinode, folio, true); iomap_read_folio(&erofs_iomap_ops, &read_ctx, &iter_ctx); + if (need_iput) + iput(iter_ctx.realinode); return 0; } @@ -396,11 +403,16 @@ static void erofs_readahead(struct readahead_control *rac) .ops = &iomap_bio_read_ops, .rac = rac, }; - struct erofs_iomap_iter_ctx iter_ctx = {}; + bool need_iput; + struct erofs_iomap_iter_ctx iter_ctx = { + .realinode = erofs_real_inode(rac->mapping->host, &need_iput), + }; - trace_erofs_readahead(rac->mapping->host, readahead_index(rac), + trace_erofs_readahead(iter_ctx.realinode, readahead_index(rac), readahead_count(rac), true); iomap_readahead(&erofs_iomap_ops, &read_ctx, &iter_ctx); + if (need_iput) + iput(iter_ctx.realinode); } static sector_t erofs_bmap(struct address_space *mapping, sector_t block) @@ -421,7 +433,9 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); #endif if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) { - struct erofs_iomap_iter_ctx iter_ctx = {}; + struct erofs_iomap_iter_ctx iter_ctx = { + .realinode = inode, + }; return iomap_dio_rw(iocb, to, &erofs_iomap_ops, NULL, 0, &iter_ctx, 0); diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index d07dc248d264..c1d0081609dc 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -88,9 +88,9 @@ void erofs_fileio_submit_bio(struct bio *bio) bio)); } -static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) +static int erofs_fileio_scan_folio(struct erofs_fileio *io, + struct inode *inode, struct folio *folio) { - struct inode *inode = folio_inode(folio); struct erofs_map_blocks *map = &io->map; unsigned int cur = 0, end = folio_size(folio), len, attached = 0; loff_t pos = folio_pos(folio), ofs; @@ -158,31 +158,38 @@ io_retry: static int erofs_fileio_read_folio(struct file *file, struct folio *folio) { + bool need_iput; + struct inode *realinode = erofs_real_inode(folio_inode(folio), &need_iput); struct erofs_fileio io = {}; int err; - trace_erofs_read_folio(folio_inode(folio), folio, true); - err = erofs_fileio_scan_folio(&io, folio); + trace_erofs_read_folio(realinode, folio, true); + err = erofs_fileio_scan_folio(&io, realinode, folio); erofs_fileio_rq_submit(io.rq); + if (need_iput) + iput(realinode); return err; } static void erofs_fileio_readahead(struct readahead_control *rac) { - struct inode *inode = rac->mapping->host; + bool need_iput; + struct inode *realinode = erofs_real_inode(rac->mapping->host, &need_iput); struct erofs_fileio io = {}; struct folio *folio; int err; - trace_erofs_readahead(inode, readahead_index(rac), + trace_erofs_readahead(realinode, readahead_index(rac), readahead_count(rac), true); while ((folio = readahead_folio(rac))) { - err = erofs_fileio_scan_folio(&io, folio); + err = erofs_fileio_scan_folio(&io, realinode, folio); if (err && err != -EINTR) - erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", - folio->index, EROFS_I(inode)->nid); + erofs_err(realinode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(realinode)->nid); } erofs_fileio_rq_submit(io.rq); + if (need_iput) + iput(realinode); } const struct address_space_operations erofs_fileio_aops = { diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 202cbbb4eada..d33816cff813 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -213,7 +213,8 @@ static int erofs_fill_inode(struct inode *inode) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; - inode->i_fop = &erofs_file_fops; + inode->i_fop = erofs_ishare_fill_inode(inode) ? + &erofs_ishare_fops : &erofs_file_fops; break; case S_IFDIR: inode->i_op = &erofs_dir_iops; diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 367a9a9f0542..3001bfec4e04 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -584,11 +584,17 @@ int __init erofs_init_ishare(void); void erofs_exit_ishare(void); bool erofs_ishare_fill_inode(struct inode *inode); void erofs_ishare_free_inode(struct inode *inode); +struct inode *erofs_real_inode(struct inode *inode, bool *need_iput); #else static inline int erofs_init_ishare(void) { return 0; } static inline void erofs_exit_ishare(void) {} static inline bool erofs_ishare_fill_inode(struct inode *inode) { return false; } static inline void erofs_ishare_free_inode(struct inode *inode) {} +static inline struct inode *erofs_real_inode(struct inode *inode, bool *need_iput) +{ + *need_iput = false; + return inode; +} #endif long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); diff --git a/fs/erofs/ishare.c b/fs/erofs/ishare.c index 3d26b2826710..ab459fb62473 100644 --- a/fs/erofs/ishare.c +++ b/fs/erofs/ishare.c @@ -11,6 +11,12 @@ static struct vfsmount *erofs_ishare_mnt; +static inline bool erofs_is_ishare_inode(struct inode *inode) +{ + /* assumed FS_ONDEMAND is excluded with FS_PAGE_CACHE_SHARE feature */ + return inode->i_sb->s_type == &erofs_anon_fs_type; +} + static int erofs_ishare_iget5_eq(struct inode *inode, void *data) { struct erofs_inode_fingerprint *fp1 = &EROFS_I(inode)->fingerprint; @@ -38,6 +44,8 @@ bool erofs_ishare_fill_inode(struct inode *inode) struct inode *sharedinode; unsigned long hash; + if (erofs_inode_is_data_compressed(vi->datalayout)) + return false; if (erofs_xattr_fill_inode_fingerprint(&fp, inode, sbi->domain_id)) return false; hash = xxh32(fp.opaque, fp.size, 0); @@ -155,6 +163,32 @@ const struct file_operations erofs_ishare_fops = { .splice_read = filemap_splice_read, }; +struct inode *erofs_real_inode(struct inode *inode, bool *need_iput) +{ + struct erofs_inode *vi, *vi_share; + struct inode *realinode; + + *need_iput = false; + if (!erofs_is_ishare_inode(inode)) + return inode; + + vi_share = EROFS_I(inode); + spin_lock(&vi_share->ishare_lock); + /* fetch any one as real inode */ + DBG_BUGON(list_empty(&vi_share->ishare_list)); + list_for_each_entry(vi, &vi_share->ishare_list, ishare_list) { + realinode = igrab(&vi->vfs_inode); + if (realinode) { + *need_iput = true; + break; + } + } + spin_unlock(&vi_share->ishare_lock); + + DBG_BUGON(!realinode); + return realinode; +} + int __init erofs_init_ishare(void) { erofs_ishare_mnt = kern_mount(&erofs_anon_fs_type); From 9364b55a4dbf1ae9a8cb077cb8b7d0c7401d00fc Mon Sep 17 00:00:00 2001 From: Hongzhen Luo Date: Fri, 23 Jan 2026 01:31:31 +0000 Subject: [PATCH 22/31] erofs: support compressed inodes for page cache share This patch adds page cache sharing functionality for compressed inodes. Signed-off-by: Hongzhen Luo Signed-off-by: Hongbo Li Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/ishare.c | 2 -- fs/erofs/zdata.c | 38 ++++++++++++++++++++++++-------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/fs/erofs/ishare.c b/fs/erofs/ishare.c index ab459fb62473..ad53a57dbcbc 100644 --- a/fs/erofs/ishare.c +++ b/fs/erofs/ishare.c @@ -44,8 +44,6 @@ bool erofs_ishare_fill_inode(struct inode *inode) struct inode *sharedinode; unsigned long hash; - if (erofs_inode_is_data_compressed(vi->datalayout)) - return false; if (erofs_xattr_fill_inode_fingerprint(&fp, inode, sbi->domain_id)) return false; hash = xxh32(fp.opaque, fp.size, 0); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c5fa07af34f5..20d7df31a51f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -494,7 +494,7 @@ enum z_erofs_pclustermode { }; struct z_erofs_frontend { - struct inode *const inode; + struct inode *inode, *sharedinode; struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; @@ -509,8 +509,8 @@ struct z_erofs_frontend { unsigned int icur; }; -#define Z_EROFS_DEFINE_FRONTEND(fe, i, ho) struct z_erofs_frontend fe = { \ - .inode = i, .head = Z_EROFS_PCLUSTER_TAIL, \ +#define Z_EROFS_DEFINE_FRONTEND(fe, i, si, ho) struct z_erofs_frontend fe = { \ + .inode = i, .sharedinode = si, .head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .headoffset = ho } static bool z_erofs_should_alloc_cache(struct z_erofs_frontend *fe) @@ -1858,7 +1858,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, pgoff_t index = cur >> PAGE_SHIFT; struct folio *folio; - folio = erofs_grab_folio_nowait(inode->i_mapping, index); + folio = erofs_grab_folio_nowait(f->sharedinode->i_mapping, index); if (!IS_ERR_OR_NULL(folio)) { if (folio_test_uptodate(folio)) folio_unlock(folio); @@ -1875,11 +1875,13 @@ static void z_erofs_pcluster_readmore(struct z_erofs_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { - struct inode *const inode = folio->mapping->host; - Z_EROFS_DEFINE_FRONTEND(f, inode, folio_pos(folio)); + struct inode *sharedinode = folio->mapping->host; + bool need_iput; + struct inode *realinode = erofs_real_inode(sharedinode, &need_iput); + Z_EROFS_DEFINE_FRONTEND(f, realinode, sharedinode, folio_pos(folio)); int err; - trace_erofs_read_folio(inode, folio, false); + trace_erofs_read_folio(realinode, folio, false); z_erofs_pcluster_readmore(&f, NULL, true); err = z_erofs_scan_folio(&f, folio, false); z_erofs_pcluster_readmore(&f, NULL, false); @@ -1888,23 +1890,28 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio) /* if some pclusters are ready, need submit them anyway */ err = z_erofs_runqueue(&f, 0) ?: err; if (err && err != -EINTR) - erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", - err, folio->index, EROFS_I(inode)->nid); + erofs_err(realinode->i_sb, "read error %d @ %lu of nid %llu", + err, folio->index, EROFS_I(realinode)->nid); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); + + if (need_iput) + iput(realinode); return err; } static void z_erofs_readahead(struct readahead_control *rac) { - struct inode *const inode = rac->mapping->host; - Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac)); + struct inode *sharedinode = rac->mapping->host; + bool need_iput; + struct inode *realinode = erofs_real_inode(sharedinode, &need_iput); + Z_EROFS_DEFINE_FRONTEND(f, realinode, sharedinode, readahead_pos(rac)); unsigned int nrpages = readahead_count(rac); struct folio *head = NULL, *folio; int err; - trace_erofs_readahead(inode, readahead_index(rac), nrpages, false); + trace_erofs_readahead(realinode, readahead_index(rac), nrpages, false); z_erofs_pcluster_readmore(&f, rac, true); while ((folio = readahead_folio(rac))) { folio->private = head; @@ -1918,8 +1925,8 @@ static void z_erofs_readahead(struct readahead_control *rac) err = z_erofs_scan_folio(&f, folio, true); if (err && err != -EINTR) - erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", - folio->index, EROFS_I(inode)->nid); + erofs_err(realinode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(realinode)->nid); } z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); @@ -1927,6 +1934,9 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_runqueue(&f, nrpages << PAGE_SHIFT); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); + + if (need_iput) + iput(realinode); } const struct address_space_operations z_erofs_aops = { From d86d7817c042dd651d47b1873f4b6eaefbedd890 Mon Sep 17 00:00:00 2001 From: Hongzhen Luo Date: Fri, 23 Jan 2026 01:31:32 +0000 Subject: [PATCH 23/31] erofs: implement .fadvise for page cache share This patch implements the .fadvise interface for page cache share. Similar to overlayfs, it drops those clean, unused pages through vfs_fadvise(). Signed-off-by: Hongzhen Luo Signed-off-by: Hongbo Li Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/ishare.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/erofs/ishare.c b/fs/erofs/ishare.c index ad53a57dbcbc..ce980320a8b9 100644 --- a/fs/erofs/ishare.c +++ b/fs/erofs/ishare.c @@ -151,6 +151,12 @@ static int erofs_ishare_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_readonly_mmap(file, vma); } +static int erofs_ishare_fadvise(struct file *file, loff_t offset, + loff_t len, int advice) +{ + return vfs_fadvise(file->private_data, offset, len, advice); +} + const struct file_operations erofs_ishare_fops = { .open = erofs_ishare_file_open, .llseek = generic_file_llseek, @@ -159,6 +165,7 @@ const struct file_operations erofs_ishare_fops = { .release = erofs_ishare_file_release, .get_unmapped_area = thp_get_unmapped_area, .splice_read = filemap_splice_read, + .fadvise = erofs_ishare_fadvise, }; struct inode *erofs_real_inode(struct inode *inode, bool *need_iput) From 1729f7c67544b31569836f48f142e8f7c0952b26 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 28 Jan 2026 11:54:08 +0800 Subject: [PATCH 24/31] erofs: mark inodes without acls in erofs_read_inode() Similar to commit 91ef18b567da ("ext4: mark inodes without acls in __ext4_iget()"), the ACL state won't be read when the file owner performs a lookup, and the RCU fast path for lookups won't work because the ACL state remains unknown. If there are no extended attributes, or if the xattr filter indicates that no ACL xattr is present, call cache_no_acl() directly. Reviewed-by: Hongbo Li Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 5 +++++ fs/erofs/xattr.c | 20 ++++++++++++++++++++ fs/erofs/xattr.h | 2 +- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d33816cff813..2ecc28abd6cd 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -137,6 +137,11 @@ static int erofs_read_inode(struct inode *inode) err = -EFSCORRUPTED; goto err_out; } + + if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL) && + erofs_inode_has_noacl(inode, ptr, ofs)) + cache_no_acl(inode); + switch (inode->i_mode & S_IFMT) { case S_IFDIR: vi->dot_omitted = (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 2ef9d6436b05..b7da1ed83160 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -587,6 +587,26 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) kfree(value); return acl; } + +bool erofs_inode_has_noacl(struct inode *inode, void *kaddr, unsigned int ofs) +{ + static const unsigned int bitmask = + BIT(21) | /* system.posix_acl_default */ + BIT(30); /* system.posix_acl_access */ + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + const struct erofs_xattr_ibody_header *ih = kaddr + ofs; + + if (EROFS_I(inode)->xattr_isize < sizeof(*ih)) + return true; + + if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved && + !check_add_overflow(ofs, sizeof(*ih), &ofs) && + ofs <= i_blocksize(inode)) { + if ((le32_to_cpu(ih->h_name_filter) & bitmask) == bitmask) + return true; + } + return false; +} #endif #ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 4d0e58ff7a14..4465f7018a41 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -32,5 +32,5 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu); int erofs_xattr_fill_inode_fingerprint(struct erofs_inode_fingerprint *fp, struct inode *inode, const char *domain_id); - +bool erofs_inode_has_noacl(struct inode *inode, void *kaddr, unsigned int ofs); #endif From 72558e2bed272b5ca8771ba14390160c876207f4 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 3 Feb 2026 10:31:00 +0800 Subject: [PATCH 25/31] erofs: use inode_set_cached_link() Symlink lengths are now cached in in-memory inodes directly so that readlink can be sped up. Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 2ecc28abd6cd..294f66376825 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -8,21 +8,29 @@ #include #include -static int erofs_fill_symlink(struct inode *inode, void *kaddr, - unsigned int m_pofs) +static int erofs_fill_symlink(struct inode *inode, void *bptr, unsigned int ofs) { struct erofs_inode *vi = EROFS_I(inode); - loff_t off; + char *link; + loff_t end; - m_pofs += vi->xattr_isize; - /* check if it cannot be handled with fast symlink scheme */ - if (vi->datalayout != EROFS_INODE_FLAT_INLINE || - check_add_overflow(m_pofs, inode->i_size, &off) || - off > i_blocksize(inode)) - return 0; - - inode->i_link = kmemdup_nul(kaddr + m_pofs, inode->i_size, GFP_KERNEL); - return inode->i_link ? 0 : -ENOMEM; + ofs += vi->xattr_isize; + /* check whether the symlink data is small enough to be inlined */ + if (vi->datalayout == EROFS_INODE_FLAT_INLINE && + !check_add_overflow(ofs, inode->i_size, &end) && + end <= i_blocksize(inode)) { + link = kmemdup_nul(bptr + ofs, inode->i_size, GFP_KERNEL); + if (!link) + return -ENOMEM; + if (unlikely(!inode->i_size || strlen(link) != inode->i_size)) { + erofs_err(inode->i_sb, "invalid fast symlink size %llu @ nid %llu", + inode->i_size | 0ULL, vi->nid); + kfree(link); + return -EFSCORRUPTED; + } + inode_set_cached_link(inode, link, inode->i_size); + } + return 0; } static int erofs_read_inode(struct inode *inode) From 7cef3c8341940febf75db6c25199cd83fb74d52f Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 29 Jan 2026 10:41:25 +0800 Subject: [PATCH 26/31] erofs: separate plain and compressed filesystems formally The EROFS on-disk format uses a tiny, plain metadata design that prioritizes performance and minimizes complex inconsistencies against common writable disk filesystems (almost all serious metadata inconsistency cannot happen in well-designed immutable filesystems like EROFS). EROFS deliberately avoids artificial design flaws to eliminate serious security risks from untrusted remote sources by design, although human-made implementation bugs can still happen sometimes. Currently, there is no strict check to prevent compressed inodes, especially LZ4-compressed inodes, from being read in plain filesystems. Starting with erofs-utils 1.0 and Linux 5.3, LZ4_0PADDING sb feature is automatically enabled for LZ4-compressed EROFS images to support in-place decompression. Furthermore, since Linux 5.4 LTS is no longer supported, we no longer need to handle ancient LZ4-compressed EROFS images generated by erofs-utils prior to 1.0. To formally distinguish different filesystem types for improved security: - Use the presence of LZ4_0PADDING or a non-zero `dsb->u1.lz4_max_distance` as a marker for compressed filesystems containing LZ4-compressed inodes only; - For other algorithms, use `dsb->u1.available_compr_algs` bitmap. Note: LZ4_0PADDING has been supported since Linux 5.4 (the first formal kernel version), so exposing it via sysfs is no longer necessary and is now deprecated (but remain it for five more years until 2031): `dsb->u1` has been strictly non-zero for all EROFS images containing compressed inodes starting with erofs-utils v1.3 and it is actually a much better marker for compressed filesystems. Signed-off-by: Gao Xiang --- Documentation/ABI/testing/sysfs-fs-erofs | 6 ++--- fs/erofs/decompressor.c | 30 ++++++++++-------------- fs/erofs/erofs_fs.h | 2 +- fs/erofs/inode.c | 14 +++++++---- fs/erofs/internal.h | 6 ++--- fs/erofs/super.c | 24 +++++++------------ fs/erofs/sysfs.c | 2 -- 7 files changed, 39 insertions(+), 45 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-erofs b/Documentation/ABI/testing/sysfs-fs-erofs index b9243c7f28d7..e4cf6fc6a106 100644 --- a/Documentation/ABI/testing/sysfs-fs-erofs +++ b/Documentation/ABI/testing/sysfs-fs-erofs @@ -3,9 +3,9 @@ Date: November 2021 Contact: "Huang Jianan" Description: Shows all enabled kernel features. Supported features: - zero_padding, compr_cfgs, big_pcluster, chunked_file, - device_table, compr_head2, sb_chksum, ztailpacking, - dedupe, fragments, 48bit, metabox. + compr_cfgs, big_pcluster, chunked_file, device_table, + compr_head2, sb_chksum, ztailpacking, dedupe, fragments, + 48bit, metabox. What: /sys/fs/erofs//sync_decompress Date: November 2021 diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index e9d799a03a91..3c54e95964c9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -34,7 +34,10 @@ static int z_erofs_load_lz4_config(struct super_block *sb, } } else { distance = le16_to_cpu(dsb->u1.lz4_max_distance); + if (!distance && !erofs_sb_has_lz4_0padding(sbi)) + return 0; sbi->lz4.max_pclusterblks = 1; + sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; } sbi->lz4.max_distance_pages = distance ? @@ -198,7 +201,6 @@ const char *z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, static const char *__z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *dst) { - bool zeropadded = erofs_sb_has_zero_padding(EROFS_SB(rq->sb)); bool may_inplace = false; unsigned int inputmargin; u8 *out, *headpage, *src; @@ -206,18 +208,15 @@ static const char *__z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, int ret, maptype; headpage = kmap_local_page(*rq->in); - /* LZ4 decompression inplace is only safe if zero_padding is enabled */ - if (zeropadded) { - reason = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, - min_t(unsigned int, rq->inputsize, - rq->sb->s_blocksize - rq->pageofs_in)); - if (reason) { - kunmap_local(headpage); - return reason; - } - may_inplace = !((rq->pageofs_in + rq->inputsize) & - (rq->sb->s_blocksize - 1)); + reason = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + rq->sb->s_blocksize - rq->pageofs_in)); + if (reason) { + kunmap_local(headpage); + return reason; } + may_inplace = !((rq->pageofs_in + rq->inputsize) & + (rq->sb->s_blocksize - 1)); inputmargin = rq->pageofs_in; src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin, @@ -226,8 +225,7 @@ static const char *__z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, return ERR_CAST(src); out = dst + rq->pageofs_out; - /* legacy format could compress extra data in a pcluster. */ - if (rq->partial_decoding || !zeropadded) + if (rq->partial_decoding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, rq->inputsize, rq->outputsize, rq->outputsize); else @@ -454,10 +452,8 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) erofs_off_t offset; int size, ret = 0; - if (!erofs_sb_has_compr_cfgs(sbi)) { - sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; + if (!erofs_sb_has_compr_cfgs(sbi)) return z_erofs_load_lz4_config(sb, dsb, NULL, 0); - } algs = le16_to_cpu(dsb->u1.available_compr_algs); sbi->available_compr_algs = algs; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index b30a74d307c5..b80c6bb33a58 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -23,7 +23,7 @@ * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should * be incompatible with this kernel version. */ -#define EROFS_FEATURE_INCOMPAT_ZERO_PADDING 0x00000001 +#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 294f66376825..4f86169c23f1 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -183,11 +183,17 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } - if (erofs_inode_is_data_compressed(vi->datalayout)) - inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) << - (sb->s_blocksize_bits - 9); - else + if (!erofs_inode_is_data_compressed(vi->datalayout)) { inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; + } else if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP) || !sbi->available_compr_algs) { + erofs_err(sb, "compressed inode (nid %llu) is invalid in a plain filesystem", + vi->nid); + err = -EFSCORRUPTED; + goto err_out; + } else { + inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) << + (sb->s_blocksize_bits - 9); + } if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { /* fill chunked inode summary info */ diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 3001bfec4e04..6a4802f3fdd8 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -114,7 +114,6 @@ struct erofs_sb_info { unsigned int sync_decompress; /* strategy for sync decompression */ unsigned int shrinker_run_no; - u16 available_compr_algs; /* pseudo inode to manage cached pages */ struct inode *managed_cache; @@ -154,6 +153,7 @@ struct erofs_sb_info { char *volume_name; u32 feature_compat; u32 feature_incompat; + u16 available_compr_algs; /* sysfs support */ struct kobject s_kobj; /* /sys/fs/erofs/ */ @@ -221,7 +221,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ return sbi->feature_##compat & EROFS_FEATURE_##feature; \ } -EROFS_FEATURE_FUNCS(zero_padding, incompat, INCOMPAT_ZERO_PADDING) +EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE) @@ -530,7 +530,6 @@ void z_erofs_put_gbuf(void *ptr); int z_erofs_gbuf_growsize(unsigned int nrpages); int __init z_erofs_gbuf_init(void); void z_erofs_gbuf_exit(void); -int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -540,6 +539,7 @@ static inline int z_erofs_init_subsystem(void) { return 0; } static inline void z_erofs_exit_subsystem(void) {} static inline int z_erofs_init_super(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ +int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE struct bio *erofs_fileio_bio_alloc(struct erofs_map_dev *mdev); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index b9ffb3d42bf4..e52c2b528f86 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -122,18 +122,6 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, return buffer; } -#ifndef CONFIG_EROFS_FS_ZIP -static int z_erofs_parse_cfgs(struct super_block *sb, - struct erofs_super_block *dsb) -{ - if (!dsb->u1.available_compr_algs) - return 0; - - erofs_err(sb, "compression disabled, unable to mount compressed EROFS"); - return -EOPNOTSUPP; -} -#endif - static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_device_info *dif, erofs_off_t *pos) { @@ -363,10 +351,16 @@ static int erofs_read_superblock(struct super_block *sb) } } - /* parse on-disk compression configurations */ - ret = z_erofs_parse_cfgs(sb, dsb); - if (ret < 0) + if (IS_ENABLED(CONFIG_EROFS_FS_ZIP)) { + ret = z_erofs_parse_cfgs(sb, dsb); + if (ret < 0) + goto out; + } else if (dsb->u1.available_compr_algs || + erofs_sb_has_lz4_0padding(sbi)) { + erofs_err(sb, "compression disabled, unable to mount compressed EROFS"); + ret = -EOPNOTSUPP; goto out; + } ret = erofs_scan_devices(sb, dsb); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 86b22b9f0c19..3a9a5fa000ae 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -86,7 +86,6 @@ static struct attribute *erofs_attrs[] = { ATTRIBUTE_GROUPS(erofs); /* Features this copy of erofs supports */ -EROFS_ATTR_FEATURE(zero_padding); EROFS_ATTR_FEATURE(compr_cfgs); EROFS_ATTR_FEATURE(big_pcluster); EROFS_ATTR_FEATURE(chunked_file); @@ -100,7 +99,6 @@ EROFS_ATTR_FEATURE(48bit); EROFS_ATTR_FEATURE(metabox); static struct attribute *erofs_feat_attrs[] = { - ATTR_LIST(zero_padding), ATTR_LIST(compr_cfgs), ATTR_LIST(big_pcluster), ATTR_LIST(chunked_file), From bc804a8d7e865ef47fb7edcaf5e77d18bf444ebc Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 30 Jan 2026 15:54:22 +0800 Subject: [PATCH 27/31] erofs: handle end of filesystem properly for file-backed mounts I/O requests beyond the end of the filesystem should be zeroed out, similar to loopback devices and that is what we expect. Fixes: ce63cb62d794 ("erofs: support unencoded inodes for fileio") Signed-off-by: Gao Xiang --- fs/erofs/fileio.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index c1d0081609dc..43998fe1cce1 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -24,21 +24,17 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) container_of(iocb, struct erofs_fileio_rq, iocb); struct folio_iter fi; - if (ret > 0) { - if (ret != rq->bio.bi_iter.bi_size) { - bio_advance(&rq->bio, ret); - zero_fill_bio(&rq->bio); - } - ret = 0; + if (ret >= 0 && ret != rq->bio.bi_iter.bi_size) { + bio_advance(&rq->bio, ret); + zero_fill_bio(&rq->bio); } - if (rq->bio.bi_end_io) { - if (ret < 0 && !rq->bio.bi_status) - rq->bio.bi_status = errno_to_blk_status(ret); - } else { + if (!rq->bio.bi_end_io) { bio_for_each_folio_all(fi, &rq->bio) { DBG_BUGON(folio_test_uptodate(fi.folio)); - erofs_onlinefolio_end(fi.folio, ret, false); + erofs_onlinefolio_end(fi.folio, ret < 0, false); } + } else if (ret < 0 && !rq->bio.bi_status) { + rq->bio.bi_status = errno_to_blk_status(ret); } bio_endio(&rq->bio); bio_uninit(&rq->bio); @@ -48,7 +44,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) { struct iov_iter iter; - int ret; + ssize_t ret; if (!rq) return; From c7c707cbaa5ed277836364da4033e141ff985678 Mon Sep 17 00:00:00 2001 From: Ferry Meng Date: Mon, 2 Feb 2026 11:09:09 +0800 Subject: [PATCH 28/31] erofs: avoid some unnecessary #ifdefs They can either be removed or replaced with IS_ENABLED(). Signed-off-by: Ferry Meng Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/data.c | 20 ++++++++------------ fs/erofs/super.c | 13 ++++--------- fs/erofs/sysfs.c | 5 ++--- 3 files changed, 14 insertions(+), 24 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 3a4eb0dececd..a2c796db4510 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -365,12 +365,10 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { -#ifdef CONFIG_EROFS_FS_ZIP + if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP)) + return -EOPNOTSUPP; return iomap_fiemap(inode, fieinfo, start, len, &z_erofs_iomap_report_ops); -#else - return -EOPNOTSUPP; -#endif } return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops); } @@ -428,10 +426,9 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!iov_iter_count(to)) return 0; -#ifdef CONFIG_FS_DAX - if (IS_DAX(inode)) + if (IS_ENABLED(CONFIG_FS_DAX) && IS_DAX(inode)) return dax_iomap_rw(iocb, to, &erofs_iomap_ops); -#endif + if ((iocb->ki_flags & IOCB_DIRECT) && inode->i_sb->s_bdev) { struct erofs_iomap_iter_ctx iter_ctx = { .realinode = inode, @@ -491,12 +488,11 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; const struct iomap_ops *ops = &erofs_iomap_ops; - if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) -#ifdef CONFIG_EROFS_FS_ZIP + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { + if (!IS_ENABLED(CONFIG_EROFS_FS_ZIP)) + return generic_file_llseek(file, offset, whence); ops = &z_erofs_iomap_report_ops; -#else - return generic_file_llseek(file, offset, whence); -#endif + } if (whence == SEEK_HOLE) offset = iomap_seek_hole(inode, offset, ops); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index e52c2b528f86..7827e61424b7 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -381,12 +381,10 @@ static void erofs_default_options(struct erofs_sb_info *sbi) sbi->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; sbi->sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO; #endif -#ifdef CONFIG_EROFS_FS_XATTR - set_opt(&sbi->opt, XATTR_USER); -#endif -#ifdef CONFIG_EROFS_FS_POSIX_ACL - set_opt(&sbi->opt, POSIX_ACL); -#endif + if (IS_ENABLED(CONFIG_EROFS_FS_XATTR)) + set_opt(&sbi->opt, XATTR_USER); + if (IS_ENABLED(CONFIG_EROFS_FS_POSIX_ACL)) + set_opt(&sbi->opt, POSIX_ACL); } enum { @@ -1125,11 +1123,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) static void erofs_evict_inode(struct inode *inode) { -#ifdef CONFIG_FS_DAX if (IS_DAX(inode)) dax_break_layout_final(inode); -#endif - erofs_ishare_free_inode(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 3a9a5fa000ae..6734483a440f 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -168,11 +168,10 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, return ret; if (t != (unsigned int)t) return -ERANGE; -#ifdef CONFIG_EROFS_FS_ZIP - if (!strcmp(a->attr.name, "sync_decompress") && + if (IS_ENABLED(CONFIG_EROFS_FS_ZIP) && + !strcmp(a->attr.name, "sync_decompress") && (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF)) return -EINVAL; -#endif *(unsigned int *)ptr = t; return len; case attr_pointer_bool: From c134a40f86efb8d6b5a949ef70e06d5752209be5 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 3 Feb 2026 16:25:36 +0800 Subject: [PATCH 29/31] erofs: fix inline data read failure for ztailpacking pclusters Compressed folios for ztailpacking pclusters must be valid before adding these pclusters to I/O chains. Otherwise, z_erofs_decompress_pcluster() may assume they are already valid and then trigger a NULL pointer dereference. It is somewhat hard to reproduce because the inline data is in the same block as the tail of the compressed indexes, which are usually read just before. However, it may still happen if a fatal signal arrives while read_mapping_folio() is running, as shown below: erofs: (device dm-1): z_erofs_pcluster_begin: failed to get inline data -4 Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 ... pc : z_erofs_decompress_queue+0x4c8/0xa14 lr : z_erofs_decompress_queue+0x160/0xa14 sp : ffffffc08b3eb3a0 x29: ffffffc08b3eb570 x28: ffffffc08b3eb418 x27: 0000000000001000 x26: ffffff8086ebdbb8 x25: ffffff8086ebdbb8 x24: 0000000000000001 x23: 0000000000000008 x22: 00000000fffffffb x21: dead000000000700 x20: 00000000000015e7 x19: ffffff808babb400 x18: ffffffc089edc098 x17: 00000000c006287d x16: 00000000c006287d x15: 0000000000000004 x14: ffffff80ba8f8000 x13: 0000000000000004 x12: 00000006589a77c9 x11: 0000000000000015 x10: 0000000000000000 x9 : 0000000000000000 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 000000000000003f x5 : 0000000000000040 x4 : ffffffffffffffe0 x3 : 0000000000000020 x2 : 0000000000000008 x1 : 0000000000000000 x0 : 0000000000000000 Call trace: z_erofs_decompress_queue+0x4c8/0xa14 z_erofs_runqueue+0x908/0x97c z_erofs_read_folio+0x128/0x228 filemap_read_folio+0x68/0x128 filemap_get_pages+0x44c/0x8b4 filemap_read+0x12c/0x5b8 generic_file_read_iter+0x4c/0x15c do_iter_readv_writev+0x188/0x1e0 vfs_iter_read+0xac/0x1a4 backing_file_read_iter+0x170/0x34c ovl_read_iter+0xf0/0x140 vfs_read+0x28c/0x344 ksys_read+0x80/0xf0 __arm64_sys_read+0x24/0x34 invoke_syscall+0x60/0x114 el0_svc_common+0x88/0xe4 do_el0_svc+0x24/0x30 el0_svc+0x40/0xa8 el0t_64_sync_handler+0x70/0xbc el0t_64_sync+0x1bc/0x1c0 Fix this by reading the inline data before allocating and adding the pclusters to the I/O chains. Fixes: cecf864d3d76 ("erofs: support inline data decompression") Reported-by: Zhiguo Niu Reviewed-and-tested-by: Zhiguo Niu Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 20d7df31a51f..ea9d32e9cb12 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -806,14 +806,26 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; struct z_erofs_pcluster *pcl = NULL; - void *ptr; + void *ptr = NULL; int ret; DBG_BUGON(fe->pcl); /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(!fe->head); - if (!(map->m_flags & EROFS_MAP_META)) { + if (map->m_flags & EROFS_MAP_META) { + ret = erofs_init_metabuf(&map->buf, sb, + erofs_inode_in_metabox(fe->inode)); + if (ret) + return ret; + ptr = erofs_bread(&map->buf, map->m_pa, false); + if (IS_ERR(ptr)) { + erofs_err(sb, "failed to read inline data %pe @ pa %llu of nid %llu", + ptr, map->m_pa, EROFS_I(fe->inode)->nid); + return PTR_ERR(ptr); + } + ptr = map->buf.page; + } else { while (1) { rcu_read_lock(); pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa); @@ -853,18 +865,8 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) /* bind cache first when cached decompression is preferred */ z_erofs_bind_cache(fe); } else { - ret = erofs_init_metabuf(&map->buf, sb, - erofs_inode_in_metabox(fe->inode)); - if (ret) - return ret; - ptr = erofs_bread(&map->buf, map->m_pa, false); - if (IS_ERR(ptr)) { - ret = PTR_ERR(ptr); - erofs_err(sb, "failed to get inline folio %d", ret); - return ret; - } - folio_get(page_folio(map->buf.page)); - WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + folio_get(page_folio((struct page *)ptr)); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, ptr); fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } From 8f2fb72fd17eecd5a47c73ce7e228d157e613b80 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 4 Feb 2026 17:37:31 +0800 Subject: [PATCH 30/31] erofs: update compression algorithm status The following changes are proposed in the upcoming Linux 7.0: - Enable LZMA support by default, as it's already in use by Fedora 42/43 and some Android vendors for minimal filesystem sizes; - Promote DEFLATE and Zstandard out of EXPERIMENTAL status, given that they have been landed and well-tested for over a year and are already ready for general use. Signed-off-by: Gao Xiang --- Documentation/filesystems/erofs.rst | 6 +++--- fs/erofs/Kconfig | 11 +++-------- fs/erofs/decompressor_deflate.c | 1 - 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst index af1df574e66c..d6b3693eba60 100644 --- a/Documentation/filesystems/erofs.rst +++ b/Documentation/filesystems/erofs.rst @@ -63,9 +63,9 @@ Here are the main features of EROFS: - Support POSIX.1e ACLs by using extended attributes; - Support transparent data compression as an option: - LZ4, MicroLZMA and DEFLATE algorithms can be used on a per-file basis; In - addition, inplace decompression is also supported to avoid bounce compressed - buffers and unnecessary page cache thrashing. + LZ4, MicroLZMA, DEFLATE and Zstandard algorithms can be used on a per-file + basis; In addition, inplace decompression is also supported to avoid bounce + compressed buffers and unnecessary page cache thrashing. - Support chunk-based data deduplication and rolling-hash compressed data deduplication; diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index b71f2a8074fe..a9f645f57bb2 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -112,13 +112,14 @@ config EROFS_FS_ZIP config EROFS_FS_ZIP_LZMA bool "EROFS LZMA compressed data support" depends on EROFS_FS_ZIP + default y help Saying Y here includes support for reading EROFS file systems containing LZMA compressed data, specifically called microLZMA. It gives better compression ratios than the default LZ4 format, at the expense of more CPU overhead. - If unsure, say N. + Say N if you want to disable LZMA compression support. config EROFS_FS_ZIP_DEFLATE bool "EROFS DEFLATE compressed data support" @@ -129,9 +130,6 @@ config EROFS_FS_ZIP_DEFLATE ratios than the default LZ4 format, while it costs more CPU overhead. - DEFLATE support is an experimental feature for now and so most - file systems will be readable without selecting this option. - If unsure, say N. config EROFS_FS_ZIP_ZSTD @@ -141,10 +139,7 @@ config EROFS_FS_ZIP_ZSTD Saying Y here includes support for reading EROFS file systems containing Zstandard compressed data. It gives better compression ratios than the default LZ4 format, while it costs more CPU - overhead. - - Zstandard support is an experimental feature for now and so most - file systems will be readable without selecting this option. + overhead and memory footprint. If unsure, say N. diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index 3fb73000ed27..4f26ab767645 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -89,7 +89,6 @@ static int z_erofs_load_deflate_config(struct super_block *sb, inited = true; } mutex_unlock(&deflate_resize_mutex); - erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!"); return 0; failed: mutex_unlock(&deflate_resize_mutex); From 1caf50ce4af096d0280d59a31abdd85703cd995c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Feb 2026 06:30:05 +0800 Subject: [PATCH 31/31] erofs: fix UAF issue for file-backed mounts w/ directio option [ 9.269940][ T3222] Call trace: [ 9.269948][ T3222] ext4_file_read_iter+0xac/0x108 [ 9.269979][ T3222] vfs_iocb_iter_read+0xac/0x198 [ 9.269993][ T3222] erofs_fileio_rq_submit+0x12c/0x180 [ 9.270008][ T3222] erofs_fileio_submit_bio+0x14/0x24 [ 9.270030][ T3222] z_erofs_runqueue+0x834/0x8ac [ 9.270054][ T3222] z_erofs_read_folio+0x120/0x220 [ 9.270083][ T3222] filemap_read_folio+0x60/0x120 [ 9.270102][ T3222] filemap_fault+0xcac/0x1060 [ 9.270119][ T3222] do_pte_missing+0x2d8/0x1554 [ 9.270131][ T3222] handle_mm_fault+0x5ec/0x70c [ 9.270142][ T3222] do_page_fault+0x178/0x88c [ 9.270167][ T3222] do_translation_fault+0x38/0x54 [ 9.270183][ T3222] do_mem_abort+0x54/0xac [ 9.270208][ T3222] el0_da+0x44/0x7c [ 9.270227][ T3222] el0t_64_sync_handler+0x5c/0xf4 [ 9.270253][ T3222] el0t_64_sync+0x1bc/0x1c0 EROFS may encounter above panic when enabling file-backed mount w/ directio mount option, the root cause is it may suffer UAF in below race condition: - z_erofs_read_folio wq s_dio_done_wq - z_erofs_runqueue - erofs_fileio_submit_bio - erofs_fileio_rq_submit - vfs_iocb_iter_read - ext4_file_read_iter - ext4_dio_read_iter - iomap_dio_rw : bio was submitted and return -EIOCBQUEUED - dio_aio_complete_work - dio_complete - dio->iocb->ki_complete (erofs_fileio_ki_complete()) - kfree(rq) : it frees iocb, iocb.ki_filp can be UAF in file_accessed(). - file_accessed : access NULL file point Introduce a reference count in struct erofs_fileio_rq, and initialize it as two, both erofs_fileio_ki_complete() and erofs_fileio_rq_submit() will decrease reference count, the last one decreasing the reference count to zero will free rq. Cc: stable@kernel.org Fixes: fb176750266a ("erofs: add file-backed mount support") Fixes: 6422cde1b0d5 ("erofs: use buffered I/O for file-backed mounts by default") Signed-off-by: Chao Yu Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/fileio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 43998fe1cce1..4d5054dcac95 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -10,6 +10,7 @@ struct erofs_fileio_rq { struct bio bio; struct kiocb iocb; struct super_block *sb; + refcount_t ref; }; struct erofs_fileio { @@ -38,7 +39,8 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) } bio_endio(&rq->bio); bio_uninit(&rq->bio); - kfree(rq); + if (refcount_dec_and_test(&rq->ref)) + kfree(rq); } static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) @@ -60,6 +62,8 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); if (ret != -EIOCBQUEUED) erofs_fileio_ki_complete(&rq->iocb, ret); + if (refcount_dec_and_test(&rq->ref)) + kfree(rq); } static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) @@ -70,6 +74,7 @@ static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev *mdev) bio_init(&rq->bio, NULL, rq->bvecs, ARRAY_SIZE(rq->bvecs), REQ_OP_READ); rq->iocb.ki_filp = mdev->m_dif->file; rq->sb = mdev->m_sb; + refcount_set(&rq->ref, 2); return rq; }