From 0319227099dfa06157b7cd669072b6e899d1bba8 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Tue, 25 Nov 2025 13:19:56 -0500 Subject: [PATCH 001/107] oid_registry: allow arbitrary size OIDs The current OID registry parser uses 64 bit arithmetic which limits us to supporting 64 bit or smaller OIDs. This isn't usually a problem except that it prevents us from representing the 2.25. prefix OIDs which are the OID representation of UUIDs and have a 128 bit number following the prefix. Rather than import not often used perl arithmetic modules, replace the current perl 64 bit arithmetic with a callout to bc, which is arbitrary precision, for decimal to base 2 conversion, then do pure string operations on the base 2 number. [James.Bottomley@HansenPartnership.com: tidy up perl with better my placement also set bc to arbitrary size] Link: https://lkml.kernel.org/r/dbc90c344c691ed988640a28367ff895b5ef2604.camel@HansenPartnership.com Link: https://lkml.kernel.org/r/833c858cd74533203b43180208734b84f1137af0.camel@HansenPartnership.com Signed-off-by: James Bottomley Cc: David Howells Cc: Blaise Boscaccy Signed-off-by: Andrew Morton --- lib/build_OID_registry | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/lib/build_OID_registry b/lib/build_OID_registry index 8267e8d71338..30493ac190c0 100755 --- a/lib/build_OID_registry +++ b/lib/build_OID_registry @@ -60,10 +60,12 @@ for (my $i = 0; $i <= $#names; $i++) { # Determine the encoded length of this OID my $size = $#components; for (my $loop = 2; $loop <= $#components; $loop++) { - my $c = $components[$loop]; + $ENV{'BC_LINE_LENGTH'} = "0"; + my $c = `echo "ibase=10; obase=2; $components[$loop]" | bc`; + chomp($c); # We will base128 encode the number - my $tmp = ($c == 0) ? 0 : int(log($c)/log(2)); + my $tmp = length($c) - 1; $tmp = int($tmp / 7); $size += $tmp; } @@ -100,16 +102,24 @@ for (my $i = 0; $i <= $#names; $i++) { push @octets, $components[0] * 40 + $components[1]; for (my $loop = 2; $loop <= $#components; $loop++) { - my $c = $components[$loop]; + # get the base 2 representation of the component + $ENV{'BC_LINE_LENGTH'} = "0"; + my $c = `echo "ibase=10; obase=2; $components[$loop]" | bc`; + chomp($c); - # Base128 encode the number - my $tmp = ($c == 0) ? 0 : int(log($c)/log(2)); + my $tmp = length($c) - 1; $tmp = int($tmp / 7); - for (; $tmp > 0; $tmp--) { - push @octets, (($c >> $tmp * 7) & 0x7f) | 0x80; + # zero pad upto length multiple of 7 + $c = substr("0000000", 0, ($tmp + 1) * 7 - length($c)).$c; + + # Base128 encode the number + for (my $j = 0; $j < $tmp; $j++) { + my $b = oct("0b".substr($c, $j * 7, 7)); + + push @octets, $b | 0x80; } - push @octets, $c & 0x7f; + push @octets, oct("0b".substr($c, $tmp * 7, 7)); } push @encoded_oids, \@octets; From b11052be3ea7c1dfc81804b203bc4369edafd040 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Dec 2025 18:57:30 +0100 Subject: [PATCH 002/107] crash_dump: constify struct configfs_item_operations and configfs_group_operations 'struct configfs_item_operations' and 'configfs_group_operations' are not modified in this driver. Constifying these structures moves some data to a read-only section, so increases overall security, especially when the structure holds some function pointers. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 16339 11001 384 27724 6c4c kernel/crash_dump_dm_crypt.o After: ===== text data bss dec hex filename 16499 10841 384 27724 6c4c kernel/crash_dump_dm_crypt.o Link: https://lkml.kernel.org/r/d046ee5666d2f6b1a48ca1a222dfbd2f7c44462f.1765735035.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: Coiby Xu Tested-by: Coiby Xu Cc: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_dump_dm_crypt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c index 401423ba477d..0d23dc1de67c 100644 --- a/kernel/crash_dump_dm_crypt.c +++ b/kernel/crash_dump_dm_crypt.c @@ -223,7 +223,7 @@ static void config_key_release(struct config_item *item) key_count--; } -static struct configfs_item_operations config_key_item_ops = { +static const struct configfs_item_operations config_key_item_ops = { .release = config_key_release, }; @@ -298,7 +298,7 @@ static struct configfs_attribute *config_keys_attrs[] = { * Note that, since no extra work is required on ->drop_item(), * no ->drop_item() is provided. */ -static struct configfs_group_operations config_keys_group_ops = { +static const struct configfs_group_operations config_keys_group_ops = { .make_item = config_keys_make_item, }; From 4a54331616b309934d46e255a9f1fdd890e77ebe Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Fri, 12 Dec 2025 15:45:03 +0800 Subject: [PATCH 003/107] ocfs2: give ocfs2 the ability to reclaim suballocator free bg Patch series "ocfs2: give ocfs2 the ability to reclaim suballocator free bg", v6. This patch (of 2): The current ocfs2 code can't reclaim suballocator block group space. In some cases, this causes ocfs2 to hold onto a lot of space. For example, when creating lots of small files, the space is held/managed by the '//inode_alloc'. After the user deletes all the small files, the space never returns to the '//global_bitmap'. This issue prevents ocfs2 from providing the needed space even when there is enough free space in a small ocfs2 volume. This patch gives ocfs2 the ability to reclaim suballocator free space when the block group is freed. For performance reasons, this patch keeps the first suballocator block group active. Link: https://lkml.kernel.org/r/20251212074505.25962-2-heming.zhao@suse.com Signed-off-by: Heming Zhao Reviewed-by: Su Yue Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/suballoc.c | 308 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 299 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8e6e5235b30c..aac2f96bee43 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -295,6 +295,74 @@ static int ocfs2_validate_group_descriptor(struct super_block *sb, return ocfs2_validate_gd_self(sb, bh, 0); } +/* + * The hint group descriptor (gd) may already have been released + * in _ocfs2_free_suballoc_bits(). We first check the gd signature, + * then perform the standard ocfs2_read_group_descriptor() jobs. + * + * If the gd signature is invalid, we return 'rc=0' and set + * '*released=1'. The caller is expected to handle this specific case. + * Otherwise, we return the actual error code. + * + * We treat gd signature corruption case as a release case. The + * caller ocfs2_claim_suballoc_bits() will use ocfs2_search_chain() + * to search each gd block. The code will eventually find this + * corrupted gd block - Late, but not missed. + * + * Note: + * The caller is responsible for initializing the '*released' status. + */ +static int ocfs2_read_hint_group_descriptor(struct inode *inode, + struct ocfs2_dinode *di, u64 gd_blkno, + struct buffer_head **bh, int *released) +{ + int rc; + struct buffer_head *tmp = *bh; + struct ocfs2_group_desc *gd; + + rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp, NULL); + if (rc) + goto out; + + gd = (struct ocfs2_group_desc *) tmp->b_data; + if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { + /* + * Invalid gd cache was set in ocfs2_read_block(), + * which will affect block_group allocation. + * Path: + * ocfs2_reserve_suballoc_bits + * ocfs2_block_group_alloc + * ocfs2_block_group_alloc_contig + * ocfs2_set_new_buffer_uptodate + */ + ocfs2_remove_from_cache(INODE_CACHE(inode), tmp); + *released = 1; /* we return 'rc=0' for this case */ + goto free_bh; + } + + /* below jobs same with ocfs2_read_group_descriptor() */ + if (!buffer_jbd(tmp)) { + rc = ocfs2_validate_group_descriptor(inode->i_sb, tmp); + if (rc) + goto free_bh; + } + + rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); + if (rc) + goto free_bh; + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + + return rc; + +free_bh: + brelse(tmp); +out: + return rc; +} + int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, u64 gd_blkno, struct buffer_head **bh) { @@ -1725,7 +1793,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, u32 bits_wanted, u32 min_bits, struct ocfs2_suballoc_result *res, - u16 *bits_left) + u16 *bits_left, int *released) { int ret; struct buffer_head *group_bh = NULL; @@ -1733,9 +1801,11 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_group_descriptor(alloc_inode, di, - res->sr_bg_blkno, &group_bh); - if (ret < 0) { + ret = ocfs2_read_hint_group_descriptor(alloc_inode, di, + res->sr_bg_blkno, &group_bh, released); + if (*released) { + return 0; + } else if (ret < 0) { mlog_errno(ret); return ret; } @@ -1950,6 +2020,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, struct ocfs2_suballoc_result *res) { int status; + int released = 0; u16 victim, i; u16 bits_left = 0; u64 hint = ac->ac_last_group; @@ -1976,6 +2047,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, goto bail; } + /* the hint bg may already be released, we quiet search this group. */ res->sr_bg_blkno = hint; if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism @@ -1983,7 +2055,12 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, * allocation group. This helps us maintain some * contiguousness across allocations. */ status = ocfs2_search_one_group(ac, handle, bits_wanted, - min_bits, res, &bits_left); + min_bits, res, &bits_left, + &released); + if (released) { + res->sr_bg_blkno = 0; + goto chain_search; + } if (!status) goto set_hint; if (status < 0 && status != -ENOSPC) { @@ -1991,7 +2068,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, goto bail; } } - +chain_search: cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; if (!le16_to_cpu(cl->cl_next_free_rec) || le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { @@ -2113,6 +2190,12 @@ bail: return status; } +/* + * after ocfs2 has the ability to release block group unused space, + * the ->ip_last_used_group may be invalid. so this function returns + * ac->ac_last_group need to verify. + * refer the 'hint' in ocfs2_claim_suballoc_bits() for more details. + */ static void ocfs2_init_inode_ac_group(struct inode *dir, struct buffer_head *parent_di_bh, struct ocfs2_alloc_context *ac) @@ -2551,6 +2634,198 @@ bail: return status; } +/* + * Reclaim the suballocator managed space to main bitmap. + * This function first works on the suballocator to perform the + * cleanup rec/alloc_inode job, then switches to the main bitmap + * to reclaim released space. + * + * handle: The transaction handle + * alloc_inode: The suballoc inode + * alloc_bh: The buffer_head of suballoc inode + * group_bh: The group descriptor buffer_head of suballocator managed. + * Caller should release the input group_bh. + */ +static int _ocfs2_reclaim_suballoc_to_main(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + struct buffer_head *group_bh) +{ + int idx, status = 0; + int i, next_free_rec, len = 0; + __le16 old_bg_contig_free_bits = 0; + u16 start_bit; + u32 tmp_used; + u64 bg_blkno, start_blk; + unsigned int count; + struct ocfs2_chain_rec *rec; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; + struct ocfs2_chain_list *cl = &fe->id2.i_chain; + struct ocfs2_group_desc *group = (struct ocfs2_group_desc *) group_bh->b_data; + + idx = le16_to_cpu(group->bg_chain); + rec = &(cl->cl_recs[idx]); + + status = ocfs2_extend_trans(handle, + ocfs2_calc_group_alloc_credits(osb->sb, + le16_to_cpu(cl->cl_cpg))); + if (status) { + mlog_errno(status); + goto bail; + } + status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode), + alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* + * Only clear the suballocator rec item in-place. + * + * If idx is not the last, we don't compress (remove the empty item) + * the cl_recs[]. If not, we need to do lots jobs. + * + * Compress cl_recs[] code example: + * if (idx != cl->cl_next_free_rec - 1) + * memmove(&cl->cl_recs[idx], &cl->cl_recs[idx + 1], + * sizeof(struct ocfs2_chain_rec) * + * (cl->cl_next_free_rec - idx - 1)); + * for(i = idx; i < cl->cl_next_free_rec-1; i++) { + * group->bg_chain = "later group->bg_chain"; + * group->bg_blkno = xxx; + * ... ... + * } + */ + + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_total); + fe->id1.bitmap1.i_total = cpu_to_le32(tmp_used - le32_to_cpu(rec->c_total)); + + /* Substraction 1 for the block group itself */ + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); + fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - 1); + + tmp_used = le32_to_cpu(fe->i_clusters); + fe->i_clusters = cpu_to_le32(tmp_used - le16_to_cpu(cl->cl_cpg)); + + spin_lock(&OCFS2_I(alloc_inode)->ip_lock); + OCFS2_I(alloc_inode)->ip_clusters -= le32_to_cpu(fe->i_clusters); + fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, + le32_to_cpu(fe->i_clusters))); + spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); + i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); + alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); + + ocfs2_journal_dirty(handle, alloc_bh); + ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0); + + start_blk = le64_to_cpu(rec->c_blkno); + count = le32_to_cpu(rec->c_total) / le16_to_cpu(cl->cl_bpc); + + /* + * If the rec is the last one, let's compress the chain list by + * removing the empty cl_recs[] at the end. + */ + next_free_rec = le16_to_cpu(cl->cl_next_free_rec); + if (idx == (next_free_rec - 1)) { + len++; /* the last item should be counted first */ + for (i = (next_free_rec - 2); i > 0; i--) { + if (cl->cl_recs[i].c_free == cl->cl_recs[i].c_total) + len++; + else + break; + } + } + le16_add_cpu(&cl->cl_next_free_rec, -len); + + rec->c_free = 0; + rec->c_total = 0; + rec->c_blkno = 0; + ocfs2_remove_from_cache(INODE_CACHE(alloc_inode), group_bh); + memset(group, 0, sizeof(struct ocfs2_group_desc)); + + /* prepare job for reclaim clusters */ + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) + goto bail; /* ignore the error in reclaim path */ + + inode_lock(main_bm_inode); + + status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); + if (status < 0) + goto free_bm_inode; /* ignore the error in reclaim path */ + + ocfs2_block_to_cluster_group(main_bm_inode, start_blk, &bg_blkno, + &start_bit); + fe = (struct ocfs2_dinode *) main_bm_bh->b_data; + cl = &fe->id2.i_chain; + /* reuse group_bh, caller will release the input group_bh */ + group_bh = NULL; + + /* reclaim clusters to global_bitmap */ + status = ocfs2_read_group_descriptor(main_bm_inode, fe, bg_blkno, + &group_bh); + if (status < 0) { + mlog_errno(status); + goto free_bm_bh; + } + group = (struct ocfs2_group_desc *) group_bh->b_data; + + if ((count + start_bit) > le16_to_cpu(group->bg_bits)) { + ocfs2_error(alloc_inode->i_sb, + "reclaim length (%d) beyands block group length (%d)", + count + start_bit, le16_to_cpu(group->bg_bits)); + goto free_group_bh; + } + + old_bg_contig_free_bits = group->bg_contig_free_bits; + status = ocfs2_block_group_clear_bits(handle, main_bm_inode, + group, group_bh, + start_bit, count, 0, + _ocfs2_clear_bit); + if (status < 0) { + mlog_errno(status); + goto free_group_bh; + } + + status = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode), + main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + ocfs2_block_group_set_bits(handle, main_bm_inode, group, group_bh, + start_bit, count, + le16_to_cpu(old_bg_contig_free_bits), 1); + goto free_group_bh; + } + + idx = le16_to_cpu(group->bg_chain); + rec = &(cl->cl_recs[idx]); + + le32_add_cpu(&rec->c_free, count); + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); + fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); + ocfs2_journal_dirty(handle, main_bm_bh); + +free_group_bh: + brelse(group_bh); + +free_bm_bh: + ocfs2_inode_unlock(main_bm_inode, 1); + brelse(main_bm_bh); + +free_bm_inode: + inode_unlock(main_bm_inode); + iput(main_bm_inode); + +bail: + return status; +} + /* * expects the suballoc inode to already be locked. */ @@ -2563,12 +2838,13 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, void (*undo_fn)(unsigned int bit, unsigned long *bitmap)) { - int status = 0; + int idx, status = 0; u32 tmp_used; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *group; + struct ocfs2_chain_rec *rec; __le16 old_bg_contig_free_bits = 0; /* The alloc_bh comes from ocfs2_free_dinode() or @@ -2614,12 +2890,26 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, goto bail; } - le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, - count); + idx = le16_to_cpu(group->bg_chain); + rec = &(cl->cl_recs[idx]); + + le32_add_cpu(&rec->c_free, count); tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); ocfs2_journal_dirty(handle, alloc_bh); + /* + * Reclaim suballocator free space. + * Bypass: global_bitmap, non empty rec, first rec in cl_recs[] + */ + if (ocfs2_is_cluster_bitmap(alloc_inode) || + (le32_to_cpu(rec->c_free) != (le32_to_cpu(rec->c_total) - 1)) || + (le16_to_cpu(cl->cl_next_free_rec) == 1)) { + goto bail; + } + + _ocfs2_reclaim_suballoc_to_main(handle, alloc_inode, alloc_bh, group_bh); + bail: brelse(group_bh); return status; From fd4d53bde9128bc85d85cc3427bd592e4c3293e4 Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Fri, 12 Dec 2025 15:45:04 +0800 Subject: [PATCH 004/107] ocfs2: detect released suballocator BG for fh_to_[dentry|parent] After ocfs2 gained the ability to reclaim suballocator free block group (BGs), a suballocator block group may be released. This change causes the xfstest case generic/426 to fail. generic/426 expects return value -ENOENT or -ESTALE, but the current code triggers -EROFS. Call stack before ocfs2 gained the ability to reclaim bg: ocfs2_fh_to_dentry //or ocfs2_fh_to_parent ocfs2_get_dentry + ocfs2_test_inode_bit | ocfs2_test_suballoc_bit | + ocfs2_read_group_descriptor //Since ocfs2 never releases the bg, | | //the bg block was always found. | + *res = ocfs2_test_bit //unlink was called, and the bit is zero | + if (!set) //because the above *res is 0 status = -ESTALE //the generic/426 expected return value Current call stack that triggers -EROFS: ocfs2_get_dentry ocfs2_test_inode_bit ocfs2_test_suballoc_bit ocfs2_read_group_descriptor + if reading a released bg, validation fails and triggers -EROFS How to fix: Since the read BG is already released, we must avoid triggering -EROFS. With this commit, we use ocfs2_read_hint_group_descriptor() to detect the released BG block. This approach quietly handles this type of error and returns -EINVAL, which triggers the caller's existing conversion path to -ESTALE. [dan.carpenter@linaro.org: fix uninitialized variable] Link: https://lkml.kernel.org/r/dc37519fd2470909f8c65e26c5131b8b6dde2a5c.1766043917.git.dan.carpenter@linaro.org Link: https://lkml.kernel.org/r/20251212074505.25962-3-heming.zhao@suse.com Signed-off-by: Heming Zhao Signed-off-by: Dan Carpenter Reviewed-by: Su Yue Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Heming Zhao Signed-off-by: Andrew Morton --- fs/ocfs2/export.c | 6 ++++-- fs/ocfs2/suballoc.c | 26 +++++++++++++++++--------- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index b95724b767e1..9c2665dd24e2 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -74,8 +74,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, * nice */ status = -ESTALE; - } else + } else if (status != -ESTALE) { mlog(ML_ERROR, "test inode bit failed %d\n", status); + } goto unlock_nfs_sync; } @@ -162,8 +163,9 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) if (status < 0) { if (status == -EINVAL) { status = -ESTALE; - } else + } else if (status != -ESTALE) { mlog(ML_ERROR, "test inode bit failed %d\n", status); + } parent = ERR_PTR(status); goto bail_unlock; } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index aac2f96bee43..79d1325b2111 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -3163,7 +3163,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct ocfs2_group_desc *group; struct buffer_head *group_bh = NULL; u64 bg_blkno; - int status; + int status, quiet = 0, released = 0; trace_ocfs2_test_suballoc_bit((unsigned long long)blkno, (unsigned int)bit); @@ -3179,9 +3179,13 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, bg_blkno = group_blkno ? group_blkno : ocfs2_which_suballoc_group(blkno, bit); - status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, - &group_bh); - if (status < 0) { + status = ocfs2_read_hint_group_descriptor(suballoc, alloc_di, bg_blkno, + &group_bh, &released); + if (released) { + quiet = 1; + status = -ESTALE; + goto bail; + } else if (status < 0) { mlog(ML_ERROR, "read group %llu failed %d\n", (unsigned long long)bg_blkno, status); goto bail; @@ -3193,7 +3197,7 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, bail: brelse(group_bh); - if (status) + if (status && !quiet) mlog_errno(status); return status; } @@ -3213,7 +3217,7 @@ bail: */ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) { - int status; + int status, quiet = 0; u64 group_blkno = 0; u16 suballoc_bit = 0, suballoc_slot = 0; struct inode *inode_alloc_inode; @@ -3255,8 +3259,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, group_blkno, blkno, suballoc_bit, res); - if (status < 0) - mlog(ML_ERROR, "test suballoc bit failed %d\n", status); + if (status < 0) { + if (status == -ESTALE) + quiet = 1; + else + mlog(ML_ERROR, "test suballoc bit failed %d\n", status); + } ocfs2_inode_unlock(inode_alloc_inode, 0); inode_unlock(inode_alloc_inode); @@ -3264,7 +3272,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) iput(inode_alloc_inode); brelse(alloc_bh); bail: - if (status) + if (status && !quiet) mlog_errno(status); return status; } From 9677a51abd860aa75bd4fe28176672744bf5180e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Dec 2025 11:41:35 +0100 Subject: [PATCH 005/107] ocfs2: constify struct configfs_item_operations and configfs_group_operations 'struct configfs_item_operations' and 'configfs_group_operations' are not modified in this driver. Constifying these structures moves some data to a read-only section, so increases overall security, especially when the structure holds some function pointers. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 74011 19312 5280 98603 1812b fs/ocfs2/cluster/heartbeat.o After: ===== text data bss dec hex filename 74171 19152 5280 98603 1812b fs/ocfs2/cluster/heartbeat.o Link: https://lkml.kernel.org/r/7c7c00ba328e5e514d8debee698154039e9640dd.1765708880.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Heming Zhao Signed-off-by: Andrew Morton --- fs/ocfs2/cluster/heartbeat.c | 4 ++-- fs/ocfs2/cluster/nodemanager.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 724350925aff..8e9cbc334cf4 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1942,7 +1942,7 @@ static struct configfs_attribute *o2hb_region_attrs[] = { NULL, }; -static struct configfs_item_operations o2hb_region_item_ops = { +static const struct configfs_item_operations o2hb_region_item_ops = { .release = o2hb_region_release, }; @@ -2193,7 +2193,7 @@ static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { NULL, }; -static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { +static const struct configfs_group_operations o2hb_heartbeat_group_group_ops = { .make_item = o2hb_heartbeat_group_make_item, .drop_item = o2hb_heartbeat_group_drop_item, }; diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 6bc4e064ace4..c5e83c774d73 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -396,7 +396,7 @@ static struct configfs_attribute *o2nm_node_attrs[] = { NULL, }; -static struct configfs_item_operations o2nm_node_item_ops = { +static const struct configfs_item_operations o2nm_node_item_ops = { .release = o2nm_node_release, }; @@ -638,7 +638,7 @@ static void o2nm_node_group_drop_item(struct config_group *group, config_item_put(item); } -static struct configfs_group_operations o2nm_node_group_group_ops = { +static const struct configfs_group_operations o2nm_node_group_group_ops = { .make_item = o2nm_node_group_make_item, .drop_item = o2nm_node_group_drop_item, }; @@ -657,7 +657,7 @@ static void o2nm_cluster_release(struct config_item *item) kfree(cluster); } -static struct configfs_item_operations o2nm_cluster_item_ops = { +static const struct configfs_item_operations o2nm_cluster_item_ops = { .release = o2nm_cluster_release, }; @@ -741,7 +741,7 @@ static void o2nm_cluster_group_drop_item(struct config_group *group, struct conf config_item_put(item); } -static struct configfs_group_operations o2nm_cluster_group_group_ops = { +static const struct configfs_group_operations o2nm_cluster_group_group_ops = { .make_group = o2nm_cluster_group_make_group, .drop_item = o2nm_cluster_group_drop_item, }; From 688dab01c3bb14cb559878aaf7019bfba4a79275 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Fri, 12 Dec 2025 11:28:26 +0530 Subject: [PATCH 006/107] ocfs2: validate i_refcount_loc when refcount flag is set Add validation in ocfs2_validate_inode_block() to check that if an inode has OCFS2_HAS_REFCOUNT_FL set, it must also have a valid i_refcount_loc. A corrupted filesystem image can have this inconsistent state, which later triggers a BUG_ON in ocfs2_remove_refcount_tree() when the inode is being wiped during unlink. Catch this corruption early during inode validation to fail gracefully instead of crashing the kernel. Link: https://lkml.kernel.org/r/20251212055826.20929-1-kartikey406@gmail.com Signed-off-by: Deepanshu Kartikey Reported-by: syzbot+6d832e79d3efe1c46743@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6d832e79d3efe1c46743 Tested-by: syzbot+6d832e79d3efe1c46743@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/20251208084407.3021466-1-kartikey406@gmail.com/T/ [v1] Link: https://lore.kernel.org/all/20251212045646.9988-1-kartikey406@gmail.com/T/ [v2] Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Heming Zhao Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b5fcc2725a29..c95c998811ae 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1529,6 +1529,13 @@ int ocfs2_validate_inode_block(struct super_block *sb, } } + if ((le16_to_cpu(di->i_dyn_features) & OCFS2_HAS_REFCOUNT_FL) && + !di->i_refcount_loc) { + rc = ocfs2_error(sb, "Inode #%llu has refcount flag but no i_refcount_loc\n", + (unsigned long long)bh->b_blocknr); + goto bail; + } + rc = 0; bail: From 1524af3685b35feac76662cc551cbc37bd14775f Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Fri, 12 Dec 2025 10:51:32 +0530 Subject: [PATCH 007/107] ocfs2: validate inline data i_size during inode read When reading an inode from disk, ocfs2_validate_inode_block() performs various sanity checks but does not validate the size of inline data. If the filesystem is corrupted, an inode's i_size can exceed the actual inline data capacity (id_count). This causes ocfs2_dir_foreach_blk_id() to iterate beyond the inline data buffer, triggering a use-after-free when accessing directory entries from freed memory. In the syzbot report: - i_size was 1099511627576 bytes (~1TB) - Actual inline data capacity (id_count) is typically <256 bytes - A garbage rec_len (54648) caused ctx->pos to jump out of bounds - This triggered a UAF in ocfs2_check_dir_entry() Fix by adding a validation check in ocfs2_validate_inode_block() to ensure inodes with inline data have i_size <= id_count. This catches the corruption early during inode read and prevents all downstream code from operating on invalid data. Link: https://lkml.kernel.org/r/20251212052132.16750-1-kartikey406@gmail.com Signed-off-by: Deepanshu Kartikey Reported-by: syzbot+c897823f699449cc3eb4@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c897823f699449cc3eb4 Tested-by: syzbot+c897823f699449cc3eb4@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/20251211115231.3560028-1-kartikey406@gmail.com/T/ [v1] Link: https://lore.kernel.org/all/20251212040400.6377-1-kartikey406@gmail.com/T/ [v2] Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Heming Zhao Signed-off-by: Andrew Morton --- fs/ocfs2/inode.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c95c998811ae..03a51662ea8e 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1494,12 +1494,25 @@ int ocfs2_validate_inode_block(struct super_block *sb, goto bail; } - if ((le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) && - le32_to_cpu(di->i_clusters)) { - rc = ocfs2_error(sb, "Invalid dinode %llu: %u clusters\n", - (unsigned long long)bh->b_blocknr, - le32_to_cpu(di->i_clusters)); - goto bail; + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *data = &di->id2.i_data; + + if (le32_to_cpu(di->i_clusters)) { + rc = ocfs2_error(sb, + "Invalid dinode %llu: %u clusters\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_clusters)); + goto bail; + } + + if (le64_to_cpu(di->i_size) > le16_to_cpu(data->id_count)) { + rc = ocfs2_error(sb, + "Invalid dinode #%llu: inline data i_size %llu exceeds id_count %u\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_size), + le16_to_cpu(data->id_count)); + goto bail; + } } if (le32_to_cpu(di->i_flags) & OCFS2_CHAIN_FL) { From d3cd8de2e17e496e115f36faeccad7d219edd381 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 11 Dec 2025 18:59:49 +0300 Subject: [PATCH 008/107] ocfs2: adjust ocfs2_xa_remove_entry() to match UBSAN boundary checks After introducing 2f26f58df041 ("ocfs2: annotate flexible array members with __counted_by_le()"), syzbot has reported the following issue: UBSAN: array-index-out-of-bounds in fs/ocfs2/xattr.c:1955:3 index 2 is out of range for type 'struct ocfs2_xattr_entry[] __counted_by(xh_count)' (aka 'struct ocfs2_xattr_entry[]') ... Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 ubsan_epilogue+0xa/0x40 lib/ubsan.c:233 __ubsan_handle_out_of_bounds+0xe9/0xf0 lib/ubsan.c:455 ocfs2_xa_remove_entry+0x36d/0x3e0 fs/ocfs2/xattr.c:1955 ... To address this issue, 'xh_entries[]' member removal should be performed before actually changing 'xh_count', thus making sure that all array accesses matches the boundary checks performed by UBSAN. Link: https://lkml.kernel.org/r/20251211155949.774485-1-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Reported-by: syzbot+cf96bc82a588a27346a8@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=cf96bc82a588a27346a8 Reviewed-by: Heming Zhao Acked-by: Joseph Qi Cc: Deepanshu Kartikey Cc: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/xattr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 1b21fbc16d73..5fd85f517868 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1971,8 +1971,7 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) ocfs2_xa_wipe_namevalue(loc); loc->xl_entry = NULL; - le16_add_cpu(&xh->xh_count, -1); - count = le16_to_cpu(xh->xh_count); + count = le16_to_cpu(xh->xh_count) - 1; /* * Only zero out the entry if there are more remaining. This is @@ -1987,6 +1986,8 @@ static void ocfs2_xa_remove_entry(struct ocfs2_xa_loc *loc) memset(&xh->xh_entries[count], 0, sizeof(struct ocfs2_xattr_entry)); } + + xh->xh_count = cpu_to_le16(count); } /* From 4e9f69c062150566de5870536f08a50239724537 Mon Sep 17 00:00:00 2001 From: Prithvi Tambewagh Date: Tue, 16 Dec 2025 00:15:57 +0530 Subject: [PATCH 009/107] ocfs2: add validate function for slot map blocks When the filesystem is being mounted, the kernel panics while the data regarding slot map allocation to the local node, is being written to the disk. This occurs because the value of slot map buffer head block number, which should have been greater than or equal to `OCFS2_SUPER_BLOCK_BLKNO` (evaluating to 2) is less than it, indicative of disk metadata corruption. This triggers BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO) in ocfs2_write_block(), causing the kernel to panic. This is fixed by introducing function ocfs2_validate_slot_map_block() to validate slot map blocks. It first checks if the buffer head passed to it is up to date and valid, else it panics the kernel at that point itself. Further, it contains an if condition block, which checks if `bh->b_blocknr` is lesser than `OCFS2_SUPER_BLOCK_BLKNO`; if yes, then ocfs2_error is called, which prints the error log, for debugging purposes, and the return value of ocfs2_error() is returned. If the if condition is false, value 0 is returned by ocfs2_validate_slot_map_block(). This function is used as validate function in calls to ocfs2_read_blocks() in ocfs2_refresh_slot_info() and ocfs2_map_slot_buffers(). Link: https://lkml.kernel.org/r/20251215184600.13147-1-activprithvi@gmail.com Signed-off-by: Prithvi Tambewagh Reported-by: syzbot+c818e5c4559444f88aa0@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c818e5c4559444f88aa0 Tested-by: Reviewed-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/slot_map.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index e544c704b583..ea4a68abc25b 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -44,6 +44,9 @@ struct ocfs2_slot_info { static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, unsigned int node_num); +static int ocfs2_validate_slot_map_block(struct super_block *sb, + struct buffer_head *bh); + static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si, int slot_num) { @@ -132,7 +135,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) * this is not true, the read of -1 (UINT64_MAX) will fail. */ ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks, - si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL); + si->si_bh, OCFS2_BH_IGNORE_CACHE, + ocfs2_validate_slot_map_block); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -332,6 +336,24 @@ int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num) return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num); } +static int ocfs2_validate_slot_map_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + + BUG_ON(!buffer_uptodate(bh)); + + if (bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO) { + rc = ocfs2_error(sb, + "Invalid Slot Map Buffer Head " + "Block Number : %llu, Should be >= %d", + (unsigned long long)bh->b_blocknr, + OCFS2_SUPER_BLOCK_BLKNO); + return rc; + } + return 0; +} + static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, struct ocfs2_slot_info *si) { @@ -383,7 +405,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, bh = NULL; /* Acquire a fresh bh */ status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno, - 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL); + 1, &bh, OCFS2_BH_IGNORE_CACHE, + ocfs2_validate_slot_map_block); if (status < 0) { mlog_errno(status); goto bail; From e0b0f2834c9bec51b1068de5cf5e10c7519143eb Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Fri, 19 Dec 2025 16:31:52 +0800 Subject: [PATCH 010/107] ocfs2: fix oob in __ocfs2_find_path syzbot constructed a corrupted image, which resulted in el->l_count from the b-tree extent block being 0. Since the length of the l_recs array depends on l_count, reading its member e_blkno triggered the out-of-bounds access reported by syzbot in [1]. The loop terminates when l_count is 0, similar to when next_free is 0. [1] UBSAN: array-index-out-of-bounds in fs/ocfs2/alloc.c:1838:11 index 0 is out of range for type 'struct ocfs2_extent_rec[] __counted_by(l_count)' (aka 'struct ocfs2_extent_rec[]') Call Trace: __ocfs2_find_path+0x606/0xa40 fs/ocfs2/alloc.c:1838 ocfs2_find_leaf+0xab/0x1c0 fs/ocfs2/alloc.c:1946 ocfs2_get_clusters_nocache+0x172/0xc60 fs/ocfs2/extent_map.c:418 ocfs2_get_clusters+0x505/0xa70 fs/ocfs2/extent_map.c:631 ocfs2_extent_map_get_blocks+0x202/0x6a0 fs/ocfs2/extent_map.c:678 ocfs2_read_virt_blocks+0x286/0x930 fs/ocfs2/extent_map.c:1001 ocfs2_read_dir_block fs/ocfs2/dir.c:521 [inline] ocfs2_find_entry_el fs/ocfs2/dir.c:728 [inline] ocfs2_find_entry+0x3e4/0x2090 fs/ocfs2/dir.c:1120 ocfs2_find_files_on_disk+0xdf/0x310 fs/ocfs2/dir.c:2023 ocfs2_lookup_ino_from_name+0x52/0x100 fs/ocfs2/dir.c:2045 _ocfs2_get_system_file_inode fs/ocfs2/sysfile.c:136 [inline] ocfs2_get_system_file_inode+0x326/0x770 fs/ocfs2/sysfile.c:112 ocfs2_init_global_system_inodes+0x319/0x660 fs/ocfs2/super.c:461 ocfs2_initialize_super fs/ocfs2/super.c:2196 [inline] ocfs2_fill_super+0x4432/0x65b0 fs/ocfs2/super.c:993 get_tree_bdev_flags+0x40e/0x4d0 fs/super.c:1691 vfs_get_tree+0x92/0x2a0 fs/super.c:1751 fc_mount fs/namespace.c:1199 [inline] Link: https://lkml.kernel.org/r/tencent_4D99464FA28D9225BE0DBA923F5DF6DD8C07@qq.com Signed-off-by: Edward Adam Davis Reported-by: syzbot+151afab124dfbc5f15e6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=151afab124dfbc5f15e6 Reviewed-by: Heming Zhao Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/alloc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 58bf58b68955..b7db177d17d6 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1812,14 +1812,15 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, ret = -EROFS; goto out; } - if (le16_to_cpu(el->l_next_free_rec) == 0) { + if (!el->l_next_free_rec || !el->l_count) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), - "Owner %llu has empty extent list at depth %u\n", + "Owner %llu has empty extent list at depth %u\n" + "(next free=%u count=%u)\n", (unsigned long long)ocfs2_metadata_cache_owner(ci), - le16_to_cpu(el->l_tree_depth)); + le16_to_cpu(el->l_tree_depth), + le16_to_cpu(el->l_next_free_rec), le16_to_cpu(el->l_count)); ret = -EROFS; goto out; - } for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { From 29300f929eb1f9b3e555b834d05f2e9d73da303f Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 21 Oct 2025 13:55:18 +0300 Subject: [PATCH 011/107] ocfs2: annotate more flexible array members with __counted_by_le() Annotate flexible array members of 'struct ocfs2_local_alloc' and 'struct ocfs2_inline_data' with '__counted_by_le()' attribute to improve array bounds checking when CONFIG_UBSAN_BOUNDS is enabled, and prefer the convenient 'memset()' over an explicit loop to simplify 'ocfs2_clear_local_alloc()'. Link: https://lkml.kernel.org/r/20251021105518.119953-1-dmantipov@yandex.ru Signed-off-by: Dmitry Antipov Reviewed-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/localalloc.c | 4 +--- fs/ocfs2/ocfs2_fs.h | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index d1aa04a5af1b..56be21c695d6 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -905,13 +905,11 @@ bail: static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc) { struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); - int i; alloc->id1.bitmap1.i_total = 0; alloc->id1.bitmap1.i_used = 0; la->la_bm_off = 0; - for(i = 0; i < le16_to_cpu(la->la_size); i++) - la->la_bitmap[i] = 0; + memset(la->la_bitmap, 0, le16_to_cpu(la->la_size)); } #if 0 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index f7763da5c4a2..c501eb3cdcda 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -641,7 +641,7 @@ struct ocfs2_local_alloc __le16 la_size; /* Size of included bitmap, in bytes */ __le16 la_reserved1; __le64 la_reserved2; -/*10*/ __u8 la_bitmap[]; +/*10*/ __u8 la_bitmap[] __counted_by_le(la_size); }; /* @@ -654,7 +654,7 @@ struct ocfs2_inline_data * for data, starting at id_data */ __le16 id_reserved0; __le32 id_reserved1; - __u8 id_data[]; /* Start of user data */ + __u8 id_data[] __counted_by_le(id_count); /* Start of user data */ }; /* From 6dcd539f062d89127cb3a84a7da373a9bd28ba7b Mon Sep 17 00:00:00 2001 From: Ryota Sakamoto Date: Mon, 15 Dec 2025 13:43:22 +0000 Subject: [PATCH 012/107] lib/tests: convert test_uuid module to KUnit Move lib/test_uuid.c to lib/tests/uuid_kunit.c and convert it to use KUnit. This change switches the ad-hoc test code to standard KUnit test cases. The test data remains the same, but the verification logic is updated to use KUNIT_EXPECT_* macros. Also remove CONFIG_TEST_UUID from arch/*/configs/* because it is no longer used. The new CONFIG_UUID_KUNIT_TEST will be automatically enabled by CONFIG_KUNIT_ALL_TESTS. [lukas.bulwahn@redhat.com: MAINTAINERS: adjust file entry in UUID HELPERS] Link: https://lkml.kernel.org/r/20251217053907.2778515-1-lukas.bulwahn@redhat.com Link: https://lkml.kernel.org/r/20251215134322.12949-1-sakamo.ryota@gmail.com Signed-off-by: Ryota Sakamoto Signed-off-by: Lukas Bulwahn Acked-by: Geert Uytterhoeven Reviewed-by: David Gow Cc: Andriy Shevchenko Cc: Brendan Higgins Cc: Lukas Bulwahn Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- arch/m68k/configs/amiga_defconfig | 1 - arch/m68k/configs/apollo_defconfig | 1 - arch/m68k/configs/atari_defconfig | 1 - arch/m68k/configs/bvme6000_defconfig | 1 - arch/m68k/configs/hp300_defconfig | 1 - arch/m68k/configs/mac_defconfig | 1 - arch/m68k/configs/multi_defconfig | 1 - arch/m68k/configs/mvme147_defconfig | 1 - arch/m68k/configs/mvme16x_defconfig | 1 - arch/m68k/configs/q40_defconfig | 1 - arch/m68k/configs/sun3_defconfig | 1 - arch/m68k/configs/sun3x_defconfig | 1 - arch/powerpc/configs/ppc64_defconfig | 1 - lib/Kconfig.debug | 14 ++- lib/Makefile | 1 - lib/test_uuid.c | 134 --------------------------- lib/tests/Makefile | 1 + lib/tests/uuid_kunit.c | 106 +++++++++++++++++++++ 19 files changed, 119 insertions(+), 152 deletions(-) delete mode 100644 lib/test_uuid.c create mode 100644 lib/tests/uuid_kunit.c diff --git a/MAINTAINERS b/MAINTAINERS index ebc2f1bc0ade..99407c4c0095 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27260,7 +27260,7 @@ R: Andy Shevchenko L: linux-kernel@vger.kernel.org S: Maintained F: include/linux/uuid.h -F: lib/test_uuid.c +F: lib/tests/uuid_kunit.c F: lib/uuid.c UV SYSFS DRIVER diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index bfc1ee7c8158..1439abb69f73 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -618,7 +618,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index d9d1f3c4c70d..6a4e71866f60 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -575,7 +575,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 523205adccc8..46ad7d57b4fc 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -595,7 +595,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 7b0a4ef0b010..867bfa13a44c 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -567,7 +567,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 089c5c394c62..5dfe602cafd4 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -577,7 +577,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 5f2484c36733..f5d30310a349 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -594,7 +594,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 74f0a1f6d871..fe54e9222cc0 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -681,7 +681,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 4bee18c820e4..4ff2ff0993ad 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -567,7 +567,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 322c17e55c9a..6bb4738a65aa 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -568,7 +568,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 82f9baab8fea..14166c8fe234 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -584,7 +584,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index f94ad226cb5b..5db924e3caf7 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -564,7 +564,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index a5ecfc505ab2..318c9fe42f46 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -565,7 +565,6 @@ CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 2d92c11eea7e..684b3ea80f39 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -443,7 +443,6 @@ CONFIG_TEST_KSTRTOX=m CONFIG_TEST_PRINTF=m CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m -CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m CONFIG_TEST_MAPLE_TREE=m CONFIG_TEST_RHASHTABLE=m diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ba36939fda79..4bfca37f313e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2499,9 +2499,6 @@ config TEST_BITMAP If unsure, say N. -config TEST_UUID - tristate "Test functions located in the uuid module at runtime" - config TEST_XARRAY tristate "Test the XArray code at runtime" @@ -3285,6 +3282,17 @@ config RATELIMIT_KUNIT_TEST If unsure, say N. +config UUID_KUNIT_TEST + tristate "KUnit test for UUID" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the uuid library, + which provides functions for generating and parsing UUID and GUID. + The test suite checks parsing of UUID and GUID strings. + + If unsure, say N. + config INT_POW_KUNIT_TEST tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index aaf677cf4527..586a9f9b27a9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -89,7 +89,6 @@ ifeq ($(CONFIG_CC_IS_CLANG)$(CONFIG_KASAN),yy) GCOV_PROFILE_test_bitmap.o := n endif -obj-$(CONFIG_TEST_UUID) += test_uuid.o obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_MAPLE_TREE) += test_maple_tree.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o diff --git a/lib/test_uuid.c b/lib/test_uuid.c deleted file mode 100644 index 0124fad5d72c..000000000000 --- a/lib/test_uuid.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Test cases for lib/uuid.c module. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include - -struct test_uuid_data { - const char *uuid; - guid_t le; - uuid_t be; -}; - -static const struct test_uuid_data test_uuid_test_data[] = { - { - .uuid = "c33f4995-3701-450e-9fbf-206a2e98e576", - .le = GUID_INIT(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76), - .be = UUID_INIT(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76), - }, - { - .uuid = "64b4371c-77c1-48f9-8221-29f054fc023b", - .le = GUID_INIT(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b), - .be = UUID_INIT(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b), - }, - { - .uuid = "0cb4ddff-a545-4401-9d06-688af53e7f84", - .le = GUID_INIT(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84), - .be = UUID_INIT(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84), - }, -}; - -static const char * const test_uuid_wrong_data[] = { - "c33f4995-3701-450e-9fbf206a2e98e576 ", /* no hyphen(s) */ - "64b4371c-77c1-48f9-8221-29f054XX023b", /* invalid character(s) */ - "0cb4ddff-a545-4401-9d06-688af53e", /* not enough data */ -}; - -static unsigned total_tests __initdata; -static unsigned failed_tests __initdata; - -static void __init test_uuid_failed(const char *prefix, bool wrong, bool be, - const char *data, const char *actual) -{ - pr_err("%s test #%u %s %s data: '%s'\n", - prefix, - total_tests, - wrong ? "passed on wrong" : "failed on", - be ? "BE" : "LE", - data); - if (actual && *actual) - pr_err("%s test #%u actual data: '%s'\n", - prefix, - total_tests, - actual); - failed_tests++; -} - -static void __init test_uuid_test(const struct test_uuid_data *data) -{ - guid_t le; - uuid_t be; - char buf[48]; - - /* LE */ - total_tests++; - if (guid_parse(data->uuid, &le)) - test_uuid_failed("conversion", false, false, data->uuid, NULL); - - total_tests++; - if (!guid_equal(&data->le, &le)) { - sprintf(buf, "%pUl", &le); - test_uuid_failed("cmp", false, false, data->uuid, buf); - } - - /* BE */ - total_tests++; - if (uuid_parse(data->uuid, &be)) - test_uuid_failed("conversion", false, true, data->uuid, NULL); - - total_tests++; - if (!uuid_equal(&data->be, &be)) { - sprintf(buf, "%pUb", &be); - test_uuid_failed("cmp", false, true, data->uuid, buf); - } -} - -static void __init test_uuid_wrong(const char *data) -{ - guid_t le; - uuid_t be; - - /* LE */ - total_tests++; - if (!guid_parse(data, &le)) - test_uuid_failed("negative", true, false, data, NULL); - - /* BE */ - total_tests++; - if (!uuid_parse(data, &be)) - test_uuid_failed("negative", true, true, data, NULL); -} - -static int __init test_uuid_init(void) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(test_uuid_test_data); i++) - test_uuid_test(&test_uuid_test_data[i]); - - for (i = 0; i < ARRAY_SIZE(test_uuid_wrong_data); i++) - test_uuid_wrong(test_uuid_wrong_data[i]); - - if (failed_tests == 0) - pr_info("all %u tests passed\n", total_tests); - else - pr_err("failed %u out of %u tests\n", failed_tests, total_tests); - - return failed_tests ? -EINVAL : 0; -} -module_init(test_uuid_init); - -static void __exit test_uuid_exit(void) -{ - /* do nothing */ -} -module_exit(test_uuid_exit); - -MODULE_AUTHOR("Andy Shevchenko "); -MODULE_DESCRIPTION("Test cases for lib/uuid.c module"); -MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/tests/Makefile b/lib/tests/Makefile index 601dba4b7d96..9a20608f65f5 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -49,5 +49,6 @@ obj-$(CONFIG_STRING_HELPERS_KUNIT_TEST) += string_helpers_kunit.o obj-$(CONFIG_USERCOPY_KUNIT_TEST) += usercopy_kunit.o obj-$(CONFIG_UTIL_MACROS_KUNIT) += util_macros_kunit.o obj-$(CONFIG_RATELIMIT_KUNIT_TEST) += test_ratelimit.o +obj-$(CONFIG_UUID_KUNIT_TEST) += uuid_kunit.o obj-$(CONFIG_TEST_RUNTIME_MODULE) += module/ diff --git a/lib/tests/uuid_kunit.c b/lib/tests/uuid_kunit.c new file mode 100644 index 000000000000..de71b2649dac --- /dev/null +++ b/lib/tests/uuid_kunit.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0 +/* + * Test cases for lib/uuid.c module. + */ + +#include +#include + +struct test_uuid_data { + const char *uuid; + guid_t le; + uuid_t be; +}; + +static const struct test_uuid_data test_uuid_test_data[] = { + { + .uuid = "c33f4995-3701-450e-9fbf-206a2e98e576", + .le = GUID_INIT(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76), + .be = UUID_INIT(0xc33f4995, 0x3701, 0x450e, 0x9f, 0xbf, 0x20, 0x6a, 0x2e, 0x98, 0xe5, 0x76), + }, + { + .uuid = "64b4371c-77c1-48f9-8221-29f054fc023b", + .le = GUID_INIT(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b), + .be = UUID_INIT(0x64b4371c, 0x77c1, 0x48f9, 0x82, 0x21, 0x29, 0xf0, 0x54, 0xfc, 0x02, 0x3b), + }, + { + .uuid = "0cb4ddff-a545-4401-9d06-688af53e7f84", + .le = GUID_INIT(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84), + .be = UUID_INIT(0x0cb4ddff, 0xa545, 0x4401, 0x9d, 0x06, 0x68, 0x8a, 0xf5, 0x3e, 0x7f, 0x84), + }, +}; + +static const char * const test_uuid_wrong_data[] = { + "c33f4995-3701-450e-9fbf206a2e98e576 ", /* no hyphen(s) */ + "64b4371c-77c1-48f9-8221-29f054XX023b", /* invalid character(s) */ + "0cb4ddff-a545-4401-9d06-688af53e", /* not enough data */ +}; + +static void uuid_test_guid_valid(struct kunit *test) +{ + unsigned int i; + const struct test_uuid_data *data; + guid_t le; + + for (i = 0; i < ARRAY_SIZE(test_uuid_test_data); i++) { + data = &test_uuid_test_data[i]; + KUNIT_EXPECT_EQ(test, guid_parse(data->uuid, &le), 0); + KUNIT_EXPECT_TRUE(test, guid_equal(&data->le, &le)); + } +} + +static void uuid_test_uuid_valid(struct kunit *test) +{ + unsigned int i; + const struct test_uuid_data *data; + uuid_t be; + + for (i = 0; i < ARRAY_SIZE(test_uuid_test_data); i++) { + data = &test_uuid_test_data[i]; + KUNIT_EXPECT_EQ(test, uuid_parse(data->uuid, &be), 0); + KUNIT_EXPECT_TRUE(test, uuid_equal(&data->be, &be)); + } +} + +static void uuid_test_guid_invalid(struct kunit *test) +{ + unsigned int i; + const char *uuid; + guid_t le; + + for (i = 0; i < ARRAY_SIZE(test_uuid_wrong_data); i++) { + uuid = test_uuid_wrong_data[i]; + KUNIT_EXPECT_EQ(test, guid_parse(uuid, &le), -EINVAL); + } +} + +static void uuid_test_uuid_invalid(struct kunit *test) +{ + unsigned int i; + const char *uuid; + uuid_t be; + + for (i = 0; i < ARRAY_SIZE(test_uuid_wrong_data); i++) { + uuid = test_uuid_wrong_data[i]; + KUNIT_EXPECT_EQ(test, uuid_parse(uuid, &be), -EINVAL); + } +} + +static struct kunit_case uuid_test_cases[] = { + KUNIT_CASE(uuid_test_guid_valid), + KUNIT_CASE(uuid_test_uuid_valid), + KUNIT_CASE(uuid_test_guid_invalid), + KUNIT_CASE(uuid_test_uuid_invalid), + {}, +}; + +static struct kunit_suite uuid_test_suite = { + .name = "uuid", + .test_cases = uuid_test_cases, +}; + +kunit_test_suite(uuid_test_suite); + +MODULE_AUTHOR("Andy Shevchenko "); +MODULE_DESCRIPTION("Test cases for lib/uuid.c module"); +MODULE_LICENSE("Dual BSD/GPL"); From 24c776355f4097316a763005434ffff716aa21a8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Dec 2025 16:51:56 -0800 Subject: [PATCH 013/107] kernel.h: drop hex.h and update all hex.h users Remove from and update all users/callers of hex.h interfaces to directly #include as part of the process of putting kernel.h on a diet. Removing hex.h from kernel.h means that 36K C source files don't have to pay the price of parsing hex.h for the roughly 120 C source files that need it. This change has been build-tested with allmodconfig on most ARCHes. Also, all users/callers of in the entire source tree have been updated if needed (if not already #included). Link: https://lkml.kernel.org/r/20251215005206.2362276-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Andy Shevchenko Cc: Ingo Molnar Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- arch/mips/kernel/setup.c | 1 + arch/mips/rb532/devices.c | 1 + arch/powerpc/kernel/btext.c | 1 + arch/s390/kernel/alternative.c | 1 + arch/s390/kernel/stackprotector.c | 1 + arch/um/drivers/vector_kern.c | 1 + arch/xtensa/platforms/iss/network.c | 1 + certs/blacklist.c | 1 + crypto/asymmetric_keys/asymmetric_type.c | 1 + crypto/asymmetric_keys/x509_public_key.c | 1 + crypto/krb5/selftest.c | 1 + drivers/atm/nicstar.c | 1 + drivers/auxdisplay/hd44780_common.c | 1 + drivers/auxdisplay/lcd2s.c | 1 + drivers/bus/moxtet.c | 1 + drivers/char/tpm/tpm.h | 1 + drivers/comedi/drivers/jr3_pci.c | 1 + drivers/firmware/broadcom/bcm47xx_sprom.c | 1 + drivers/gpio/gpio-macsmc.c | 1 + drivers/hid/hid-picolcd_debugfs.c | 1 + drivers/hwmon/pmbus/q54sj108a2.c | 1 + drivers/hwmon/pmbus/ucd9000.c | 1 + drivers/infiniband/ulp/srp/ib_srp.c | 1 + drivers/infiniband/ulp/srpt/ib_srpt.c | 1 + drivers/input/touchscreen/iqs5xx.c | 1 + drivers/md/dm-crypt.c | 1 + drivers/md/dm-integrity.c | 1 + drivers/md/dm-verity-target.c | 1 + .../media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c | 1 + drivers/media/cec/usb/rainshadow/rainshadow-cec.c | 1 + drivers/media/i2c/ccs/ccs-reg-access.c | 1 + drivers/media/usb/pvrusb2/pvrusb2-debugifc.c | 1 + drivers/misc/kgdbts.c | 1 + drivers/misc/pch_phub.c | 1 + drivers/net/bonding/bond_options.c | 1 + drivers/net/can/can327.c | 1 + drivers/net/can/slcan/slcan-core.c | 1 + drivers/net/ethernet/chelsio/cxgb3/common.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_dbg.c | 1 + drivers/net/ethernet/micrel/ksz884x.c | 1 + drivers/net/ethernet/pasemi/pasemi_mac.c | 1 + drivers/net/netconsole.c | 1 + drivers/net/netdevsim/dev.c | 1 + drivers/net/usb/r8152.c | 1 + drivers/net/usb/usbnet.c | 1 + drivers/net/wireless/ath/ath6kl/debug.c | 1 + drivers/net/wireless/intel/iwlwifi/fw/debugfs.c | 1 + drivers/net/wireless/intel/iwlwifi/mld/debugfs.c | 1 + drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c | 1 + drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 1 + drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h | 1 + drivers/net/wireless/realtek/rtw89/debug.c | 1 + drivers/net/wireless/silabs/wfx/fwio.c | 1 + drivers/nvme/target/configfs.c | 1 + drivers/nvme/target/core.c | 1 + drivers/nvmem/brcm_nvram.c | 1 + drivers/nvmem/layouts/u-boot-env.c | 1 + drivers/platform/x86/intel/wmi/thunderbolt.c | 1 + drivers/pnp/support.c | 1 + drivers/ptp/ptp_pch.c | 1 + drivers/s390/cio/blacklist.c | 1 + drivers/s390/crypto/ap_bus.c | 1 + drivers/s390/crypto/zcrypt_cex4.c | 1 + drivers/s390/virtio/virtio_ccw.c | 1 + drivers/scsi/aacraid/rx.c | 1 + drivers/scsi/ips.c | 1 + drivers/scsi/libsas/sas_scsi_host.c | 1 + drivers/scsi/qla2xxx/tcm_qla2xxx.c | 1 + drivers/scsi/scsi_transport_fc.c | 1 + drivers/staging/rtl8723bs/core/rtw_ieee80211.c | 1 + drivers/target/iscsi/iscsi_target_auth.c | 1 + drivers/target/target_core_fabric_lib.c | 1 + drivers/target/target_core_spc.c | 1 + drivers/target/tcm_fc/tfc_conf.c | 1 + drivers/thunderbolt/switch.c | 1 + drivers/tty/vt/vt.c | 1 + drivers/ufs/core/ufshcd.c | 1 + drivers/usb/atm/speedtch.c | 1 + drivers/usb/atm/ueagle-atm.c | 1 + drivers/usb/gadget/function/u_ether.c | 1 + drivers/usb/gadget/function/uvc_configfs.c | 1 + drivers/usb/typec/ucsi/debugfs.c | 1 + drivers/usb/typec/ucsi/ucsi_ccg.c | 1 + drivers/watchdog/hpwdt.c | 1 + fs/adfs/dir.c | 1 + fs/binfmt_misc.c | 1 + fs/ecryptfs/ecryptfs_kernel.h | 1 + fs/efivarfs/vars.c | 1 + fs/fat/dir.c | 1 + fs/fat/namei_vfat.c | 1 + fs/gfs2/lock_dlm.c | 1 + fs/nfsd/nfs4recover.c | 1 + fs/ntfs3/ntfs_fs.h | 1 + fs/overlayfs/namei.c | 1 + fs/proc/array.c | 1 + fs/seq_file.c | 1 + fs/udf/unicode.c | 1 + include/linux/kernel.h | 1 - kernel/audit.c | 1 + kernel/bpf/core.c | 1 + kernel/bpf/syscall.c | 1 + kernel/debug/gdbstub.c | 1 + lib/hexdump.c | 1 + lib/string_helpers.c | 1 + lib/uuid.c | 1 + lib/vsprintf.c | 1 + net/bridge/br_sysfs_br.c | 1 + net/core/pktgen.c | 1 + net/core/utils.c | 1 + net/ipv4/arp.c | 1 + net/mac80211/debugfs_netdev.c | 1 + net/sunrpc/cache.c | 1 + net/tipc/core.h | 1 + security/integrity/evm/evm_crypto.c | 1 + security/integrity/ima/ima_api.c | 1 + security/ipe/digest.c | 1 + security/keys/encrypted-keys/encrypted.c | 1 + security/keys/trusted-keys/trusted_core.c | 1 + security/keys/trusted-keys/trusted_tpm1.c | 1 + security/loadpin/loadpin.c | 1 + security/selinux/selinuxfs.c | 1 + sound/pci/riptide/riptide.c | 1 + sound/usb/6fire/firmware.c | 1 + 123 files changed, 122 insertions(+), 1 deletion(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 11b9b6b63e19..c540431ed332 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/mips/rb532/devices.c b/arch/mips/rb532/devices.c index b7f6f782d9a1..8ecb56be81ac 100644 --- a/arch/mips/rb532/devices.c +++ b/arch/mips/rb532/devices.c @@ -7,6 +7,7 @@ */ #include #include +#include #include #include #include diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index ca00c4824e31..b23dddfce26d 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include diff --git a/arch/s390/kernel/alternative.c b/arch/s390/kernel/alternative.c index 90c0e6408992..02d04ae621ba 100644 --- a/arch/s390/kernel/alternative.c +++ b/arch/s390/kernel/alternative.c @@ -4,6 +4,7 @@ #define pr_fmt(fmt) "alt: " fmt #endif +#include #include #include #include diff --git a/arch/s390/kernel/stackprotector.c b/arch/s390/kernel/stackprotector.c index d4e40483f008..8bd3ecf9200a 100644 --- a/arch/s390/kernel/stackprotector.c +++ b/arch/s390/kernel/stackprotector.c @@ -5,6 +5,7 @@ #endif #include +#include #include #include #include diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 25d9258fa592..28cfe1c700f0 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index f0a63b2f85cc..832579143891 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -13,6 +13,7 @@ #define pr_fmt(fmt) "%s: " fmt, __func__ +#include #include #include #include diff --git a/certs/blacklist.c b/certs/blacklist.c index 675dd7a8f07a..11fc858b2921 100644 --- a/certs/blacklist.c +++ b/certs/blacklist.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/crypto/asymmetric_keys/asymmetric_type.c b/crypto/asymmetric_keys/asymmetric_type.c index 348966ea2175..b7a08de58064 100644 --- a/crypto/asymmetric_keys/asymmetric_type.c +++ b/crypto/asymmetric_keys/asymmetric_type.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c index 12e3341e806b..0499b13ba3ef 100644 --- a/crypto/asymmetric_keys/x509_public_key.c +++ b/crypto/asymmetric_keys/x509_public_key.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/crypto/krb5/selftest.c b/crypto/krb5/selftest.c index 4519c572d37e..67c4accd8cbd 100644 --- a/crypto/krb5/selftest.c +++ b/crypto/krb5/selftest.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c index 45952cfea06b..bc8dbba77b87 100644 --- a/drivers/atm/nicstar.c +++ b/drivers/atm/nicstar.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/auxdisplay/hd44780_common.c b/drivers/auxdisplay/hd44780_common.c index 1792fe2a4460..b71db39f9249 100644 --- a/drivers/auxdisplay/hd44780_common.c +++ b/drivers/auxdisplay/hd44780_common.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include #include #include #include diff --git a/drivers/auxdisplay/lcd2s.c b/drivers/auxdisplay/lcd2s.c index 045dbef49dee..defb0573e43c 100644 --- a/drivers/auxdisplay/lcd2s.c +++ b/drivers/auxdisplay/lcd2s.c @@ -11,6 +11,7 @@ * Author: Lars Pöschel * All rights reserved. */ +#include #include #include #include diff --git a/drivers/bus/moxtet.c b/drivers/bus/moxtet.c index 7ce61d629a87..5a53bfab470a 100644 --- a/drivers/bus/moxtet.c +++ b/drivers/bus/moxtet.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 02c07fef41ba..87d68ddf270a 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include diff --git a/drivers/comedi/drivers/jr3_pci.c b/drivers/comedi/drivers/jr3_pci.c index 61792d940a3d..51287cbc3e48 100644 --- a/drivers/comedi/drivers/jr3_pci.c +++ b/drivers/comedi/drivers/jr3_pci.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/firmware/broadcom/bcm47xx_sprom.c b/drivers/firmware/broadcom/bcm47xx_sprom.c index fdcd3a07abcd..bca03fd85808 100644 --- a/drivers/firmware/broadcom/bcm47xx_sprom.c +++ b/drivers/firmware/broadcom/bcm47xx_sprom.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include diff --git a/drivers/gpio/gpio-macsmc.c b/drivers/gpio/gpio-macsmc.c index 30ef258e7655..b0952d066a9d 100644 --- a/drivers/gpio/gpio-macsmc.c +++ b/drivers/gpio/gpio-macsmc.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/drivers/hid/hid-picolcd_debugfs.c b/drivers/hid/hid-picolcd_debugfs.c index d01176da8896..085847a92e07 100644 --- a/drivers/hid/hid-picolcd_debugfs.c +++ b/drivers/hid/hid-picolcd_debugfs.c @@ -11,6 +11,7 @@ #include #include +#include #include #include diff --git a/drivers/hwmon/pmbus/q54sj108a2.c b/drivers/hwmon/pmbus/q54sj108a2.c index 4d7086d83aa3..fc030ca34480 100644 --- a/drivers/hwmon/pmbus/q54sj108a2.c +++ b/drivers/hwmon/pmbus/q54sj108a2.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include diff --git a/drivers/hwmon/pmbus/ucd9000.c b/drivers/hwmon/pmbus/ucd9000.c index 55e7af3a5f98..9b5d34a110ba 100644 --- a/drivers/hwmon/pmbus/ucd9000.c +++ b/drivers/hwmon/pmbus/ucd9000.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 23ed2fc688f0..2012ba22a7af 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -33,6 +33,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 71269446353d..e314e6a84d96 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -33,6 +33,7 @@ */ #include +#include #include #include #include diff --git a/drivers/input/touchscreen/iqs5xx.c b/drivers/input/touchscreen/iqs5xx.c index 4ebd7565ae6e..c63819abaf9b 100644 --- a/drivers/input/touchscreen/iqs5xx.c +++ b/drivers/input/touchscreen/iqs5xx.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 79704fbc523b..cbeb5f918d09 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 170bf67a2edd..b41424a4c139 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5c17472d7896..ca094f14a287 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -17,6 +17,7 @@ #include "dm-verity-fec.h" #include "dm-verity-verify-sig.h" #include "dm-audit.h" +#include #include #include #include diff --git a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c index e2eff17952ab..bf92576bb2fc 100644 --- a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c +++ b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/cec/usb/rainshadow/rainshadow-cec.c b/drivers/media/cec/usb/rainshadow/rainshadow-cec.c index 08f58456d682..6c0cee4b066f 100644 --- a/drivers/media/cec/usb/rainshadow/rainshadow-cec.c +++ b/drivers/media/cec/usb/rainshadow/rainshadow-cec.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/i2c/ccs/ccs-reg-access.c b/drivers/media/i2c/ccs/ccs-reg-access.c index fd36889ccc1d..a0181a5d2f34 100644 --- a/drivers/media/i2c/ccs/ccs-reg-access.c +++ b/drivers/media/i2c/ccs/ccs-reg-access.c @@ -12,6 +12,7 @@ #include #include +#include #include #include "ccs.h" diff --git a/drivers/media/usb/pvrusb2/pvrusb2-debugifc.c b/drivers/media/usb/pvrusb2/pvrusb2-debugifc.c index 81d711269ab5..9f936085acbb 100644 --- a/drivers/media/usb/pvrusb2/pvrusb2-debugifc.c +++ b/drivers/media/usb/pvrusb2/pvrusb2-debugifc.c @@ -4,6 +4,7 @@ * Copyright (C) 2005 Mike Isely */ +#include #include #include "pvrusb2-debugifc.h" #include "pvrusb2-hdw.h" diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c index 0cf31164b470..3b7a041ea351 100644 --- a/drivers/misc/kgdbts.c +++ b/drivers/misc/kgdbts.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/misc/pch_phub.c b/drivers/misc/pch_phub.c index 7bee179841bc..0d63e834dbe7 100644 --- a/drivers/misc/pch_phub.c +++ b/drivers/misc/pch_phub.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 384499c869b8..fa65a0e92b8e 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include diff --git a/drivers/net/can/can327.c b/drivers/net/can/can327.c index b66fc16aedd2..90f5e35f3c8f 100644 --- a/drivers/net/can/can327.c +++ b/drivers/net/can/can327.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/can/slcan/slcan-core.c b/drivers/net/can/slcan/slcan-core.c index cd789e178d34..7439849d5c84 100644 --- a/drivers/net/can/slcan/slcan-core.c +++ b/drivers/net/can/slcan/slcan-core.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/chelsio/cxgb3/common.h b/drivers/net/ethernet/chelsio/cxgb3/common.h index ecd025dda8d6..14000977730c 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/common.h +++ b/drivers/net/ethernet/chelsio/cxgb3/common.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_dbg.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_dbg.c index 030a5776c937..8803fa071c50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_dbg.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_dbg.c @@ -2,6 +2,7 @@ // Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. #include +#include #include #include #include diff --git a/drivers/net/ethernet/micrel/ksz884x.c b/drivers/net/ethernet/micrel/ksz884x.c index cdde19b8edc4..4980a6e44607 100644 --- a/drivers/net/ethernet/micrel/ksz884x.c +++ b/drivers/net/ethernet/micrel/ksz884x.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c index fe58024b5901..00909372ea61 100644 --- a/drivers/net/ethernet/pasemi/pasemi_mac.c +++ b/drivers/net/ethernet/pasemi/pasemi_mac.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c index 9cb4dfc242f5..bbf9c02e09d4 100644 --- a/drivers/net/netconsole.c +++ b/drivers/net/netconsole.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 2683a989873e..351ff4ed3eac 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index fa5192583860..29179e582067 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 36742e64cff7..960f200cd52c 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -18,6 +18,7 @@ */ #include +#include #include #include #include diff --git a/drivers/net/wireless/ath/ath6kl/debug.c b/drivers/net/wireless/ath/ath6kl/debug.c index b837d31416df..84403aab21c0 100644 --- a/drivers/net/wireless/ath/ath6kl/debug.c +++ b/drivers/net/wireless/ath/ath6kl/debug.c @@ -19,6 +19,7 @@ #include #include +#include #include #include diff --git a/drivers/net/wireless/intel/iwlwifi/fw/debugfs.c b/drivers/net/wireless/intel/iwlwifi/fw/debugfs.c index 3b0e8c43ba4a..3c4bee85b825 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/debugfs.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/debugfs.c @@ -7,6 +7,7 @@ #include "api/commands.h" #include "debugfs.h" #include "dbg.h" +#include #include #define FWRT_DEBUGFS_OPEN_WRAPPER(name, buflen, argtype) \ diff --git a/drivers/net/wireless/intel/iwlwifi/mld/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mld/debugfs.c index b9c9cd3f44e4..ce2fc98782c3 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/debugfs.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/debugfs.c @@ -24,6 +24,7 @@ #include "fw/api/rfi.h" #include "fw/dhc-utils.h" #include +#include #define MLD_DEBUGFS_READ_FILE_OPS(name, bufsz) \ _MLD_DEBUGFS_READ_FILE_OPS(name, bufsz, struct iwl_mld) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c index 683c0ba5fb39..e6b9896dc4ac 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index 301d590fe0bd..d7e9c2b7980e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #ifdef CONFIG_THERMAL diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h index e16865dd8e52..c93fd245c90f 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h +++ b/drivers/net/wireless/mediatek/mt76/mt7615/mt7615.h @@ -5,6 +5,7 @@ #define __MT7615_H #include +#include #include #include #include diff --git a/drivers/net/wireless/realtek/rtw89/debug.c b/drivers/net/wireless/realtek/rtw89/debug.c index 1264c2f82600..8666e26c8c80 100644 --- a/drivers/net/wireless/realtek/rtw89/debug.c +++ b/drivers/net/wireless/realtek/rtw89/debug.c @@ -2,6 +2,7 @@ /* Copyright(c) 2019-2020 Realtek Corporation */ +#include #include #include "coex.h" diff --git a/drivers/net/wireless/silabs/wfx/fwio.c b/drivers/net/wireless/silabs/wfx/fwio.c index 52c7f560b062..edd5ac30ed19 100644 --- a/drivers/net/wireless/silabs/wfx/fwio.c +++ b/drivers/net/wireless/silabs/wfx/fwio.c @@ -6,6 +6,7 @@ * Copyright (c) 2010, ST-Ericsson */ #include +#include #include #include #include diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index e44ef69dffc2..127dae51fec1 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -4,6 +4,7 @@ * Copyright (c) 2015-2016 HGST, a Western Digital Company. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index cc88e5a28c8a..eab3e4fc0f74 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -4,6 +4,7 @@ * Copyright (c) 2015-2016 HGST, a Western Digital Company. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include diff --git a/drivers/nvmem/brcm_nvram.c b/drivers/nvmem/brcm_nvram.c index b4cf245fb246..2dce6a7b8039 100644 --- a/drivers/nvmem/brcm_nvram.c +++ b/drivers/nvmem/brcm_nvram.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include diff --git a/drivers/nvmem/layouts/u-boot-env.c b/drivers/nvmem/layouts/u-boot-env.c index ab32bf1291af..f27f387bb52a 100644 --- a/drivers/nvmem/layouts/u-boot-env.c +++ b/drivers/nvmem/layouts/u-boot-env.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/platform/x86/intel/wmi/thunderbolt.c b/drivers/platform/x86/intel/wmi/thunderbolt.c index 08df560a2c7a..15e5763a20dd 100644 --- a/drivers/platform/x86/intel/wmi/thunderbolt.c +++ b/drivers/platform/x86/intel/wmi/thunderbolt.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/pnp/support.c b/drivers/pnp/support.c index a6073db10ec6..f6c866851769 100644 --- a/drivers/pnp/support.c +++ b/drivers/pnp/support.c @@ -9,6 +9,7 @@ #include #include +#include #include #include "base.h" diff --git a/drivers/ptp/ptp_pch.c b/drivers/ptp/ptp_pch.c index b8a9a54a176c..f854da2fd812 100644 --- a/drivers/ptp/ptp_pch.c +++ b/drivers/ptp/ptp_pch.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/drivers/s390/cio/blacklist.c b/drivers/s390/cio/blacklist.c index 738d5e2d5304..020d210bde9f 100644 --- a/drivers/s390/cio/blacklist.c +++ b/drivers/s390/cio/blacklist.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "cio: " fmt +#include #include #include #include diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index a445494fd2be..6b1b7b014816 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c index 6ba7fbddd3f7..e9a984903bff 100644 --- a/drivers/s390/crypto/zcrypt_cex4.c +++ b/drivers/s390/crypto/zcrypt_cex4.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 4904b831c0a7..1653cc668dcf 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include diff --git a/drivers/scsi/aacraid/rx.c b/drivers/scsi/aacraid/rx.c index e06ff83b69ce..ba9f3256c258 100644 --- a/drivers/scsi/aacraid/rx.c +++ b/drivers/scsi/aacraid/rx.c @@ -17,6 +17,7 @@ */ #include +#include #include #include #include diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 3393a288fd23..40af961382dc 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -167,6 +167,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c index ffa5b49aaf08..da02457f0b09 100644 --- a/drivers/scsi/libsas/sas_scsi_host.c +++ b/drivers/scsi/libsas/sas_scsi_host.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "sas_internal.h" diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c index 2fff68935338..9f16164faa1e 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index 987befb02408..6bd68f493f20 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/staging/rtl8723bs/core/rtw_ieee80211.c b/drivers/staging/rtl8723bs/core/rtw_ieee80211.c index 8fdeeda88a6d..e89b24fa5e05 100644 --- a/drivers/staging/rtl8723bs/core/rtw_ieee80211.c +++ b/drivers/staging/rtl8723bs/core/rtw_ieee80211.c @@ -6,6 +6,7 @@ ******************************************************************************/ #include +#include #include #include diff --git a/drivers/target/iscsi/iscsi_target_auth.c b/drivers/target/iscsi/iscsi_target_auth.c index c8a248bd11be..2c4d583fe3e6 100644 --- a/drivers/target/iscsi/iscsi_target_auth.c +++ b/drivers/target/iscsi/iscsi_target_auth.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/target/target_core_fabric_lib.c b/drivers/target/target_core_fabric_lib.c index ec7bc6e30228..87c5d26a5089 100644 --- a/drivers/target/target_core_fabric_lib.c +++ b/drivers/target/target_core_fabric_lib.c @@ -16,6 +16,7 @@ * on the formats implemented in this file. */ +#include #include #include #include diff --git a/drivers/target/target_core_spc.c b/drivers/target/target_core_spc.c index fe2b888bcb43..6360b66c7445 100644 --- a/drivers/target/target_core_spc.c +++ b/drivers/target/target_core_spc.c @@ -7,6 +7,7 @@ * Nicholas A. Bellinger */ +#include #include #include #include diff --git a/drivers/target/tcm_fc/tfc_conf.c b/drivers/target/tcm_fc/tfc_conf.c index f686d95d3273..a29b20b5f78e 100644 --- a/drivers/target/tcm_fc/tfc_conf.c +++ b/drivers/target/tcm_fc/tfc_conf.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c index b3948aad0b95..e2732c575bad 100644 --- a/drivers/thunderbolt/switch.c +++ b/drivers/thunderbolt/switch.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 59b4b5e126ba..edda91bfdf62 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 604043a7533d..31950fc51a4c 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/atm/speedtch.c b/drivers/usb/atm/speedtch.c index 773ac2725532..e6b610a87482 100644 --- a/drivers/usb/atm/speedtch.c +++ b/drivers/usb/atm/speedtch.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c index cd0f7b4bd82a..78a2585f33ec 100644 --- a/drivers/usb/atm/ueagle-atm.c +++ b/drivers/usb/atm/ueagle-atm.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index f58590bf5e02..c47965d850d4 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/gadget/function/uvc_configfs.c b/drivers/usb/gadget/function/uvc_configfs.c index a4a2d3dcb0d6..5a87516ddb31 100644 --- a/drivers/usb/gadget/function/uvc_configfs.c +++ b/drivers/usb/gadget/function/uvc_configfs.c @@ -12,6 +12,7 @@ #include "uvc_configfs.h" +#include #include #include #include diff --git a/drivers/usb/typec/ucsi/debugfs.c b/drivers/usb/typec/ucsi/debugfs.c index f3684ab787fe..d1f5832165c3 100644 --- a/drivers/usb/typec/ucsi/debugfs.c +++ b/drivers/usb/typec/ucsi/debugfs.c @@ -8,6 +8,7 @@ * Gopal Saranya */ #include +#include #include #include #include diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c index d83a0051c737..199799b319c2 100644 --- a/drivers/usb/typec/ucsi/ucsi_ccg.c +++ b/drivers/usb/typec/ucsi/ucsi_ccg.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index ae30e394d176..2a848c35c14d 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 77fbd196008f..4f9dc276da6f 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -6,6 +6,7 @@ * * Common directory handling for ADFS */ +#include #include #include "adfs.h" diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 8cb1a94339b8..2b772613a74c 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 62a2ea7f59ed..0acc1e638454 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c index 6edc10958ecf..798a1bc36022 100644 --- a/fs/efivarfs/vars.c +++ b/fs/efivarfs/vars.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 92b091783966..af7dedf8adcb 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include "fat.h" diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 5dbc4cbb8fce..4f3cc2b3089e 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include "fat.h" diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index b8d249925395..065ade6a1192 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 441dfbfe2d2b..1e6b2dd47ba7 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index a4559c9f64e6..f18349689458 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index e9a69c95be91..cda26bdef3b9 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/proc/array.c b/fs/proc/array.c index 42932f88141a..39e9246f6e4a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -55,6 +55,7 @@ #include #include +#include #include #include #include diff --git a/fs/seq_file.c b/fs/seq_file.c index 8bbb1ad46335..8894cbde8d3a 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 32c7f3d27f74..87580ff827ee 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -16,6 +16,7 @@ #include "udfdecl.h" +#include #include #include /* for memset */ #include diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5b46924fdff5..35b8f2a5aca5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/audit.c b/kernel/audit.c index 26a332ffb1b8..2f2db2907055 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -32,6 +32,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1b9b18e5b03c..f1c5fc66ef01 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ff82144f885..4216de60e371 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 22fe969c5d2e..f586afd76c80 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -27,6 +27,7 @@ #include #include +#include #include #include #include diff --git a/lib/hexdump.c b/lib/hexdump.c index c3db7c3a7643..2e5cd8c24769 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/string_helpers.c b/lib/string_helpers.c index ffb8ead6d4cd..8cb6f66c9c2b 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/uuid.c b/lib/uuid.c index e309b4c5be3d..e8543c668dc7 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/lib/vsprintf.c b/lib/vsprintf.c index a3790c43a0ab..800b8ac49f53 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index cb4855ed9500..dcd727345cac 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d41b03fd1f63..8e185b318288 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -126,6 +126,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/utils.c b/net/core/utils.c index 5e63b0ea21f3..dd86913988f4 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c8c3e1713c0e..51d70180e1cc 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 30a5a978a678..f3c6a41e4911 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 131090f31e6a..d808c0b63f30 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/net/tipc/core.h b/net/tipc/core.h index 7f3fe3401c45..9ce5f9ff6cc0 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index a5e730ffda57..465a32f59c11 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -13,6 +13,7 @@ #define pr_fmt(fmt) "EVM: "fmt #include +#include #include #include #include diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c index c35ea613c9f8..c6d1c7be8a3e 100644 --- a/security/integrity/ima/ima_api.c +++ b/security/integrity/ima/ima_api.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/security/ipe/digest.c b/security/ipe/digest.c index 493716370570..5006366837ba 100644 --- a/security/ipe/digest.c +++ b/security/ipe/digest.c @@ -3,6 +3,7 @@ * Copyright (C) 2020-2024 Microsoft Corporation. All rights reserved. */ +#include #include "digest.h" /** diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 596e7a30bd3c..56b531587a1e 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/security/keys/trusted-keys/trusted_core.c b/security/keys/trusted-keys/trusted_core.c index b1680ee53f86..16168ba5c83e 100644 --- a/security/keys/trusted-keys/trusted_core.c +++ b/security/keys/trusted-keys/trusted_core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c index 636acb66a4f6..c865c97aa1b4 100644 --- a/security/keys/trusted-keys/trusted_tpm1.c +++ b/security/keys/trusted-keys/trusted_tpm1.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c index 273ffbd6defe..019840006096 100644 --- a/security/loadpin/loadpin.c +++ b/security/loadpin/loadpin.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 896acad1f5f7..4d58c7ad1a23 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/sound/pci/riptide/riptide.c b/sound/pci/riptide/riptide.c index e983cd657e28..f91fe64bf4f9 100644 --- a/sound/pci/riptide/riptide.c +++ b/sound/pci/riptide/riptide.c @@ -75,6 +75,7 @@ */ #include +#include #include #include #include diff --git a/sound/usb/6fire/firmware.c b/sound/usb/6fire/firmware.c index c51abc54d2f8..cc8caec946cc 100644 --- a/sound/usb/6fire/firmware.c +++ b/sound/usb/6fire/firmware.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "firmware.h" From 436debc9cad892576d4f3287446b64474922764c Mon Sep 17 00:00:00 2001 From: Alejandro Colomar Date: Thu, 11 Dec 2025 11:43:49 +0100 Subject: [PATCH 014/107] array_size.h: add ARRAY_END() Patch series "Add ARRAY_END(), and use it to fix off-by-one bugs", v6. Add ARRAY_END(), and use it to fix off-by-one bugs ARRAY_END() is a macro to calculate a pointer to one past the last element of an array argument. This is a very common pointer, which is used to iterate over all elements of an array: for (T *p = a; p < ARRAY_END(a); p++) ... Of course, this pointer should never be dereferenced. A pointer one past the last element of an array should not be dereferenced; it's perfectly fine to hold such a pointer --and a good thing to do--, but the only thing it should be used for is comparing it with other pointers derived from the same array. Due to how special these pointers are, it would be good to use consistent naming. It's common to name such a pointer 'end' --in fact, we have many such cases in the kernel--. C++ even standardized this name with std::end(). Let's try naming such pointers 'end', and try also avoid using 'end' for pointers that are not the result of ARRAY_END(). It has been incorrectly suggested that these pointers are dangerous, and that they should never be used, suggesting to use something like #define ARRAY_LAST(a) ((a) + ARRAY_SIZE(a) - 1) for (T *p = a; p <= ARRAY_LAST(a); p++) ... This is bogus, as it doesn't scale down to arrays of 0 elements. In the case of an array of 0 elements, ARRAY_LAST() would underflow the pointer, which not only it can't be dereferenced, it can't even be held (it produces Undefined Behavior). That would be a footgun. Such arrays don't exist per the ISO C standard; however, GCC supports them as an extension (with partial support, though; GCC has a few bugs which need to be fixed). This patch set fixes a few places where it was intended to use the array end (that is, one past the last element), but accidentally a pointer to the last element was used instead, thus wasting one byte. It also replaces other places where the array end was correctly calculated with ARRAY_SIZE(), by using the simpler ARRAY_END(). Also, there was one drivers/ file that already defined this macro. We remove that definition, to not conflict with this one. This patch (of 4): ARRAY_END() returns a pointer one past the end of the last element in the array argument. This pointer is useful for iterating over the elements of an array: for (T *p = a, p < ARRAY_END(a); p++) ... Link: https://lkml.kernel.org/r/cover.1765449750.git.alx@kernel.org Link: https://lkml.kernel.org/r/5973cfb674192bc8e533485dbfb54e3062896be1.1765449750.git.alx@kernel.org Signed-off-by: Alejandro Colomar Cc: Kees Cook Cc: Christopher Bazley Cc: Rasmus Villemoes Cc: Marco Elver Cc: Michal Hocko Cc: Linus Torvalds Cc: Al Viro Cc: Alexander Potapenko Cc: Dmitriy Vyukov Cc: Jann Horn Cc: Maciej W. Rozycki Signed-off-by: Andrew Morton --- drivers/block/floppy.c | 2 -- include/linux/array_size.h | 6 ++++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c28786e0fe1c..92e446a64371 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4802,8 +4802,6 @@ static void floppy_release_allocated_regions(int fdc, const struct io_region *p) } } -#define ARRAY_END(X) (&((X)[ARRAY_SIZE(X)])) - static int floppy_request_regions(int fdc) { const struct io_region *p; diff --git a/include/linux/array_size.h b/include/linux/array_size.h index 06d7d83196ca..0c4fec98822e 100644 --- a/include/linux/array_size.h +++ b/include/linux/array_size.h @@ -10,4 +10,10 @@ */ #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) +/** + * ARRAY_END - get a pointer to one past the last element in array @arr + * @arr: array + */ +#define ARRAY_END(arr) (&(arr)[ARRAY_SIZE(arr)]) + #endif /* _LINUX_ARRAY_SIZE_H */ From 8118f197b7b738285eb4b66a80d01a5c8f35e231 Mon Sep 17 00:00:00 2001 From: Alejandro Colomar Date: Thu, 11 Dec 2025 11:43:54 +0100 Subject: [PATCH 015/107] mm: fix benign off-by-one bugs We were wasting a byte due to an off-by-one bug. s[c]nprintf() doesn't write more than $2 bytes including the null byte, so trying to pass 'size-1' there is wasting one byte. Link: https://lkml.kernel.org/r/9c38dd009c17b0219889c7089d9bdde5aaf28a8e.1765449750.git.alx@kernel.org Signed-off-by: Alejandro Colomar Acked-by: Marco Elver Cc: Kees Cook Cc: Christopher Bazley Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Cc: Linus Torvalds Cc: Rasmus Villemoes Cc: Michal Hocko Cc: Al Viro Cc: Maciej W. Rozycki Signed-off-by: Andrew Morton --- mm/kfence/kfence_test.c | 4 ++-- mm/kmsan/kmsan_test.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 00034e37bc9f..5725a367246d 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -110,7 +110,7 @@ static bool report_matches(const struct expect_report *r) /* Title */ cur = expect[0]; - end = &expect[0][sizeof(expect[0]) - 1]; + end = ARRAY_END(expect[0]); switch (r->type) { case KFENCE_ERROR_OOB: cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s", @@ -140,7 +140,7 @@ static bool report_matches(const struct expect_report *r) /* Access information */ cur = expect[1]; - end = &expect[1][sizeof(expect[1]) - 1]; + end = ARRAY_END(expect[1]); switch (r->type) { case KFENCE_ERROR_OOB: diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 902ec48b1e3e..b5ad5dfb2c00 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -105,7 +105,7 @@ static bool report_matches(const struct expect_report *r) /* Title */ cur = expected_header; - end = &expected_header[sizeof(expected_header) - 1]; + end = ARRAY_END(expected_header); cur += scnprintf(cur, end - cur, "BUG: KMSAN: %s", r->error_type); From a9e5620c9a9e237b3344702dec0839b89159a060 Mon Sep 17 00:00:00 2001 From: Alejandro Colomar Date: Thu, 11 Dec 2025 11:44:00 +0100 Subject: [PATCH 016/107] kernel: fix off-by-one benign bugs We were wasting a byte due to an off-by-one bug. s[c]nprintf() doesn't write more than $2 bytes including the null byte, so trying to pass 'size-1' there is wasting one byte. This is essentially the same as the previous commit, in a different file. Link: https://lkml.kernel.org/r/b4a945a4d40b7104364244f616eb9fb9f1fa691f.1765449750.git.alx@kernel.org Signed-off-by: Alejandro Colomar Cc: Marco Elver Cc: Kees Cook Cc: Christopher Bazley Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Cc: Linus Torvalds Cc: Rasmus Villemoes Cc: Marco Elver Cc: Michal Hocko Cc: Al Viro Cc: Maciej W. Rozycki Signed-off-by: Andrew Morton --- kernel/kcsan/kcsan_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 219d22857c98..8ef8167be745 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -176,7 +176,7 @@ static bool __report_matches(const struct expect_report *r) /* Title */ cur = expect[0]; - end = &expect[0][sizeof(expect[0]) - 1]; + end = ARRAY_END(expect[0]); cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ", is_assert ? "assert: race" : "data-race"); if (r->access[1].fn) { @@ -200,7 +200,7 @@ static bool __report_matches(const struct expect_report *r) /* Access 1 */ cur = expect[1]; - end = &expect[1][sizeof(expect[1]) - 1]; + end = ARRAY_END(expect[1]); if (!r->access[1].fn) cur += scnprintf(cur, end - cur, "race at unknown origin, with "); From 61e9210e23921cf1176af23b426b9bad8b08ffff Mon Sep 17 00:00:00 2001 From: Alejandro Colomar Date: Thu, 11 Dec 2025 11:44:04 +0100 Subject: [PATCH 017/107] mm: use ARRAY_END() instead of open-coding it There aren't any bugs in this code; it's purely cosmetic. By using ARRAY_END(), we prevent future issues, in case the code is modified; it has less moving parts. Also, it should be more readable (and perhaps more importantly, greppable), as there are several ways of writing an expression that gets the end of an array, which are unified by this API name. Link: https://lkml.kernel.org/r/2335917d123891fec074ab1b3acfb517cf14b5a7.1765449750.git.alx@kernel.org Signed-off-by: Alejandro Colomar Cc: Kees Cook Cc: Linus Torvalds Cc: Alexander Potapenko Cc: Al Viro Cc: Christopher Bazley Cc: Dmitriy Vyukov Cc: Jann Horn Cc: Maciej W. Rozycki Cc: Marco Elver Cc: Michal Hocko Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- mm/kmemleak.c | 2 +- mm/memcontrol-v1.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 1ac56ceb29b6..fe33f2edfe07 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -510,7 +510,7 @@ static void mem_pool_free(struct kmemleak_object *object) { unsigned long flags; - if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) { + if (object < mem_pool || object >= ARRAY_END(mem_pool)) { kmem_cache_free(object_cache, object); return; } diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 6eed14bff742..b2f37bd939fa 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1794,7 +1794,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) mem_cgroup_flush_stats(memcg); - for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + for (stat = stats; stat < ARRAY_END(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, false)); @@ -1805,7 +1805,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) seq_putc(m, '\n'); } - for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + for (stat = stats; stat < ARRAY_END(stats); stat++) { seq_printf(m, "hierarchical_%s=%lu", stat->name, mem_cgroup_nr_lru_pages(memcg, stat->lru_mask, From 76103d1b268e6f45735aa92d70bea5b5e8174a70 Mon Sep 17 00:00:00 2001 From: Pnina Feder Date: Tue, 16 Dec 2025 15:28:00 +0200 Subject: [PATCH 018/107] kernel: vmcoreinfo: allocate vmcoreinfo_data based on VMCOREINFO_BYTES Patch series "vmcoreinfo: support VMCOREINFO_BYTES larger than PAGE_SIZE". VMCOREINFO_BYTES is defined as a configurable size, but multiple code paths implicitly assume it always fits into a single page. This series removes that assumption by allocating and mapping vmcoreinfo based on its actual size. Patch 1 updates vmcoreinfo allocation to use get_order(VMCOREINFO_BYTES). Patch 2 updates crash kernel handling to correctly allocate and map multiple pages when copying vmcoreinfo. This makes vmcoreinfo size consistent across the kernel and avoids future breakage if VMCOREINFO_BYTES grows. (No functional change when VMCOREINFO_BYTES == PAGE_SIZE.) This patch (of 2): VMCOREINFO_BYTES defines the size of vmcoreinfo data, but the current implementation assumes a single page allocation. Allocate vmcoreinfo_data using get_order(VMCOREINFO_BYTES) so that vmcoreinfo can safely grow beyond PAGE_SIZE. This avoids hidden assumptions and keeps vmcoreinfo size consistent across the kernel. Link: https://lkml.kernel.org/r/20251216132801.807260-1-pnina.feder@mobileye.com Link: https://lkml.kernel.org/r/20251216132801.807260-2-pnina.feder@mobileye.com Signed-off-by: Pnina Feder Reviewed-by: Andrew Morton Cc: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/vmcore_info.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c index fe9bf8db1922..22b3205dd4dc 100644 --- a/kernel/vmcore_info.c +++ b/kernel/vmcore_info.c @@ -137,7 +137,9 @@ EXPORT_SYMBOL_GPL(hwerr_log_error_type); static int __init crash_save_vmcoreinfo_init(void) { - vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL); + int order; + order = get_order(VMCOREINFO_BYTES); + vmcoreinfo_data = (unsigned char *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); if (!vmcoreinfo_data) { pr_warn("Memory allocation for vmcoreinfo_data failed\n"); return -ENOMEM; @@ -146,7 +148,7 @@ static int __init crash_save_vmcoreinfo_init(void) vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE, GFP_KERNEL | __GFP_ZERO); if (!vmcoreinfo_note) { - free_page((unsigned long)vmcoreinfo_data); + free_pages((unsigned long)vmcoreinfo_data, order); vmcoreinfo_data = NULL; pr_warn("Memory allocation for vmcoreinfo_note failed\n"); return -ENOMEM; From b5bfcc1ffe512c7879cb90befdeabaa43d9f07ca Mon Sep 17 00:00:00 2001 From: Pnina Feder Date: Tue, 16 Dec 2025 15:28:01 +0200 Subject: [PATCH 019/107] kernel/crash: handle multi-page vmcoreinfo in crash kernel copy kimage_crash_copy_vmcoreinfo() currently assumes vmcoreinfo fits in a single page. This breaks if VMCOREINFO_BYTES exceeds PAGE_SIZE. Allocate the required order of control pages and vmap all pages needed to safely copy vmcoreinfo into the crash kernel image. Link: https://lkml.kernel.org/r/20251216132801.807260-3-pnina.feder@mobileye.com Signed-off-by: Pnina Feder Reviewed-by: Andrew Morton Cc: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 99dac1aa972a..3952b3e102e0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -44,9 +44,15 @@ note_buf_t __percpu *crash_notes; int kimage_crash_copy_vmcoreinfo(struct kimage *image) { - struct page *vmcoreinfo_page; + struct page *vmcoreinfo_base; + struct page *vmcoreinfo_pages[DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE)]; + unsigned int order, nr_pages; + int i; void *safecopy; + nr_pages = DIV_ROUND_UP(VMCOREINFO_BYTES, PAGE_SIZE); + order = get_order(VMCOREINFO_BYTES); + if (!IS_ENABLED(CONFIG_CRASH_DUMP)) return 0; if (image->type != KEXEC_TYPE_CRASH) @@ -61,12 +67,15 @@ int kimage_crash_copy_vmcoreinfo(struct kimage *image) * happens to generate vmcoreinfo note, hereby we rely on * vmap for this purpose. */ - vmcoreinfo_page = kimage_alloc_control_pages(image, 0); - if (!vmcoreinfo_page) { + vmcoreinfo_base = kimage_alloc_control_pages(image, order); + if (!vmcoreinfo_base) { pr_warn("Could not allocate vmcoreinfo buffer\n"); return -ENOMEM; } - safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); + for (i = 0; i < nr_pages; i++) + vmcoreinfo_pages[i] = vmcoreinfo_base + i; + + safecopy = vmap(vmcoreinfo_pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!safecopy) { pr_warn("Could not vmap vmcoreinfo buffer\n"); return -ENOMEM; From e700f5d1560798aacf0e56fdcc70ee2c20bf56ec Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 16 Dec 2025 02:45:21 -0500 Subject: [PATCH 020/107] watchdog: softlockup: panic when lockup duration exceeds N thresholds The softlockup_panic sysctl is currently a binary option: panic immediately or never panic on soft lockups. Panicking on any soft lockup, regardless of duration, can be overly aggressive for brief stalls that may be caused by legitimate operations. Conversely, never panicking may allow severe system hangs to persist undetected. Extend softlockup_panic to accept an integer threshold, allowing the kernel to panic only when the normalized lockup duration exceeds N watchdog threshold periods. This provides finer-grained control to distinguish between transient delays and persistent system failures. The accepted values are: - 0: Don't panic (unchanged) - 1: Panic when duration >= 1 * threshold (20s default, original behavior) - N > 1: Panic when duration >= N * threshold (e.g., 2 = 40s, 3 = 60s.) The original behavior is preserved for values 0 and 1, maintaining full backward compatibility while allowing systems to tolerate brief lockups while still catching severe, persistent hangs. [lirongqing@baidu.com: v2] Link: https://lkml.kernel.org/r/20251218074300.4080-1-lirongqing@baidu.com Link: https://lkml.kernel.org/r/20251216074521.2796-1-lirongqing@baidu.com Signed-off-by: Li RongQing Cc: Eduard Zingerman Cc: Hao Luo Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Lance Yang Cc: Martin KaFai Lau Cc: Nicholas Piggin Cc: Song Liu Cc: Stanislav Fomichev Cc: Yonghong Song Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 10 +++++----- arch/arm/configs/aspeed_g5_defconfig | 2 +- arch/arm/configs/pxa3xx_defconfig | 2 +- arch/openrisc/configs/or1klitex_defconfig | 2 +- arch/powerpc/configs/skiroot_defconfig | 2 +- drivers/gpu/drm/ci/arm.config | 2 +- drivers/gpu/drm/ci/arm64.config | 2 +- drivers/gpu/drm/ci/x86_64.config | 2 +- kernel/configs/debug.config | 2 +- kernel/watchdog.c | 10 ++++++---- lib/Kconfig.debug | 13 +++++++------ tools/testing/selftests/bpf/config | 2 +- .../testing/selftests/wireguard/qemu/kernel.config | 2 +- 13 files changed, 28 insertions(+), 25 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1058f2a6d6a8..73d846211144 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6969,12 +6969,12 @@ Kernel parameters softlockup_panic= [KNL] Should the soft-lockup detector generate panics. - Format: 0 | 1 + Format: - A value of 1 instructs the soft-lockup detector - to panic the machine when a soft-lockup occurs. It is - also controlled by the kernel.softlockup_panic sysctl - and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the + A value of non-zero instructs the soft-lockup detector + to panic the machine when a soft-lockup duration exceeds + N thresholds. It is also controlled by the kernel.softlockup_panic + sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the respective build-time switch to that functionality. softlockup_all_cpu_backtrace= diff --git a/arch/arm/configs/aspeed_g5_defconfig b/arch/arm/configs/aspeed_g5_defconfig index 2e6ea13c1e9b..ec558e57d081 100644 --- a/arch/arm/configs/aspeed_g5_defconfig +++ b/arch/arm/configs/aspeed_g5_defconfig @@ -306,7 +306,7 @@ CONFIG_SCHED_STACK_END_CHECK=y CONFIG_PANIC_ON_OOPS=y CONFIG_PANIC_TIMEOUT=-1 CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1 CONFIG_WQ_WATCHDOG=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/arm/configs/pxa3xx_defconfig b/arch/arm/configs/pxa3xx_defconfig index 07d422f0ff34..fb272e3a2337 100644 --- a/arch/arm/configs/pxa3xx_defconfig +++ b/arch/arm/configs/pxa3xx_defconfig @@ -100,7 +100,7 @@ CONFIG_PRINTK_TIME=y CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_SHIRQ=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 # CONFIG_SCHED_DEBUG is not set CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig index fb1eb9a68bd6..984b0e3b2768 100644 --- a/arch/openrisc/configs/or1klitex_defconfig +++ b/arch/openrisc/configs/or1klitex_defconfig @@ -52,5 +52,5 @@ CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,bpf" CONFIG_PRINTK_TIME=y CONFIG_PANIC_ON_OOPS=y CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BUG_ON_DATA_CORRUPTION=y diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 2b71a6dc399e..a4114fca5a39 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -289,7 +289,7 @@ CONFIG_SCHED_STACK_END_CHECK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_PANIC_ON_OOPS=y CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y CONFIG_WQ_WATCHDOG=y diff --git a/drivers/gpu/drm/ci/arm.config b/drivers/gpu/drm/ci/arm.config index 411e814819a8..d7c51670da2f 100644 --- a/drivers/gpu/drm/ci/arm.config +++ b/drivers/gpu/drm/ci/arm.config @@ -52,7 +52,7 @@ CONFIG_TMPFS=y CONFIG_PROVE_LOCKING=n CONFIG_DEBUG_LOCKDEP=n CONFIG_SOFTLOCKUP_DETECTOR=n -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=n +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0 CONFIG_FW_LOADER_COMPRESS=y diff --git a/drivers/gpu/drm/ci/arm64.config b/drivers/gpu/drm/ci/arm64.config index fddfbd4d2493..ea0e30737c4d 100644 --- a/drivers/gpu/drm/ci/arm64.config +++ b/drivers/gpu/drm/ci/arm64.config @@ -161,7 +161,7 @@ CONFIG_TMPFS=y CONFIG_PROVE_LOCKING=n CONFIG_DEBUG_LOCKDEP=n CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_DETECT_HUNG_TASK=y diff --git a/drivers/gpu/drm/ci/x86_64.config b/drivers/gpu/drm/ci/x86_64.config index 8eaba388b141..7ac98a78691e 100644 --- a/drivers/gpu/drm/ci/x86_64.config +++ b/drivers/gpu/drm/ci/x86_64.config @@ -47,7 +47,7 @@ CONFIG_TMPFS=y CONFIG_PROVE_LOCKING=n CONFIG_DEBUG_LOCKDEP=n CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_DETECT_HUNG_TASK=y diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 9f6ab7dabf67..774702591d26 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -84,7 +84,7 @@ CONFIG_SLUB_DEBUG_ON=y # Debug Oops, Lockups and Hangs # CONFIG_BOOTPARAM_HUNG_TASK_PANIC=0 -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=0 CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DETECT_HUNG_TASK=y CONFIG_PANIC_ON_OOPS=y diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 366122f4a0f8..b4d5fbdb933a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -363,7 +363,7 @@ static struct cpumask watchdog_allowed_mask __read_mostly; /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = - IS_ENABLED(CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC); + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC; static bool softlockup_initialized __read_mostly; static u64 __read_mostly sample_period; @@ -774,8 +774,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); - int duration; int softlockup_all_cpu_backtrace; + int duration, thresh_count; unsigned long flags; if (!watchdog_enabled) @@ -879,7 +879,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); sys_info(softlockup_si_mask & ~SYS_INFO_ALL_BT); - if (softlockup_panic) + thresh_count = duration / get_softlockup_thresh(); + + if (softlockup_panic && thresh_count >= softlockup_panic) panic("softlockup: hung tasks"); } @@ -1228,7 +1230,7 @@ static const struct ctl_table watchdog_sysctls[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "softlockup_sys_info", diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4bfca37f313e..947e62e92da8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1110,13 +1110,14 @@ config SOFTLOCKUP_DETECTOR_INTR_STORM the CPU stats and the interrupt counts during the "soft lockups". config BOOTPARAM_SOFTLOCKUP_PANIC - bool "Panic (Reboot) On Soft Lockups" + int "Panic (Reboot) On Soft Lockups" depends on SOFTLOCKUP_DETECTOR + default 0 help - Say Y here to enable the kernel to panic on "soft lockups", - which are bugs that cause the kernel to loop in kernel - mode for more than 20 seconds (configurable using the watchdog_thresh - sysctl), without giving other tasks a chance to run. + Set to a non-zero value N to enable the kernel to panic on "soft + lockups", which are bugs that cause the kernel to loop in kernel + mode for more than (N * 20 seconds) (configurable using the + watchdog_thresh sysctl), without giving other tasks a chance to run. The panic can be used in combination with panic_timeout, to cause the system to reboot automatically after a @@ -1124,7 +1125,7 @@ config BOOTPARAM_SOFTLOCKUP_PANIC high-availability systems that have uptime guarantees and where a lockup must be resolved ASAP. - Say N if unsure. + Say 0 if unsure. config HAVE_HARDLOCKUP_DETECTOR_BUDDY bool diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 558839e3c185..24855381290d 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -1,6 +1,6 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BPF=y CONFIG_BPF_EVENTS=y CONFIG_BPF_JIT=y diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 0504c11c2de6..bb89d2dfaa2a 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -80,7 +80,7 @@ CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_WQ_WATCHDOG=y CONFIG_DETECT_HUNG_TASK=y CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=1 CONFIG_BOOTPARAM_HUNG_TASK_PANIC=1 CONFIG_PANIC_TIMEOUT=-1 CONFIG_STACKTRACE=y From b8f690f6d1d9008ffab1f58fffc769ba813da373 Mon Sep 17 00:00:00 2001 From: Lalit Shankar Chowdhury Date: Tue, 2 Dec 2025 03:14:04 +0530 Subject: [PATCH 021/107] fat: remove unused parameter Remove unused inode parameter from fat_cache_alloc(). Link: https://lkml.kernel.org/r/20251201214403.90604-2-lalitshankarch@gmail.com Signed-off-by: Lalit Shankar Chowdhury Acked-by: OGAWA Hirofumi Cc: Christian Brauner Signed-off-by: Andrew Morton --- fs/fat/cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 630f3056658e..1b87354e24ba 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -54,7 +54,7 @@ void fat_cache_destroy(void) kmem_cache_destroy(fat_cache_cachep); } -static inline struct fat_cache *fat_cache_alloc(struct inode *inode) +static inline struct fat_cache *fat_cache_alloc(void) { return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS); } @@ -144,7 +144,7 @@ static void fat_cache_add(struct inode *inode, struct fat_cache_id *new) MSDOS_I(inode)->nr_caches++; spin_unlock(&MSDOS_I(inode)->cache_lru_lock); - tmp = fat_cache_alloc(inode); + tmp = fat_cache_alloc(); if (!tmp) { spin_lock(&MSDOS_I(inode)->cache_lru_lock); MSDOS_I(inode)->nr_caches--; From 7c5b0f6a9ff5041ea6f4213c9827170c60a376f0 Mon Sep 17 00:00:00 2001 From: Kevin Hao Date: Wed, 17 Dec 2025 12:23:27 +0800 Subject: [PATCH 022/107] .editorconfig: respect .editorconfig settings from parent directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Setting 'root' to 'true' prevents the editor from searching for other .editorconfig files in parent directories. However, a common workflow involves generating a patch with 'git format-patch' and opening it in an editor within the kernel source directory. In such cases, we want any specific settings for patch files defined in an .editorconfig located above the kernel source directory to remain effective. Therefore, remove the 'root' setting from the kernel .editorconfig. Link: https://lkml.kernel.org/r/20251217-editconfig-v1-1-883e6dd6dbfa@gmail.com Signed-off-by: Kevin Hao Cc: Íñigo Huguet Cc: Danny Lin Cc: Mickaël Salaün Cc: Masahiro Yamada Signed-off-by: Andrew Morton --- .editorconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/.editorconfig b/.editorconfig index 29a30ccfc07b..b5ea32b6954b 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,7 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -root = true - [{*.{awk,c,dts,dtsi,dtso,h,mk,s,S},Kconfig,Makefile,Makefile.*}] charset = utf-8 end_of_line = lf From 426295ef18c5d5f0b7f75ac89d09022fcfafd25c Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 28 Nov 2025 14:59:14 +0100 Subject: [PATCH 023/107] kallsyms: clean up @namebuf initialization in kallsyms_lookup_buildid() Patch series "kallsyms: Prevent invalid access when showing module buildid", v3. We have seen nested crashes in __sprint_symbol(), see below. They seem to be caused by an invalid pointer to "buildid". This patchset cleans up kallsyms code related to module buildid and fixes this invalid access when printing backtraces. I made an audit of __sprint_symbol() and found several situations when the buildid might be wrong: + bpf_address_lookup() does not set @modbuildid + ftrace_mod_address_lookup() does not set @modbuildid + __sprint_symbol() does not take rcu_read_lock and the related struct module might get removed before mod->build_id is printed. This patchset solves these problems: + 1st, 2nd patches are preparatory + 3rd, 4th, 6th patches fix the above problems + 5th patch cleans up a suspicious initialization code. This is the backtrace, we have seen. But it is not really important. The problems fixed by the patchset are obvious: crash64> bt [62/2029] PID: 136151 TASK: ffff9f6c981d4000 CPU: 367 COMMAND: "btrfs" #0 [ffffbdb687635c28] machine_kexec at ffffffffb4c845b3 #1 [ffffbdb687635c80] __crash_kexec at ffffffffb4d86a6a #2 [ffffbdb687635d08] hex_string at ffffffffb51b3b61 #3 [ffffbdb687635d40] crash_kexec at ffffffffb4d87964 #4 [ffffbdb687635d50] oops_end at ffffffffb4c41fc8 #5 [ffffbdb687635d70] do_trap at ffffffffb4c3e49a #6 [ffffbdb687635db8] do_error_trap at ffffffffb4c3e6a4 #7 [ffffbdb687635df8] exc_stack_segment at ffffffffb5666b33 #8 [ffffbdb687635e20] asm_exc_stack_segment at ffffffffb5800cf9 ... This patch (of 7) The function kallsyms_lookup_buildid() initializes the given @namebuf by clearing the first and the last byte. It is not clear why. The 1st byte makes sense because some callers ignore the return code and expect that the buffer contains a valid string, for example: - function_stat_show() - kallsyms_lookup() - kallsyms_lookup_buildid() The initialization of the last byte does not make much sense because it can later be overwritten. Fortunately, it seems that all called functions behave correctly: - kallsyms_expand_symbol() explicitly adds the trailing '\0' at the end of the function. - All *__address_lookup() functions either use the safe strscpy() or they do not touch the buffer at all. Document the reason for clearing the first byte. And remove the useless initialization of the last byte. Link: https://lkml.kernel.org/r/20251128135920.217303-2-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Aaron Tomlin Cc: Alexei Starovoitov Cc: Daniel Borkman Cc: John Fastabend Cc: Kees Cook Cc: Luis Chamberalin Cc: Marc Rutland Cc: "Masami Hiramatsu (Google)" Cc: Petr Pavlu Cc: Sami Tolvanen Cc: Steven Rostedt Cc: Daniel Gomez Signed-off-by: Andrew Morton --- kernel/kallsyms.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 049e296f586c..9559bf947c6b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -355,7 +355,12 @@ static int kallsyms_lookup_buildid(unsigned long addr, { int ret; - namebuf[KSYM_NAME_LEN - 1] = 0; + /* + * kallsyms_lookus() returns pointer to namebuf on success and + * NULL on error. But some callers ignore the return value. + * Instead they expect @namebuf filled either with valid + * or empty string. + */ namebuf[0] = 0; if (is_ksym_addr(addr)) { From fda024fb64769e9d6b3916d013c78d6b189129f8 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 28 Nov 2025 14:59:15 +0100 Subject: [PATCH 024/107] kallsyms: clean up modname and modbuildid initialization in kallsyms_lookup_buildid() The @modname and @modbuildid optional return parameters are set only when the symbol is in a module. Always initialize them so that they do not need to be cleared when the module is not in a module. It simplifies the logic and makes the code even slightly more safe. Note that bpf_address_lookup() function will get updated in a separate patch. Link: https://lkml.kernel.org/r/20251128135920.217303-3-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Aaron Tomlin Cc: Alexei Starovoitov Cc: Daniel Borkman Cc: Daniel Gomez Cc: John Fastabend Cc: Kees Cook Cc: Luis Chamberalin Cc: Marc Rutland Cc: "Masami Hiramatsu (Google)" Cc: Petr Pavlu Cc: Sami Tolvanen Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- kernel/kallsyms.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 9559bf947c6b..66ad899124c5 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -362,6 +362,14 @@ static int kallsyms_lookup_buildid(unsigned long addr, * or empty string. */ namebuf[0] = 0; + /* + * Initialize the module-related return values. They are not set + * when the symbol is in vmlinux or it is a bpf address. + */ + if (modname) + *modname = NULL; + if (modbuildid) + *modbuildid = NULL; if (is_ksym_addr(addr)) { unsigned long pos; @@ -370,10 +378,6 @@ static int kallsyms_lookup_buildid(unsigned long addr, /* Grab name */ kallsyms_expand_symbol(get_symbol_offset(pos), namebuf, KSYM_NAME_LEN); - if (modname) - *modname = NULL; - if (modbuildid) - *modbuildid = NULL; return strlen(namebuf); } From acfdbb4ab2910ff6f03becb569c23ac7b2223913 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 28 Nov 2025 14:59:16 +0100 Subject: [PATCH 025/107] module: add helper function for reading module_buildid() Add a helper function for reading the optional "build_id" member of struct module. It is going to be used also in ftrace_mod_address_lookup(). Use "#ifdef" instead of "#if IS_ENABLED()" to match the declaration of the optional field in struct module. Link: https://lkml.kernel.org/r/20251128135920.217303-4-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Daniel Gomez Reviewed-by: Petr Pavlu Cc: Aaron Tomlin Cc: Alexei Starovoitov Cc: Daniel Borkman Cc: John Fastabend Cc: Kees Cook Cc: Luis Chamberalin Cc: Marc Rutland Cc: "Masami Hiramatsu (Google)" Cc: Sami Tolvanen Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/module.h | 9 +++++++++ kernel/module/kallsyms.c | 9 ++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/module.h b/include/linux/module.h index d80c3ea57472..ac254525014c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -748,6 +748,15 @@ static inline void __module_get(struct module *module) __mod ? __mod->name : "kernel"; \ }) +static inline const unsigned char *module_buildid(struct module *mod) +{ +#ifdef CONFIG_STACKTRACE_BUILD_ID + return mod->build_id; +#else + return NULL; +#endif +} + /* Dereference module function descriptor */ void *dereference_module_function_descriptor(struct module *mod, void *ptr); diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 00a60796327c..0fc11e45df9b 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -334,13 +334,8 @@ int module_address_lookup(unsigned long addr, if (mod) { if (modname) *modname = mod->name; - if (modbuildid) { -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - *modbuildid = mod->build_id; -#else - *modbuildid = NULL; -#endif - } + if (modbuildid) + *modbuildid = module_buildid(mod); sym = find_kallsyms_symbol(mod, addr, size, offset); From 8e81dac4cd5477731169b92cff7c24f8f6635950 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 28 Nov 2025 14:59:17 +0100 Subject: [PATCH 026/107] kallsyms: cleanup code for appending the module buildid Put the code for appending the optional "buildid" into a helper function, It makes __sprint_symbol() better readable. Also print a warning when the "modname" is set and the "buildid" isn't. It might catch a situation when some lookup function in kallsyms_lookup_buildid() does not handle the "buildid". Use pr_*_once() to avoid an infinite recursion when the function is called from printk(). The recursion is rather theoretical but better be on the safe side. Link: https://lkml.kernel.org/r/20251128135920.217303-5-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Aaron Tomlin Cc: Alexei Starovoitov Cc: Daniel Borkman Cc: Daniel Gomez Cc: John Fastabend Cc: Kees Cook Cc: Luis Chamberalin Cc: Marc Rutland Cc: "Masami Hiramatsu (Google)" Cc: Petr Pavlu Cc: Sami Tolvanen Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- kernel/kallsyms.c | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 66ad899124c5..c0898327836c 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -435,6 +435,37 @@ int lookup_symbol_name(unsigned long addr, char *symname) return lookup_module_symbol_name(addr, symname); } +#ifdef CONFIG_STACKTRACE_BUILD_ID + +static int append_buildid(char *buffer, const char *modname, + const unsigned char *buildid) +{ + if (!modname) + return 0; + + if (!buildid) { + pr_warn_once("Undefined buildid for the module %s\n", modname); + return 0; + } + + /* build ID should match length of sprintf */ +#ifdef CONFIG_MODULES + static_assert(sizeof(typeof_member(struct module, build_id)) == 20); +#endif + + return sprintf(buffer, " %20phN", buildid); +} + +#else /* CONFIG_STACKTRACE_BUILD_ID */ + +static int append_buildid(char *buffer, const char *modname, + const unsigned char *buildid) +{ + return 0; +} + +#endif /* CONFIG_STACKTRACE_BUILD_ID */ + /* Look up a kernel symbol and return it in a text buffer. */ static int __sprint_symbol(char *buffer, unsigned long address, int symbol_offset, int add_offset, int add_buildid) @@ -457,15 +488,8 @@ static int __sprint_symbol(char *buffer, unsigned long address, if (modname) { len += sprintf(buffer + len, " [%s", modname); -#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) - if (add_buildid && buildid) { - /* build ID should match length of sprintf */ -#if IS_ENABLED(CONFIG_MODULES) - static_assert(sizeof(typeof_member(struct module, build_id)) == 20); -#endif - len += sprintf(buffer + len, " %20phN", buildid); - } -#endif + if (add_buildid) + len += append_buildid(buffer + len, modname, buildid); len += sprintf(buffer + len, "]"); } From cd6735896d0343942cf3dafb48ce32eb79341990 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 28 Nov 2025 14:59:18 +0100 Subject: [PATCH 027/107] kallsyms/bpf: rename __bpf_address_lookup() to bpf_address_lookup() bpf_address_lookup() has been used only in kallsyms_lookup_buildid(). It was supposed to set @modname and @modbuildid when the symbol was in a module. But it always just cleared @modname because BPF symbols were never in a module. And it did not clear @modbuildid because the pointer was not passed. The wrapper is no longer needed. Both @modname and @modbuildid are now always initialized to NULL in kallsyms_lookup_buildid(). Remove the wrapper and rename __bpf_address_lookup() to bpf_address_lookup() because this variant is used everywhere. [akpm@linux-foundation.org: fix loongarch] Link: https://lkml.kernel.org/r/20251128135920.217303-6-pmladek@suse.com Fixes: 9294523e3768 ("module: add printk formats to add module build ID to stacktraces") Signed-off-by: Petr Mladek Acked-by: Alexei Starovoitov Cc: Aaron Tomlin Cc: Daniel Borkman Cc: Daniel Gomez Cc: John Fastabend Cc: Kees Cook Cc: Luis Chamberalin Cc: Marc Rutland Cc: "Masami Hiramatsu (Google)" Cc: Petr Pavlu Cc: Sami Tolvanen Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/loongarch/net/bpf_jit.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 26 ++++---------------------- kernel/bpf/core.c | 4 ++-- kernel/kallsyms.c | 5 ++--- 6 files changed, 11 insertions(+), 30 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index b6eb7a465ad2..1d657bd3ce65 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2951,7 +2951,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, u64 plt_target = 0ULL; bool poking_bpf_entry; - if (!__bpf_address_lookup((unsigned long)ip, &size, &offset, namebuf)) + if (!bpf_address_lookup((unsigned long)ip, &size, &offset, namebuf)) /* Only poking bpf text is supported. Since kernel function * entry is set up by ftrace, we reply on ftrace to poke kernel * functions. diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c index d1d5a65308b9..3b63bc5b99d9 100644 --- a/arch/loongarch/net/bpf_jit.c +++ b/arch/loongarch/net/bpf_jit.c @@ -1319,7 +1319,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, /* Only poking bpf text is supported. Since kernel function entry * is set up by ftrace, we rely on ftrace to poke kernel functions. */ - if (!__bpf_address_lookup((unsigned long)ip, &size, &offset, namebuf)) + if (!bpf_address_lookup((unsigned long)ip, &size, &offset, namebuf)) return -ENOTSUPP; image = ip - offset; diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 5e976730b2f5..e199976e410a 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -1122,7 +1122,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, bpf_func = (unsigned long)ip; /* We currently only support poking bpf programs */ - if (!__bpf_address_lookup(bpf_func, &size, &offset, name)) { + if (!bpf_address_lookup(bpf_func, &size, &offset, name)) { pr_err("%s (0x%lx): kernel/modules are not supported\n", __func__, bpf_func); return -EOPNOTSUPP; } diff --git a/include/linux/filter.h b/include/linux/filter.h index fd54fed8f95f..7452817d707d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1375,24 +1375,13 @@ static inline bool bpf_jit_kallsyms_enabled(void) return false; } -int __bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym); +int bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); -static inline int -bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) -{ - int ret = __bpf_address_lookup(addr, size, off, sym); - - if (ret && modname) - *modname = NULL; - return ret; -} - void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); @@ -1431,8 +1420,8 @@ static inline bool bpf_jit_kallsyms_enabled(void) } static inline int -__bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym) +bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) { return 0; } @@ -1453,13 +1442,6 @@ static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) return NULL; } -static inline int -bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) -{ - return 0; -} - static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) { } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f1c5fc66ef01..8f6d8f1c4946 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -714,8 +714,8 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr) return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } -int __bpf_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char *sym) +int bpf_address_lookup(unsigned long addr, unsigned long *size, + unsigned long *off, char *sym) { struct bpf_ksym *ksym; int ret = 0; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index c0898327836c..a37cafdf52ca 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -345,7 +345,7 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, return 1; } return !!module_address_lookup(addr, symbolsize, offset, NULL, NULL, namebuf) || - !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); + !!bpf_address_lookup(addr, symbolsize, offset, namebuf); } static int kallsyms_lookup_buildid(unsigned long addr, @@ -386,8 +386,7 @@ static int kallsyms_lookup_buildid(unsigned long addr, ret = module_address_lookup(addr, symbolsize, offset, modname, modbuildid, namebuf); if (!ret) - ret = bpf_address_lookup(addr, symbolsize, - offset, modname, namebuf); + ret = bpf_address_lookup(addr, symbolsize, offset, namebuf); if (!ret) ret = ftrace_mod_address_lookup(addr, symbolsize, From e8a1e7eaa19d0b757b06a2f913e3eeb4b1c002c6 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 28 Nov 2025 14:59:19 +0100 Subject: [PATCH 028/107] kallsyms/ftrace: set module buildid in ftrace_mod_address_lookup() __sprint_symbol() might access an invalid pointer when kallsyms_lookup_buildid() returns a symbol found by ftrace_mod_address_lookup(). The ftrace lookup function must set both @modname and @modbuildid the same way as module_address_lookup(). Link: https://lkml.kernel.org/r/20251128135920.217303-7-pmladek@suse.com Fixes: 9294523e3768 ("module: add printk formats to add module build ID to stacktraces") Signed-off-by: Petr Mladek Reviewed-by: Aaron Tomlin Acked-by: Steven Rostedt (Google) Cc: Alexei Starovoitov Cc: Daniel Borkman Cc: Daniel Gomez Cc: John Fastabend Cc: Kees Cook Cc: Luis Chamberalin Cc: Marc Rutland Cc: "Masami Hiramatsu (Google)" Cc: Petr Pavlu Cc: Sami Tolvanen Signed-off-by: Andrew Morton --- include/linux/ftrace.h | 6 ++++-- kernel/kallsyms.c | 4 ++-- kernel/trace/ftrace.c | 5 ++++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a3a8989e3268..dc844d7e693d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -87,11 +87,13 @@ struct ftrace_hash; defined(CONFIG_DYNAMIC_FTRACE) int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym); + unsigned long *off, char **modname, + const unsigned char **modbuildid, char *sym); #else static inline int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) + unsigned long *off, char **modname, + const unsigned char **modbuildid, char *sym) { return 0; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index a37cafdf52ca..0f639c907336 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -389,8 +389,8 @@ static int kallsyms_lookup_buildid(unsigned long addr, ret = bpf_address_lookup(addr, symbolsize, offset, namebuf); if (!ret) - ret = ftrace_mod_address_lookup(addr, symbolsize, - offset, modname, namebuf); + ret = ftrace_mod_address_lookup(addr, symbolsize, offset, + modname, modbuildid, namebuf); return ret; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aa758efc3731..304505c11686 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7753,7 +7753,8 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, - unsigned long *off, char **modname, char *sym) + unsigned long *off, char **modname, + const unsigned char **modbuildid, char *sym) { struct ftrace_mod_map *mod_map; int ret = 0; @@ -7765,6 +7766,8 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, if (ret) { if (modname) *modname = mod_map->mod->name; + if (modbuildid) + *modbuildid = module_buildid(mod_map->mod); break; } } From 3b07086444f80c844351255fd94c2cb0a7224df2 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 28 Nov 2025 14:59:20 +0100 Subject: [PATCH 029/107] kallsyms: prevent module removal when printing module name and buildid kallsyms_lookup_buildid() copies the symbol name into the given buffer so that it can be safely read anytime later. But it just copies pointers to mod->name and mod->build_id which might get reused after the related struct module gets removed. The lifetime of struct module is synchronized using RCU. Take the rcu read lock for the entire __sprint_symbol(). Link: https://lkml.kernel.org/r/20251128135920.217303-8-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Aaron Tomlin Cc: Alexei Starovoitov Cc: Daniel Borkman Cc: Daniel Gomez Cc: John Fastabend Cc: Kees Cook Cc: Luis Chamberalin Cc: Marc Rutland Cc: "Masami Hiramatsu (Google)" Cc: Petr Pavlu Cc: Sami Tolvanen Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- kernel/kallsyms.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 0f639c907336..e0813ca9469a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -474,6 +474,9 @@ static int __sprint_symbol(char *buffer, unsigned long address, unsigned long offset, size; int len; + /* Prevent module removal until modname and modbuildid are printed */ + guard(rcu)(); + address += symbol_offset; len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); From f34e19c34e4e92338d2ceaab2b95dd7790d262de Mon Sep 17 00:00:00 2001 From: Minu Jin Date: Tue, 25 Nov 2025 09:04:07 +0900 Subject: [PATCH 030/107] fork-comment-fix: remove ambiguous question mark in CLONE_CHILD_CLEARTID comment The current comment "Clear TID on mm_release()?" ends with a question mark, implying uncertainty about whether the TID is actually cleared in mm_release(). However, the code flow is deterministic. When a task exits, mm_release() explicitly checks 'tsk->clear_child_tid' and clears. Since this behavior is unambiguous, remove the confusing question mark and rephrase the comment to clearly state that TID is cleared in mm_release(). Link: https://lkml.kernel.org/r/20251125000407.24470-1-s9430939@naver.com Signed-off-by: Minu Jin Cc: Ben Segall Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Cc: Oleg Nesterov Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index b1f3915d5f8e..b21eccc9e11c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2071,7 +2071,7 @@ __latent_entropy struct task_struct *copy_process( p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* - * Clear TID on mm_release()? + * TID is cleared in mm_release() when the task exits */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; From c243413740b39b0cf0a88732de5efc2b45716d81 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 19 Dec 2025 21:45:41 -0800 Subject: [PATCH 031/107] kfifo: fix kmalloc_array_node() argument order To be consistent, pass the kmalloc_array_node() parameters in the order (number_of_elements, element_size). Since only the product of the two values is used, this is not a bug fix. Link: https://lkml.kernel.org/r/20251220054541.2295599-1-rdunlap@infradead.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=216015 Signed-off-by: Randy Dunlap Cc: Stefani Seibold Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/kfifo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/kfifo.c b/lib/kfifo.c index 525e66f8294c..2633f9cc336c 100644 --- a/lib/kfifo.c +++ b/lib/kfifo.c @@ -41,7 +41,7 @@ int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size, return -EINVAL; } - fifo->data = kmalloc_array_node(esize, size, gfp_mask, node); + fifo->data = kmalloc_array_node(size, esize, gfp_mask, node); if (!fifo->data) { fifo->mask = 0; From 0e7fd23f9293cee3c7f341498a0011d09c491510 Mon Sep 17 00:00:00 2001 From: Kari Argillander Date: Fri, 19 Dec 2025 18:25:11 +0200 Subject: [PATCH 032/107] editorconfig: add rst extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have a lot of .rst documentation; use editorconfig rules for those. This sets the default tab width to 8, which makes indentation consistent and avoids requiring developers to adjust editor settings manually. Link: https://lkml.kernel.org/r/20251219-editorconfig-rst-v1-1-58d4fa397664@gmail.com Signed-off-by: Kari Argillander Cc: Danny Lin Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Mickael Salaun Cc: Íñigo Huguet Signed-off-by: Andrew Morton --- .editorconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.editorconfig b/.editorconfig index b5ea32b6954b..69718ac91747 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -[{*.{awk,c,dts,dtsi,dtso,h,mk,s,S},Kconfig,Makefile,Makefile.*}] +[{*.{awk,c,dts,dtsi,dtso,h,mk,rst,s,S},Kconfig,Makefile,Makefile.*}] charset = utf-8 end_of_line = lf insert_final_newline = true From d30aca3eeffc18452e5cc5c4e59f1a4da2bd2f12 Mon Sep 17 00:00:00 2001 From: Ryota Sakamoto Date: Sun, 21 Dec 2025 13:35:16 +0000 Subject: [PATCH 033/107] lib/tests: convert test_min_heap module to KUnit Move lib/test_min_heap.c to lib/tests/min_heap_kunit.c and convert it to use KUnit. This change switches the ad-hoc test code to standard KUnit test cases. The test data remains the same, but the verification logic is updated to use KUNIT_EXPECT_* macros. Also remove CONFIG_TEST_MIN_HEAP from arch/*/configs/* because it is no longer used. The new CONFIG_MIN_HEAP_KUNIT_TEST will be automatically enabled by CONFIG_KUNIT_ALL_TESTS. The reasons for converting to KUnit are: 1. Standardization: Switching from ad-hoc printk-based reporting to the standard KTAP format makes it easier for CI systems to parse and report test results 2. Better Diagnostics: Using KUNIT_EXPECT_* macros automatically provides detailed diagnostics on failure. 3. Tooling Integration: It allows the test to be managed and executed using standard KUnit tools. Link: https://lkml.kernel.org/r/20251221133516.321846-1-sakamo.ryota@gmail.com Signed-off-by: Ryota Sakamoto Acked-by: Kuan-Wei Chiu Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: David Gow Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Sven Schnelle Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- arch/m68k/configs/amiga_defconfig | 1 - arch/m68k/configs/apollo_defconfig | 1 - arch/m68k/configs/atari_defconfig | 1 - arch/m68k/configs/bvme6000_defconfig | 1 - arch/m68k/configs/hp300_defconfig | 1 - arch/m68k/configs/mac_defconfig | 1 - arch/m68k/configs/multi_defconfig | 1 - arch/m68k/configs/mvme147_defconfig | 1 - arch/m68k/configs/mvme16x_defconfig | 1 - arch/m68k/configs/q40_defconfig | 1 - arch/m68k/configs/sun3_defconfig | 1 - arch/m68k/configs/sun3x_defconfig | 1 - arch/powerpc/configs/ppc64_defconfig | 1 - arch/s390/configs/debug_defconfig | 2 +- lib/Kconfig.debug | 21 +-- lib/Makefile | 1 - lib/tests/Makefile | 1 + .../min_heap_kunit.c} | 145 ++++++++---------- 19 files changed, 80 insertions(+), 105 deletions(-) rename lib/{test_min_heap.c => tests/min_heap_kunit.c} (58%) diff --git a/MAINTAINERS b/MAINTAINERS index 99407c4c0095..4dcbcb5c14f0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17456,7 +17456,7 @@ S: Maintained F: Documentation/core-api/min_heap.rst F: include/linux/min_heap.h F: lib/min_heap.c -F: lib/test_min_heap.c +F: lib/tests/min_heap_kunit.c MIPI CCS, SMIA AND SMIA++ IMAGE SENSOR DRIVER M: Sakari Ailus diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 1439abb69f73..46598efbea54 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -609,7 +609,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 6a4e71866f60..63bef7a6d858 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -566,7 +566,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 46ad7d57b4fc..1342adfbd855 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -586,7 +586,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 867bfa13a44c..484f21a2da37 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -558,7 +558,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 5dfe602cafd4..ce97c816aa21 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -568,7 +568,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index f5d30310a349..f5b57ea2d681 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -585,7 +585,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index fe54e9222cc0..85efdb31c898 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -672,7 +672,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 4ff2ff0993ad..7102579b83d3 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -558,7 +558,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 6bb4738a65aa..18c0493ed0ff 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -559,7 +559,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 14166c8fe234..1b3a34ab1c74 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -575,7 +575,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 5db924e3caf7..1a41a1c6bde1 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -555,7 +555,6 @@ CONFIG_WW_MUTEX_SELFTEST=m CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 318c9fe42f46..8f182684e54b 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -556,7 +556,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_TEST_DHRY=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_TEST_MULDIV64=m CONFIG_REED_SOLOMON_TEST=m diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 684b3ea80f39..f1e937222a83 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -426,7 +426,6 @@ CONFIG_BOOTX_TEXT=y CONFIG_KUNIT=m CONFIG_KUNIT_ALL_TESTS=m CONFIG_LKDTM=m -CONFIG_TEST_MIN_HEAP=m CONFIG_TEST_DIV64=m CONFIG_BACKTRACE_SELF_TEST=m CONFIG_TEST_REF_TRACKER=m diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 0713914b25b4..4be3a7540909 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -921,7 +921,7 @@ CONFIG_FAULT_INJECTION_DEBUG_FS=y CONFIG_FAULT_INJECTION_CONFIGFS=y CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y CONFIG_LKDTM=m -CONFIG_TEST_MIN_HEAP=y +CONFIG_MIN_HEAP_KUNIT_TEST=m CONFIG_KPROBES_SANITY_TEST=m CONFIG_RBTREE_TEST=y CONFIG_INTERVAL_TREE_TEST=m diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 947e62e92da8..3a31bbf53425 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2278,16 +2278,6 @@ config TEST_LIST_SORT If unsure, say N. -config TEST_MIN_HEAP - tristate "Min heap test" - depends on DEBUG_KERNEL || m - help - Enable this to turn on min heap function tests. This test is - executed only once during system boot (so affects only boot time), - or at module load time. - - If unsure, say N. - config TEST_SORT tristate "Array-based sort test" if !KUNIT_ALL_TESTS depends on KUNIT @@ -2878,6 +2868,17 @@ config MEMCPY_KUNIT_TEST If unsure, say N. +config MIN_HEAP_KUNIT_TEST + tristate "Min heap test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the min heap library + which provides functions for creating and managing min heaps. + The test suite checks the functionality of the min heap library. + + If unsure, say N + config IS_SIGNED_TYPE_KUNIT_TEST tristate "Test is_signed_type() macro" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index 586a9f9b27a9..1f87a174a317 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -75,7 +75,6 @@ obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o CFLAGS_test_ubsan.o += $(call cc-disable-warning, unused-but-set-variable) UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o -obj-$(CONFIG_TEST_MIN_HEAP) += test_min_heap.o obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o diff --git a/lib/tests/Makefile b/lib/tests/Makefile index 9a20608f65f5..088b80d16383 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -33,6 +33,7 @@ CFLAGS_longest_symbol_kunit.o += $(call cc-disable-warning, missing-prototypes) obj-$(CONFIG_LONGEST_SYM_KUNIT_TEST) += longest_symbol_kunit.o obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o +obj-$(CONFIG_MIN_HEAP_KUNIT_TEST) += min_heap_kunit.o CFLAGS_overflow_kunit.o = $(call cc-disable-warning, tautological-constant-out-of-range-compare) obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o obj-$(CONFIG_PRINTF_KUNIT_TEST) += printf_kunit.o diff --git a/lib/test_min_heap.c b/lib/tests/min_heap_kunit.c similarity index 58% rename from lib/test_min_heap.c rename to lib/tests/min_heap_kunit.c index a9c4a74d3898..9c1122661698 100644 --- a/lib/test_min_heap.c +++ b/lib/tests/min_heap_kunit.c @@ -1,60 +1,66 @@ // SPDX-License-Identifier: GPL-2.0-only -#define pr_fmt(fmt) "min_heap_test: " fmt - /* * Test cases for the min max heap. */ -#include +#include #include #include -#include #include +struct min_heap_test_case { + const char *str; + bool min_heap; +}; + +static struct min_heap_test_case min_heap_cases[] = { + { + .str = "min", + .min_heap = true, + }, + { + .str = "max", + .min_heap = false, + }, +}; + +KUNIT_ARRAY_PARAM_DESC(min_heap, min_heap_cases, str); + DEFINE_MIN_HEAP(int, min_heap_test); -static __init bool less_than(const void *lhs, const void *rhs, void __always_unused *args) +static bool less_than(const void *lhs, const void *rhs, void __always_unused *args) { return *(int *)lhs < *(int *)rhs; } -static __init bool greater_than(const void *lhs, const void *rhs, void __always_unused *args) +static bool greater_than(const void *lhs, const void *rhs, void __always_unused *args) { return *(int *)lhs > *(int *)rhs; } -static __init int pop_verify_heap(bool min_heap, - struct min_heap_test *heap, - const struct min_heap_callbacks *funcs) +static void pop_verify_heap(struct kunit *test, + bool min_heap, + struct min_heap_test *heap, + const struct min_heap_callbacks *funcs) { int *values = heap->data; - int err = 0; int last; last = values[0]; min_heap_pop_inline(heap, funcs, NULL); while (heap->nr > 0) { - if (min_heap) { - if (last > values[0]) { - pr_err("error: expected %d <= %d\n", last, - values[0]); - err++; - } - } else { - if (last < values[0]) { - pr_err("error: expected %d >= %d\n", last, - values[0]); - err++; - } - } + if (min_heap) + KUNIT_EXPECT_LE(test, last, values[0]); + else + KUNIT_EXPECT_GE(test, last, values[0]); last = values[0]; min_heap_pop_inline(heap, funcs, NULL); } - return err; } -static __init int test_heapify_all(bool min_heap) +static void test_heapify_all(struct kunit *test) { + const struct min_heap_test_case *params = test->param_value; int values[] = { 3, 1, 2, 4, 0x8000000, 0x7FFFFFF, 0, -3, -1, -2, -4, 0x8000000, 0x7FFFFFF }; struct min_heap_test heap = { @@ -63,15 +69,14 @@ static __init int test_heapify_all(bool min_heap) .size = ARRAY_SIZE(values), }; struct min_heap_callbacks funcs = { - .less = min_heap ? less_than : greater_than, + .less = params->min_heap ? less_than : greater_than, .swp = NULL, }; - int i, err; + int i; /* Test with known set of values. */ min_heapify_all_inline(&heap, &funcs, NULL); - err = pop_verify_heap(min_heap, &heap, &funcs); - + pop_verify_heap(test, params->min_heap, &heap, &funcs); /* Test with randomly generated values. */ heap.nr = ARRAY_SIZE(values); @@ -79,13 +84,12 @@ static __init int test_heapify_all(bool min_heap) values[i] = get_random_u32(); min_heapify_all_inline(&heap, &funcs, NULL); - err += pop_verify_heap(min_heap, &heap, &funcs); - - return err; + pop_verify_heap(test, params->min_heap, &heap, &funcs); } -static __init int test_heap_push(bool min_heap) +static void test_heap_push(struct kunit *test) { + const struct min_heap_test_case *params = test->param_value; const int data[] = { 3, 1, 2, 4, 0x80000000, 0x7FFFFFFF, 0, -3, -1, -2, -4, 0x80000000, 0x7FFFFFFF }; int values[ARRAY_SIZE(data)]; @@ -95,29 +99,28 @@ static __init int test_heap_push(bool min_heap) .size = ARRAY_SIZE(values), }; struct min_heap_callbacks funcs = { - .less = min_heap ? less_than : greater_than, + .less = params->min_heap ? less_than : greater_than, .swp = NULL, }; - int i, temp, err; + int i, temp; /* Test with known set of values copied from data. */ for (i = 0; i < ARRAY_SIZE(data); i++) min_heap_push_inline(&heap, &data[i], &funcs, NULL); - err = pop_verify_heap(min_heap, &heap, &funcs); + pop_verify_heap(test, params->min_heap, &heap, &funcs); /* Test with randomly generated values. */ while (heap.nr < heap.size) { temp = get_random_u32(); min_heap_push_inline(&heap, &temp, &funcs, NULL); } - err += pop_verify_heap(min_heap, &heap, &funcs); - - return err; + pop_verify_heap(test, params->min_heap, &heap, &funcs); } -static __init int test_heap_pop_push(bool min_heap) +static void test_heap_pop_push(struct kunit *test) { + const struct min_heap_test_case *params = test->param_value; const int data[] = { 3, 1, 2, 4, 0x80000000, 0x7FFFFFFF, 0, -3, -1, -2, -4, 0x80000000, 0x7FFFFFFF }; int values[ARRAY_SIZE(data)]; @@ -127,13 +130,13 @@ static __init int test_heap_pop_push(bool min_heap) .size = ARRAY_SIZE(values), }; struct min_heap_callbacks funcs = { - .less = min_heap ? less_than : greater_than, + .less = params->min_heap ? less_than : greater_than, .swp = NULL, }; - int i, temp, err; + int i, temp; /* Fill values with data to pop and replace. */ - temp = min_heap ? 0x80000000 : 0x7FFFFFFF; + temp = params->min_heap ? 0x80000000 : 0x7FFFFFFF; for (i = 0; i < ARRAY_SIZE(data); i++) min_heap_push_inline(&heap, &temp, &funcs, NULL); @@ -141,7 +144,7 @@ static __init int test_heap_pop_push(bool min_heap) for (i = 0; i < ARRAY_SIZE(data); i++) min_heap_pop_push_inline(&heap, &data[i], &funcs, NULL); - err = pop_verify_heap(min_heap, &heap, &funcs); + pop_verify_heap(test, params->min_heap, &heap, &funcs); heap.nr = 0; for (i = 0; i < ARRAY_SIZE(data); i++) @@ -152,13 +155,12 @@ static __init int test_heap_pop_push(bool min_heap) temp = get_random_u32(); min_heap_pop_push_inline(&heap, &temp, &funcs, NULL); } - err += pop_verify_heap(min_heap, &heap, &funcs); - - return err; + pop_verify_heap(test, params->min_heap, &heap, &funcs); } -static __init int test_heap_del(bool min_heap) +static void test_heap_del(struct kunit *test) { + const struct min_heap_test_case *params = test->param_value; int values[] = { 3, 1, 2, 4, 0x8000000, 0x7FFFFFF, 0, -3, -1, -2, -4, 0x8000000, 0x7FFFFFF }; struct min_heap_test heap; @@ -166,17 +168,16 @@ static __init int test_heap_del(bool min_heap) min_heap_init_inline(&heap, values, ARRAY_SIZE(values)); heap.nr = ARRAY_SIZE(values); struct min_heap_callbacks funcs = { - .less = min_heap ? less_than : greater_than, + .less = params->min_heap ? less_than : greater_than, .swp = NULL, }; - int i, err; + int i; /* Test with known set of values. */ min_heapify_all_inline(&heap, &funcs, NULL); for (i = 0; i < ARRAY_SIZE(values) / 2; i++) min_heap_del_inline(&heap, get_random_u32() % heap.nr, &funcs, NULL); - err = pop_verify_heap(min_heap, &heap, &funcs); - + pop_verify_heap(test, params->min_heap, &heap, &funcs); /* Test with randomly generated values. */ heap.nr = ARRAY_SIZE(values); @@ -186,37 +187,23 @@ static __init int test_heap_del(bool min_heap) for (i = 0; i < ARRAY_SIZE(values) / 2; i++) min_heap_del_inline(&heap, get_random_u32() % heap.nr, &funcs, NULL); - err += pop_verify_heap(min_heap, &heap, &funcs); - - return err; + pop_verify_heap(test, params->min_heap, &heap, &funcs); } -static int __init test_min_heap_init(void) -{ - int err = 0; +static struct kunit_case min_heap_test_cases[] = { + KUNIT_CASE_PARAM(test_heapify_all, min_heap_gen_params), + KUNIT_CASE_PARAM(test_heap_push, min_heap_gen_params), + KUNIT_CASE_PARAM(test_heap_pop_push, min_heap_gen_params), + KUNIT_CASE_PARAM(test_heap_del, min_heap_gen_params), + {}, +}; - err += test_heapify_all(true); - err += test_heapify_all(false); - err += test_heap_push(true); - err += test_heap_push(false); - err += test_heap_pop_push(true); - err += test_heap_pop_push(false); - err += test_heap_del(true); - err += test_heap_del(false); - if (err) { - pr_err("test failed with %d errors\n", err); - return -EINVAL; - } - pr_info("test passed\n"); - return 0; -} -module_init(test_min_heap_init); +static struct kunit_suite min_heap_test_suite = { + .name = "min_heap", + .test_cases = min_heap_test_cases, +}; -static void __exit test_min_heap_exit(void) -{ - /* do nothing */ -} -module_exit(test_min_heap_exit); +kunit_test_suite(min_heap_test_suite); MODULE_DESCRIPTION("Test cases for the min max heap"); MODULE_LICENSE("GPL"); From 1965bbb8f3c72e5f1972b5eeb6f19a36664a676d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 22 Dec 2025 08:55:10 +0100 Subject: [PATCH 034/107] ipc/shm: uapi: remove dependency on libc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using libc types and headers from the UAPI headers is problematic as it introduces a dependency on a full C toolchain. shm.h does not even use any symbols from the libc header as the usage of getpagesize() was removed a decade ago in commit 060028bac94b ("ipc/shm.c: increase the defaults for SHMALL, SHMMAX") Drop the unnecessary inclusion. Link: https://lkml.kernel.org/r/20251222-uapi-shm-v1-1-270bb7f75d97@linutronix.de Signed-off-by: Thomas Weißschuh Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/uapi/linux/shm.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/uapi/linux/shm.h b/include/uapi/linux/shm.h index 8d1f17a4e08e..7269f9f402e3 100644 --- a/include/uapi/linux/shm.h +++ b/include/uapi/linux/shm.h @@ -5,9 +5,6 @@ #include #include #include -#ifndef __KERNEL__ -#include -#endif /* * SHMMNI, SHMMAX and SHMALL are default upper limits which can be From ad533a740c7ccb801619ed962807605254fe7545 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Sat, 13 Dec 2025 12:53:09 +0100 Subject: [PATCH 035/107] resource: provide 0args DEFINE_RES variant for unset resource desc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide a variant of DEFINE_RES that takes 0 arguments to initialize an "unset" resource descriptor. This should be used for the improper case of struct resource res = {}; where DEFINE_RES() should be used. With this new helper variant, it would result in: struct resource res = DEFINE_RES(); instead of having to define the full 3 arguments: struct resource res = DEFINE_RES(0, 0, IORESOURCE_UNSET); DEFINE_RES() with no args, will set the flags to IORESOURCE_UNSET signaling the resource descriptor is UNSET and doesn't reflect an actual resource currently. Link: https://lkml.kernel.org/r/20251213115314.16700-1-ansuelsmth@gmail.com Signed-off-by: Christian Marangi Suggested-by: Ilpo Järvinen Reviewed-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Signed-off-by: Andrew Morton --- include/linux/ioport.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 9afa30f9346f..e974fc087059 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -10,6 +10,7 @@ #define _LINUX_IOPORT_H #ifndef __ASSEMBLY__ +#include #include #include #include @@ -165,8 +166,12 @@ enum { #define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ DEFINE_RES_NAMED_DESC(_start, _size, _name, _flags, IORES_DESC_NONE) -#define DEFINE_RES(_start, _size, _flags) \ +#define __DEFINE_RES0() \ + DEFINE_RES_NAMED(0, 0, NULL, IORESOURCE_UNSET) +#define __DEFINE_RES3(_start, _size, _flags) \ DEFINE_RES_NAMED(_start, _size, NULL, _flags) +#define DEFINE_RES(...) \ + CONCATENATE(__DEFINE_RES, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) #define DEFINE_RES_IO_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_IO) From 1921044eebf1d6861a6de1a76e3f63729a45e712 Mon Sep 17 00:00:00 2001 From: Chaitanya Mishra Date: Sat, 27 Dec 2025 14:52:29 +0530 Subject: [PATCH 036/107] lib/kstrtox: fix kstrtobool() docstring to mention enabled/disabled Commit ae5b3500856f ("kstrtox: add support for enabled and disabled in kstrtobool()") added support for 'e'/'E' (enabled) and 'd'/'D' (disabled) inputs, but did not update the docstring accordingly. Update the docstring to include 'Ee' (for true) and 'Dd' (for false) in the list of accepted first characters. Link: https://lkml.kernel.org/r/20251227092229.57330-1-chaitanyamishra.ai@gmail.com Fixes: ae5b3500856f ("kstrtox: add support for enabled and disabled in kstrtobool()") Signed-off-by: Chaitanya Mishra Cc: Mario Limonciello Signed-off-by: Andrew Morton --- lib/kstrtox.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/kstrtox.c b/lib/kstrtox.c index bdde40cd69d7..97be2a39f537 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -340,8 +340,8 @@ EXPORT_SYMBOL(kstrtos8); * @s: input string * @res: result * - * This routine returns 0 iff the first character is one of 'YyTt1NnFf0', or - * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value + * This routine returns 0 iff the first character is one of 'EeYyTt1DdNnFf0', + * or [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value * pointed to by res is updated upon finding a match. */ noinline From 998be0a4dbcaa796a05c7b52327f3a09c29d3662 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 30 Dec 2025 11:14:02 -0500 Subject: [PATCH 037/107] liveupdate: separate memfd support into LIVEUPDATE_MEMFD Decouple memfd preservation support from the core Live Update Orchestrator configuration. Previously, enabling CONFIG_LIVEUPDATE forced a dependency on CONFIG_SHMEM and unconditionally compiled memfd_luo.o. However, Live Update may be used for purposes that do not require memfd-backed memory preservation. Introduce CONFIG_LIVEUPDATE_MEMFD to gate memfd_luo.o. This moves the SHMEM and MEMFD_CREATE dependencies to the specific feature that needs them, allowing the base LIVEUPDATE option to be selected independently of shared memory support. Link: https://lkml.kernel.org/r/20251230161402.1542099-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- kernel/liveupdate/Kconfig | 17 ++++++++++++++++- mm/Makefile | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/liveupdate/Kconfig b/kernel/liveupdate/Kconfig index d2aeaf13c3ac..1a8513f16ef7 100644 --- a/kernel/liveupdate/Kconfig +++ b/kernel/liveupdate/Kconfig @@ -54,7 +54,6 @@ config KEXEC_HANDOVER_ENABLE_DEFAULT config LIVEUPDATE bool "Live Update Orchestrator" depends on KEXEC_HANDOVER - depends on SHMEM help Enable the Live Update Orchestrator. Live Update is a mechanism, typically based on kexec, that allows the kernel to be updated @@ -73,4 +72,20 @@ config LIVEUPDATE If unsure, say N. +config LIVEUPDATE_MEMFD + bool "Live update support for memfd" + depends on LIVEUPDATE + depends on MEMFD_CREATE + depends on SHMEM + default LIVEUPDATE + help + Enable live update support for memfd regions. This allows preserving + memfd-backed memory across kernel live updates. + + This can be used to back VM memory with memfds, allowing the guest + memory to persist, or for other user workloads needing to preserve + pages. + + If unsure, say N. + endmenu diff --git a/mm/Makefile b/mm/Makefile index 2d0570a16e5b..e38fcfbb805c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -100,7 +100,7 @@ obj-$(CONFIG_NUMA) += memory-tiers.o obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o -obj-$(CONFIG_LIVEUPDATE) += memfd_luo.o +obj-$(CONFIG_LIVEUPDATE_MEMFD) += memfd_luo.o obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o ifdef CONFIG_SWAP From e896c44aecfb7b3470470b4e63495dfa2b359060 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 30 Dec 2025 08:13:15 +0100 Subject: [PATCH 038/107] types: drop definition of __EXPORTED_HEADERS__ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This definition disarms the warning in uapi/linux/types.h about including kernel headers from user space. However the warning is already disarmed due to the fact that kernel code is built with -D__KERNEL__. Drop the pointless definition. Link: https://lkml.kernel.org/r/20251230-exported-headers-types-h-v1-1-947fc606f3d8@linutronix.de Signed-off-by: Thomas Weißschuh Cc: Arnd Bergmann Signed-off-by: Andrew Morton --- include/linux/types.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/types.h b/include/linux/types.h index d4437e9c452c..0cbb684eec5c 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -2,7 +2,6 @@ #ifndef _LINUX_TYPES_H #define _LINUX_TYPES_H -#define __EXPORTED_HEADERS__ #include #ifndef __ASSEMBLY__ From 10d1c75ed4382a8e79874379caa2ead8952734f9 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Tue, 30 Dec 2025 22:16:07 -0800 Subject: [PATCH 039/107] ima: verify the previous kernel's IMA buffer lies in addressable RAM Patch series "Address page fault in ima_restore_measurement_list()", v3. When the second-stage kernel is booted via kexec with a limiting command line such as "mem=" we observe a pafe fault that happens. BUG: unable to handle page fault for address: ffff97793ff47000 RIP: ima_restore_measurement_list+0xdc/0x45a #PF: error_code(0x0000) not-present page This happens on x86_64 only, as this is already fixed in aarch64 in commit: cbf9c4b9617b ("of: check previous kernel's ima-kexec-buffer against memory bounds") This patch (of 3): When the second-stage kernel is booted with a limiting command line (e.g. "mem="), the IMA measurement buffer handed over from the previous kernel may fall outside the addressable RAM of the new kernel. Accessing such a buffer can fault during early restore. Introduce a small generic helper, ima_validate_range(), which verifies that a physical [start, end] range for the previous-kernel IMA buffer lies within addressable memory: - On x86, use pfn_range_is_mapped(). - On OF based architectures, use page_is_ram(). Link: https://lkml.kernel.org/r/20251231061609.907170-1-harshit.m.mogalapalli@oracle.com Link: https://lkml.kernel.org/r/20251231061609.907170-2-harshit.m.mogalapalli@oracle.com Signed-off-by: Harshit Mogalapalli Reviewed-by: Mimi Zohar Cc: Alexander Graf Cc: Ard Biesheuvel Cc: Borislav Betkov Cc: guoweikang Cc: Henry Willard Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Bohac Cc: Joel Granados Cc: Jonathan McDowell Cc: Mike Rapoport Cc: Paul Webb Cc: Sohil Mehta Cc: Sourabh Jain Cc: Thomas Gleinxer Cc: Yifei Liu Cc: Baoquan He Cc: Signed-off-by: Andrew Morton --- include/linux/ima.h | 1 + security/integrity/ima/ima_kexec.c | 35 ++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/include/linux/ima.h b/include/linux/ima.h index 8e29cb4e6a01..abf8923f8fc5 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -69,6 +69,7 @@ static inline int ima_measure_critical_data(const char *event_label, #ifdef CONFIG_HAVE_IMA_KEXEC int __init ima_free_kexec_buffer(void); int __init ima_get_kexec_buffer(void **addr, size_t *size); +int ima_validate_range(phys_addr_t phys, size_t size); #endif #ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c index 5beb69edd12f..36a34c54de58 100644 --- a/security/integrity/ima/ima_kexec.c +++ b/security/integrity/ima/ima_kexec.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include "ima.h" @@ -294,3 +296,36 @@ void __init ima_load_kexec_buffer(void) pr_debug("Error restoring the measurement list: %d\n", rc); } } + +/* + * ima_validate_range - verify a physical buffer lies in addressable RAM + * @phys: physical start address of the buffer from previous kernel + * @size: size of the buffer + * + * On success return 0. On failure returns -EINVAL so callers can skip + * restoring. + */ +int ima_validate_range(phys_addr_t phys, size_t size) +{ + unsigned long start_pfn, end_pfn; + phys_addr_t end_phys; + + if (check_add_overflow(phys, (phys_addr_t)size - 1, &end_phys)) + return -EINVAL; + + start_pfn = PHYS_PFN(phys); + end_pfn = PHYS_PFN(end_phys); + +#ifdef CONFIG_X86 + if (!pfn_range_is_mapped(start_pfn, end_pfn)) +#else + if (!page_is_ram(start_pfn) || !page_is_ram(end_pfn)) +#endif + { + pr_warn("IMA: previous kernel measurement buffer %pa (size 0x%zx) lies outside available memory\n", + &phys, size); + return -EINVAL; + } + + return 0; +} From 4d02233235ed0450de9c10fcdcf3484e3c9401ce Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Tue, 30 Dec 2025 22:16:08 -0800 Subject: [PATCH 040/107] of/kexec: refactor ima_get_kexec_buffer() to use ima_validate_range() Refactor the OF/DT ima_get_kexec_buffer() to use a generic helper to validate the address range. No functional change intended. Link: https://lkml.kernel.org/r/20251231061609.907170-3-harshit.m.mogalapalli@oracle.com Signed-off-by: Harshit Mogalapalli Reviewed-by: Mimi Zohar Cc: Alexander Graf Cc: Ard Biesheuvel Cc: Baoquan He Cc: Borislav Betkov Cc: guoweikang Cc: Henry Willard Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Bohac Cc: Joel Granados Cc: Jonathan McDowell Cc: Mike Rapoport Cc: Paul Webb Cc: Sohil Mehta Cc: Sourabh Jain Cc: Thomas Gleinxer Cc: Yifei Liu Cc: Signed-off-by: Andrew Morton --- drivers/of/kexec.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c index 1ee2d31816ae..c4cf3552c018 100644 --- a/drivers/of/kexec.c +++ b/drivers/of/kexec.c @@ -128,7 +128,6 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size) { int ret, len; unsigned long tmp_addr; - unsigned long start_pfn, end_pfn; size_t tmp_size; const void *prop; @@ -144,17 +143,9 @@ int __init ima_get_kexec_buffer(void **addr, size_t *size) if (!tmp_size) return -ENOENT; - /* - * Calculate the PFNs for the buffer and ensure - * they are with in addressable memory. - */ - start_pfn = PHYS_PFN(tmp_addr); - end_pfn = PHYS_PFN(tmp_addr + tmp_size - 1); - if (!page_is_ram(start_pfn) || !page_is_ram(end_pfn)) { - pr_warn("IMA buffer at 0x%lx, size = 0x%zx beyond memory\n", - tmp_addr, tmp_size); - return -EINVAL; - } + ret = ima_validate_range(tmp_addr, tmp_size); + if (ret) + return ret; *addr = __va(tmp_addr); *size = tmp_size; From c5489d04337b47e93c0623e8145fcba3f5739efd Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Tue, 30 Dec 2025 22:16:09 -0800 Subject: [PATCH 041/107] x86/kexec: add a sanity check on previous kernel's ima kexec buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the second-stage kernel is booted via kexec with a limiting command line such as "mem=", the physical range that contains the carried over IMA measurement list may fall outside the truncated RAM leading to a kernel panic. BUG: unable to handle page fault for address: ffff97793ff47000 RIP: ima_restore_measurement_list+0xdc/0x45a #PF: error_code(0x0000) – not-present page Other architectures already validate the range with page_is_ram(), as done in commit cbf9c4b9617b ("of: check previous kernel's ima-kexec-buffer against memory bounds") do a similar check on x86. Without carrying the measurement list across kexec, the attestation would fail. Link: https://lkml.kernel.org/r/20251231061609.907170-4-harshit.m.mogalapalli@oracle.com Signed-off-by: Harshit Mogalapalli Fixes: b69a2afd5afc ("x86/kexec: Carry forward IMA measurement log on kexec") Reported-by: Paul Webb Reviewed-by: Mimi Zohar Cc: Alexander Graf Cc: Ard Biesheuvel Cc: Baoquan He Cc: Borislav Betkov Cc: guoweikang Cc: Henry Willard Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Bohac Cc: Joel Granados Cc: Jonathan McDowell Cc: Mike Rapoport Cc: Sohil Mehta Cc: Sourabh Jain Cc: Thomas Gleinxer Cc: Yifei Liu Cc: Signed-off-by: Andrew Morton --- arch/x86/kernel/setup.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 1b2edd07a3e1..383d4a4784f5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -439,9 +439,15 @@ int __init ima_free_kexec_buffer(void) int __init ima_get_kexec_buffer(void **addr, size_t *size) { + int ret; + if (!ima_kexec_buffer_size) return -ENOENT; + ret = ima_validate_range(ima_kexec_buffer_phys, ima_kexec_buffer_size); + if (ret) + return ret; + *addr = __va(ima_kexec_buffer_phys); *size = ima_kexec_buffer_size; From 77983f611fa61d749db4579d20908663e5a0895e Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 30 Dec 2025 15:25:13 +0100 Subject: [PATCH 042/107] ocfs2: adjust function name reference There is no function dlm_mast_regions(). However, dlm_match_regions() is passed the buffer "local", which it uses internally, so it seems like dlm_match_regions() was intended. Link: https://lkml.kernel.org/r/20251230142513.95467-1-Julia.Lawall@inria.fr Signed-off-by: Julia Lawall Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Heming Zhao Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2347a50f079b..cf3ca2f597c2 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1105,7 +1105,7 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len, mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node, qr->qr_domain); - /* buffer used in dlm_mast_regions() */ + /* buffer used in dlm_match_regions() */ local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); if (!local) return -ENOMEM; From a7e53bfb43667dd0eaf046c1725105e2cfe3be7c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 5 Jan 2026 18:58:34 +0200 Subject: [PATCH 043/107] kho/abi: luo: make generated documentation more coherent Patch series "kho: ABI headers and Documentation updates". LUO started adding KHO ABI headers to include/linux/kho/abi, but the core parts of KHO and memblock are still using the old way for descriptions on their ABIs. Let's consolidate all things KHO in include/linux/kho/abi. And while on that, make some documentation updates to have more coherent KHO docs. This patch (of 6): LUO ABI description starts with "This header defines" which is fine in the header but reads weird in the generated html documentation. Update it to make the generated documentation coherent. Link: https://lkml.kernel.org/r/20260105165839.285270-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20260105165839.285270-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Jonathan Corbet Cc: Pasha Tatashin Cc: Jason Miu Signed-off-by: Andrew Morton --- include/linux/kho/abi/luo.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h index bb099c92e469..beb86847b544 100644 --- a/include/linux/kho/abi/luo.h +++ b/include/linux/kho/abi/luo.h @@ -8,10 +8,10 @@ /** * DOC: Live Update Orchestrator ABI * - * This header defines the stable Application Binary Interface used by the - * Live Update Orchestrator to pass state from a pre-update kernel to a - * post-update kernel. The ABI is built upon the Kexec HandOver framework - * and uses a Flattened Device Tree to describe the preserved data. + * Live Update Orchestrator uses the stable Application Binary Interface + * defined below to pass state from a pre-update kernel to a post-update + * kernel. The ABI is built upon the Kexec HandOver framework and uses a + * Flattened Device Tree to describe the preserved data. * * This interface is a contract. Any modification to the FDT structure, node * properties, compatible strings, or the layout of the `__packed` serialization From 32cb2729c956162e5ca96fe5509b38eb9561e8c0 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 5 Jan 2026 18:58:35 +0200 Subject: [PATCH 044/107] kho/abi: memfd: make generated documentation more coherent memfd preservation ABI description starts with "This header defines" which is fine in the header but reads weird in the generated html documentation. Update it to make the generated documentation coherent. Link: https://lkml.kernel.org/r/20260105165839.285270-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Jason Miu Cc: Jonathan Corbet Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/kho/abi/memfd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h index da7d063474a1..c211c31334a3 100644 --- a/include/linux/kho/abi/memfd.h +++ b/include/linux/kho/abi/memfd.h @@ -17,8 +17,8 @@ /** * DOC: memfd Live Update ABI * - * This header defines the ABI for preserving the state of a memfd across a - * kexec reboot using the LUO. + * memfd uses the ABI defined below for preserving its state across a kexec + * reboot using the LUO. * * The state is serialized into a packed structure `struct memfd_luo_ser` * which is handed over to the next kernel via the KHO mechanism. From a6f4e56828029bc3b9a79910b38026fd2958915e Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 5 Jan 2026 18:58:36 +0200 Subject: [PATCH 045/107] kho: docs: combine concepts and FDT documentation Currently index.rst in KHO documentation looks empty and sad as it only contains links to "Kexec Handover Concepts" and "KHO FDT" chapters. Inline contents of these chapters into index.rst to provide a single coherent chapter describing KHO. While on it, drop parts of the KHO FDT description that will be superseded by addition of KHO ABI documentation. [rppt@kernel.org: fix Documentation/core-api/kho/index.rst] Link: https://lkml.kernel.org/r/aV4bnHlBXGpT_FMc@kernel.org Link: https://lkml.kernel.org/r/20260105165839.285270-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Jason Miu Cc: Jonathan Corbet Cc: Pratyush Yadav Signed-off-by: Andrew Morton --- Documentation/core-api/kho/concepts.rst | 74 ----------------------- Documentation/core-api/kho/fdt.rst | 80 ------------------------- Documentation/core-api/kho/index.rst | 77 ++++++++++++++++++++++-- Documentation/core-api/liveupdate.rst | 2 +- Documentation/mm/memfd_preservation.rst | 2 +- kernel/liveupdate/luo_core.c | 3 +- 6 files changed, 75 insertions(+), 163 deletions(-) delete mode 100644 Documentation/core-api/kho/concepts.rst delete mode 100644 Documentation/core-api/kho/fdt.rst diff --git a/Documentation/core-api/kho/concepts.rst b/Documentation/core-api/kho/concepts.rst deleted file mode 100644 index d626d1dbd678..000000000000 --- a/Documentation/core-api/kho/concepts.rst +++ /dev/null @@ -1,74 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0-or-later -.. _kho-concepts: - -======================= -Kexec Handover Concepts -======================= - -Kexec HandOver (KHO) is a mechanism that allows Linux to preserve memory -regions, which could contain serialized system states, across kexec. - -It introduces multiple concepts: - -KHO FDT -======= - -Every KHO kexec carries a KHO specific flattened device tree (FDT) blob -that describes preserved memory regions. These regions contain either -serialized subsystem states, or in-memory data that shall not be touched -across kexec. After KHO, subsystems can retrieve and restore preserved -memory regions from KHO FDT. - -KHO only uses the FDT container format and libfdt library, but does not -adhere to the same property semantics that normal device trees do: Properties -are passed in native endianness and standardized properties like ``regs`` and -``ranges`` do not exist, hence there are no ``#...-cells`` properties. - -KHO is still under development. The FDT schema is unstable and would change -in the future. - -Scratch Regions -=============== - -To boot into kexec, we need to have a physically contiguous memory range that -contains no handed over memory. Kexec then places the target kernel and initrd -into that region. The new kernel exclusively uses this region for memory -allocations before during boot up to the initialization of the page allocator. - -We guarantee that we always have such regions through the scratch regions: On -first boot KHO allocates several physically contiguous memory regions. Since -after kexec these regions will be used by early memory allocations, there is a -scratch region per NUMA node plus a scratch region to satisfy allocations -requests that do not require particular NUMA node assignment. -By default, size of the scratch region is calculated based on amount of memory -allocated during boot. The ``kho_scratch`` kernel command line option may be -used to explicitly define size of the scratch regions. -The scratch regions are declared as CMA when page allocator is initialized so -that their memory can be used during system lifetime. CMA gives us the -guarantee that no handover pages land in that region, because handover pages -must be at a static physical memory location and CMA enforces that only -movable pages can be located inside. - -After KHO kexec, we ignore the ``kho_scratch`` kernel command line option and -instead reuse the exact same region that was originally allocated. This allows -us to recursively execute any amount of KHO kexecs. Because we used this region -for boot memory allocations and as target memory for kexec blobs, some parts -of that memory region may be reserved. These reservations are irrelevant for -the next KHO, because kexec can overwrite even the original kernel. - -.. _kho-finalization-phase: - -KHO finalization phase -====================== - -To enable user space based kexec file loader, the kernel needs to be able to -provide the FDT that describes the current kernel's state before -performing the actual kexec. The process of generating that FDT is -called serialization. When the FDT is generated, some properties -of the system may become immutable because they are already written down -in the FDT. That state is called the KHO finalization phase. - -Public API -========== -.. kernel-doc:: kernel/liveupdate/kexec_handover.c - :export: diff --git a/Documentation/core-api/kho/fdt.rst b/Documentation/core-api/kho/fdt.rst deleted file mode 100644 index 62505285d60d..000000000000 --- a/Documentation/core-api/kho/fdt.rst +++ /dev/null @@ -1,80 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0-or-later - -======= -KHO FDT -======= - -KHO uses the flattened device tree (FDT) container format and libfdt -library to create and parse the data that is passed between the -kernels. The properties in KHO FDT are stored in native format. -It includes the physical address of an in-memory structure describing -all preserved memory regions, as well as physical addresses of KHO users' -own FDTs. Interpreting those sub FDTs is the responsibility of KHO users. - -KHO nodes and properties -======================== - -Property ``preserved-memory-map`` ---------------------------------- - -KHO saves a special property named ``preserved-memory-map`` under the root node. -This node contains the physical address of an in-memory structure for KHO to -preserve memory regions across kexec. - -Property ``compatible`` ------------------------ - -The ``compatible`` property determines compatibility between the kernel -that created the KHO FDT and the kernel that attempts to load it. -If the kernel that loads the KHO FDT is not compatible with it, the entire -KHO process will be bypassed. - -Property ``fdt`` ----------------- - -Generally, a KHO user serialize its state into its own FDT and instructs -KHO to preserve the underlying memory, such that after kexec, the new kernel -can recover its state from the preserved FDT. - -A KHO user thus can create a node in KHO root tree and save the physical address -of its own FDT in that node's property ``fdt`` . - -Examples -======== - -The following example demonstrates KHO FDT that preserves two memory -regions created with ``reserve_mem`` kernel command line parameter:: - - /dts-v1/; - - / { - compatible = "kho-v1"; - - preserved-memory-map = <0x40be16 0x1000000>; - - memblock { - fdt = <0x1517 0x1000000>; - }; - }; - -where the ``memblock`` node contains an FDT that is requested by the -subsystem memblock for preservation. The FDT contains the following -serialized data:: - - /dts-v1/; - - / { - compatible = "memblock-v1"; - - n1 { - compatible = "reserve-mem-v1"; - start = <0xc06b 0x4000000>; - size = <0x04 0x00>; - }; - - n2 { - compatible = "reserve-mem-v1"; - start = <0xc067 0x4000000>; - size = <0x04 0x00>; - }; - }; diff --git a/Documentation/core-api/kho/index.rst b/Documentation/core-api/kho/index.rst index 0c63b0c5c143..1733b3c3e976 100644 --- a/Documentation/core-api/kho/index.rst +++ b/Documentation/core-api/kho/index.rst @@ -1,13 +1,80 @@ .. SPDX-License-Identifier: GPL-2.0-or-later +.. _kho-concepts: + ======================== Kexec Handover Subsystem ======================== -.. toctree:: - :maxdepth: 1 +Overview +======== - concepts - fdt +Kexec HandOver (KHO) is a mechanism that allows Linux to preserve memory +regions, which could contain serialized system states, across kexec. -.. only:: subproject and html +KHO uses :ref:`flattened device tree (FDT) ` to pass information about +the preserved state from pre-exec kernel to post-kexec kernel and :ref:`scratch +memory regions ` to ensure integrity of the preserved memory. + +.. _kho_fdt: + +KHO FDT +======= +Every KHO kexec carries a KHO specific flattened device tree (FDT) blob that +describes the preserved state. The FDT includes properties describing preserved +memory regions and nodes that hold subsystem specific state. + +The preserved memory regions contain either serialized subsystem states, or +in-memory data that shall not be touched across kexec. After KHO, subsystems +can retrieve and restore the preserved state from KHO FDT. + +Subsystems participating in KHO can define their own format for state +serialization and preservation. + +.. _kho_scratch: + +Scratch Regions +=============== + +To boot into kexec, we need to have a physically contiguous memory range that +contains no handed over memory. Kexec then places the target kernel and initrd +into that region. The new kernel exclusively uses this region for memory +allocations before during boot up to the initialization of the page allocator. + +We guarantee that we always have such regions through the scratch regions: On +first boot KHO allocates several physically contiguous memory regions. Since +after kexec these regions will be used by early memory allocations, there is a +scratch region per NUMA node plus a scratch region to satisfy allocations +requests that do not require particular NUMA node assignment. +By default, size of the scratch region is calculated based on amount of memory +allocated during boot. The ``kho_scratch`` kernel command line option may be +used to explicitly define size of the scratch regions. +The scratch regions are declared as CMA when page allocator is initialized so +that their memory can be used during system lifetime. CMA gives us the +guarantee that no handover pages land in that region, because handover pages +must be at a static physical memory location and CMA enforces that only +movable pages can be located inside. + +After KHO kexec, we ignore the ``kho_scratch`` kernel command line option and +instead reuse the exact same region that was originally allocated. This allows +us to recursively execute any amount of KHO kexecs. Because we used this region +for boot memory allocations and as target memory for kexec blobs, some parts +of that memory region may be reserved. These reservations are irrelevant for +the next KHO, because kexec can overwrite even the original kernel. + +.. _kho-finalization-phase: + +KHO finalization phase +====================== + +To enable user space based kexec file loader, the kernel needs to be able to +provide the FDT that describes the current kernel's state before +performing the actual kexec. The process of generating that FDT is +called serialization. When the FDT is generated, some properties +of the system may become immutable because they are already written down +in the FDT. That state is called the KHO finalization phase. + +See Also +======== + +- :doc:`/admin-guide/mm/kho` diff --git a/Documentation/core-api/liveupdate.rst b/Documentation/core-api/liveupdate.rst index 7960eb15a81f..e2aba13494cf 100644 --- a/Documentation/core-api/liveupdate.rst +++ b/Documentation/core-api/liveupdate.rst @@ -58,4 +58,4 @@ See Also ======== - :doc:`Live Update uAPI ` -- :doc:`/core-api/kho/concepts` +- :doc:`/core-api/kho/index` diff --git a/Documentation/mm/memfd_preservation.rst b/Documentation/mm/memfd_preservation.rst index 66e0fb6d5ef0..a8a5b476afd3 100644 --- a/Documentation/mm/memfd_preservation.rst +++ b/Documentation/mm/memfd_preservation.rst @@ -20,4 +20,4 @@ See Also ======== - :doc:`/core-api/liveupdate` -- :doc:`/core-api/kho/concepts` +- :doc:`/core-api/kho/index` diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index 944663d99dd9..a26c093eb8eb 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -35,8 +35,7 @@ * iommu, interrupts, vfio, participating filesystems, and memory management. * * LUO uses Kexec Handover to transfer memory state from the current kernel to - * the next kernel. For more details see - * Documentation/core-api/kho/concepts.rst. + * the next kernel. For more details see Documentation/core-api/kho/index.rst. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt From 5e1ea1e27b6ff237122ac6cb30e0b8ea4618f75f Mon Sep 17 00:00:00 2001 From: Jason Miu Date: Mon, 5 Jan 2026 18:58:37 +0200 Subject: [PATCH 046/107] kho: introduce KHO FDT ABI header Introduce the `include/linux/kho/abi/kexec_handover.h` header file, which defines the stable ABI for the KHO mechanism. This header specifies how preserved data is passed between kernels using an FDT. The ABI contract includes the FDT structure, node properties, and the "kho-v1" compatible string. By centralizing these definitions, this header serves as the foundational agreement for inter-kernel communication of preserved states, ensuring forward compatibility and preventing misinterpretation of data across kexec transitions. Since the ABI definitions are now centralized in the header files, the YAML files that previously described the FDT interfaces are redundant. These redundant files have therefore been removed. Link: https://lkml.kernel.org/r/20260105165839.285270-5-rppt@kernel.org Signed-off-by: Jason Miu Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Jonathan Corbet Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- Documentation/core-api/kho/abi.rst | 16 ++++ Documentation/core-api/kho/bindings/kho.yaml | 43 ---------- .../core-api/kho/bindings/sub-fdt.yaml | 27 ------ Documentation/core-api/kho/index.rst | 9 ++ MAINTAINERS | 1 + include/linux/kho/abi/kexec_handover.h | 85 +++++++++++++++++++ kernel/liveupdate/kexec_handover.c | 19 ++--- 7 files changed, 120 insertions(+), 80 deletions(-) create mode 100644 Documentation/core-api/kho/abi.rst delete mode 100644 Documentation/core-api/kho/bindings/kho.yaml delete mode 100644 Documentation/core-api/kho/bindings/sub-fdt.yaml create mode 100644 include/linux/kho/abi/kexec_handover.h diff --git a/Documentation/core-api/kho/abi.rst b/Documentation/core-api/kho/abi.rst new file mode 100644 index 000000000000..a1ee0f481727 --- /dev/null +++ b/Documentation/core-api/kho/abi.rst @@ -0,0 +1,16 @@ +.. SPDX-License-Identifier: GPL-2.0-or-later + +================== +Kexec Handover ABI +================== + +Core Kexec Handover ABI +======================== + +.. kernel-doc:: include/linux/kho/abi/kexec_handover.h + :doc: Kexec Handover ABI + +See Also +======== + +- :doc:`/admin-guide/mm/kho` diff --git a/Documentation/core-api/kho/bindings/kho.yaml b/Documentation/core-api/kho/bindings/kho.yaml deleted file mode 100644 index 11e8ab7b219d..000000000000 --- a/Documentation/core-api/kho/bindings/kho.yaml +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -%YAML 1.2 ---- -title: Kexec HandOver (KHO) root tree - -maintainers: - - Mike Rapoport - - Changyuan Lyu - -description: | - System memory preserved by KHO across kexec. - -properties: - compatible: - enum: - - kho-v1 - - preserved-memory-map: - description: | - physical address (u64) of an in-memory structure describing all preserved - folios and memory ranges. - -patternProperties: - "$[0-9a-f_]+^": - $ref: sub-fdt.yaml# - description: physical address of a KHO user's own FDT. - -required: - - compatible - - preserved-memory-map - -additionalProperties: false - -examples: - - | - kho { - compatible = "kho-v1"; - preserved-memory-map = <0xf0be16 0x1000000>; - - memblock { - fdt = <0x80cc16 0x1000000>; - }; - }; diff --git a/Documentation/core-api/kho/bindings/sub-fdt.yaml b/Documentation/core-api/kho/bindings/sub-fdt.yaml deleted file mode 100644 index b9a3d2d24850..000000000000 --- a/Documentation/core-api/kho/bindings/sub-fdt.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -%YAML 1.2 ---- -title: KHO users' FDT address - -maintainers: - - Mike Rapoport - - Changyuan Lyu - -description: | - Physical address of an FDT blob registered by a KHO user. - -properties: - fdt: - description: | - physical address (u64) of an FDT blob. - -required: - - fdt - -additionalProperties: false - -examples: - - | - memblock { - fdt = <0x80cc16 0x1000000>; - }; diff --git a/Documentation/core-api/kho/index.rst b/Documentation/core-api/kho/index.rst index 1733b3c3e976..dcc6a36cc134 100644 --- a/Documentation/core-api/kho/index.rst +++ b/Documentation/core-api/kho/index.rst @@ -31,6 +31,15 @@ can retrieve and restore the preserved state from KHO FDT. Subsystems participating in KHO can define their own format for state serialization and preservation. +KHO FDT and structures defined by the subsystems form an ABI between pre-kexec +and post-kexec kernels. This ABI is defined by header files in +``include/linux/kho/abi`` directory. + +.. toctree:: + :maxdepth: 1 + + abi.rst + .. _kho_scratch: Scratch Regions diff --git a/MAINTAINERS b/MAINTAINERS index 4dcbcb5c14f0..9d724a7ade71 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13968,6 +13968,7 @@ F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* F: include/linux/kexec_handover.h F: include/linux/kho/ +F: include/linux/kho/abi/ F: kernel/liveupdate/kexec_handover* F: lib/test_kho.c F: tools/testing/selftests/kho/ diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h new file mode 100644 index 000000000000..af9fa8c134c7 --- /dev/null +++ b/include/linux/kho/abi/kexec_handover.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/* + * Copyright (C) 2023 Alexander Graf + * Copyright (C) 2025 Microsoft Corporation, Mike Rapoport + * Copyright (C) 2025 Google LLC, Changyuan Lyu + * Copyright (C) 2025 Google LLC, Jason Miu + */ + +#ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H +#define _LINUX_KHO_ABI_KEXEC_HANDOVER_H + +/** + * DOC: Kexec Handover ABI + * + * Kexec Handover uses the ABI defined below for passing preserved data from + * one kernel to the next. + * The ABI uses Flattened Device Tree (FDT) format. The first kernel creates an + * FDT which is then passed to the next kernel during a kexec handover. + * + * This interface is a contract. Any modification to the FDT structure, node + * properties, compatible string, or the layout of the data structures + * referenced here constitutes a breaking change. Such changes require + * incrementing the version number in KHO_FDT_COMPATIBLE to prevent a new kernel + * from misinterpreting data from an older kernel. Changes are allowed provided + * the compatibility version is incremented. However, backward/forward + * compatibility is only guaranteed for kernels supporting the same ABI version. + * + * FDT Structure Overview: + * The FDT serves as a central registry for physical + * addresses of preserved data structures and sub-FDTs. The first kernel + * populates this FDT with references to memory regions and other FDTs that + * need to persist across the kexec transition. The subsequent kernel then + * parses this FDT to locate and restore the preserved data.:: + * + * / { + * compatible = "kho-v1"; + * + * preserved-memory-map = <0x...>; + * + * { + * fdt = <0x...>; + * }; + * + * { + * fdt = <0x...>; + * }; + * ... ... + * { + * fdt = <0x...>; + * }; + * }; + * + * Root KHO Node (/): + * - compatible: "kho-v1" + * + * Indentifies the overall KHO ABI version. + * + * - preserved-memory-map: u64 + * + * Physical memory address pointing to the root of the + * preserved memory map data structure. + * + * Subnodes (): + * Subnodes can also be added to the root node to + * describe other preserved data blobs. The + * is provided by the subsystem that uses KHO for preserving its + * data. + * + * - fdt: u64 + * + * Physical address pointing to a subnode FDT blob that is also + * being preserved. + */ + +/* The compatible string for the KHO FDT root node. */ +#define KHO_FDT_COMPATIBLE "kho-v1" + +/* The FDT property for the preserved memory map. */ +#define KHO_FDT_MEMORY_MAP_PROP_NAME "preserved-memory-map" + +/* The FDT property for sub-FDTs. */ +#define KHO_FDT_SUB_TREE_PROP_NAME "fdt" + +#endif /* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */ diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index d4482b6e3cae..8f57d6e040af 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -33,10 +34,7 @@ #include "../kexec_internal.h" #include "kexec_handover_internal.h" -#define KHO_FDT_COMPATIBLE "kho-v1" -#define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" -#define PROP_SUB_FDT "fdt" - +/* The magic token for preserved pages */ #define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ /* @@ -378,7 +376,7 @@ static void kho_update_memory_map(struct khoser_mem_chunk *first_chunk) void *ptr; u64 phys; - ptr = fdt_getprop_w(kho_out.fdt, 0, PROP_PRESERVED_MEMORY_MAP, NULL); + ptr = fdt_getprop_w(kho_out.fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, NULL); /* Check and discard previous memory map */ phys = get_unaligned((u64 *)ptr); @@ -466,7 +464,7 @@ static phys_addr_t __init kho_get_mem_map_phys(const void *fdt) const void *mem_ptr; int len; - mem_ptr = fdt_getprop(fdt, 0, PROP_PRESERVED_MEMORY_MAP, &len); + mem_ptr = fdt_getprop(fdt, 0, KHO_FDT_MEMORY_MAP_PROP_NAME, &len); if (!mem_ptr || len != sizeof(u64)) { pr_err("failed to get preserved memory bitmaps\n"); return 0; @@ -727,7 +725,8 @@ int kho_add_subtree(const char *name, void *fdt) goto out_pack; } - err = fdt_setprop(root_fdt, off, PROP_SUB_FDT, &phys, sizeof(phys)); + err = fdt_setprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, + &phys, sizeof(phys)); if (err < 0) goto out_pack; @@ -758,7 +757,7 @@ void kho_remove_subtree(void *fdt) const u64 *val; int len; - val = fdt_getprop(root_fdt, off, PROP_SUB_FDT, &len); + val = fdt_getprop(root_fdt, off, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(phys_addr_t)) continue; @@ -1305,7 +1304,7 @@ int kho_retrieve_subtree(const char *name, phys_addr_t *phys) if (offset < 0) return -ENOENT; - val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len); + val = fdt_getprop(fdt, offset, KHO_FDT_SUB_TREE_PROP_NAME, &len); if (!val || len != sizeof(*val)) return -EINVAL; @@ -1325,7 +1324,7 @@ static __init int kho_out_fdt_setup(void) err |= fdt_finish_reservemap(root); err |= fdt_begin_node(root, ""); err |= fdt_property_string(root, "compatible", KHO_FDT_COMPATIBLE); - err |= fdt_property(root, PROP_PRESERVED_MEMORY_MAP, &empty_mem_map, + err |= fdt_property(root, KHO_FDT_MEMORY_MAP_PROP_NAME, &empty_mem_map, sizeof(empty_mem_map)); err |= fdt_end_node(root); err |= fdt_finish(root); From ac2d8102c4b88713a8fa371d5d802fcff131d6ac Mon Sep 17 00:00:00 2001 From: Jason Miu Date: Mon, 5 Jan 2026 18:58:38 +0200 Subject: [PATCH 047/107] kho: relocate vmalloc preservation structure to KHO ABI header The `struct kho_vmalloc` defines the in-memory layout for preserving vmalloc regions across kexec. This layout is a contract between kernels and part of the KHO ABI. To reflect this relationship, the related structs and helper macros are relocated to the ABI header, `include/linux/kho/abi/kexec_handover.h`. This move places the structure's definition under the protection of the KHO_FDT_COMPATIBLE version string. The structure and its components are now also documented within the ABI header to describe the contract and prevent ABI breaks. [rppt@kernel.org: update comment, per Pratyush] Link: https://lkml.kernel.org/r/aW_Mqp6HcqLwQImS@kernel.org Link: https://lkml.kernel.org/r/20260105165839.285270-6-rppt@kernel.org Signed-off-by: Jason Miu Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Jonathan Corbet Cc: Pasha Tatashin Cc: Pratyush Yadav Signed-off-by: Andrew Morton --- Documentation/core-api/kho/abi.rst | 6 ++ include/linux/kexec_handover.h | 27 +-------- include/linux/kho/abi/kexec_handover.h | 78 ++++++++++++++++++++++++++ include/linux/kho/abi/memfd.h | 2 +- kernel/liveupdate/kexec_handover.c | 15 ----- lib/test_kho.c | 1 + 6 files changed, 88 insertions(+), 41 deletions(-) diff --git a/Documentation/core-api/kho/abi.rst b/Documentation/core-api/kho/abi.rst index a1ee0f481727..1d9916adee23 100644 --- a/Documentation/core-api/kho/abi.rst +++ b/Documentation/core-api/kho/abi.rst @@ -10,6 +10,12 @@ Core Kexec Handover ABI .. kernel-doc:: include/linux/kho/abi/kexec_handover.h :doc: Kexec Handover ABI +vmalloc preservation ABI +======================== + +.. kernel-doc:: include/linux/kho/abi/kexec_handover.h + :doc: Kexec Handover ABI for vmalloc Preservation + See Also ======== diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 5f7b9de97e8d..a56ff3ffaf17 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -11,34 +11,11 @@ struct kho_scratch { phys_addr_t size; }; +struct kho_vmalloc; + struct folio; struct page; -#define DECLARE_KHOSER_PTR(name, type) \ - union { \ - phys_addr_t phys; \ - type ptr; \ - } name -#define KHOSER_STORE_PTR(dest, val) \ - ({ \ - typeof(val) v = val; \ - typecheck(typeof((dest).ptr), v); \ - (dest).phys = virt_to_phys(v); \ - }) -#define KHOSER_LOAD_PTR(src) \ - ({ \ - typeof(src) s = src; \ - (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ - }) - -struct kho_vmalloc_chunk; -struct kho_vmalloc { - DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); - unsigned int total_pages; - unsigned short flags; - unsigned short order; -}; - #ifdef CONFIG_KEXEC_HANDOVER bool kho_is_enabled(void); bool is_kho_boot(void); diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h index af9fa8c134c7..2201a0d2c159 100644 --- a/include/linux/kho/abi/kexec_handover.h +++ b/include/linux/kho/abi/kexec_handover.h @@ -10,6 +10,8 @@ #ifndef _LINUX_KHO_ABI_KEXEC_HANDOVER_H #define _LINUX_KHO_ABI_KEXEC_HANDOVER_H +#include + /** * DOC: Kexec Handover ABI * @@ -82,4 +84,80 @@ /* The FDT property for sub-FDTs. */ #define KHO_FDT_SUB_TREE_PROP_NAME "fdt" +/** + * DOC: Kexec Handover ABI for vmalloc Preservation + * + * The Kexec Handover ABI for preserving vmalloc'ed memory is defined by + * a set of structures and helper macros. The layout of these structures is a + * stable contract between kernels and is versioned by the KHO_FDT_COMPATIBLE + * string. + * + * The preservation is managed through a main descriptor &struct kho_vmalloc, + * which points to a linked list of &struct kho_vmalloc_chunk structures. These + * chunks contain the physical addresses of the preserved pages, allowing the + * next kernel to reconstruct the vmalloc area with the same content and layout. + * Helper macros are also defined for storing and loading pointers within + * these structures. + */ + +/* Helper macro to define a union for a serializable pointer. */ +#define DECLARE_KHOSER_PTR(name, type) \ + union { \ + u64 phys; \ + type ptr; \ + } name + +/* Stores the physical address of a serializable pointer. */ +#define KHOSER_STORE_PTR(dest, val) \ + ({ \ + typeof(val) v = val; \ + typecheck(typeof((dest).ptr), v); \ + (dest).phys = virt_to_phys(v); \ + }) + +/* Loads the stored physical address back to a pointer. */ +#define KHOSER_LOAD_PTR(src) \ + ({ \ + typeof(src) s = src; \ + (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ + }) + +/* + * This header is embedded at the beginning of each `kho_vmalloc_chunk` + * and contains a pointer to the next chunk in the linked list, + * stored as a physical address for handover. + */ +struct kho_vmalloc_hdr { + DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); +}; + +#define KHO_VMALLOC_SIZE \ + ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ + sizeof(u64)) + +/* + * Each chunk is a single page and is part of a linked list that describes + * a preserved vmalloc area. It contains the header with the link to the next + * chunk and a zero terminated array of physical addresses of the pages that + * make up the preserved vmalloc area. + */ +struct kho_vmalloc_chunk { + struct kho_vmalloc_hdr hdr; + u64 phys[KHO_VMALLOC_SIZE]; +}; + +static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); + +/* + * Describes a preserved vmalloc memory area, including the + * total number of pages, allocation flags, page order, and a pointer to the + * first chunk of physical page addresses. + */ +struct kho_vmalloc { + DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); + unsigned int total_pages; + unsigned short flags; + unsigned short order; +}; + #endif /* _LINUX_KHO_ABI_KEXEC_HANDOVER_H */ diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h index c211c31334a3..68cb6303b846 100644 --- a/include/linux/kho/abi/memfd.h +++ b/include/linux/kho/abi/memfd.h @@ -12,7 +12,7 @@ #define _LINUX_KHO_ABI_MEMFD_H #include -#include +#include /** * DOC: memfd Live Update ABI diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 8f57d6e040af..66fcdda0ebdc 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -876,21 +876,6 @@ void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) } EXPORT_SYMBOL_GPL(kho_unpreserve_pages); -struct kho_vmalloc_hdr { - DECLARE_KHOSER_PTR(next, struct kho_vmalloc_chunk *); -}; - -#define KHO_VMALLOC_SIZE \ - ((PAGE_SIZE - sizeof(struct kho_vmalloc_hdr)) / \ - sizeof(phys_addr_t)) - -struct kho_vmalloc_chunk { - struct kho_vmalloc_hdr hdr; - phys_addr_t phys[KHO_VMALLOC_SIZE]; -}; - -static_assert(sizeof(struct kho_vmalloc_chunk) == PAGE_SIZE); - /* vmalloc flags KHO supports */ #define KHO_VMALLOC_SUPPORTED_FLAGS (VM_ALLOC | VM_ALLOW_HUGE_VMAP) diff --git a/lib/test_kho.c b/lib/test_kho.c index 47de56280795..3431daca6968 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -19,6 +19,7 @@ #include #include #include +#include #include From dd1e79ef6ca188678ece81a77d0076ae7403116c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 5 Jan 2026 18:58:39 +0200 Subject: [PATCH 048/107] kho/abi: add memblock ABI header Introduce KHO ABI header describing preservation ABI for memblock's reserve_mem regions and link the relevant documentation to KHO docs. [lukas.bulwahn@redhat.com: MAINTAINERS: adjust file entry in MEMBLOCK AND MEMORY MANAGEMENT INITIALIZATION] Link: https://lkml.kernel.org/r/20260107090438.22901-1-lukas.bulwahn@redhat.com [rppt@kernel.org: update reserved_mem node description, per Pratyush] Link: https://lkml.kernel.org/r/aW_M-HYZzx5SkbnZ@kernel.org Link: https://lkml.kernel.org/r/20260105165839.285270-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Jason Miu Cc: Jonathan Corbet Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- Documentation/core-api/kho/abi.rst | 6 ++ .../kho/bindings/memblock/memblock.yaml | 39 ---------- .../kho/bindings/memblock/reserve-mem.yaml | 40 ---------- MAINTAINERS | 2 +- include/linux/kho/abi/memblock.h | 73 +++++++++++++++++++ mm/memblock.c | 4 +- 6 files changed, 81 insertions(+), 83 deletions(-) delete mode 100644 Documentation/core-api/kho/bindings/memblock/memblock.yaml delete mode 100644 Documentation/core-api/kho/bindings/memblock/reserve-mem.yaml create mode 100644 include/linux/kho/abi/memblock.h diff --git a/Documentation/core-api/kho/abi.rst b/Documentation/core-api/kho/abi.rst index 1d9916adee23..2e63be3486cf 100644 --- a/Documentation/core-api/kho/abi.rst +++ b/Documentation/core-api/kho/abi.rst @@ -16,6 +16,12 @@ vmalloc preservation ABI .. kernel-doc:: include/linux/kho/abi/kexec_handover.h :doc: Kexec Handover ABI for vmalloc Preservation +memblock preservation ABI +========================= + +.. kernel-doc:: include/linux/kho/abi/memblock.h + :doc: memblock kexec handover ABI + See Also ======== diff --git a/Documentation/core-api/kho/bindings/memblock/memblock.yaml b/Documentation/core-api/kho/bindings/memblock/memblock.yaml deleted file mode 100644 index d388c28eb91d..000000000000 --- a/Documentation/core-api/kho/bindings/memblock/memblock.yaml +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -%YAML 1.2 ---- -title: Memblock reserved memory - -maintainers: - - Mike Rapoport - -description: | - Memblock can serialize its current memory reservations created with - reserve_mem command line option across kexec through KHO. - The post-KHO kernel can then consume these reservations and they are - guaranteed to have the same physical address. - -properties: - compatible: - enum: - - reserve-mem-v1 - -patternProperties: - "$[0-9a-f_]+^": - $ref: reserve-mem.yaml# - description: reserved memory regions - -required: - - compatible - -additionalProperties: false - -examples: - - | - memblock { - compatible = "memblock-v1"; - n1 { - compatible = "reserve-mem-v1"; - start = <0xc06b 0x4000000>; - size = <0x04 0x00>; - }; - }; diff --git a/Documentation/core-api/kho/bindings/memblock/reserve-mem.yaml b/Documentation/core-api/kho/bindings/memblock/reserve-mem.yaml deleted file mode 100644 index 10282d3d1bcd..000000000000 --- a/Documentation/core-api/kho/bindings/memblock/reserve-mem.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) -%YAML 1.2 ---- -title: Memblock reserved memory regions - -maintainers: - - Mike Rapoport - -description: | - Memblock can serialize its current memory reservations created with - reserve_mem command line option across kexec through KHO. - This object describes each such region. - -properties: - compatible: - enum: - - reserve-mem-v1 - - start: - description: | - physical address (u64) of the reserved memory region. - - size: - description: | - size (u64) of the reserved memory region. - -required: - - compatible - - start - - size - -additionalProperties: false - -examples: - - | - n1 { - compatible = "reserve-mem-v1"; - start = <0xc06b 0x4000000>; - size = <0x04 0x00>; - }; diff --git a/MAINTAINERS b/MAINTAINERS index 9d724a7ade71..92b377cd131b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16396,7 +16396,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git for-next T: git git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock.git fixes F: Documentation/core-api/boot-time-mm.rst -F: Documentation/core-api/kho/bindings/memblock/* +F: include/linux/kho/abi/memblock.h F: include/linux/memblock.h F: mm/bootmem_info.c F: mm/memblock.c diff --git a/include/linux/kho/abi/memblock.h b/include/linux/kho/abi/memblock.h new file mode 100644 index 000000000000..27b042f470e1 --- /dev/null +++ b/include/linux/kho/abi/memblock.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_KHO_ABI_MEMBLOCK_H +#define _LINUX_KHO_ABI_MEMBLOCK_H + +/** + * DOC: memblock kexec handover ABI + * + * Memblock can serialize its current memory reservations created with + * reserve_mem command line option across kexec through KHO. + * The post-KHO kernel can then consume these reservations and they are + * guaranteed to have the same physical address. + * + * The state is serialized using Flattened Device Tree (FDT) format. Any + * modification to the FDT structure, node properties, or the compatible + * strings constitutes a breaking change. Such changes require incrementing the + * version number in the relevant `_COMPATIBLE` string to prevent a new kernel + * from misinterpreting data from an old kernel. + * + * Changes are allowed provided the compatibility version is incremented. + * However, backward/forward compatibility is only guaranteed for kernels + * supporting the same ABI version. + * + * FDT Structure Overview: + * The entire memblock state is encapsulated within a single KHO entry named + * "memblock". + * This entry contains an FDT with the following layout: + * + * .. code-block:: none + * + * / { + * compatible = "memblock-v1"; + * + * n1 { + * compatible = "reserve-mem-v1"; + * start = <0xc06b 0x4000000>; + * size = <0x04 0x00>; + * }; + * }; + * + * Main memblock node (/): + * + * - compatible: "memblock-v1" + + * Identifies the overall memblock ABI version. + * + * reserved_mem node: + * These nodes describe all reserve_mem regions. The node name is the name + * defined by the user for a reserve_mem region. + * + * - compatible: "reserve-mem-v1" + * + * Identifies the ABI version of reserve_mem descriptions + * + * - start: u64 + * + * Physical address of the reserved memory region. + * + * - size: u64 + * + * size in bytes of the reserved memory region. + */ + +/* Top level memblock FDT node name. */ +#define MEMBLOCK_KHO_FDT "memblock" + +/* The compatible string for the memblock FDT root node. */ +#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" + +/* The compatible string for the reserve_mem FDT nodes. */ +#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" + +#endif /* _LINUX_KHO_ABI_MEMBLOCK_H */ diff --git a/mm/memblock.c b/mm/memblock.c index 905d06b16348..6cff515d82f4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -21,6 +21,7 @@ #ifdef CONFIG_KEXEC_HANDOVER #include #include +#include #endif /* CONFIG_KEXEC_HANDOVER */ #include @@ -2442,9 +2443,6 @@ int reserve_mem_release_by_name(const char *name) } #ifdef CONFIG_KEXEC_HANDOVER -#define MEMBLOCK_KHO_FDT "memblock" -#define MEMBLOCK_KHO_NODE_COMPATIBLE "memblock-v1" -#define RESERVE_MEM_KHO_NODE_COMPATIBLE "reserve-mem-v1" static int __init reserved_mem_preserve(void) { From dbac35bee8fc844c2d8d6417af874a170a44d41f Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Tue, 6 Jan 2026 15:01:40 +0100 Subject: [PATCH 049/107] lib/Kconfig.debug: fix BOOTPARAM_HUNG_TASK_PANIC comment The comment for CONFIG_BOOTPARAM_HUNG_TASK_PANIC says: Say N if unsure. but since commit 9544f9e6947f ("hung_task: panic when there are more than N hung tasks at the same time"), N is not a valid value for the option, leading to a warning at build time: .config:11736:warning: symbol value 'n' invalid for BOOTPARAM_HUNG_TASK_PANIC as well as an error when given to menuconfig. Fix the comment to say '0' instead of 'N'. Link: https://lkml.kernel.org/r/20260106140140.136446-1-tglozar@redhat.com Fixes: 9544f9e6947f ("hung_task: panic when there are more than N hung tasks at the same time") Signed-off-by: Tomas Glozar Reported-by: Johnny Mnemonic Reviewed-by: Lance Yang Cc: Li RongQing Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3a31bbf53425..2122d5cec34d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1274,7 +1274,7 @@ config BOOTPARAM_HUNG_TASK_PANIC high-availability systems that have uptime guarantees and where a hung tasks must be resolved ASAP. - Say N if unsure. + Say 0 if unsure. config DETECT_HUNG_TASK_BLOCKER bool "Dump Hung Tasks Blocker" From c62e7e6444cd75dfea2609646f25c66f28b95082 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Sun, 4 Jan 2026 19:05:04 +0530 Subject: [PATCH 050/107] ocfs2: add check for free bits before allocation in ocfs2_move_extent() Add a check to verify the group descriptor has enough free bits before attempting allocation in ocfs2_move_extent(). This prevents a kernel BUG_ON crash in ocfs2_block_group_set_bits() when the move_extents ioctl is called on a crafted or corrupted filesystem. The existing validation in ocfs2_validate_gd_self() only checks static metadata consistency (bg_free_bits_count <= bg_bits) when the descriptor is first read from disk. However, during move_extents operations, multiple allocations can exhaust the free bits count below the requested allocation size, triggering BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits). The debug trace shows the issue clearly: - Block group 32 validated with bg_free_bits_count=427 - Repeated allocations decreased count: 427 -> 171 -> 43 -> ... -> 1 - Final request for 2 bits with only 1 available triggers BUG_ON By adding an early check in ocfs2_move_extent() right after ocfs2_find_victim_alloc_group(), we return -ENOSPC gracefully instead of crashing the kernel. This also avoids unnecessary work in ocfs2_probe_alloc_group() and __ocfs2_move_extent() when the allocation will fail. Link: https://lkml.kernel.org/r/20260104133504.14810-1-kartikey406@gmail.com Signed-off-by: Deepanshu Kartikey Reported-by: syzbot+7960178e777909060224@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7960178e777909060224 Link: https://lore.kernel.org/all/20251231115801.293726-1-kartikey406@gmail.com/T/ [v1] Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Heming Zhao Signed-off-by: Andrew Morton --- fs/ocfs2/move_extents.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 99637e34d9da..e3cdf8788484 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -662,6 +662,12 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out_commit; } + gd = (struct ocfs2_group_desc *)gd_bh->b_data; + if (le16_to_cpu(gd->bg_free_bits_count) < len) { + ret = -ENOSPC; + goto out_commit; + } + /* * probe the victim cluster group to find a proper * region to fit wanted movement, it even will perform @@ -682,7 +688,6 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out_commit; } - gd = (struct ocfs2_group_desc *)gd_bh->b_data; ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, le16_to_cpu(gd->bg_chain)); if (ret) { From 2bbd9e1d14d6156180d21cc871a51a3bd1839c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?mingzhu=2Ewang=28=E7=8E=8B=E6=98=8E=E7=8F=A0=29?= Date: Wed, 7 Jan 2026 08:15:32 +0000 Subject: [PATCH 051/107] kernel/fork: update obsolete use_mm references to kthread_use_mm The comment for get_task_mm() in kernel/fork.c incorrectly references the deprecated function `use_mm()`, which has been renamed to `kthread_use_mm()` in kernel/kthread.c. This patch updates the documentation to reflect the current function names, ensuring accuracy when developers refer to the kernel thread memory context API. No functional changes were introduced. Link: https://lkml.kernel.org/r/KUZPR04MB8965F954108B4DD7E8FFDB2B8F84A@KUZPR04MB8965.apcprd04.prod.outlook.com Signed-off-by: mingzhu.wang Cc: Ben Segall Cc: David Hildenbrand Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Jiazi Li Cc: Juri Lelli Cc: Kees Cook Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index b21eccc9e11c..f5ad5de49d68 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1356,7 +1356,7 @@ struct file *get_task_exe_file(struct task_struct *task) * @task: The task. * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning - * this kernel workthread has transiently adopted a user mm with use_mm, + * this kernel workthread has transiently adopted a user mm with kthread_use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() * after use. Typically used by /proc and ptrace. From 105ddfb2d2b3acec7a7d9695463df48733d91e6c Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 7 Jan 2026 08:28:46 +0000 Subject: [PATCH 052/107] rust: task: restrict Task::group_leader() to current MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Task::group_leader() method currently allows you to access the group_leader() of any task, for example one you hold a refcount to. But this is not safe in general since the group leader could change when a task exits. See for example commit a15f37a40145c ("kernel/sys.c: fix the racy usage of task_lock(tsk->group_leader) in sys_prlimit64() paths"). All existing users of Task::group_leader() call this method on current, which is guaranteed running, so there's not an actual issue in Rust code today. But to prevent code in the future from making this mistake, restrict Task::group_leader() so that it can only be called on current. There are some other cases where accessing task->group_leader is okay. For example it can be safe if you hold tasklist_lock or rcu_read_lock(). However, only supporting current->group_leader is sufficient for all in-tree Rust users of group_leader right now. Safe Rust functionality for accessing it under rcu or while holding tasklist_lock may be added in the future if required by any future Rust module. This patch is a bugfix in that it prevents users of this API from writing incorrect code. It doesn't change behavior of correct code. Link: https://lkml.kernel.org/r/20260107-task-group-leader-v2-1-8fbf816f2a2f@google.com Signed-off-by: Alice Ryhl Fixes: 313c4281bc9d ("rust: add basic `Task`") Reported-by: Oleg Nesterov Closes: https://lore.kernel.org/all/aTLnV-5jlgfk1aRK@redhat.com/ Reviewed-by: Boqun Feng Reviewed-by: Gary Guo Cc: Andreas Hindborg Cc: Benno Lossin Cc: "Björn Roy Baron" Cc: Björn Roy Baron Cc: Christian Brauner Cc: Danilo Krummrich Cc: FUJITA Tomonori Cc: Miguel Ojeda Cc: Panagiotis Foliadis Cc: Shankari Anand Cc: Trevor Gross Signed-off-by: Andrew Morton --- rust/kernel/task.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 49fad6de0674..cc907fb531bc 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -204,18 +204,6 @@ impl Task { self.0.get() } - /// Returns the group leader of the given task. - pub fn group_leader(&self) -> &Task { - // SAFETY: The group leader of a task never changes after initialization, so reading this - // field is not a data race. - let ptr = unsafe { *ptr::addr_of!((*self.as_ptr()).group_leader) }; - - // SAFETY: The lifetime of the returned task reference is tied to the lifetime of `self`, - // and given that a task has a reference to its group leader, we know it must be valid for - // the lifetime of the returned task reference. - unsafe { &*ptr.cast() } - } - /// Returns the PID of the given task. pub fn pid(&self) -> Pid { // SAFETY: The pid of a task never changes after initialization, so reading this field is @@ -345,6 +333,18 @@ impl CurrentTask { // `release_task()` call. Some(unsafe { PidNamespace::from_ptr(active_ns) }) } + + /// Returns the group leader of the current task. + pub fn group_leader(&self) -> &Task { + // SAFETY: The group leader of a task never changes while the task is running, and `self` + // is the current task, which is guaranteed running. + let ptr = unsafe { (*self.as_ptr()).group_leader }; + + // SAFETY: `current->group_leader` stays valid for at least the duration in which `current` + // is running, and the signature of this function ensures that the returned `&Task` can + // only be used while `current` is still valid, thus still running. + unsafe { &*ptr.cast() } + } } // SAFETY: The type invariants guarantee that `Task` is always refcounted. From bf45794244ca1fb1c135754f36ff765eea01f9e6 Mon Sep 17 00:00:00 2001 From: Kir Chou Date: Thu, 8 Jan 2026 21:07:53 +0900 Subject: [PATCH 053/107] lib/glob: convert selftest to KUnit This patch converts the existing glob selftest (lib/globtest.c) to use the KUnit framework (lib/tests/glob_kunit.c). The new test: - Migrates all 64 test cases from the original test to the KUnit suite. - Removes the custom 'verbose' module parameter as KUnit handles logging. - Updates Kconfig.debug and Makefile to support the new KUnit test. - Updates Kconfig and Makefile to remove the original selftest. - Updates GLOB_SELFTEST to GLOB_KUNIT_TEST for arch/m68k/configs. This commit is verified by `./tools/testing/kunit/kunit.py run' with the .kunit/.kunitconfig: CONFIG_KUNIT=y CONFIG_GLOB_KUNIT_TEST=y Link: https://lkml.kernel.org/r/20260108120753.27339-1-note351@hotmail.com Signed-off-by: Kir Chou Acked-by: Geert Uytterhoeven Reviewed-by: David Gow Reviewed-by: Kuan-Wei Chiu Cc: Signed-off-by: Andrew Morton --- arch/m68k/configs/amiga_defconfig | 1 - arch/m68k/configs/apollo_defconfig | 1 - arch/m68k/configs/atari_defconfig | 1 - arch/m68k/configs/bvme6000_defconfig | 1 - arch/m68k/configs/hp300_defconfig | 1 - arch/m68k/configs/mac_defconfig | 1 - arch/m68k/configs/multi_defconfig | 1 - arch/m68k/configs/mvme147_defconfig | 1 - arch/m68k/configs/mvme16x_defconfig | 1 - arch/m68k/configs/q40_defconfig | 1 - arch/m68k/configs/sun3_defconfig | 1 - arch/m68k/configs/sun3x_defconfig | 1 - lib/Kconfig | 13 --- lib/Kconfig.debug | 13 +++ lib/Makefile | 1 - lib/globtest.c | 167 --------------------------- lib/tests/Makefile | 1 + lib/tests/glob_kunit.c | 125 ++++++++++++++++++++ 18 files changed, 139 insertions(+), 193 deletions(-) delete mode 100644 lib/globtest.c create mode 100644 lib/tests/glob_kunit.c diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 46598efbea54..3c87c1d181a6 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -600,7 +600,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 63bef7a6d858..03eaace46fe7 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -557,7 +557,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 1342adfbd855..61228b9d2c2a 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -577,7 +577,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 484f21a2da37..83fcc12916c5 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -549,7 +549,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index ce97c816aa21..84d477e95fe8 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -559,7 +559,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index f5b57ea2d681..b1e911a138a0 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -576,7 +576,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 85efdb31c898..0a2c3ac6dc7f 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -663,7 +663,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 7102579b83d3..2087fe4af3d6 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -549,7 +549,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 18c0493ed0ff..4af83b643da1 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -550,7 +550,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 1b3a34ab1c74..56c303097050 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -566,7 +566,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 1a41a1c6bde1..de2a5b27d408 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -547,7 +547,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 8f182684e54b..297b8edcff6d 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -547,7 +547,6 @@ CONFIG_CRYPTO_USER_API_AEAD=m CONFIG_PRIME_NUMBERS=m CONFIG_CRC_BENCHMARK=y CONFIG_XZ_DEC_TEST=m -CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set CONFIG_MAGIC_SYSRQ=y CONFIG_TEST_LOCKUP=m diff --git a/lib/Kconfig b/lib/Kconfig index 2923924bea78..0f2fb9610647 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -430,19 +430,6 @@ config GLOB are compiling an out-of tree driver which tells you that it depends on this. -config GLOB_SELFTEST - tristate "glob self-test on init" - depends on GLOB - help - This option enables a simple self-test of the glob_match - function on startup. It is primarily useful for people - working on the code to ensure they haven't introduced any - regressions. - - It only adds a little bit of code and slows kernel boot (or - module load) by a small amount, so you're welcome to play with - it, but you probably don't need it. - # # Netlink attribute parsing support is select'ed if needed # diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2122d5cec34d..17d759a04021 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -3364,6 +3364,19 @@ config PRIME_NUMBERS_KUNIT_TEST If unsure, say N +config GLOB_KUNIT_TEST + tristate "Glob matching test" if !KUNIT_ALL_TESTS + depends on GLOB + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test the glob functions at runtime. + + This test suite verifies the correctness of glob_match() across various + scenarios, including edge cases. + + If unsure, say N + endif # RUNTIME_TESTING_MENU config ARCH_USE_MEMTEST diff --git a/lib/Makefile b/lib/Makefile index 1f87a174a317..9839f40af5dc 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -224,7 +224,6 @@ obj-$(CONFIG_CLOSURES) += closure.o obj-$(CONFIG_DQL) += dynamic_queue_limits.o obj-$(CONFIG_GLOB) += glob.o -obj-$(CONFIG_GLOB_SELFTEST) += globtest.o obj-$(CONFIG_DIMLIB) += dim/ obj-$(CONFIG_SIGNATURE) += digsig.o diff --git a/lib/globtest.c b/lib/globtest.c deleted file mode 100644 index d8e97d43b905..000000000000 --- a/lib/globtest.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Extracted fronm glob.c - */ - -#include -#include -#include -#include - -/* Boot with "glob.verbose=1" to show successful tests, too */ -static bool verbose = false; -module_param(verbose, bool, 0); - -struct glob_test { - char const *pat, *str; - bool expected; -}; - -static bool __pure __init test(char const *pat, char const *str, bool expected) -{ - bool match = glob_match(pat, str); - bool success = match == expected; - - /* Can't get string literals into a particular section, so... */ - static char const msg_error[] __initconst = - KERN_ERR "glob: \"%s\" vs. \"%s\": %s *** ERROR ***\n"; - static char const msg_ok[] __initconst = - KERN_DEBUG "glob: \"%s\" vs. \"%s\": %s OK\n"; - static char const mismatch[] __initconst = "mismatch"; - char const *message; - - if (!success) - message = msg_error; - else if (verbose) - message = msg_ok; - else - return success; - - printk(message, pat, str, mismatch + 3*match); - return success; -} - -/* - * The tests are all jammed together in one array to make it simpler - * to place that array in the .init.rodata section. The obvious - * "array of structures containing char *" has no way to force the - * pointed-to strings to be in a particular section. - * - * Anyway, a test consists of: - * 1. Expected glob_match result: '1' or '0'. - * 2. Pattern to match: null-terminated string - * 3. String to match against: null-terminated string - * - * The list of tests is terminated with a final '\0' instead of - * a glob_match result character. - */ -static char const glob_tests[] __initconst = - /* Some basic tests */ - "1" "a\0" "a\0" - "0" "a\0" "b\0" - "0" "a\0" "aa\0" - "0" "a\0" "\0" - "1" "\0" "\0" - "0" "\0" "a\0" - /* Simple character class tests */ - "1" "[a]\0" "a\0" - "0" "[a]\0" "b\0" - "0" "[!a]\0" "a\0" - "1" "[!a]\0" "b\0" - "1" "[ab]\0" "a\0" - "1" "[ab]\0" "b\0" - "0" "[ab]\0" "c\0" - "1" "[!ab]\0" "c\0" - "1" "[a-c]\0" "b\0" - "0" "[a-c]\0" "d\0" - /* Corner cases in character class parsing */ - "1" "[a-c-e-g]\0" "-\0" - "0" "[a-c-e-g]\0" "d\0" - "1" "[a-c-e-g]\0" "f\0" - "1" "[]a-ceg-ik[]\0" "a\0" - "1" "[]a-ceg-ik[]\0" "]\0" - "1" "[]a-ceg-ik[]\0" "[\0" - "1" "[]a-ceg-ik[]\0" "h\0" - "0" "[]a-ceg-ik[]\0" "f\0" - "0" "[!]a-ceg-ik[]\0" "h\0" - "0" "[!]a-ceg-ik[]\0" "]\0" - "1" "[!]a-ceg-ik[]\0" "f\0" - /* Simple wild cards */ - "1" "?\0" "a\0" - "0" "?\0" "aa\0" - "0" "??\0" "a\0" - "1" "?x?\0" "axb\0" - "0" "?x?\0" "abx\0" - "0" "?x?\0" "xab\0" - /* Asterisk wild cards (backtracking) */ - "0" "*??\0" "a\0" - "1" "*??\0" "ab\0" - "1" "*??\0" "abc\0" - "1" "*??\0" "abcd\0" - "0" "??*\0" "a\0" - "1" "??*\0" "ab\0" - "1" "??*\0" "abc\0" - "1" "??*\0" "abcd\0" - "0" "?*?\0" "a\0" - "1" "?*?\0" "ab\0" - "1" "?*?\0" "abc\0" - "1" "?*?\0" "abcd\0" - "1" "*b\0" "b\0" - "1" "*b\0" "ab\0" - "0" "*b\0" "ba\0" - "1" "*b\0" "bb\0" - "1" "*b\0" "abb\0" - "1" "*b\0" "bab\0" - "1" "*bc\0" "abbc\0" - "1" "*bc\0" "bc\0" - "1" "*bc\0" "bbc\0" - "1" "*bc\0" "bcbc\0" - /* Multiple asterisks (complex backtracking) */ - "1" "*ac*\0" "abacadaeafag\0" - "1" "*ac*ae*ag*\0" "abacadaeafag\0" - "1" "*a*b*[bc]*[ef]*g*\0" "abacadaeafag\0" - "0" "*a*b*[ef]*[cd]*g*\0" "abacadaeafag\0" - "1" "*abcd*\0" "abcabcabcabcdefg\0" - "1" "*ab*cd*\0" "abcabcabcabcdefg\0" - "1" "*abcd*abcdef*\0" "abcabcdabcdeabcdefg\0" - "0" "*abcd*\0" "abcabcabcabcefg\0" - "0" "*ab*cd*\0" "abcabcabcabcefg\0"; - -static int __init glob_init(void) -{ - unsigned successes = 0; - unsigned n = 0; - char const *p = glob_tests; - static char const message[] __initconst = - KERN_INFO "glob: %u self-tests passed, %u failed\n"; - - /* - * Tests are jammed together in a string. The first byte is '1' - * or '0' to indicate the expected outcome, or '\0' to indicate the - * end of the tests. Then come two null-terminated strings: the - * pattern and the string to match it against. - */ - while (*p) { - bool expected = *p++ & 1; - char const *pat = p; - - p += strlen(p) + 1; - successes += test(pat, p, expected); - p += strlen(p) + 1; - n++; - } - - n -= successes; - printk(message, successes, n); - - /* What's the errno for "kernel bug detected"? Guess... */ - return n ? -ECANCELED : 0; -} - -/* We need a dummy exit function to allow unload */ -static void __exit glob_fini(void) { } - -module_init(glob_init); -module_exit(glob_fini); - -MODULE_DESCRIPTION("glob(7) matching tests"); -MODULE_LICENSE("Dual MIT/GPL"); diff --git a/lib/tests/Makefile b/lib/tests/Makefile index 088b80d16383..ab3e74d0da9e 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -19,6 +19,7 @@ CFLAGS_fortify_kunit.o += $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE) obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o +obj-$(CONFIG_GLOB_KUNIT_TEST) += glob_kunit.o obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o obj-$(CONFIG_TEST_IOV_ITER) += kunit_iov_iter.o diff --git a/lib/tests/glob_kunit.c b/lib/tests/glob_kunit.c new file mode 100644 index 000000000000..362b1eda8e5b --- /dev/null +++ b/lib/tests/glob_kunit.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: MIT OR GPL-2.0 +/* + * Test cases for glob functions. + */ + +#include +#include +#include + +/** + * struct glob_test_case - Test case for glob matching. + * @pat: Pattern to match. + * @str: String to match against. + * @expected: Expected glob_match result, true if matched. + */ +struct glob_test_case { + const char *pat; + const char *str; + bool expected; +}; + +static const struct glob_test_case glob_test_cases[] = { + /* Some basic tests */ + { .pat = "a", .str = "a", .expected = true }, + { .pat = "a", .str = "b", .expected = false }, + { .pat = "a", .str = "aa", .expected = false }, + { .pat = "a", .str = "", .expected = false }, + { .pat = "", .str = "", .expected = true }, + { .pat = "", .str = "a", .expected = false }, + /* Simple character class tests */ + { .pat = "[a]", .str = "a", .expected = true }, + { .pat = "[a]", .str = "b", .expected = false }, + { .pat = "[!a]", .str = "a", .expected = false }, + { .pat = "[!a]", .str = "b", .expected = true }, + { .pat = "[ab]", .str = "a", .expected = true }, + { .pat = "[ab]", .str = "b", .expected = true }, + { .pat = "[ab]", .str = "c", .expected = false }, + { .pat = "[!ab]", .str = "c", .expected = true }, + { .pat = "[a-c]", .str = "b", .expected = true }, + { .pat = "[a-c]", .str = "d", .expected = false }, + /* Corner cases in character class parsing */ + { .pat = "[a-c-e-g]", .str = "-", .expected = true }, + { .pat = "[a-c-e-g]", .str = "d", .expected = false }, + { .pat = "[a-c-e-g]", .str = "f", .expected = true }, + { .pat = "[]a-ceg-ik[]", .str = "a", .expected = true }, + { .pat = "[]a-ceg-ik[]", .str = "]", .expected = true }, + { .pat = "[]a-ceg-ik[]", .str = "[", .expected = true }, + { .pat = "[]a-ceg-ik[]", .str = "h", .expected = true }, + { .pat = "[]a-ceg-ik[]", .str = "f", .expected = false }, + { .pat = "[!]a-ceg-ik[]", .str = "h", .expected = false }, + { .pat = "[!]a-ceg-ik[]", .str = "]", .expected = false }, + { .pat = "[!]a-ceg-ik[]", .str = "f", .expected = true }, + /* Simple wild cards */ + { .pat = "?", .str = "a", .expected = true }, + { .pat = "?", .str = "aa", .expected = false }, + { .pat = "??", .str = "a", .expected = false }, + { .pat = "?x?", .str = "axb", .expected = true }, + { .pat = "?x?", .str = "abx", .expected = false }, + { .pat = "?x?", .str = "xab", .expected = false }, + /* Asterisk wild cards (backtracking) */ + { .pat = "*??", .str = "a", .expected = false }, + { .pat = "*??", .str = "ab", .expected = true }, + { .pat = "*??", .str = "abc", .expected = true }, + { .pat = "*??", .str = "abcd", .expected = true }, + { .pat = "??*", .str = "a", .expected = false }, + { .pat = "??*", .str = "ab", .expected = true }, + { .pat = "??*", .str = "abc", .expected = true }, + { .pat = "??*", .str = "abcd", .expected = true }, + { .pat = "?*?", .str = "a", .expected = false }, + { .pat = "?*?", .str = "ab", .expected = true }, + { .pat = "?*?", .str = "abc", .expected = true }, + { .pat = "?*?", .str = "abcd", .expected = true }, + { .pat = "*b", .str = "b", .expected = true }, + { .pat = "*b", .str = "ab", .expected = true }, + { .pat = "*b", .str = "ba", .expected = false }, + { .pat = "*b", .str = "bb", .expected = true }, + { .pat = "*b", .str = "abb", .expected = true }, + { .pat = "*b", .str = "bab", .expected = true }, + { .pat = "*bc", .str = "abbc", .expected = true }, + { .pat = "*bc", .str = "bc", .expected = true }, + { .pat = "*bc", .str = "bbc", .expected = true }, + { .pat = "*bc", .str = "bcbc", .expected = true }, + /* Multiple asterisks (complex backtracking) */ + { .pat = "*ac*", .str = "abacadaeafag", .expected = true }, + { .pat = "*ac*ae*ag*", .str = "abacadaeafag", .expected = true }, + { .pat = "*a*b*[bc]*[ef]*g*", .str = "abacadaeafag", .expected = true }, + { .pat = "*a*b*[ef]*[cd]*g*", .str = "abacadaeafag", .expected = false }, + { .pat = "*abcd*", .str = "abcabcabcabcdefg", .expected = true }, + { .pat = "*ab*cd*", .str = "abcabcabcabcdefg", .expected = true }, + { .pat = "*abcd*abcdef*", .str = "abcabcdabcdeabcdefg", .expected = true }, + { .pat = "*abcd*", .str = "abcabcabcabcefg", .expected = false }, + { .pat = "*ab*cd*", .str = "abcabcabcabcefg", .expected = false }, +}; + +static void glob_case_to_desc(const struct glob_test_case *t, char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "pat:\"%s\" str:\"%s\"", t->pat, t->str); +} + +KUNIT_ARRAY_PARAM(glob, glob_test_cases, glob_case_to_desc); + +static void glob_test_match(struct kunit *test) +{ + const struct glob_test_case *params = test->param_value; + + KUNIT_EXPECT_EQ_MSG(test, + glob_match(params->pat, params->str), + params->expected, + "Pattern: \"%s\", String: \"%s\", Expected: %d", + params->pat, params->str, params->expected); +} + +static struct kunit_case glob_kunit_test_cases[] = { + KUNIT_CASE_PARAM(glob_test_match, glob_gen_params), + {} +}; + +static struct kunit_suite glob_test_suite = { + .name = "glob", + .test_cases = glob_kunit_test_cases, +}; + +kunit_test_suite(glob_test_suite); +MODULE_DESCRIPTION("Test cases for glob functions"); +MODULE_LICENSE("Dual MIT/GPL"); From 77ce1b4cd08fcdd049001fdf5f59c014fb4b7711 Mon Sep 17 00:00:00 2001 From: Long Wei Date: Wed, 7 Jan 2026 10:24:27 +0800 Subject: [PATCH 054/107] kho: test: clean up residual memory upon test_kho module unload During the initialization phase, the test_kho module invokes the kho_preserve_folio function, which internally configures bitmaps within kho_mem_track and establishes chunk linked lists in KHO. Upon unloading the test_kho module, it is necessary to clean up these states. Link: https://lkml.kernel.org/r/20260107022427.4114424-1-longwei27@huawei.com Signed-off-by: Long Wei Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: hewenliang Cc: Pasha Tatashin Cc: Pratyush Yadav Signed-off-by: Andrew Morton --- lib/test_kho.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/test_kho.c b/lib/test_kho.c index 3431daca6968..a20fafaf9846 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -340,11 +340,15 @@ module_init(kho_test_init); static void kho_test_cleanup(void) { + /* unpreserve and free the data stored in folios */ + kho_test_unpreserve_data(&kho_test_state); for (int i = 0; i < kho_test_state.nr_folios; i++) folio_put(kho_test_state.folios[i]); kvfree(kho_test_state.folios); - vfree(kho_test_state.folios_info); + + /* Unpreserve and release the FDT folio */ + kho_unpreserve_folio(kho_test_state.fdt); folio_put(kho_test_state.fdt); } From 25929dae28f528d7d74992edabd38bf3c374e485 Mon Sep 17 00:00:00 2001 From: Long Wei Date: Tue, 16 Dec 2025 19:44:00 +0800 Subject: [PATCH 055/107] kho: remove duplicate header file references kexec_handover_internal.h is included twice in kexec_handover.c. Remove the redundant first inclusion to eliminate the duplication. Link: https://lkml.kernel.org/r/20251216114400.2677311-1-longwei27@huawei.com Signed-off-by: Long Wei Reviewed-by: Pasha Tatashin Cc: Alexander Graf Cc: hewenliang Cc: Mike Rapoport Cc: Pratyush Yadav Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 66fcdda0ebdc..fbe109a0d858 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -25,7 +25,6 @@ #include -#include "kexec_handover_internal.h" /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. From 8cafcb881364af5ef3a8b9fed4db254054033d8a Mon Sep 17 00:00:00 2001 From: Zhiyu Zhang Date: Thu, 1 Jan 2026 19:11:48 +0800 Subject: [PATCH 056/107] fat: avoid parent link count underflow in rmdir Corrupted FAT images can leave a directory inode with an incorrect i_nlink (e.g. 2 even though subdirectories exist). rmdir then unconditionally calls drop_nlink(dir) and can drive i_nlink to 0, triggering the WARN_ON in drop_nlink(). Add a sanity check in vfat_rmdir() and msdos_rmdir(): only drop the parent link count when it is at least 3, otherwise report a filesystem error. Link: https://lkml.kernel.org/r/20260101111148.1437-1-zhiyuzhang999@gmail.com Fixes: 9a53c3a783c2 ("[PATCH] r/o bind mounts: unlink: monitor i_nlink") Signed-off-by: Zhiyu Zhang Reported-by: Zhiyu Zhang Closes: https://lore.kernel.org/linux-fsdevel/aVN06OKsKxZe6-Kv@casper.infradead.org/T/#t Tested-by: Zhiyu Zhang Acked-by: OGAWA Hirofumi Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Andrew Morton --- fs/fat/namei_msdos.c | 7 ++++++- fs/fat/namei_vfat.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 0b920ee40a7f..262ec1b790b5 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -325,7 +325,12 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry) err = fat_remove_entries(dir, &sinfo); /* and releases bh */ if (err) goto out; - drop_nlink(dir); + if (dir->i_nlink >= 3) + drop_nlink(dir); + else { + fat_fs_error(sb, "parent dir link count too low (%u)", + dir->i_nlink); + } clear_nlink(inode); fat_truncate_time(inode, NULL, S_CTIME); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 4f3cc2b3089e..8bf5f7a9fd23 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -804,7 +804,12 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) err = fat_remove_entries(dir, &sinfo); /* and releases bh */ if (err) goto out; - drop_nlink(dir); + if (dir->i_nlink >= 3) + drop_nlink(dir); + else { + fat_fs_error(sb, "parent dir link count too low (%u)", + dir->i_nlink); + } clear_nlink(inode); fat_truncate_time(inode, NULL, S_ATIME|S_MTIME); From e8eef69a99f185e75909adb24ab93d706e07bf27 Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Mon, 12 Jan 2026 10:08:53 -0800 Subject: [PATCH 057/107] once: don't use a work queue to reset sleepable static key Pointless overhead to use a work queue to reset the static key for a DO_ONCE_SLEEPABLE() invocation. Note that the previous code path included a BUG_ON() if the static key was already disabled. Dropped that as part of this change because: 1) Use of BUG_ON() is highly discouraged. 2) There is a WARN_ON() in the static_branch_disable() code path that would provide adequate breadcrumbs to debug any issue. Link: https://lkml.kernel.org/r/aWU4tfTju1l3oZCu@agluck-desk3 Signed-off-by: Tony Luck Reported-by: Reinette Chatre Cc: Eric Dumazet Signed-off-by: Andrew Morton --- lib/once.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/once.c b/lib/once.c index 2c306f0e891e..8557eb489f34 100644 --- a/lib/once.c +++ b/lib/once.c @@ -93,6 +93,6 @@ void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, { *done = true; mutex_unlock(&once_mutex); - once_disable_jump(once_key, mod); + static_branch_disable(once_key); } EXPORT_SYMBOL(__do_once_sleepable_done); From 5e65b5ca7d4e1f5d18e03ada94f549086ceb6500 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 24 Dec 2025 12:38:10 -0500 Subject: [PATCH 058/107] tsacct: skip all kernel threads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch is a preparation step for HPCC, for the OOM killer improvements. I suspect that this patch is useful on its own, because it really makes no sense to sum up accounting statistics of use_mm within kernel threads which are only temporarily using those mm. When we hit acct_account_cputime within a irq handler over a kthread that happens to use a userspace mm, we end up summing up the mm's RSS into the tsk acct_rss_mem1, which eventually decays. I don't see a good rationale behind tracking the mm's rss in that way when a kthread use a userspace mm temporarily through use_mm. It causes issues with init_mm and efi_mm which only partially initialize their mm_struct when introducing the new hierarchical percpu counters to replace RSS counters, which requires a pointer dereference when reading the approximate counter sum. The current percpu counters simply load a zeroed atomic counter, which happen to work. Skip all kernel threads in acct_account_cputime(), not just those that happen to have a NULL mm. This is a preparation step before introducing the hierarchical percpu counters. Link: https://lkml.kernel.org/r/20251224173810.648699-2-mathieu.desnoyers@efficios.com Signed-off-by: Mathieu Desnoyers Cc: Mark Brown Cc: Aboorva Devarajan Cc: Al Viro Cc: Baolin Wang Cc: Christan König Cc: Christian Brauner Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: "Liam R . Howlett" Cc: Lorenzo Stoakes Cc: Martin Liu Cc: Masami Hiramatsu Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: "Paul E. McKenney" Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sweet Tea Dorminy Cc: Tejun Heo Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Wei Yang Cc: Yu Zhao Signed-off-by: Andrew Morton --- kernel/tsacct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 6ea2f6363b90..5c153106e642 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -125,7 +125,7 @@ static void __acct_update_integrals(struct task_struct *tsk, { u64 time, delta; - if (!likely(tsk->mm)) + if (unlikely(!tsk->mm || (tsk->flags & PF_KTHREAD))) return; time = stime + utime; From 4cc67b048459bebb7a60b693044ec83fb853eba1 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Sun, 11 Jan 2026 21:21:57 +0000 Subject: [PATCH 059/107] linux/log2.h: reduce instruction count for is_power_of_2() Follow an observation that (n ^ (n - 1)) will only ever retain the most significant bit set in the word operated on if that is the only bit set in the first place, and use it to determine whether a number is a whole power of 2, avoiding the need for an explicit check for nonzero. This reduces the sequence produced to 3 instructions only across Alpha, MIPS, and RISC-V targets, down from 4, 5, and 4 respectively, removing a branch in the two latter cases. And it's 5 instructions on POWER and x86-64 vs 8 and 9 respectively. There are no branches now emitted here for targets that have a suitable conditional set operation, although an inline expansion will often end with one, depending on what code a call to this function is used in. Credit goes to GCC authors for coming up with this optimisation used as the fallback for (__builtin_popcountl(n) == 1), equivalent to this code, for targets where the hardware population count operation is considered expensive. Link: https://lkml.kernel.org/r/alpine.DEB.2.21.2601111836250.30566@angie.orcam.me.uk Signed-off-by: Maciej W. Rozycki Cc: Jens Axboe Cc: John Garry Cc: "Martin K. Petersen" Cc: Su Hui Signed-off-by: Andrew Morton --- include/linux/log2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/log2.h b/include/linux/log2.h index 2eac3fc9303d..e17ceb32e0c9 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -44,7 +44,7 @@ int __ilog2_u64(u64 n) static __always_inline __attribute__((const)) bool is_power_of_2(unsigned long n) { - return (n != 0 && ((n & (n - 1)) == 0)); + return n - 1 < (n ^ (n - 1)); } /** From a906f3ae4423d35c9804c8ec3a0db96ce9b54d44 Mon Sep 17 00:00:00 2001 From: Lillian Berry Date: Sun, 11 Jan 2026 07:56:35 -0500 Subject: [PATCH 060/107] init/main.c: check if rdinit was explicitly set before printing warning The rdinit parameter is set by default, and attempted during boot even if not specified in the command line. Only print the warning about rdinit being inaccessible if the rdinit value was found in command line; it's just noise otherwise. [akpm@linux-foundation.org: move ramdisk_execute_command_set into __initdata] Link: https://lkml.kernel.org/r/20260111125635.53682-1-lillian@star-ark.net Signed-off-by: Lillian Berry Cc: Ahmad Fatoum Cc: Alexander Shishkin Cc: Al Viro Cc: Douglas Anderson Cc: Francesco Valla Cc: Guo Weikang Cc: Huacai Chen Cc: Huan Yang Cc: Ingo Molnar Cc: "Mike Rapoport (Microsoft)" Cc: Sascha Hauer Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- init/main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index b84818ad9685..4773f3bad49c 100644 --- a/init/main.c +++ b/init/main.c @@ -162,6 +162,7 @@ static size_t initargs_offs; static char *execute_command; static char *ramdisk_execute_command = "/init"; +static bool __initdata ramdisk_execute_command_set; /* * Used to generate warnings if static_key manipulation functions are used @@ -623,6 +624,7 @@ static int __init rdinit_setup(char *str) unsigned int i; ramdisk_execute_command = str; + ramdisk_execute_command_set = true; /* See "auto" comment in init_setup */ for (i = 1; i < MAX_INIT_ARGS; i++) argv_init[i] = NULL; @@ -1699,8 +1701,9 @@ static noinline void __init kernel_init_freeable(void) int ramdisk_command_access; ramdisk_command_access = init_eaccess(ramdisk_execute_command); if (ramdisk_command_access != 0) { - pr_warn("check access for rdinit=%s failed: %i, ignoring\n", - ramdisk_execute_command, ramdisk_command_access); + if (ramdisk_execute_command_set) + pr_warn("check access for rdinit=%s failed: %i, ignoring\n", + ramdisk_execute_command, ramdisk_command_access); ramdisk_execute_command = NULL; prepare_namespace(); } From 499f86de4f8c34e19a57daf2b6f0cba848e91994 Mon Sep 17 00:00:00 2001 From: Sun Jian Date: Tue, 13 Jan 2026 18:15:32 +0800 Subject: [PATCH 061/107] init/main: read bootconfig header with get_unaligned_le32() get_boot_config_from_initrd() scans up to 3 bytes before initrd_end to handle GRUB 4-byte alignment. As a result, the bootconfig header immediately preceding the magic may be unaligned. Read the size and checksum fields with get_unaligned_le32() instead of casting to u32 * and using le32_to_cpu(), avoiding potential unaligned access and silencing sparse "cast to restricted __le32" warnings. Sparse warnings (gcc + C=1): init/main.c:292:16: warning: cast to restricted __le32 init/main.c:293:16: warning: cast to restricted __le32 No functional change intended. Link: https://lkml.kernel.org/r/20260113101532.1630770-1-sun.jian.kdev@gmail.com Signed-off-by: Sun Jian Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- init/main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/init/main.c b/init/main.c index 4773f3bad49c..29741049aa79 100644 --- a/init/main.c +++ b/init/main.c @@ -104,6 +104,7 @@ #include #include #include +#include #include #include @@ -270,7 +271,7 @@ static void * __init get_boot_config_from_initrd(size_t *_size) { u32 size, csum; char *data; - u32 *hdr; + u8 *hdr; int i; if (!initrd_end) @@ -289,9 +290,9 @@ static void * __init get_boot_config_from_initrd(size_t *_size) return NULL; found: - hdr = (u32 *)(data - 8); - size = le32_to_cpu(hdr[0]); - csum = le32_to_cpu(hdr[1]); + hdr = (u8 *)(data - 8); + size = get_unaligned_le32(hdr); + csum = get_unaligned_le32(hdr + 4); data = ((void *)hdr) - size; if ((unsigned long)data < initrd_start) { From 3bb83c910971c47989aa439849265600fa67b42a Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 13 Jan 2026 16:22:28 +1100 Subject: [PATCH 062/107] bpf: explicitly align bpf_res_spin_lock Patch series "Align atomic storage", v7. This series adds the __aligned attribute to atomic_t and atomic64_t definitions in include/linux and include/asm-generic (respectively) to get natural alignment of both types on csky, m68k, microblaze, nios2, openrisc and sh. This series also adds Kconfig options to enable a new run-time warning to help reveal misaligned atomic accesses on platforms which don't trap that. The performance impact is expected to vary across platforms and workloads. The measurements I made on m68k show that some workloads run faster and others slower. This patch (of 4): Align bpf_res_spin_lock to avoid a BUILD_BUG_ON() when the alignment changes, as it will do on m68k when, in a subsequent patch, the minimum alignment of the atomic_t member of struct rqspinlock gets increased from 2 to 4. Drop the BUILD_BUG_ON() as it becomes redundant. Link: https://lkml.kernel.org/r/cover.1768281748.git.fthain@linux-m68k.org Link: https://lkml.kernel.org/r/8a83876b07d1feacc024521e44059ae89abbb1ea.1768281748.git.fthain@linux-m68k.org Signed-off-by: Finn Thain Acked-by: Alexei Starovoitov Reviewed-by: Arnd Bergmann Cc: Geert Uytterhoeven Cc: Andrii Nakryiko Cc: Ard Biesheuvel Cc: Boqun Feng Cc: "Borislav Petkov (AMD)" Cc: Daniel Borkman Cc: Dinh Nguyen Cc: Eduard Zingerman Cc: Gary Guo Cc: Guo Ren Cc: Hao Luo Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Fastabend Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: KP Singh Cc: Marc Rutland Cc: Martin KaFai Lau Cc: Peter Zijlstra Cc: Rich Felker Cc: Sasha Levin (Microsoft) Cc: Song Liu Cc: Stafford Horne Cc: Stanislav Fomichev Cc: Stefan Kristiansson Cc: Thomas Gleixner Cc: Will Deacon Cc: Yonghong Song Cc: Yoshinori Sato Cc: Dave Hansen Signed-off-by: Andrew Morton --- include/asm-generic/rqspinlock.h | 2 +- kernel/bpf/rqspinlock.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h index 0f2dcbbfee2f..dd36ac96bf66 100644 --- a/include/asm-generic/rqspinlock.h +++ b/include/asm-generic/rqspinlock.h @@ -28,7 +28,7 @@ struct rqspinlock { */ struct bpf_res_spin_lock { u32 val; -}; +} __aligned(__alignof__(struct rqspinlock)); struct qspinlock; #ifdef CONFIG_QUEUED_SPINLOCKS diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index f7d0c8d4644e..8d892fb099ac 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -694,7 +694,6 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) int ret; BUILD_BUG_ON(sizeof(rqspinlock_t) != sizeof(struct bpf_res_spin_lock)); - BUILD_BUG_ON(__alignof__(rqspinlock_t) != __alignof__(struct bpf_res_spin_lock)); preempt_disable(); ret = res_spin_lock((rqspinlock_t *)lock); From e428b013d9dff30f7a65509e33047ba975cce8ba Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 13 Jan 2026 16:22:28 +1100 Subject: [PATCH 063/107] atomic: specify alignment for atomic_t and atomic64_t Some recent commits incorrectly assumed 4-byte alignment of locks. That assumption fails on Linux/m68k (and, interestingly, would have failed on Linux/cris also). The jump label implementation makes a similar alignment assumption. The expectation that atomic_t and atomic64_t variables will be naturally aligned seems reasonable, as indeed they are on 64-bit architectures. But atomic64_t isn't naturally aligned on csky, m68k, microblaze, nios2, openrisc and sh. Neither atomic_t nor atomic64_t are naturally aligned on m68k. This patch brings a little uniformity by specifying natural alignment for atomic types. One benefit is that atomic64_t variables do not get split across a page boundary. The cost is that some structs grow which leads to cache misses and wasted memory. See also, commit bbf2a330d92c ("x86: atomic64: The atomic64_t data type should be 8 bytes aligned on 32-bit too"). Link: https://lkml.kernel.org/r/a76bc24a4e7c1d8112d7d5fa8d14e4b694a0e90c.1768281748.git.fthain@linux-m68k.org Link: https://lore.kernel.org/lkml/CAFr9PX=MYUDGJS2kAvPMkkfvH+0-SwQB_kxE4ea0J_wZ_pk=7w@mail.gmail.com Link: https://lore.kernel.org/lkml/CAMuHMdW7Ab13DdGs2acMQcix5ObJK0O2dG_Fxzr8_g58Rc1_0g@mail.gmail.com/ Signed-off-by: Finn Thain Acked-by: Guo Ren Reviewed-by: Arnd Bergmann Cc: Guo Ren Cc: Geert Uytterhoeven Cc: Dinh Nguyen Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: Yoshinori Sato Cc: Rich Felker Cc: John Paul Adrian Glaubitz Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Ard Biesheuvel Cc: Boqun Feng Cc: "Borislav Petkov (AMD)" Cc: Daniel Borkman Cc: Dave Hansen Cc: Eduard Zingerman Cc: Gary Guo Cc: Hao Luo Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Marc Rutland Cc: Martin KaFai Lau Cc: Peter Zijlstra Cc: Sasha Levin (Microsoft) Cc: Song Liu Cc: Stanislav Fomichev Cc: Thomas Gleixner Cc: Will Deacon Cc: Yonghong Song Signed-off-by: Andrew Morton --- include/asm-generic/atomic64.h | 2 +- include/linux/types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h index 100d24b02e52..f22ccfc0df98 100644 --- a/include/asm-generic/atomic64.h +++ b/include/asm-generic/atomic64.h @@ -10,7 +10,7 @@ #include typedef struct { - s64 counter; + s64 __aligned(sizeof(s64)) counter; } atomic64_t; #define ATOMIC64_INIT(i) { (i) } diff --git a/include/linux/types.h b/include/linux/types.h index 0cbb684eec5c..f69be881369f 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -179,7 +179,7 @@ typedef phys_addr_t resource_size_t; typedef unsigned long irq_hw_number_t; typedef struct { - int counter; + int __aligned(sizeof(int)) counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } From 80047d84eed25e9c92cfb9169980a0dfec110246 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Jan 2026 16:22:28 +1100 Subject: [PATCH 064/107] atomic: add alignment check to instrumented atomic operations Add a Kconfig option for debug builds which logs a warning when an instrumented atomic operation takes place that's misaligned. Some platforms don't trap for this. [fthain@linux-m68k.org: added __DISABLE_EXPORTS conditional and refactored as helper function] Link: https://lkml.kernel.org/r/51ebf844e006ca0de408f5d3a831e7b39d7fc31c.1768281748.git.fthain@linux-m68k.org Link: https://lore.kernel.org/lkml/20250901093600.GF4067720@noisy.programming.kicks-ass.net/ Link: https://lore.kernel.org/linux-next/df9fbd22-a648-ada4-fee0-68fe4325ff82@linux-m68k.org/ Signed-off-by: Finn Thain Signed-off-by: Peter Zijlstra (Intel) Suggested-by: Geert Uytterhoeven Cc: Sasha Levin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Ard Biesheuvel Cc: "H. Peter Anvin" Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Arnd Bergmann Cc: Boqun Feng Cc: Daniel Borkman Cc: Dinh Nguyen Cc: Eduard Zingerman Cc: Gary Guo Cc: Guo Ren Cc: Hao Luo Cc: Jiri Olsa Cc: John Fastabend Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: KP Singh Cc: Marc Rutland Cc: Martin KaFai Lau Cc: Rich Felker Cc: Song Liu Cc: Stafford Horne Cc: Stanislav Fomichev Cc: Stefan Kristiansson Cc: Will Deacon Cc: Yonghong Song Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/instrumented.h | 11 +++++++++++ lib/Kconfig.debug | 10 ++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 711a1f0d1a73..e34b6a557e0a 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -7,6 +7,7 @@ #ifndef _LINUX_INSTRUMENTED_H #define _LINUX_INSTRUMENTED_H +#include #include #include #include @@ -55,6 +56,13 @@ static __always_inline void instrument_read_write(const volatile void *v, size_t kcsan_check_read_write(v, size); } +static __always_inline void instrument_atomic_check_alignment(const volatile void *v, size_t size) +{ +#ifndef __DISABLE_EXPORTS + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ATOMIC) && ((unsigned long)v & (size - 1))); +#endif +} + /** * instrument_atomic_read - instrument atomic read access * @v: address of access @@ -67,6 +75,7 @@ static __always_inline void instrument_atomic_read(const volatile void *v, size_ { kasan_check_read(v, size); kcsan_check_atomic_read(v, size); + instrument_atomic_check_alignment(v, size); } /** @@ -81,6 +90,7 @@ static __always_inline void instrument_atomic_write(const volatile void *v, size { kasan_check_write(v, size); kcsan_check_atomic_write(v, size); + instrument_atomic_check_alignment(v, size); } /** @@ -95,6 +105,7 @@ static __always_inline void instrument_atomic_read_write(const volatile void *v, { kasan_check_write(v, size); kcsan_check_atomic_read_write(v, size); + instrument_atomic_check_alignment(v, size); } /** diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 17d759a04021..9eb685d1ec44 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1360,6 +1360,16 @@ config DEBUG_PREEMPT depending on workload as it triggers debugging routines for each this_cpu operation. It should only be used for debugging purposes. +config DEBUG_ATOMIC + bool "Debug atomic variables" + depends on DEBUG_KERNEL + help + If you say Y here then the kernel will add a runtime alignment check + to atomic accesses. Useful for architectures that do not have trap on + mis-aligned access. + + This option has potentially significant overhead. + menu "Lock Debugging (spinlocks, mutexes, etc...)" config LOCK_DEBUGGING_SUPPORT From 9a229ae249e0a24276901ad6807f31b32124f5c5 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 13 Jan 2026 16:22:28 +1100 Subject: [PATCH 065/107] atomic: add option for weaker alignment check Add a new Kconfig symbol to make CONFIG_DEBUG_ATOMIC more useful on those architectures which do not align dynamic allocations to 8-byte boundaries. Without this, CONFIG_DEBUG_ATOMIC produces excessive WARN splats. Link: https://lkml.kernel.org/r/6d25a12934fe9199332f4d65d17c17de450139a8.1768281748.git.fthain@linux-m68k.org Signed-off-by: Finn Thain Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Boqun Feng Cc: "Borislav Petkov (AMD)" Cc: Daniel Borkman Cc: Dave Hansen Cc: Dinh Nguyen Cc: Eduard Zingerman Cc: Gary Guo Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Hao Luo Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Fastabend Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: KP Singh Cc: Marc Rutland Cc: Martin KaFai Lau Cc: Peter Zijlstra Cc: Rich Felker Cc: Sasha Levin (Microsoft) Cc: Song Liu Cc: Stafford Horne Cc: Stanislav Fomichev Cc: Stefan Kristiansson Cc: Thomas Gleixner Cc: Will Deacon Cc: Yonghong Song Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/instrumented.h | 8 +++++++- lib/Kconfig.debug | 8 ++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index e34b6a557e0a..a1b4cf81adc2 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -59,7 +59,13 @@ static __always_inline void instrument_read_write(const volatile void *v, size_t static __always_inline void instrument_atomic_check_alignment(const volatile void *v, size_t size) { #ifndef __DISABLE_EXPORTS - WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ATOMIC) && ((unsigned long)v & (size - 1))); + if (IS_ENABLED(CONFIG_DEBUG_ATOMIC)) { + unsigned int mask = size - 1; + + if (IS_ENABLED(CONFIG_DEBUG_ATOMIC_LARGEST_ALIGN)) + mask &= sizeof(struct { long x; } __aligned_largest) - 1; + WARN_ON_ONCE((unsigned long)v & mask); + } #endif } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9eb685d1ec44..7eed3b197ca9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1370,6 +1370,14 @@ config DEBUG_ATOMIC This option has potentially significant overhead. +config DEBUG_ATOMIC_LARGEST_ALIGN + bool "Check alignment only up to __aligned_largest" + depends on DEBUG_ATOMIC + help + If you say Y here then the check for natural alignment of + atomic accesses will be constrained to the compiler's largest + alignment for scalar types. + menu "Lock Debugging (spinlocks, mutexes, etc...)" config LOCK_DEBUGGING_SUPPORT From 89802ca36c96b324829996ef05013f82ecc9b68a Mon Sep 17 00:00:00 2001 From: Wangyang Guo Date: Tue, 13 Jan 2026 10:29:58 +0800 Subject: [PATCH 066/107] lib/group_cpus: make group CPU cluster aware As CPU core counts increase, the number of NVMe IRQs may be smaller than the total number of CPUs. This forces multiple CPUs to share the same IRQ. If the IRQ affinity and the CPU's cluster do not align, a performance penalty can be observed on some platforms. This patch improves IRQ affinity by grouping CPUs by cluster within each NUMA domain, ensuring better locality between CPUs and their assigned NVMe IRQs. Details: Intel Xeon E platform packs 4 CPU cores as 1 module (cluster) and share the L2 cache. Let's say, if there are 40 CPUs in 1 NUMA domain and 11 IRQs to dispatch. The existing algorithm will map first 7 IRQs each with 4 CPUs and remained 4 IRQs each with 3 CPUs. The last 4 IRQs may have cross cluster issue. For example, the 9th IRQ which pinned to CPU32, then for CPU31, it will have cross L2 memory access. CPU |28 29 30 31|32 33 34 35|36 ... -------- -------- -------- IRQ 8 9 10 If this patch applied, then first 2 IRQs each mapped with 2 CPUs and rest 9 IRQs each mapped with 4 CPUs, which avoids the cross cluster memory access. CPU |00 01 02 03|04 05 06 07|08 09 10 11| ... ----- ----- ----------- ----------- IRQ 1 2 3 4 As a result, 15%+ performance difference is observed in FIO libaio/randread/bs=8k. Changes since V1: - Add more performance details in commit messages. - Fix endless loop when topology_cluster_cpumask return invalid mask. History: v1: https://lore.kernel.org/all/20251024023038.872616-1-wangyang.guo@intel.com/ v1 [RESEND]: https://lore.kernel.org/all/20251111020608.1501543-1-wangyang.guo@intel.com/ Link: https://lkml.kernel.org/r/20260113022958.3379650-1-wangyang.guo@intel.com Signed-off-by: Wangyang Guo Reviewed-by: Tianyou Li Reviewed-by: Tim Chen Tested-by: Dan Liang Cc: Christoph Hellwig Cc: Jens Axboe Cc: Keith Busch Cc: Ming Lei Cc: Radu Rendec Cc: Sagi Grimberg Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- lib/group_cpus.c | 271 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 206 insertions(+), 65 deletions(-) diff --git a/lib/group_cpus.c b/lib/group_cpus.c index 6d08ac05f371..a93df70919df 100644 --- a/lib/group_cpus.c +++ b/lib/group_cpus.c @@ -114,48 +114,15 @@ static int ncpus_cmp_func(const void *l, const void *r) return ln->ncpus - rn->ncpus; } -/* - * Allocate group number for each node, so that for each node: - * - * 1) the allocated number is >= 1 - * - * 2) the allocated number is <= active CPU number of this node - * - * The actual allocated total groups may be less than @numgrps when - * active total CPU number is less than @numgrps. - * - * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' - * for each node. - */ -static void alloc_nodes_groups(unsigned int numgrps, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - const nodemask_t nodemsk, - struct cpumask *nmsk, - struct node_groups *node_groups) +static void alloc_groups_to_nodes(unsigned int numgrps, + unsigned int numcpus, + struct node_groups *node_groups, + unsigned int num_nodes) { - unsigned n, remaining_ncpus = 0; + unsigned int n, remaining_ncpus = numcpus; + unsigned int ngroups, ncpus; - for (n = 0; n < nr_node_ids; n++) { - node_groups[n].id = n; - node_groups[n].ncpus = UINT_MAX; - } - - for_each_node_mask(n, nodemsk) { - unsigned ncpus; - - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - ncpus = cpumask_weight(nmsk); - - if (!ncpus) - continue; - remaining_ncpus += ncpus; - node_groups[n].ncpus = ncpus; - } - - numgrps = min_t(unsigned, remaining_ncpus, numgrps); - - sort(node_groups, nr_node_ids, sizeof(node_groups[0]), + sort(node_groups, num_nodes, sizeof(node_groups[0]), ncpus_cmp_func, NULL); /* @@ -226,9 +193,8 @@ static void alloc_nodes_groups(unsigned int numgrps, * finally for each node X: grps(X) <= ncpu(X). * */ - for (n = 0; n < nr_node_ids; n++) { - unsigned ngroups, ncpus; + for (n = 0; n < num_nodes; n++) { if (node_groups[n].ncpus == UINT_MAX) continue; @@ -246,12 +212,201 @@ static void alloc_nodes_groups(unsigned int numgrps, } } +/* + * Allocate group number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * + * 2) the allocated number is <= active CPU number of this node + * + * The actual allocated total groups may be less than @numgrps when + * active total CPU number is less than @numgrps. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_groups(unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_groups *node_groups) +{ + unsigned int n, numcpus = 0; + + for (n = 0; n < nr_node_ids; n++) { + node_groups[n].id = n; + node_groups[n].ncpus = UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { + unsigned int ncpus; + + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + ncpus = cpumask_weight(nmsk); + + if (!ncpus) + continue; + numcpus += ncpus; + node_groups[n].ncpus = ncpus; + } + + numgrps = min_t(unsigned int, numcpus, numgrps); + alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids); +} + +static void assign_cpus_to_groups(unsigned int ncpus, + struct cpumask *nmsk, + struct node_groups *nv, + struct cpumask *masks, + unsigned int *curgrp, + unsigned int last_grp) +{ + unsigned int v, cpus_per_grp, extra_grps; + /* Account for rounding errors */ + extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); + + /* Spread allocated groups on CPUs of the current node */ + for (v = 0; v < nv->ngroups; v++, *curgrp += 1) { + cpus_per_grp = ncpus / nv->ngroups; + + /* Account for extra groups to compensate rounding errors */ + if (extra_grps) { + cpus_per_grp++; + --extra_grps; + } + + /* + * wrapping has to be considered given 'startgrp' + * may start anywhere + */ + if (*curgrp >= last_grp) + *curgrp = 0; + grp_spread_init_one(&masks[*curgrp], nmsk, cpus_per_grp); + } +} + +static int alloc_cluster_groups(unsigned int ncpus, + unsigned int ngroups, + struct cpumask *node_cpumask, + cpumask_var_t msk, + const struct cpumask ***clusters_ptr, + struct node_groups **cluster_groups_ptr) +{ + unsigned int ncluster = 0; + unsigned int cpu, nc, n; + const struct cpumask *cluster_mask; + const struct cpumask **clusters; + struct node_groups *cluster_groups; + + cpumask_copy(msk, node_cpumask); + + /* Probe how many clusters in this node. */ + while (1) { + cpu = cpumask_first(msk); + if (cpu >= nr_cpu_ids) + break; + + cluster_mask = topology_cluster_cpumask(cpu); + if (!cpumask_weight(cluster_mask)) + goto no_cluster; + /* Clean out CPUs on the same cluster. */ + cpumask_andnot(msk, msk, cluster_mask); + ncluster++; + } + + /* If ngroups < ncluster, cross cluster is inevitable, skip. */ + if (ncluster == 0 || ncluster > ngroups) + goto no_cluster; + + /* Allocate memory based on cluster number. */ + clusters = kcalloc(ncluster, sizeof(struct cpumask *), GFP_KERNEL); + if (!clusters) + goto no_cluster; + cluster_groups = kcalloc(ncluster, sizeof(struct node_groups), GFP_KERNEL); + if (!cluster_groups) + goto fail_cluster_groups; + + /* Filling cluster info for later process. */ + cpumask_copy(msk, node_cpumask); + for (n = 0; n < ncluster; n++) { + cpu = cpumask_first(msk); + cluster_mask = topology_cluster_cpumask(cpu); + nc = cpumask_weight_and(cluster_mask, node_cpumask); + clusters[n] = cluster_mask; + cluster_groups[n].id = n; + cluster_groups[n].ncpus = nc; + cpumask_andnot(msk, msk, cluster_mask); + } + + alloc_groups_to_nodes(ngroups, ncpus, cluster_groups, ncluster); + + *clusters_ptr = clusters; + *cluster_groups_ptr = cluster_groups; + return ncluster; + + fail_cluster_groups: + kfree(clusters); + no_cluster: + return 0; +} + +/* + * Try group CPUs evenly for cluster locality within a NUMA node. + * + * Return: true if success, false otherwise. + */ +static bool __try_group_cluster_cpus(unsigned int ncpus, + unsigned int ngroups, + struct cpumask *node_cpumask, + struct cpumask *masks, + unsigned int *curgrp, + unsigned int last_grp) +{ + struct node_groups *cluster_groups; + const struct cpumask **clusters; + unsigned int ncluster; + bool ret = false; + cpumask_var_t nmsk; + unsigned int i, nc; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + goto fail_nmsk_alloc; + + ncluster = alloc_cluster_groups(ncpus, ngroups, node_cpumask, nmsk, + &clusters, &cluster_groups); + + if (ncluster == 0) + goto fail_no_clusters; + + for (i = 0; i < ncluster; i++) { + struct node_groups *nv = &cluster_groups[i]; + + /* Get the cpus on this cluster. */ + cpumask_and(nmsk, node_cpumask, clusters[nv->id]); + nc = cpumask_weight(nmsk); + if (!nc) + continue; + WARN_ON_ONCE(nv->ngroups > nc); + + assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp); + } + + ret = true; + kfree(cluster_groups); + kfree(clusters); + fail_no_clusters: + free_cpumask_var(nmsk); + fail_nmsk_alloc: + return ret; +} + static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct cpumask *masks) { - unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; + unsigned int i, n, nodes, done = 0; unsigned int last_grp = numgrps; unsigned int curgrp = startgrp; nodemask_t nodemsk = NODE_MASK_NONE; @@ -287,7 +442,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, nodemsk, nmsk, node_groups); for (i = 0; i < nr_node_ids; i++) { - unsigned int ncpus, v; + unsigned int ncpus; struct node_groups *nv = &node_groups[i]; if (nv->ngroups == UINT_MAX) @@ -301,28 +456,14 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, WARN_ON_ONCE(nv->ngroups > ncpus); - /* Account for rounding errors */ - extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); - - /* Spread allocated groups on CPUs of the current node */ - for (v = 0; v < nv->ngroups; v++, curgrp++) { - cpus_per_grp = ncpus / nv->ngroups; - - /* Account for extra groups to compensate rounding errors */ - if (extra_grps) { - cpus_per_grp++; - --extra_grps; - } - - /* - * wrapping has to be considered given 'startgrp' - * may start anywhere - */ - if (curgrp >= last_grp) - curgrp = 0; - grp_spread_init_one(&masks[curgrp], nmsk, - cpus_per_grp); + if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk, + masks, &curgrp, last_grp)) { + done += nv->ngroups; + continue; } + + assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp, + last_grp); done += nv->ngroups; } kfree(node_groups); From 08e8f1ef3df270daef4ffc9c4bb15669f72d5d2f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 14 Jan 2026 22:47:56 -0800 Subject: [PATCH 067/107] kernel-chktaint: add reporting for tainted modules Check all loaded modules and report any that have their 'taint' flags set. The tainted module output format is: * () Example output: Kernel is "tainted" for the following reasons: * externally-built ('out-of-tree') module was loaded (#12) * unsigned module was loaded (#13) Raw taint value as int/string: 12288/'G OE ' Tainted modules: * dump_test (OE) Link: https://lkml.kernel.org/r/20260115064756.531592-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Thorsten Leemhuis Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- tools/debugging/kernel-chktaint | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/debugging/kernel-chktaint b/tools/debugging/kernel-chktaint index e7da0909d097..e1571c04afb5 100755 --- a/tools/debugging/kernel-chktaint +++ b/tools/debugging/kernel-chktaint @@ -211,9 +211,25 @@ else addout "J" echo " * fwctl's mutating debug interface was used (#19)" fi +echo "Raw taint value as int/string: $taint/'$out'" +# report on any tainted loadable modules +[ "$1" = "" ] && [ -r /sys/module/ ] && \ + cnt=`grep [A-Z] /sys/module/*/taint | wc -l` || cnt=0 + +if [ $cnt -ne 0 ]; then + echo + echo "Tainted modules:" + for dir in `ls /sys/module` ; do + if [ -r /sys/module/$dir/taint ]; then + modtnt=`cat /sys/module/$dir/taint` + [ "$modtnt" = "" ] || echo " * $dir ($modtnt)" + fi + done +fi + +echo echo "For a more detailed explanation of the various taint flags see" echo " Documentation/admin-guide/tainted-kernels.rst in the Linux kernel sources" echo " or https://kernel.org/doc/html/latest/admin-guide/tainted-kernels.html" -echo "Raw taint value as int/string: $taint/'$out'" #EOF# From 6ca9de3600f482b74723dc13b5e345e4bc3fb3fa Mon Sep 17 00:00:00 2001 From: "Pratyush Yadav (Google)" Date: Fri, 16 Jan 2026 16:54:11 +0000 Subject: [PATCH 068/107] kho: print which scratch buffer failed to be reserved When scratch area fails to reserve, KHO prints a message indicating that. But it doesn't say which scratch failed to allocate. This can be useful information for debugging. Even more so when the failure is hard to reproduce. Along with the current message, also print which exact scratch area failed to be reserved. Link: https://lkml.kernel.org/r/20260116165416.1262531-1-pratyush@kernel.org Signed-off-by: Pratyush Yadav (Google) Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: David Matlack Cc: Pasha Tatashin Cc: Pratyush Yadav Cc: Samiullah Khawaja Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index fbe109a0d858..b0be06c41d92 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -637,8 +637,10 @@ static void __init kho_reserve_scratch(void) kho_scratch_cnt = num_online_nodes() + 2; size = kho_scratch_cnt * sizeof(*kho_scratch); kho_scratch = memblock_alloc(size, PAGE_SIZE); - if (!kho_scratch) + if (!kho_scratch) { + pr_err("Failed to reserve scratch array\n"); goto err_disable_kho; + } /* * reserve scratch area in low memory for lowmem allocations in the @@ -647,8 +649,10 @@ static void __init kho_reserve_scratch(void) size = scratch_size_lowmem; addr = memblock_phys_alloc_range(size, CMA_MIN_ALIGNMENT_BYTES, 0, ARCH_LOW_ADDRESS_LIMIT); - if (!addr) + if (!addr) { + pr_err("Failed to reserve lowmem scratch buffer\n"); goto err_free_scratch_desc; + } kho_scratch[i].addr = addr; kho_scratch[i].size = size; @@ -657,8 +661,10 @@ static void __init kho_reserve_scratch(void) /* reserve large contiguous area for allocations without nid */ size = scratch_size_global; addr = memblock_phys_alloc(size, CMA_MIN_ALIGNMENT_BYTES); - if (!addr) + if (!addr) { + pr_err("Failed to reserve global scratch buffer\n"); goto err_free_scratch_areas; + } kho_scratch[i].addr = addr; kho_scratch[i].size = size; @@ -669,8 +675,10 @@ static void __init kho_reserve_scratch(void) addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid, true); - if (!addr) + if (!addr) { + pr_err("Failed to reserve nid %d scratch buffer\n", nid); goto err_free_scratch_areas; + } kho_scratch[i].addr = addr; kho_scratch[i].size = size; From 931d5c36c7369b65adb9e3d197a8d3a8a913db8c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 16 Jan 2026 09:42:52 -0800 Subject: [PATCH 069/107] checkpatch: add an invalid patch separator test Some versions of tools that apply patches incorrectly allow lines that start with 3 dashes and have additional content on the same line. Checkpatch will now emit an ERROR on these lines and optionally convert those lines from dashes to equals with --fix. Link: https://lkml.kernel.org/r/6ec1ed08328340db42655287afd5fa4067316b11.camel@perches.com Signed-off-by: Joe Perches Suggested-by: Ian Rogers Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Kuan-Wei Chiu Cc: Lukas Bulwahn Cc: Namhyung kim Cc: Stehen Rothwell Signed-off-by: Andrew Morton --- Documentation/dev-tools/checkpatch.rst | 5 +++++ scripts/checkpatch.pl | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst index deb3f67a633c..baf0b42ebba9 100644 --- a/Documentation/dev-tools/checkpatch.rst +++ b/Documentation/dev-tools/checkpatch.rst @@ -601,6 +601,11 @@ Commit message See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes + **BAD_COMMIT_SEPARATOR** + The commit separator is a single line with 3 dashes. + The regex match is '^---$' + Lines that start with 3 dashes and have more content on the same line + may confuse tools that apply patches. Comparison style ---------------- diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c0250244cf7a..3932f07e6ada 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3031,6 +3031,16 @@ sub process { } } +# Check for invalid patch separator + if ($in_commit_log && + $line =~ /^---.+/) { + if (ERROR("BAD_COMMIT_SEPARATOR", + "Invalid commit separator - some tools may have problems applying this\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/-/=/g; + } + } + # Check for patch separator if ($line =~ /^---$/) { $has_patch_separator = 1; From 840fe43d371fc59ef2da6b6bb88a4d480eed9a38 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Fri, 16 Jan 2026 11:22:14 +0000 Subject: [PATCH 070/107] kho: use unsigned long for nr_pages Patch series "kho: clean up page initialization logic", v2. This series simplifies the page initialization logic in kho_restore_page(). It was originally only a single patch [0], but on Pasha's suggestion, I added another patch to use unsigned long for nr_pages. Technically speaking, the patches aren't related and can be applied independently, but bundling them together since patch 2 relies on 1 and it is easier to manage them this way. This patch (of 2): With 4k pages, a 32-bit nr_pages can span up to 16 TiB. While it is a lot, there exist systems with terabytes of RAM. gup is also moving to using long for nr_pages. Use unsigned long and make KHO future-proof. Link: https://lkml.kernel.org/r/20260116112217.915803-1-pratyush@kernel.org Link: https://lkml.kernel.org/r/20260116112217.915803-2-pratyush@kernel.org Signed-off-by: Pratyush Yadav Suggested-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pasha Tatashin Cc: Alexander Graf Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 6 +++--- kernel/liveupdate/kexec_handover.c | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index a56ff3ffaf17..ac4129d1d741 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -22,15 +22,15 @@ bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); void kho_unpreserve_folio(struct folio *folio); -int kho_preserve_pages(struct page *page, unsigned int nr_pages); -void kho_unpreserve_pages(struct page *page, unsigned int nr_pages); +int kho_preserve_pages(struct page *page, unsigned long nr_pages); +void kho_unpreserve_pages(struct page *page, unsigned long nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); void *kho_alloc_preserve(size_t size); void kho_unpreserve_free(void *mem); void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); -struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); +struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); int kho_add_subtree(const char *name, void *fdt); void kho_remove_subtree(void *fdt); diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index e44fd7ceff2e..56cc1aad5c5c 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -219,7 +219,8 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) { struct page *page = pfn_to_online_page(PHYS_PFN(phys)); - unsigned int nr_pages, ref_cnt; + unsigned long nr_pages; + unsigned int ref_cnt; union kho_page_info info; if (!page) @@ -246,7 +247,7 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) * count of 1 */ ref_cnt = is_folio ? 0 : 1; - for (unsigned int i = 1; i < nr_pages; i++) + for (unsigned long i = 1; i < nr_pages; i++) set_page_count(page + i, ref_cnt); if (is_folio && info.order) @@ -288,7 +289,7 @@ EXPORT_SYMBOL_GPL(kho_restore_folio); * * Return: 0 on success, error code on failure */ -struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages) +struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages) { const unsigned long start_pfn = PHYS_PFN(phys); const unsigned long end_pfn = start_pfn + nr_pages; @@ -837,7 +838,7 @@ EXPORT_SYMBOL_GPL(kho_unpreserve_folio); * * Return: 0 on success, error code on failure */ -int kho_preserve_pages(struct page *page, unsigned int nr_pages) +int kho_preserve_pages(struct page *page, unsigned long nr_pages) { struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); @@ -881,7 +882,7 @@ EXPORT_SYMBOL_GPL(kho_preserve_pages); * kho_preserve_pages() call. Unpreserving arbitrary sub-ranges of larger * preserved blocks is not supported. */ -void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +void kho_unpreserve_pages(struct page *page, unsigned long nr_pages) { struct kho_mem_track *track = &kho_out.track; const unsigned long start_pfn = page_to_pfn(page); From 8f1081892d6218d23bf8afb4246217c41f5a9b21 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Fri, 16 Jan 2026 11:22:15 +0000 Subject: [PATCH 071/107] kho: simplify page initialization in kho_restore_page() When restoring a page (from kho_restore_pages()) or folio (from kho_restore_folio()), KHO must initialize the struct page. The initialization differs slightly depending on if a folio is requested or a set of 0-order pages is requested. Conceptually, it is quite simple to understand. When restoring 0-order pages, each page gets a refcount of 1 and that's it. When restoring a folio, head page gets a refcount of 1 and tail pages get 0. kho_restore_page() tries to combine the two separate initialization flow into one piece of code. While it works fine, it is more complicated to read than it needs to be. Make the code simpler by splitting the two initalization paths into two separate functions. This improves readability by clearly showing how each type must be initialized. Link: https://lkml.kernel.org/r/20260116112217.915803-3-pratyush@kernel.org Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pasha Tatashin Cc: Alexander Graf Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 40 +++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 56cc1aad5c5c..fbfa5a04faed 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -216,11 +216,32 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, return 0; } +/* For physically contiguous 0-order pages. */ +static void kho_init_pages(struct page *page, unsigned long nr_pages) +{ + for (unsigned long i = 0; i < nr_pages; i++) + set_page_count(page + i, 1); +} + +static void kho_init_folio(struct page *page, unsigned int order) +{ + unsigned long nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned long i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); +} + static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) { struct page *page = pfn_to_online_page(PHYS_PFN(phys)); unsigned long nr_pages; - unsigned int ref_cnt; union kho_page_info info; if (!page) @@ -238,20 +259,11 @@ static struct page *kho_restore_page(phys_addr_t phys, bool is_folio) /* Clear private to make sure later restores on this page error out. */ page->private = 0; - /* Head page gets refcount of 1. */ - set_page_count(page, 1); - /* - * For higher order folios, tail pages get a page count of zero. - * For physically contiguous order-0 pages every pages gets a page - * count of 1 - */ - ref_cnt = is_folio ? 0 : 1; - for (unsigned long i = 1; i < nr_pages; i++) - set_page_count(page + i, ref_cnt); - - if (is_folio && info.order) - prep_compound_page(page, info.order); + if (is_folio) + kho_init_folio(page, info.order); + else + kho_init_pages(page, nr_pages); /* Always mark headpage's codetag as empty to avoid accounting mismatch */ clear_page_tag_ref(page); From e8d899d301346a5591c9d1af06c3c9b3501cf84b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 16 Jan 2026 16:26:27 -0700 Subject: [PATCH 072/107] compiler-clang.h: require LLVM 19.1.0 or higher for __typeof_unqual__ When building the kernel using a version of LLVM between llvmorg-19-init (the first commit of the LLVM 19 development cycle) and the change in LLVM that actually added __typeof_unqual__ for all C modes [1], which might happen during a bisect of LLVM, there is a build failure: In file included from arch/x86/kernel/asm-offsets.c:9: In file included from include/linux/crypto.h:15: In file included from include/linux/completion.h:12: In file included from include/linux/swait.h:7: In file included from include/linux/spinlock.h:56: In file included from include/linux/preempt.h:79: arch/x86/include/asm/preempt.h:61:2: error: call to undeclared function '__typeof_unqual__'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 61 | raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED); | ^ arch/x86/include/asm/percpu.h:478:36: note: expanded from macro 'raw_cpu_and_4' 478 | #define raw_cpu_and_4(pcp, val) percpu_binary_op(4, , "and", (pcp), val) | ^ arch/x86/include/asm/percpu.h:210:3: note: expanded from macro 'percpu_binary_op' 210 | TYPEOF_UNQUAL(_var) pto_tmp__; \ | ^ include/linux/compiler.h:248:29: note: expanded from macro 'TYPEOF_UNQUAL' 248 | # define TYPEOF_UNQUAL(exp) __typeof_unqual__(exp) | ^ The current logic of CC_HAS_TYPEOF_UNQUAL just checks for a major version of 19 but half of the 19 development cycle did not have support for __typeof_unqual__. Harden the logic of CC_HAS_TYPEOF_UNQUAL to avoid this error by only using __typeof_unqual__ with a released version of LLVM 19, which is greater than or equal to 19.1.0 with LLVM's versioning scheme that matches GCC's [2]. Link: https://github.com/llvm/llvm-project/commit/cc308f60d41744b5920ec2e2e5b25e1273c8704b [1] Link: https://github.com/llvm/llvm-project/commit/4532617ae420056bf32f6403dde07fb99d276a49 [2] Link: https://lkml.kernel.org/r/20260116-require-llvm-19-1-for-typeof_unqual-v1-1-3b9a4a4b212b@kernel.org Fixes: ac053946f5c4 ("compiler.h: introduce TYPEOF_UNQUAL() macro") Signed-off-by: Nathan Chancellor Cc: Bill Wendling Cc: Justin Stitt Cc: Uros Bizjak Cc: Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 7edf1a07b535..e1123dd28486 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -153,4 +153,4 @@ * Bindgen uses LLVM even if our C compiler is GCC, so we cannot * rely on the auto-detected CONFIG_CC_HAS_TYPEOF_UNQUAL. */ -#define CC_HAS_TYPEOF_UNQUAL (__clang_major__ >= 19) +#define CC_HAS_TYPEOF_UNQUAL (__clang_major__ > 19 || (__clang_major__ == 19 && __clang_minor__ > 0)) From f2e0abdc88ce68cdba0a66ccc05a3e96b688a2c7 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 15 Jan 2026 23:25:04 -0500 Subject: [PATCH 073/107] kernel.h: drop STACK_MAGIC macro Patch series "Unload linux/kernel.h", v5. kernel.h hosts declarations that can be placed better. This series decouples kernel.h with some explicit and implicit dependencies; also, moves tracing functionality to a new independent header. This patch (of 6): The macro was introduced in 1994, v1.0.4, for stacks protection. Since that, people found better ways to protect stacks, and now the macro is only used by i915 selftests. Move it to a local header and drop from the kernel.h. Link: https://lkml.kernel.org/r/20260116042510.241009-1-ynorov@nvidia.com Link: https://lkml.kernel.org/r/20260116042510.241009-2-ynorov@nvidia.com Signed-off-by: Yury Norov Reviewed-by: Andy Shevchenko Acked-by: Randy Dunlap Acked-by: Jani Nikula Reviewed-by: Christophe Leroy (CS GROUP) Reviewed-by: Aaron Tomlin Reviewed-by: Andi Shyti Reviewed-by: Joel Fernandes Cc: Greg Kroah-Hartman Cc: Petr Pavlu Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gt/selftest_ring_submission.c | 1 + drivers/gpu/drm/i915/i915_selftest.h | 2 ++ include/linux/kernel.h | 2 -- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/selftest_ring_submission.c b/drivers/gpu/drm/i915/gt/selftest_ring_submission.c index 87ceb0f374b6..600333ae6c8c 100644 --- a/drivers/gpu/drm/i915/gt/selftest_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/selftest_ring_submission.c @@ -3,6 +3,7 @@ * Copyright © 2020 Intel Corporation */ +#include "i915_selftest.h" #include "intel_engine_pm.h" #include "selftests/igt_flush_test.h" diff --git a/drivers/gpu/drm/i915/i915_selftest.h b/drivers/gpu/drm/i915/i915_selftest.h index bdf3e22c0a34..72922028f4ba 100644 --- a/drivers/gpu/drm/i915/i915_selftest.h +++ b/drivers/gpu/drm/i915/i915_selftest.h @@ -26,6 +26,8 @@ #include +#define STACK_MAGIC 0xdeadbeef + struct pci_dev; struct drm_i915_private; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 35b8f2a5aca5..cefe733a0c10 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -39,8 +39,6 @@ #include -#define STACK_MAGIC 0xdeadbeef - struct completion; struct user; From 25b66674b1036c1eb3069bf62329a9c60850d782 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 15 Jan 2026 23:25:05 -0500 Subject: [PATCH 074/107] moduleparam: include required headers explicitly The following patch drops moduleparam.h dependency on kernel.h. In preparation to it, list all the required headers explicitly. Link: https://lkml.kernel.org/r/20260116042510.241009-3-ynorov@nvidia.com Signed-off-by: Yury Norov Suggested-by: Petr Pavlu Reviewed-by: Petr Pavlu Reviewed-by: Andy Shevchenko Reviewed-by: Joel Fernandes Cc: Aaron Tomlin Cc: Andi Shyti Cc: Christophe Leroy (CS GROUP) Cc: Greg Kroah-Hartman Cc: Jani Nikula Cc: Randy Dunlap Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/moduleparam.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 915f32f7d888..03a977168c52 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -2,9 +2,14 @@ #ifndef _LINUX_MODULE_PARAMS_H #define _LINUX_MODULE_PARAMS_H /* (C) Copyright 2001, 2002 Rusty Russell IBM Corporation */ + +#include +#include +#include #include #include #include +#include /* * The maximum module name length, including the NUL byte. From 90ddd39b881df74b14918cee031154f6ddb7af33 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 15 Jan 2026 23:25:06 -0500 Subject: [PATCH 075/107] kernel.h: move VERIFY_OCTAL_PERMISSIONS() to sysfs.h The macro is related to sysfs, but is defined in kernel.h. Move it to the proper header, and unload the generic kernel.h. Now that the macro is removed from kernel.h, linux/moduleparam.h is decoupled, and kernel.h inclusion can be removed. Link: https://lkml.kernel.org/r/20260116042510.241009-4-ynorov@nvidia.com Signed-off-by: Yury Norov Acked-by: Randy Dunlap Tested-by: Randy Dunlap Reviewed-by: Andy Shevchenko Reviewed-by: Petr Pavlu Acked-by: Greg Kroah-Hartman Reviewed-by: Joel Fernandes Cc: Aaron Tomlin Cc: Andi Shyti Cc: Christophe Leroy (CS GROUP) Cc: Jani Nikula Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/filesystems/sysfs.rst | 2 +- include/linux/kernel.h | 12 ------------ include/linux/moduleparam.h | 2 +- include/linux/sysfs.h | 13 +++++++++++++ 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/Documentation/filesystems/sysfs.rst b/Documentation/filesystems/sysfs.rst index 2703c04af7d0..ffcef4d6bc8d 100644 --- a/Documentation/filesystems/sysfs.rst +++ b/Documentation/filesystems/sysfs.rst @@ -120,7 +120,7 @@ is equivalent to doing:: .store = store_foo, }; -Note as stated in include/linux/kernel.h "OTHER_WRITABLE? Generally +Note as stated in include/linux/sysfs.h "OTHER_WRITABLE? Generally considered a bad idea." so trying to set a sysfs file writable for everyone will fail reverting to RO mode for "Others". diff --git a/include/linux/kernel.h b/include/linux/kernel.h index cefe733a0c10..09850b26061c 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -388,16 +388,4 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } # define REBUILD_DUE_TO_DYNAMIC_FTRACE #endif -/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */ -#define VERIFY_OCTAL_PERMISSIONS(perms) \ - (BUILD_BUG_ON_ZERO((perms) < 0) + \ - BUILD_BUG_ON_ZERO((perms) > 0777) + \ - /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \ - BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \ - BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \ - /* USER_WRITABLE >= GROUP_WRITABLE */ \ - BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \ - /* OTHER_WRITABLE? Generally considered a bad idea. */ \ - BUILD_BUG_ON_ZERO((perms) & 2) + \ - (perms)) #endif diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 03a977168c52..281a006dc284 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include /* diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index c33a96b7391a..99b775f3ff46 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -808,4 +808,17 @@ static inline void sysfs_put(struct kernfs_node *kn) kernfs_put(kn); } +/* Permissions on a sysfs file: you didn't miss the 0 prefix did you? */ +#define VERIFY_OCTAL_PERMISSIONS(perms) \ + (BUILD_BUG_ON_ZERO((perms) < 0) + \ + BUILD_BUG_ON_ZERO((perms) > 0777) + \ + /* USER_READABLE >= GROUP_READABLE >= OTHER_READABLE */ \ + BUILD_BUG_ON_ZERO((((perms) >> 6) & 4) < (((perms) >> 3) & 4)) + \ + BUILD_BUG_ON_ZERO((((perms) >> 3) & 4) < ((perms) & 4)) + \ + /* USER_WRITABLE >= GROUP_WRITABLE */ \ + BUILD_BUG_ON_ZERO((((perms) >> 6) & 2) < (((perms) >> 3) & 2)) + \ + /* OTHER_WRITABLE? Generally considered a bad idea. */ \ + BUILD_BUG_ON_ZERO((perms) & 2) + \ + (perms)) + #endif /* _SYSFS_H_ */ From 269586d68994ca307ded058255e243692e3bf753 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 15 Jan 2026 23:25:07 -0500 Subject: [PATCH 076/107] kernel.h: include linux/instruction_pointer.h explicitly In preparation for decoupling linux/instruction_pointer.h and linux/kernel.h, include instruction_pointer.h explicitly where needed. Link: https://lkml.kernel.org/r/20260116042510.241009-5-ynorov@nvidia.com Signed-off-by: Yury Norov Reviewed-by: Andy Shevchenko Reviewed-by: Joel Fernandes Cc: Aaron Tomlin Cc: Andi Shyti Cc: Christophe Leroy (CS GROUP) Cc: Greg Kroah-Hartman Cc: Jani Nikula Cc: Petr Pavlu Cc: Randy Dunlap Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- arch/s390/include/asm/processor.h | 1 + include/linux/ww_mutex.h | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 3affba95845b..cc187afa07b3 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 45ff6f7a872b..9b30fa2ec508 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -17,6 +17,7 @@ #ifndef __LINUX_WW_MUTEX_H #define __LINUX_WW_MUTEX_H +#include #include #include From 86e685ff364394b477cd1c476029480a2a1960c5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 15 Jan 2026 23:25:08 -0500 Subject: [PATCH 077/107] tracing: remove size parameter in __trace_puts() The __trace_puts() function takes a string pointer and the size of the string itself. All users currently simply pass in the strlen() of the string it is also passing in. There's no reason to pass in the size. Instead have the __trace_puts() function do the strlen() within the function itself. This fixes a header recursion issue where using strlen() in the macro calling __trace_puts() requires adding #include in order to use strlen(). Removing the use of strlen() from the header fixes the recursion issue. Link: https://lore.kernel.org/all/aUN8Hm377C5A0ILX@yury/ Link: https://lkml.kernel.org/r/20260116042510.241009-6-ynorov@nvidia.com Signed-off-by: Steven Rostedt (Google) Signed-off-by: Yury Norov Reviewed-by: Andy Shevchenko Reviewed-by: Joel Fernandes Cc: Aaron Tomlin Cc: Andi Shyti Cc: Christophe Leroy (CS GROUP) Cc: Greg Kroah-Hartman Cc: Jani Nikula Cc: Petr Pavlu Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/kernel.h | 4 ++-- kernel/trace/trace.c | 7 +++---- kernel/trace/trace.h | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 09850b26061c..5838c419ed37 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -328,10 +328,10 @@ int __trace_printk(unsigned long ip, const char *fmt, ...); if (__builtin_constant_p(str)) \ __trace_bputs(_THIS_IP_, trace_printk_fmt); \ else \ - __trace_puts(_THIS_IP_, str, strlen(str)); \ + __trace_puts(_THIS_IP_, str); \ }) extern int __trace_bputs(unsigned long ip, const char *str); -extern int __trace_puts(unsigned long ip, const char *str, int size); +extern int __trace_puts(unsigned long ip, const char *str); extern void trace_dump_stack(int skip); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index baec63134ab6..e18005807395 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1178,11 +1178,10 @@ EXPORT_SYMBOL_GPL(__trace_array_puts); * __trace_puts - write a constant string into the trace buffer. * @ip: The address of the caller * @str: The constant string to write - * @size: The size of the string. */ -int __trace_puts(unsigned long ip, const char *str, int size) +int __trace_puts(unsigned long ip, const char *str) { - return __trace_array_puts(printk_trace, ip, str, size); + return __trace_array_puts(printk_trace, ip, str, strlen(str)); } EXPORT_SYMBOL_GPL(__trace_puts); @@ -1201,7 +1200,7 @@ int __trace_bputs(unsigned long ip, const char *str) int size = sizeof(struct bputs_entry); if (!printk_binsafe(tr)) - return __trace_puts(ip, str, strlen(str)); + return __trace_puts(ip, str); if (!(tr->trace_flags & TRACE_ITER(PRINTK))) return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b6d42fe06115..de4e6713b84e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2116,7 +2116,7 @@ extern void tracing_log_err(struct trace_array *tr, * about performance). The internal_trace_puts() is for such * a purpose. */ -#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ From bec261fec6d41318e414c4064f2b67c6db628acd Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 15 Jan 2026 23:25:09 -0500 Subject: [PATCH 078/107] tracing: move tracing declarations from kernel.h to a dedicated header Tracing is a half of the kernel.h in terms of LOCs, although it's a self-consistent part. It is intended for quick debugging purposes and isn't used by the normal tracing utilities. Move it to a separate header. If someone needs to just throw a trace_printk() in their driver, they will not have to pull all the heavy tracing machinery. This is a pure move. Link: https://lkml.kernel.org/r/20260116042510.241009-7-ynorov@nvidia.com Signed-off-by: Yury Norov Acked-by: Steven Rostedt Reviewed-by: Andy Shevchenko Reviewed-by: Joel Fernandes Cc: Aaron Tomlin Cc: Andi Shyti Cc: Christophe Leroy (CS GROUP) Cc: Greg Kroah-Hartman Cc: Jani Nikula Cc: Petr Pavlu Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/kernel.h | 196 +-------------------------------- include/linux/trace_printk.h | 204 +++++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+), 195 deletions(-) create mode 100644 include/linux/trace_printk.h diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5838c419ed37..e5570a16cbb1 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include @@ -189,200 +189,6 @@ enum system_states { }; extern enum system_states system_state; -/* - * General tracing related utility functions - trace_printk(), - * tracing_on/tracing_off and tracing_start()/tracing_stop - * - * Use tracing_on/tracing_off when you want to quickly turn on or off - * tracing. It simply enables or disables the recording of the trace events. - * This also corresponds to the user space /sys/kernel/tracing/tracing_on - * file, which gives a means for the kernel and userspace to interact. - * Place a tracing_off() in the kernel where you want tracing to end. - * From user space, examine the trace, and then echo 1 > tracing_on - * to continue tracing. - * - * tracing_stop/tracing_start has slightly more overhead. It is used - * by things like suspend to ram where disabling the recording of the - * trace is not enough, but tracing must actually stop because things - * like calling smp_processor_id() may crash the system. - * - * Most likely, you want to use tracing_on/tracing_off. - */ - -enum ftrace_dump_mode { - DUMP_NONE, - DUMP_ALL, - DUMP_ORIG, - DUMP_PARAM, -}; - -#ifdef CONFIG_TRACING -void tracing_on(void); -void tracing_off(void); -int tracing_is_on(void); -void tracing_snapshot(void); -void tracing_snapshot_alloc(void); - -extern void tracing_start(void); -extern void tracing_stop(void); - -static inline __printf(1, 2) -void ____trace_printk_check_format(const char *fmt, ...) -{ -} -#define __trace_printk_check_format(fmt, args...) \ -do { \ - if (0) \ - ____trace_printk_check_format(fmt, ##args); \ -} while (0) - -/** - * trace_printk - printf formatting in the ftrace buffer - * @fmt: the printf format for printing - * - * Note: __trace_printk is an internal function for trace_printk() and - * the @ip is passed in via the trace_printk() macro. - * - * This function allows a kernel developer to debug fast path sections - * that printk is not appropriate for. By scattering in various - * printk like tracing in the code, a developer can quickly see - * where problems are occurring. - * - * This is intended as a debugging tool for the developer only. - * Please refrain from leaving trace_printks scattered around in - * your code. (Extra memory is used for special buffers that are - * allocated when trace_printk() is used.) - * - * A little optimization trick is done here. If there's only one - * argument, there's no need to scan the string for printf formats. - * The trace_puts() will suffice. But how can we take advantage of - * using trace_puts() when trace_printk() has only one argument? - * By stringifying the args and checking the size we can tell - * whether or not there are args. __stringify((__VA_ARGS__)) will - * turn into "()\0" with a size of 3 when there are no args, anything - * else will be bigger. All we need to do is define a string to this, - * and then take its size and compare to 3. If it's bigger, use - * do_trace_printk() otherwise, optimize it to trace_puts(). Then just - * let gcc optimize the rest. - */ - -#define trace_printk(fmt, ...) \ -do { \ - char _______STR[] = __stringify((__VA_ARGS__)); \ - if (sizeof(_______STR) > 3) \ - do_trace_printk(fmt, ##__VA_ARGS__); \ - else \ - trace_puts(fmt); \ -} while (0) - -#define do_trace_printk(fmt, args...) \ -do { \ - static const char *trace_printk_fmt __used \ - __section("__trace_printk_fmt") = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_printk_check_format(fmt, ##args); \ - \ - if (__builtin_constant_p(fmt)) \ - __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \ - else \ - __trace_printk(_THIS_IP_, fmt, ##args); \ -} while (0) - -extern __printf(2, 3) -int __trace_bprintk(unsigned long ip, const char *fmt, ...); - -extern __printf(2, 3) -int __trace_printk(unsigned long ip, const char *fmt, ...); - -/** - * trace_puts - write a string into the ftrace buffer - * @str: the string to record - * - * Note: __trace_bputs is an internal function for trace_puts and - * the @ip is passed in via the trace_puts macro. - * - * This is similar to trace_printk() but is made for those really fast - * paths that a developer wants the least amount of "Heisenbug" effects, - * where the processing of the print format is still too much. - * - * This function allows a kernel developer to debug fast path sections - * that printk is not appropriate for. By scattering in various - * printk like tracing in the code, a developer can quickly see - * where problems are occurring. - * - * This is intended as a debugging tool for the developer only. - * Please refrain from leaving trace_puts scattered around in - * your code. (Extra memory is used for special buffers that are - * allocated when trace_puts() is used.) - * - * Returns: 0 if nothing was written, positive # if string was. - * (1 when __trace_bputs is used, strlen(str) when __trace_puts is used) - */ - -#define trace_puts(str) ({ \ - static const char *trace_printk_fmt __used \ - __section("__trace_printk_fmt") = \ - __builtin_constant_p(str) ? str : NULL; \ - \ - if (__builtin_constant_p(str)) \ - __trace_bputs(_THIS_IP_, trace_printk_fmt); \ - else \ - __trace_puts(_THIS_IP_, str); \ -}) -extern int __trace_bputs(unsigned long ip, const char *str); -extern int __trace_puts(unsigned long ip, const char *str); - -extern void trace_dump_stack(int skip); - -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement. - */ -#define ftrace_vprintk(fmt, vargs) \ -do { \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt __used \ - __section("__trace_printk_fmt") = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \ - } else \ - __ftrace_vprintk(_THIS_IP_, fmt, vargs); \ -} while (0) - -extern __printf(2, 0) int -__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap); - -extern __printf(2, 0) int -__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); - -extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode); -#else -static inline void tracing_start(void) { } -static inline void tracing_stop(void) { } -static inline void trace_dump_stack(int skip) { } - -static inline void tracing_on(void) { } -static inline void tracing_off(void) { } -static inline int tracing_is_on(void) { return 0; } -static inline void tracing_snapshot(void) { } -static inline void tracing_snapshot_alloc(void) { } - -static inline __printf(1, 2) -int trace_printk(const char *fmt, ...) -{ - return 0; -} -static __printf(1, 0) inline int -ftrace_vprintk(const char *fmt, va_list ap) -{ - return 0; -} -static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } -#endif /* CONFIG_TRACING */ - /* Rebuild everything on CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_DYNAMIC_FTRACE # define REBUILD_DUE_TO_DYNAMIC_FTRACE diff --git a/include/linux/trace_printk.h b/include/linux/trace_printk.h new file mode 100644 index 000000000000..bb5874097f24 --- /dev/null +++ b/include/linux/trace_printk.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TRACE_PRINTK_H +#define _LINUX_TRACE_PRINTK_H + +#include +#include +#include +#include + +/* + * General tracing related utility functions - trace_printk(), + * tracing_on/tracing_off and tracing_start()/tracing_stop + * + * Use tracing_on/tracing_off when you want to quickly turn on or off + * tracing. It simply enables or disables the recording of the trace events. + * This also corresponds to the user space /sys/kernel/tracing/tracing_on + * file, which gives a means for the kernel and userspace to interact. + * Place a tracing_off() in the kernel where you want tracing to end. + * From user space, examine the trace, and then echo 1 > tracing_on + * to continue tracing. + * + * tracing_stop/tracing_start has slightly more overhead. It is used + * by things like suspend to ram where disabling the recording of the + * trace is not enough, but tracing must actually stop because things + * like calling smp_processor_id() may crash the system. + * + * Most likely, you want to use tracing_on/tracing_off. + */ + +enum ftrace_dump_mode { + DUMP_NONE, + DUMP_ALL, + DUMP_ORIG, + DUMP_PARAM, +}; + +#ifdef CONFIG_TRACING +void tracing_on(void); +void tracing_off(void); +int tracing_is_on(void); +void tracing_snapshot(void); +void tracing_snapshot_alloc(void); + +extern void tracing_start(void); +extern void tracing_stop(void); + +static inline __printf(1, 2) +void ____trace_printk_check_format(const char *fmt, ...) +{ +} +#define __trace_printk_check_format(fmt, args...) \ +do { \ + if (0) \ + ____trace_printk_check_format(fmt, ##args); \ +} while (0) + +/** + * trace_printk - printf formatting in the ftrace buffer + * @fmt: the printf format for printing + * + * Note: __trace_printk is an internal function for trace_printk() and + * the @ip is passed in via the trace_printk() macro. + * + * This function allows a kernel developer to debug fast path sections + * that printk is not appropriate for. By scattering in various + * printk like tracing in the code, a developer can quickly see + * where problems are occurring. + * + * This is intended as a debugging tool for the developer only. + * Please refrain from leaving trace_printks scattered around in + * your code. (Extra memory is used for special buffers that are + * allocated when trace_printk() is used.) + * + * A little optimization trick is done here. If there's only one + * argument, there's no need to scan the string for printf formats. + * The trace_puts() will suffice. But how can we take advantage of + * using trace_puts() when trace_printk() has only one argument? + * By stringifying the args and checking the size we can tell + * whether or not there are args. __stringify((__VA_ARGS__)) will + * turn into "()\0" with a size of 3 when there are no args, anything + * else will be bigger. All we need to do is define a string to this, + * and then take its size and compare to 3. If it's bigger, use + * do_trace_printk() otherwise, optimize it to trace_puts(). Then just + * let gcc optimize the rest. + */ + +#define trace_printk(fmt, ...) \ +do { \ + char _______STR[] = __stringify((__VA_ARGS__)); \ + if (sizeof(_______STR) > 3) \ + do_trace_printk(fmt, ##__VA_ARGS__); \ + else \ + trace_puts(fmt); \ +} while (0) + +#define do_trace_printk(fmt, args...) \ +do { \ + static const char *trace_printk_fmt __used \ + __section("__trace_printk_fmt") = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_printk_check_format(fmt, ##args); \ + \ + if (__builtin_constant_p(fmt)) \ + __trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args); \ + else \ + __trace_printk(_THIS_IP_, fmt, ##args); \ +} while (0) + +extern __printf(2, 3) +int __trace_bprintk(unsigned long ip, const char *fmt, ...); + +extern __printf(2, 3) +int __trace_printk(unsigned long ip, const char *fmt, ...); + +/** + * trace_puts - write a string into the ftrace buffer + * @str: the string to record + * + * Note: __trace_bputs is an internal function for trace_puts and + * the @ip is passed in via the trace_puts macro. + * + * This is similar to trace_printk() but is made for those really fast + * paths that a developer wants the least amount of "Heisenbug" effects, + * where the processing of the print format is still too much. + * + * This function allows a kernel developer to debug fast path sections + * that printk is not appropriate for. By scattering in various + * printk like tracing in the code, a developer can quickly see + * where problems are occurring. + * + * This is intended as a debugging tool for the developer only. + * Please refrain from leaving trace_puts scattered around in + * your code. (Extra memory is used for special buffers that are + * allocated when trace_puts() is used.) + * + * Returns: 0 if nothing was written, positive # if string was. + * (1 when __trace_bputs is used, strlen(str) when __trace_puts is used) + */ + +#define trace_puts(str) ({ \ + static const char *trace_printk_fmt __used \ + __section("__trace_printk_fmt") = \ + __builtin_constant_p(str) ? str : NULL; \ + \ + if (__builtin_constant_p(str)) \ + __trace_bputs(_THIS_IP_, trace_printk_fmt); \ + else \ + __trace_puts(_THIS_IP_, str); \ +}) +extern int __trace_bputs(unsigned long ip, const char *str); +extern int __trace_puts(unsigned long ip, const char *str); + +extern void trace_dump_stack(int skip); + +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement. + */ +#define ftrace_vprintk(fmt, vargs) \ +do { \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt __used \ + __section("__trace_printk_fmt") = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs); \ + } else \ + __ftrace_vprintk(_THIS_IP_, fmt, vargs); \ +} while (0) + +extern __printf(2, 0) int +__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap); + +extern __printf(2, 0) int +__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); + +extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode); +#else +static inline void tracing_start(void) { } +static inline void tracing_stop(void) { } +static inline void trace_dump_stack(int skip) { } + +static inline void tracing_on(void) { } +static inline void tracing_off(void) { } +static inline int tracing_is_on(void) { return 0; } +static inline void tracing_snapshot(void) { } +static inline void tracing_snapshot_alloc(void) { } + +static inline __printf(1, 2) +int trace_printk(const char *fmt, ...) +{ + return 0; +} +static __printf(1, 0) inline int +ftrace_vprintk(const char *fmt, va_list ap) +{ + return 0; +} +static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } +#endif /* CONFIG_TRACING */ + +#endif From cc20650a096370469919be0eb3b041fc5aa47b39 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 17 Jan 2026 08:34:47 +0000 Subject: [PATCH 079/107] scripts/bloat-o-meter: ignore __noinstr_text_start __noinstr_text_start is adding noise to the script, ignore it. For instance using __always_inline on __skb_incr_checksum_unnecessary and CC=clang build. Before this patch, __noinstr_text_start can show up and confuse us. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/2 grow/shrink: 3/0 up/down: 212/-206 (6) Function old new delta tcp6_gro_complete 208 283 +75 tcp4_gro_complete 376 449 +73 __noinstr_text_start 3536 3600 +64 __pfx___skb_incr_checksum_unnecessary 32 - -32 __skb_incr_checksum_unnecessary 174 - -174 Total: Before=25509464, After=25509470, chg +0.00% After this patch we have a more precise result. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/2 grow/shrink: 2/0 up/down: 148/-206 (-58) Function old new delta tcp6_gro_complete 208 283 +75 tcp4_gro_complete 376 449 +73 __pfx___skb_incr_checksum_unnecessary 32 - -32 __skb_incr_checksum_unnecessary 174 - -174 Total: Before=25505928, After=25505870, chg -0.00% Link: https://lkml.kernel.org/r/20260117083448.3877418-1-edumazet@google.com Signed-off-by: Eric Dumazet Signed-off-by: Andrew Morton --- scripts/bloat-o-meter | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index 888ce286a351..db5dd18dc2d5 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -42,6 +42,7 @@ def getsizes(file, format): if name.startswith("__se_sys"): continue if name.startswith("__se_compat_sys"): continue if name.startswith("__addressable_"): continue + if name.startswith("__noinstr_text_start"): continue if name == "linux_banner": continue if name == "vermagic": continue # statics and some other optimizations adds random .NUMBER From 503efe850c7463a1e59df133b84461ef53c0361f Mon Sep 17 00:00:00 2001 From: Wang Yaxin Date: Mon, 19 Jan 2026 10:02:41 +0800 Subject: [PATCH 080/107] delayacct: add timestamp of delay max Problem ======= Commit 658eb5ab916d ("delayacct: add delay max to record delay peak") introduced the delay max for getdelays, which records abnormal latency peaks and helps us understand the magnitude of such delays. However, the peak latency value alone is insufficient for effective root cause analysis. Without the precise timestamp of when the peak occurred, we still lack the critical context needed to correlate it with other system events. Solution ======== To address this, we need to additionally record a precise timestamp when the maximum latency occurs. By correlating this timestamp with system logs and monitoring metrics, we can identify processes with abnormal resource usage at the same moment, which can help us to pinpoint root causes. Use Case ======== bash-4.4# ./getdelays -d -t 227 print delayacct stats ON TGID 227 CPU count real total virtual total delay total delay average delay max delay min delay max timestamp 46 188000000 192348334 4098012 0.089ms 0.429260ms 0.051205ms 2026-01-15T15:06:58 IO count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A SWAP count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A RECLAIM count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A THRAS HING count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A COMPACT count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A WPCOPY count delay total delay average delay max delay min delay max timestamp 182 19413338 0.107ms 0.547353ms 0.022462ms 2026-01-15T15:05:24 IRQ count delay total delay average delay max delay min delay max timestamp 0 0 0.000ms 0.000000ms 0.000000ms N/A Link: https://lkml.kernel.org/r/20260119100241520gWubW8-5QfhSf9gjqcc_E@zte.com.cn Signed-off-by: Wang Yaxin Cc: Fan Yu Cc: Jonathan Corbet Cc: xu xin Cc: Yang Yang Signed-off-by: Andrew Morton --- Documentation/accounting/delay-accounting.rst | 32 ++-- include/linux/delayacct.h | 8 + include/linux/sched.h | 5 + include/uapi/linux/taskstats.h | 22 ++- kernel/delayacct.c | 31 +++- kernel/sched/stats.h | 8 +- tools/accounting/getdelays.c | 172 +++++++++++++++--- 7 files changed, 223 insertions(+), 55 deletions(-) diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 86d7902a657f..e209c46241b0 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -107,22 +107,22 @@ Get sum and peak of delays, since system boot, for all pids with tgid 242:: TGID 242 - CPU count real total virtual total delay total delay average delay max delay min - 39 156000000 156576579 2111069 0.054ms 0.212296ms 0.031307ms - IO count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - SWAP count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - RECLAIM count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - THRASHING count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - COMPACT count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms - WPCOPY count delay total delay average delay max delay min - 156 11215873 0.072ms 0.207403ms 0.033913ms - IRQ count delay total delay average delay max delay min - 0 0 0.000ms 0.000000ms 0.000000ms + CPU count real total virtual total delay total delay average delay max delay min delay max timestamp + 46 188000000 192348334 4098012 0.089ms 0.429260ms 0.051205ms 2026-01-15T15:06:58 + IO count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + SWAP count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + RECLAIM count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + THRASHING count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + COMPACT count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A + WPCOPY count delay total delay average delay max delay min delay max timestamp + 182 19413338 0.107ms 0.547353ms 0.022462ms 2026-01-15T15:05:24 + IRQ count delay total delay average delay max delay min delay max timestamp + 0 0 0.000ms 0.000000ms 0.000000ms N/A Get IO accounting for pid 1, it works only with -p:: diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 800dcc360db2..ecb06f16d22c 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -69,6 +69,14 @@ struct task_delay_info { u32 compact_count; /* total count of memory compact */ u32 wpcopy_count; /* total count of write-protect copy */ u32 irq_count; /* total count of IRQ/SOFTIRQ */ + + struct timespec64 blkio_delay_max_ts; + struct timespec64 swapin_delay_max_ts; + struct timespec64 freepages_delay_max_ts; + struct timespec64 thrashing_delay_max_ts; + struct timespec64 compact_delay_max_ts; + struct timespec64 wpcopy_delay_max_ts; + struct timespec64 irq_delay_max_ts; }; #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index da0133524d08..1d22b6229b95 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -49,6 +49,7 @@ #include #include #include +#include #ifndef COMPILE_OFFSETS #include #endif @@ -86,6 +87,7 @@ struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; +struct timespec64; struct user_event_mm; #include @@ -435,6 +437,9 @@ struct sched_info { /* When were we last queued to run? */ unsigned long long last_queued; + /* Timestamp of max time spent waiting on a runqueue: */ + struct timespec64 max_run_delay_ts; + #endif /* CONFIG_SCHED_INFO */ }; diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 5929030d4e8b..1b31e8e14d2f 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -18,6 +18,16 @@ #define _LINUX_TASKSTATS_H #include +#ifdef __KERNEL__ +#include +#else +#ifndef _LINUX_TIME64_H +struct timespec64 { + __s64 tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +}; +#endif +#endif /* Format for per-task data returned to userland when * - a task exits @@ -34,7 +44,7 @@ */ -#define TASKSTATS_VERSION 16 +#define TASKSTATS_VERSION 17 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -230,6 +240,16 @@ struct taskstats { __u64 irq_delay_max; __u64 irq_delay_min; + + /*v17: delay max timestamp record*/ + struct timespec64 cpu_delay_max_ts; + struct timespec64 blkio_delay_max_ts; + struct timespec64 swapin_delay_max_ts; + struct timespec64 freepages_delay_max_ts; + struct timespec64 thrashing_delay_max_ts; + struct timespec64 compact_delay_max_ts; + struct timespec64 wpcopy_delay_max_ts; + struct timespec64 irq_delay_max_ts; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 30e7912ebb0d..d58ffc63bcba 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -18,6 +18,7 @@ do { \ d->type##_delay_max = tsk->delays->type##_delay_max; \ d->type##_delay_min = tsk->delays->type##_delay_min; \ + d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \ tmp = d->type##_delay_total + tsk->delays->type##_delay; \ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ d->type##_count += tsk->delays->type##_count; \ @@ -104,7 +105,8 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumulator (@total) and @count */ -static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min) +static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, + u64 *max, u64 *min, struct timespec64 *ts) { s64 ns = local_clock() - *start; unsigned long flags; @@ -113,8 +115,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou raw_spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - if (ns > *max) + if (ns > *max) { *max = ns; + ktime_get_real_ts64(ts); + } if (*min == 0 || ns < *min) *min = ns; raw_spin_unlock_irqrestore(lock, flags); @@ -137,7 +141,8 @@ void __delayacct_blkio_end(struct task_struct *p) &p->delays->blkio_delay, &p->delays->blkio_count, &p->delays->blkio_delay_max, - &p->delays->blkio_delay_min); + &p->delays->blkio_delay_min, + &p->delays->blkio_delay_max_ts); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -170,6 +175,7 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_delay_max = tsk->sched_info.max_run_delay; d->cpu_delay_min = tsk->sched_info.min_run_delay; + d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts; tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; tmp = (s64)d->cpu_run_virtual_total + t3; @@ -217,7 +223,8 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_delay, ¤t->delays->freepages_count, ¤t->delays->freepages_delay_max, - ¤t->delays->freepages_delay_min); + ¤t->delays->freepages_delay_min, + ¤t->delays->freepages_delay_max_ts); } void __delayacct_thrashing_start(bool *in_thrashing) @@ -241,7 +248,8 @@ void __delayacct_thrashing_end(bool *in_thrashing) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count, ¤t->delays->thrashing_delay_max, - ¤t->delays->thrashing_delay_min); + ¤t->delays->thrashing_delay_min, + ¤t->delays->thrashing_delay_max_ts); } void __delayacct_swapin_start(void) @@ -256,7 +264,8 @@ void __delayacct_swapin_end(void) ¤t->delays->swapin_delay, ¤t->delays->swapin_count, ¤t->delays->swapin_delay_max, - ¤t->delays->swapin_delay_min); + ¤t->delays->swapin_delay_min, + ¤t->delays->swapin_delay_max_ts); } void __delayacct_compact_start(void) @@ -271,7 +280,8 @@ void __delayacct_compact_end(void) ¤t->delays->compact_delay, ¤t->delays->compact_count, ¤t->delays->compact_delay_max, - ¤t->delays->compact_delay_min); + ¤t->delays->compact_delay_min, + ¤t->delays->compact_delay_max_ts); } void __delayacct_wpcopy_start(void) @@ -286,7 +296,8 @@ void __delayacct_wpcopy_end(void) ¤t->delays->wpcopy_delay, ¤t->delays->wpcopy_count, ¤t->delays->wpcopy_delay_max, - ¤t->delays->wpcopy_delay_min); + ¤t->delays->wpcopy_delay_min, + ¤t->delays->wpcopy_delay_max_ts); } void __delayacct_irq(struct task_struct *task, u32 delta) @@ -296,8 +307,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta) raw_spin_lock_irqsave(&task->delays->lock, flags); task->delays->irq_delay += delta; task->delays->irq_count++; - if (delta > task->delays->irq_delay_max) + if (delta > task->delays->irq_delay_max) { task->delays->irq_delay_max = delta; + ktime_get_real_ts64(&task->delays->irq_delay_max_ts); + } if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min)) task->delays->irq_delay_min = delta; raw_spin_unlock_irqrestore(&task->delays->lock, flags); diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c903f1a42891..a612cf253c87 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -253,8 +253,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t) delta = rq_clock(rq) - t->sched_info.last_queued; t->sched_info.last_queued = 0; t->sched_info.run_delay += delta; - if (delta > t->sched_info.max_run_delay) + if (delta > t->sched_info.max_run_delay) { t->sched_info.max_run_delay = delta; + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); + } if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) t->sched_info.min_run_delay = delta; rq_sched_info_dequeue(rq, delta); @@ -278,8 +280,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) t->sched_info.run_delay += delta; t->sched_info.last_arrival = now; t->sched_info.pcount++; - if (delta > t->sched_info.max_run_delay) + if (delta > t->sched_info.max_run_delay) { t->sched_info.max_run_delay = delta; + ktime_get_real_ts64(&t->sched_info.max_run_delay_ts); + } if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay)) t->sched_info.min_run_delay = delta; diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 21cb3c3d1331..64796c0223be 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -194,6 +195,37 @@ static int get_family_id(int sd) #define average_ms(t, c) (t / 1000000ULL / (c ? c : 1)) #define delay_ms(t) (t / 1000000ULL) +/* + * Format timespec64 to human readable string (YYYY-MM-DD HH:MM:SS) + * Returns formatted string or "N/A" if timestamp is zero + */ +static const char *format_timespec64(struct timespec64 *ts) +{ + static char buffer[32]; + struct tm tm_info; + time_t time_sec; + + /* Check if timestamp is zero (not set) */ + if (ts->tv_sec == 0 && ts->tv_nsec == 0) + return "N/A"; + + time_sec = (time_t)ts->tv_sec; + + /* Use thread-safe localtime_r */ + if (localtime_r(&time_sec, &tm_info) == NULL) + return "N/A"; + + snprintf(buffer, sizeof(buffer), "%04d-%02d-%02dT%02d:%02d:%02d", + tm_info.tm_year + 1900, + tm_info.tm_mon + 1, + tm_info.tm_mday, + tm_info.tm_hour, + tm_info.tm_min, + tm_info.tm_sec); + + return buffer; +} + /* * Version compatibility note: * Field availability depends on taskstats version (t->version), @@ -205,13 +237,28 @@ static int get_family_id(int sd) * version >= 13 - supports WPCOPY statistics * version >= 14 - supports IRQ statistics * version >= 16 - supports *_max and *_min delay statistics + * version >= 17 - supports delay max timestamp statistics * * Always verify version before accessing version-dependent fields * to maintain backward compatibility. */ #define PRINT_CPU_DELAY(version, t) \ do { \ - if (version >= 16) { \ + if (version >= 17) { \ + printf("%-10s%15s%15s%15s%15s%15s%15s%15s%25s\n", \ + "CPU", "count", "real total", "virtual total", \ + "delay total", "delay average", "delay max", \ + "delay min", "delay max timestamp"); \ + printf(" %15llu%15llu%15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \ + (unsigned long long)(t)->cpu_count, \ + (unsigned long long)(t)->cpu_run_real_total, \ + (unsigned long long)(t)->cpu_run_virtual_total, \ + (unsigned long long)(t)->cpu_delay_total, \ + average_ms((double)(t)->cpu_delay_total, (t)->cpu_count), \ + delay_ms((double)(t)->cpu_delay_max), \ + delay_ms((double)(t)->cpu_delay_min), \ + format_timespec64(&(t)->cpu_delay_max_ts)); \ + } else if (version >= 16) { \ printf("%-10s%15s%15s%15s%15s%15s%15s%15s\n", \ "CPU", "count", "real total", "virtual total", \ "delay total", "delay average", "delay max", "delay min"); \ @@ -257,44 +304,115 @@ static int get_family_id(int sd) } \ } while (0) +#define PRINT_FILED_DELAY_WITH_TS(name, version, t, count, total, max, min, max_ts) \ + do { \ + if (version >= 17) { \ + printf("%-10s%15s%15s%15s%15s%15s%25s\n", \ + name, "count", "delay total", "delay average", \ + "delay max", "delay min", "delay max timestamp"); \ + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms%23s\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count), \ + delay_ms((double)(t)->max), \ + delay_ms((double)(t)->min), \ + format_timespec64(&(t)->max_ts)); \ + } else if (version >= 16) { \ + printf("%-10s%15s%15s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average", \ + "delay max", "delay min"); \ + printf(" %15llu%15llu%15.3fms%13.6fms%13.6fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count), \ + delay_ms((double)(t)->max), \ + delay_ms((double)(t)->min)); \ + } else { \ + printf("%-10s%15s%15s%15s\n", \ + name, "count", "delay total", "delay average"); \ + printf(" %15llu%15llu%15.3fms\n", \ + (unsigned long long)(t)->count, \ + (unsigned long long)(t)->total, \ + average_ms((double)(t)->total, (t)->count)); \ + } \ + } while (0) + static void print_delayacct(struct taskstats *t) { printf("\n\n"); PRINT_CPU_DELAY(t->version, t); - PRINT_FILED_DELAY("IO", t->version, t, - blkio_count, blkio_delay_total, - blkio_delay_max, blkio_delay_min); + /* Use new macro with timestamp support for version >= 17 */ + if (t->version >= 17) { + PRINT_FILED_DELAY_WITH_TS("IO", t->version, t, + blkio_count, blkio_delay_total, + blkio_delay_max, blkio_delay_min, blkio_delay_max_ts); - PRINT_FILED_DELAY("SWAP", t->version, t, - swapin_count, swapin_delay_total, - swapin_delay_max, swapin_delay_min); + PRINT_FILED_DELAY_WITH_TS("SWAP", t->version, t, + swapin_count, swapin_delay_total, + swapin_delay_max, swapin_delay_min, swapin_delay_max_ts); - PRINT_FILED_DELAY("RECLAIM", t->version, t, - freepages_count, freepages_delay_total, - freepages_delay_max, freepages_delay_min); + PRINT_FILED_DELAY_WITH_TS("RECLAIM", t->version, t, + freepages_count, freepages_delay_total, + freepages_delay_max, freepages_delay_min, freepages_delay_max_ts); - PRINT_FILED_DELAY("THRASHING", t->version, t, - thrashing_count, thrashing_delay_total, - thrashing_delay_max, thrashing_delay_min); + PRINT_FILED_DELAY_WITH_TS("THRASHING", t->version, t, + thrashing_count, thrashing_delay_total, + thrashing_delay_max, thrashing_delay_min, thrashing_delay_max_ts); - if (t->version >= 11) { - PRINT_FILED_DELAY("COMPACT", t->version, t, - compact_count, compact_delay_total, - compact_delay_max, compact_delay_min); - } + if (t->version >= 11) { + PRINT_FILED_DELAY_WITH_TS("COMPACT", t->version, t, + compact_count, compact_delay_total, + compact_delay_max, compact_delay_min, compact_delay_max_ts); + } - if (t->version >= 13) { - PRINT_FILED_DELAY("WPCOPY", t->version, t, - wpcopy_count, wpcopy_delay_total, - wpcopy_delay_max, wpcopy_delay_min); - } + if (t->version >= 13) { + PRINT_FILED_DELAY_WITH_TS("WPCOPY", t->version, t, + wpcopy_count, wpcopy_delay_total, + wpcopy_delay_max, wpcopy_delay_min, wpcopy_delay_max_ts); + } - if (t->version >= 14) { - PRINT_FILED_DELAY("IRQ", t->version, t, - irq_count, irq_delay_total, - irq_delay_max, irq_delay_min); + if (t->version >= 14) { + PRINT_FILED_DELAY_WITH_TS("IRQ", t->version, t, + irq_count, irq_delay_total, + irq_delay_max, irq_delay_min, irq_delay_max_ts); + } + } else { + /* Use original macro for older versions */ + PRINT_FILED_DELAY("IO", t->version, t, + blkio_count, blkio_delay_total, + blkio_delay_max, blkio_delay_min); + + PRINT_FILED_DELAY("SWAP", t->version, t, + swapin_count, swapin_delay_total, + swapin_delay_max, swapin_delay_min); + + PRINT_FILED_DELAY("RECLAIM", t->version, t, + freepages_count, freepages_delay_total, + freepages_delay_max, freepages_delay_min); + + PRINT_FILED_DELAY("THRASHING", t->version, t, + thrashing_count, thrashing_delay_total, + thrashing_delay_max, thrashing_delay_min); + + if (t->version >= 11) { + PRINT_FILED_DELAY("COMPACT", t->version, t, + compact_count, compact_delay_total, + compact_delay_max, compact_delay_min); + } + + if (t->version >= 13) { + PRINT_FILED_DELAY("WPCOPY", t->version, t, + wpcopy_count, wpcopy_delay_total, + wpcopy_delay_max, wpcopy_delay_min); + } + + if (t->version >= 14) { + PRINT_FILED_DELAY("IRQ", t->version, t, + irq_count, irq_delay_total, + irq_delay_max, irq_delay_min); + } } } From 666183dcdd9ad3b8156a1df7f204f728f720380f Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Wed, 21 Jan 2026 09:35:08 +0800 Subject: [PATCH 081/107] rapidio: replace rio_free_net() with kfree() in rio_scan_alloc_net() When idtab allocation fails, net is not registered with rio_add_net() yet, so kfree(net) is sufficient to release the memory. Set mport->net to NULL to avoid dangling pointer. Link: https://lkml.kernel.org/r/20260121013508.195836-1-lihaoxiang@isrc.iscas.ac.cn Fixes: e6b585ca6e81 ("rapidio: move net allocation into core code") Signed-off-by: Haoxiang Li Reviewed-by: Andrew Morton Cc: Alexandre Bounine Cc: Matt Porter Cc: Signed-off-by: Andrew Morton --- drivers/rapidio/rio-scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index c12941f71e2c..dcd6619a4b02 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -854,7 +854,8 @@ static struct rio_net *rio_scan_alloc_net(struct rio_mport *mport, if (idtab == NULL) { pr_err("RIO: failed to allocate destID table\n"); - rio_free_net(net); + kfree(net); + mport->net = NULL; net = NULL; } else { net->enum_data = idtab; From 5138c936c2c82c9be8883921854bc6f7e1177d8c Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Wed, 10 Dec 2025 09:57:24 +0800 Subject: [PATCH 082/107] ocfs2: fix reflink preserve cleanup issue commit c06c303832ec ("ocfs2: fix xattr array entry __counted_by error") doesn't handle all cases and the cleanup job for preserved xattr entries still has bug: - the 'last' pointer should be shifted by one unit after cleanup an array entry. - current code logic doesn't cleanup the first entry when xh_count is 1. Note, commit c06c303832ec is also a bug fix for 0fe9b66c65f3. Link: https://lkml.kernel.org/r/20251210015725.8409-2-heming.zhao@suse.com Fixes: 0fe9b66c65f3 ("ocfs2: Add preserve to reflink.") Signed-off-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/xattr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 5fd85f517868..e434a62dd69f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6395,6 +6395,10 @@ static int ocfs2_reflink_xattr_header(handle_t *handle, (void *)last - (void *)xe); memset(last, 0, sizeof(struct ocfs2_xattr_entry)); + last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)] - 1; + } else { + memset(xe, 0, sizeof(struct ocfs2_xattr_entry)); + last = NULL; } /* From 480e1d5c64bb14441f79f2eb9421d5e26f91ea3d Mon Sep 17 00:00:00 2001 From: Li Chen Date: Tue, 20 Jan 2026 20:40:04 +0800 Subject: [PATCH 083/107] kexec: derive purgatory entry from symbol kexec_load_purgatory() derives image->start by locating e_entry inside an SHF_EXECINSTR section. If the purgatory object contains multiple executable sections with overlapping sh_addr, the entrypoint check can match more than once and trigger a WARN. Derive the entry section from the purgatory_start symbol when present and compute image->start from its final placement. Keep the existing e_entry fallback for purgatories that do not expose the symbol. WARNING: kernel/kexec_file.c:1009 at kexec_load_purgatory+0x395/0x3c0, CPU#10: kexec/1784 Call Trace: bzImage64_load+0x133/0xa00 __do_sys_kexec_file_load+0x2b3/0x5c0 do_syscall_64+0x81/0x610 entry_SYSCALL_64_after_hwframe+0x76/0x7e [me@linux.beauty: move helper to avoid forward declaration, per Baoquan] Link: https://lkml.kernel.org/r/20260128043511.316860-1-me@linux.beauty Link: https://lkml.kernel.org/r/20260120124005.148381-1-me@linux.beauty Fixes: 8652d44f466a ("kexec: support purgatories with .text.hot sections") Signed-off-by: Li Chen Acked-by: Baoquan He Cc: Alexander Graf Cc: Eric Biggers Cc: Li Chen Cc: Philipp Rudo Cc: Ricardo Ribalda Delgado Cc: Ross Zwisler Cc: Sourabh Jain Cc: Steven Rostedt Cc: Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 131 +++++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 57 deletions(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index eb62a9794242..2bfbb2d144e6 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -882,6 +882,60 @@ out_free_sha_regions: } #ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY +/* + * kexec_purgatory_find_symbol - find a symbol in the purgatory + * @pi: Purgatory to search in. + * @name: Name of the symbol. + * + * Return: pointer to symbol in read-only symtab on success, NULL on error. + */ +static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + const Elf_Shdr *sechdrs; + const Elf_Ehdr *ehdr; + const Elf_Sym *syms; + const char *strtab; + int i, k; + + if (!pi->ehdr) + return NULL; + + ehdr = pi->ehdr; + sechdrs = (void *)ehdr + ehdr->e_shoff; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (void *)ehdr + sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} /* * kexec_purgatory_setup_kbuf - prepare buffer to load purgatory. * @pi: Purgatory to be loaded. @@ -960,6 +1014,10 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, unsigned long offset; size_t sechdrs_size; Elf_Shdr *sechdrs; + const Elf_Sym *entry_sym; + u16 entry_shndx = 0; + unsigned long entry_off = 0; + bool start_fixed = false; int i; /* @@ -977,6 +1035,12 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, bss_addr = kbuf->mem + kbuf->bufsz; kbuf->image->start = pi->ehdr->e_entry; + entry_sym = kexec_purgatory_find_symbol(pi, "purgatory_start"); + if (entry_sym) { + entry_shndx = entry_sym->st_shndx; + entry_off = entry_sym->st_value; + } + for (i = 0; i < pi->ehdr->e_shnum; i++) { unsigned long align; void *src, *dst; @@ -994,6 +1058,13 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, offset = ALIGN(offset, align); + if (!start_fixed && entry_sym && i == entry_shndx && + (sechdrs[i].sh_flags & SHF_EXECINSTR) && + entry_off < sechdrs[i].sh_size) { + kbuf->image->start = kbuf->mem + offset + entry_off; + start_fixed = true; + } + /* * Check if the segment contains the entry point, if so, * calculate the value of image->start based on it. @@ -1004,13 +1075,14 @@ static int kexec_purgatory_setup_sechdrs(struct purgatory_info *pi, * is not set to the initial value, and warn the user so they * have a chance to fix their purgatory's linker script. */ - if (sechdrs[i].sh_flags & SHF_EXECINSTR && + if (!start_fixed && sechdrs[i].sh_flags & SHF_EXECINSTR && pi->ehdr->e_entry >= sechdrs[i].sh_addr && pi->ehdr->e_entry < (sechdrs[i].sh_addr + sechdrs[i].sh_size) && - !WARN_ON(kbuf->image->start != pi->ehdr->e_entry)) { + kbuf->image->start == pi->ehdr->e_entry) { kbuf->image->start -= sechdrs[i].sh_addr; kbuf->image->start += kbuf->mem + offset; + start_fixed = true; } src = (void *)pi->ehdr + sechdrs[i].sh_offset; @@ -1128,61 +1200,6 @@ out_free_kbuf: return ret; } -/* - * kexec_purgatory_find_symbol - find a symbol in the purgatory - * @pi: Purgatory to search in. - * @name: Name of the symbol. - * - * Return: pointer to symbol in read-only symtab on success, NULL on error. - */ -static const Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, - const char *name) -{ - const Elf_Shdr *sechdrs; - const Elf_Ehdr *ehdr; - const Elf_Sym *syms; - const char *strtab; - int i, k; - - if (!pi->ehdr) - return NULL; - - ehdr = pi->ehdr; - sechdrs = (void *)ehdr + ehdr->e_shoff; - - for (i = 0; i < ehdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_SYMTAB) - continue; - - if (sechdrs[i].sh_link >= ehdr->e_shnum) - /* Invalid strtab section number */ - continue; - strtab = (void *)ehdr + sechdrs[sechdrs[i].sh_link].sh_offset; - syms = (void *)ehdr + sechdrs[i].sh_offset; - - /* Go through symbols for a match */ - for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { - if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) - continue; - - if (strcmp(strtab + syms[k].st_name, name) != 0) - continue; - - if (syms[k].st_shndx == SHN_UNDEF || - syms[k].st_shndx >= ehdr->e_shnum) { - pr_debug("Symbol: %s has bad section index %d.\n", - name, syms[k].st_shndx); - return NULL; - } - - /* Found the symbol we are looking for */ - return &syms[k]; - } - } - - return NULL; -} - void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) { struct purgatory_info *pi = &image->purgatory_info; From 8924336531e21b187d724b5fdf5277269c9ec22c Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Thu, 22 Jan 2026 15:13:03 +0100 Subject: [PATCH 084/107] ipc: don't audit capability check in ipc_permissions() The IPC sysctls implement the ctl_table_root::permissions hook and they override the file access mode based on the CAP_CHECKPOINT_RESTORE capability, which is being checked regardless of whether any access is actually denied or not, so if an LSM denies the capability, an audit record may be logged even when access is in fact granted. It wouldn't be viable to restructure the sysctl permission logic to only check the capability when the access would be actually denied if it's not granted. Thus, do the same as in net_ctl_permissions() (net/sysctl_net.c) - switch from ns_capable() to ns_capable_noaudit(), so that the check never emits an audit record. Link: https://lkml.kernel.org/r/20260122141303.241133-1-omosnace@redhat.com Fixes: 0889f44e2810 ("ipc: Check permissions for checkpoint_restart sysctls at open time") Signed-off-by: Ondrej Mosnacek Acked-by: Alexey Gladkov Acked-by: Serge Hallyn Cc: Eric Biederman Cc: Paul Moore Signed-off-by: Andrew Morton --- include/linux/capability.h | 6 ++++++ ipc/ipc_sysctl.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/capability.h b/include/linux/capability.h index 1fb08922552c..37db92b3d6f8 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -203,6 +203,12 @@ static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns) ns_capable(ns, CAP_SYS_ADMIN); } +static inline bool checkpoint_restore_ns_capable_noaudit(struct user_namespace *ns) +{ + return ns_capable_noaudit(ns, CAP_CHECKPOINT_RESTORE) || + ns_capable_noaudit(ns, CAP_SYS_ADMIN); +} + /* audit system wants to get cap info from files as well */ int get_vfs_caps_from_disk(struct mnt_idmap *idmap, const struct dentry *dentry, diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 15b17e86e198..9b087ebeb643 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -214,7 +214,7 @@ static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) || (table->data == &ns->ids[IPC_MSG_IDS].next_id) || (table->data == &ns->ids[IPC_SHM_IDS].next_id)) && - checkpoint_restore_ns_capable(ns->user_ns)) + checkpoint_restore_ns_capable_noaudit(ns->user_ns)) mode = 0666; else #endif From 0895a000e4fff9e950a7894210db45973e485c35 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Thu, 22 Jan 2026 15:07:45 +0100 Subject: [PATCH 085/107] ucount: check for CAP_SYS_RESOURCE using ns_capable_noaudit() The user.* sysctls implement the ctl_table_root::permissions hook and they override the file access mode based on the CAP_SYS_RESOURCE capability (at most rwx if capable, at most r-- if not). The capability is being checked unconditionally, so if an LSM denies the capability, an audit record may be logged even when access is in fact granted. Given the logic in the set_permissions() function in kernel/ucount.c and the unfortunate way the permission checking is implemented, it doesn't seem viable to avoid false positive denials by deferring the capability check. Thus, do the same as in net_ctl_permissions() (net/sysctl_net.c) - switch from ns_capable() to ns_capable_noaudit(), so that the check never logs an audit record. Link: https://lkml.kernel.org/r/20260122140745.239428-1-omosnace@redhat.com Fixes: dbec28460a89 ("userns: Add per user namespace sysctls.") Signed-off-by: Ondrej Mosnacek Reviewed-by: Paul Moore Acked-by: Serge Hallyn Cc: Eric Biederman Cc: Alexey Gladkov Signed-off-by: Andrew Morton --- kernel/ucount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/ucount.c b/kernel/ucount.c index 586af49fc03e..fc4a8f2d3096 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -47,7 +47,7 @@ static int set_permissions(struct ctl_table_header *head, int mode; /* Allow users with CAP_SYS_RESOURCE unrestrained access */ - if (ns_capable(user_ns, CAP_SYS_RESOURCE)) + if (ns_capable_noaudit(user_ns, CAP_SYS_RESOURCE)) mode = (table->mode & S_IRWXU) >> 6; else /* Allow all others at most read-only access */ From b50634c5e84a7a57c20b03e367a43f1b63b7ea23 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 22 Jan 2026 14:17:57 +0200 Subject: [PATCH 086/107] kho: cleanup error handling in kho_populate() * use dedicated labels for error handling instead of checking if a pointer is not null to decide if it should be unmapped * drop assignment of values to err that are only used to print a numeric error code, there are pr_warn()s for each failure already so printing a numeric error code in the next line does not add anything useful Link: https://lkml.kernel.org/r/20260122121757.575987-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Mike Rapoport Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 39 +++++++++++++----------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index fbfa5a04faed..e0a50b012ba3 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -1455,46 +1455,40 @@ void __init kho_memory_init(void) void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) { + unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); struct kho_scratch *scratch = NULL; phys_addr_t mem_map_phys; void *fdt = NULL; - int err = 0; - unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch); + int err; /* Validate the input FDT */ fdt = early_memremap(fdt_phys, fdt_len); if (!fdt) { pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys); - err = -EFAULT; - goto out; + goto err_report; } err = fdt_check_header(fdt); if (err) { pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n", fdt_phys, err); - err = -EINVAL; - goto out; + goto err_unmap_fdt; } err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE); if (err) { pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n", fdt_phys, KHO_FDT_COMPATIBLE, err); - err = -EINVAL; - goto out; + goto err_unmap_fdt; } mem_map_phys = kho_get_mem_map_phys(fdt); - if (!mem_map_phys) { - err = -ENOENT; - goto out; - } + if (!mem_map_phys) + goto err_unmap_fdt; scratch = early_memremap(scratch_phys, scratch_len); if (!scratch) { pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n", scratch_phys, scratch_len); - err = -EFAULT; - goto out; + goto err_unmap_fdt; } /* @@ -1511,7 +1505,7 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, if (WARN_ON(err)) { pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %pe", &area->addr, &size, ERR_PTR(err)); - goto out; + goto err_unmap_scratch; } pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size); } @@ -1533,13 +1527,14 @@ void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len, kho_scratch_cnt = scratch_cnt; pr_info("found kexec handover data.\n"); -out: - if (fdt) - early_memunmap(fdt, fdt_len); - if (scratch) - early_memunmap(scratch, scratch_len); - if (err) - pr_warn("disabling KHO revival: %d\n", err); + return; + +err_unmap_scratch: + early_memunmap(scratch, scratch_len); +err_unmap_fdt: + early_memunmap(fdt, fdt_len); +err_report: + pr_warn("disabling KHO revival\n"); } /* Helper functions for kexec_file_load */ From 96a54b8ffc8c4567c32fe0b6996669f1132b026d Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 26 Jan 2026 12:20:46 +0100 Subject: [PATCH 087/107] crash_dump: fix dm_crypt keys locking and ref leak crash_load_dm_crypt_keys() reads dm-crypt volume keys from the user keyring. It uses user_key_payload_locked() without holding key->sem, which makes lockdep complain when kexec_file_load() assembles the crash image: ============================= WARNING: suspicious RCU usage ----------------------------- ./include/keys/user-type.h:53 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 no locks held by kexec/4875. stack backtrace: Call Trace: dump_stack_lvl+0x5d/0x80 lockdep_rcu_suspicious.cold+0x4e/0x96 crash_load_dm_crypt_keys+0x314/0x390 bzImage64_load+0x116/0x9a0 ? __lock_acquire+0x464/0x1ba0 __do_sys_kexec_file_load+0x26a/0x4f0 do_syscall_64+0xbd/0x430 entry_SYSCALL_64_after_hwframe+0x77/0x7f In addition, the key returned by request_key() is never key_put()'d, leaking a key reference on each load attempt. Take key->sem while copying the payload and drop the key reference afterwards. Link: https://lkml.kernel.org/r/patch.git-2d4d76083a5c.your-ad-here.call-01769426386-ext-2560@work.hours Fixes: 479e58549b0f ("crash_dump: store dm crypt keys in kdump reserved memory") Signed-off-by: Vasily Gorbik Cc: Baoquan He Cc: Coiby Xu Cc: Dave Young Cc: Vivek Goyal Cc: Signed-off-by: Andrew Morton --- kernel/crash_dump_dm_crypt.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/kernel/crash_dump_dm_crypt.c b/kernel/crash_dump_dm_crypt.c index 0d23dc1de67c..37129243054d 100644 --- a/kernel/crash_dump_dm_crypt.c +++ b/kernel/crash_dump_dm_crypt.c @@ -143,6 +143,7 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key) { const struct user_key_payload *ukp; struct key *key; + int ret = 0; kexec_dprintk("Requesting logon key %s", dm_key->key_desc); key = request_key(&key_type_logon, dm_key->key_desc, NULL); @@ -152,20 +153,28 @@ static int read_key_from_user_keying(struct dm_crypt_key *dm_key) return PTR_ERR(key); } + down_read(&key->sem); ukp = user_key_payload_locked(key); - if (!ukp) - return -EKEYREVOKED; + if (!ukp) { + ret = -EKEYREVOKED; + goto out; + } if (ukp->datalen > KEY_SIZE_MAX) { pr_err("Key size %u exceeds maximum (%u)\n", ukp->datalen, KEY_SIZE_MAX); - return -EINVAL; + ret = -EINVAL; + goto out; } memcpy(dm_key->data, ukp->data, ukp->datalen); dm_key->key_size = ukp->datalen; kexec_dprintk("Get dm crypt key (size=%u) %s: %8ph\n", dm_key->key_size, dm_key->key_desc, dm_key->data); - return 0; + +out: + up_read(&key->sem); + key_put(key); + return ret; } struct config_key { From 427b2535f51342de3156babc6bdc3f3b7dd2c707 Mon Sep 17 00:00:00 2001 From: Evangelos Petrongonas Date: Tue, 20 Jan 2026 17:59:11 +0000 Subject: [PATCH 088/107] kho: skip memoryless NUMA nodes when reserving scratch areas kho_reserve_scratch() iterates over all online NUMA nodes to allocate per-node scratch memory. On systems with memoryless NUMA nodes (nodes that have CPUs but no memory), memblock_alloc_range_nid() fails because there is no memory available on that node. This causes KHO initialization to fail and kho_enable to be set to false. Some ARM64 systems have NUMA topologies where certain nodes contain only CPUs without any associated memory. These configurations are valid and should not prevent KHO from functioning. Fix this by only counting nodes that have memory (N_MEMORY state) and skip memoryless nodes in the per-node scratch allocation loop. Link: https://lkml.kernel.org/r/20260120175913.34368-1-epetron@amazon.de Fixes: 3dc92c311498 ("kexec: add Kexec HandOver (KHO) generation helpers"). Signed-off-by: Evangelos Petrongonas Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pasha Tatashin Cc: Alexander Graf Cc: Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index e0a50b012ba3..8a2b2a7e50fc 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -655,7 +655,7 @@ static void __init kho_reserve_scratch(void) scratch_size_update(); /* FIXME: deal with node hot-plug/remove */ - kho_scratch_cnt = num_online_nodes() + 2; + kho_scratch_cnt = nodes_weight(node_states[N_MEMORY]) + 2; size = kho_scratch_cnt * sizeof(*kho_scratch); kho_scratch = memblock_alloc(size, PAGE_SIZE); if (!kho_scratch) { @@ -691,7 +691,11 @@ static void __init kho_reserve_scratch(void) kho_scratch[i].size = size; i++; - for_each_online_node(nid) { + /* + * Loop over nodes that have both memory and are online. Skip + * memoryless nodes, as we can not allocate scratch areas there. + */ + for_each_node_state(nid, N_MEMORY) { size = scratch_size_node(nid); addr = memblock_alloc_range_nid(size, CMA_MIN_ALIGNMENT_BYTES, 0, MEMBLOCK_ALLOC_ACCESSIBLE, From 33caa19f4b318378bf54692b30724f442c981dad Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Jan 2026 17:07:11 +0100 Subject: [PATCH 089/107] android/binder: don't abuse current->group_leader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "don't abuse task_struct.group_leader", v2. This series removes the usage of ->group_leader when it is "obviously unnecessary". I am going to move ->group_leader from task_struct to signal_struct or at least add the new task_group_leader() helper. So I will send more tree-wide changes on top of this series. This patch (of 7): Cleanup and preparation to simplify the next changes. - Use current->tgid instead of current->group_leader->pid - Use the value returned by get_task_struct() to initialize proc->tsk Link: https://lkml.kernel.org/r/aXY_h8i78n6yD9JY@redhat.com Link: https://lkml.kernel.org/r/aXY_ryGDwdygl1Tv@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Alice Ryhl Cc: Boris Brezillon Cc: Christan König Cc: David S. Miller Cc: Eric Dumazet Cc: Felix Kuehling Cc: Jakub Kicinski Cc: Leon Romanovsky Cc: Paolo Abeni Cc: Simon Horman Cc: Steven Price Signed-off-by: Andrew Morton --- drivers/android/binder.c | 7 +++---- drivers/android/binder_alloc.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 535fc881c8da..dea701daabb0 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -6046,7 +6046,7 @@ static int binder_open(struct inode *nodp, struct file *filp) bool existing_pid = false; binder_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d:%d\n", __func__, - current->group_leader->pid, current->pid); + current->tgid, current->pid); proc = kzalloc(sizeof(*proc), GFP_KERNEL); if (proc == NULL) @@ -6055,8 +6055,8 @@ static int binder_open(struct inode *nodp, struct file *filp) dbitmap_init(&proc->dmap); spin_lock_init(&proc->inner_lock); spin_lock_init(&proc->outer_lock); - get_task_struct(current->group_leader); - proc->tsk = current->group_leader; + proc->tsk = get_task_struct(current->group_leader); + proc->pid = current->tgid; proc->cred = get_cred(filp->f_cred); INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->freeze_wait); @@ -6075,7 +6075,6 @@ static int binder_open(struct inode *nodp, struct file *filp) binder_alloc_init(&proc->alloc); binder_stats_created(BINDER_STAT_PROC); - proc->pid = current->group_leader->pid; INIT_LIST_HEAD(&proc->delivered_death); INIT_LIST_HEAD(&proc->delivered_freeze); INIT_LIST_HEAD(&proc->waiting_threads); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 979c96b74cad..145ed5f14cdb 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1233,7 +1233,7 @@ static struct shrinker *binder_shrinker; VISIBLE_IF_KUNIT void __binder_alloc_init(struct binder_alloc *alloc, struct list_lru *freelist) { - alloc->pid = current->group_leader->pid; + alloc->pid = current->tgid; alloc->mm = current->mm; mmgrab(alloc->mm); mutex_init(&alloc->mutex); From a170919d1b670f531f31192bef4dff08be636a7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Jan 2026 17:07:20 +0100 Subject: [PATCH 090/107] android/binder: use same_thread_group(proc->tsk, current) in binder_mmap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With or without this change the checked condition can be falsely true if proc->tsk execs, but this is fine: binder_alloc_mmap_handler() checks vma->vm_mm == alloc->mm. Link: https://lkml.kernel.org/r/aXY_uPYyUg4rwNOg@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Alice Ryhl Cc: Boris Brezillon Cc: Christan König Cc: David S. Miller Cc: Eric Dumazet Cc: Felix Kuehling Cc: Jakub Kicinski Cc: Leon Romanovsky Cc: Paolo Abeni Cc: Simon Horman Cc: Steven Price Signed-off-by: Andrew Morton --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index dea701daabb0..b3b73303f84d 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -6015,7 +6015,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) { struct binder_proc *proc = filp->private_data; - if (proc->tsk != current->group_leader) + if (!same_thread_group(proc->tsk, current)) return -EINVAL; binder_debug(BINDER_DEBUG_OPEN_CLOSE, From 7d08e0916a59e006c262dcd2f7168d0336c80265 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Jan 2026 17:07:28 +0100 Subject: [PATCH 091/107] drm/amdgpu: don't abuse current->group_leader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup and preparation to simplify the next changes. - Use current->tgid instead of current->group_leader->pid - Use get_task_pid(current, PIDTYPE_TGID) instead of get_task_pid(current->group_leader, PIDTYPE_PID) Link: https://lkml.kernel.org/r/aXY_wKewzV5lCa5I@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Felix Kuehling Cc: Alice Ryhl Cc: Boris Brezillon Cc: Christan König Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Leon Romanovsky Cc: Paolo Abeni Cc: Simon Horman Cc: Steven Price Signed-off-by: Andrew Morton --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index b1c24c8fa686..df22b54ba346 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1421,7 +1421,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, goto create_evict_fence_fail; } - info->pid = get_task_pid(current->group_leader, PIDTYPE_PID); + info->pid = get_task_pid(current, PIDTYPE_TGID); INIT_DELAYED_WORK(&info->restore_userptr_work, amdgpu_amdkfd_restore_userptr_worker); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index a67285118c37..a0f8ba382b9e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2554,7 +2554,7 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm) if (current->group_leader->mm != current->mm) return; - vm->task_info->tgid = current->group_leader->pid; + vm->task_info->tgid = current->tgid; get_task_comm(vm->task_info->process_name, current->group_leader); } From a87da7a9fa7b5c39b01d7fa30415c9211e775f2e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Jan 2026 17:07:36 +0100 Subject: [PATCH 092/107] drm/amd: kill the outdated "Only the pthreads threading model is supported" checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Nowadays task->group_leader->mm != task->mm is only possible if a) task is not a group leader and b) task->group_leader->mm == NULL because task->group_leader has already exited using sys_exit(). I don't think that drm/amd tries to detect/nack this case. Link: https://lkml.kernel.org/r/aXY_yLVHd63UlWtm@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Christan König Acked-by: Felix Kuehling Cc: Alice Ryhl Cc: Boris Brezillon Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Leon Romanovsky Cc: Paolo Abeni Cc: Simon Horman Cc: Steven Price Signed-off-by: Andrew Morton --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 3 --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 10 ---------- 2 files changed, 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index a0f8ba382b9e..e44f158a11f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2551,9 +2551,6 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm) vm->task_info->task.pid = current->pid; get_task_comm(vm->task_info->task.comm, current); - if (current->group_leader->mm != current->mm) - return; - vm->task_info->tgid = current->tgid; get_task_comm(vm->task_info->process_name, current->group_leader); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index a085faac9fe1..f8ef18a3aa71 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -833,12 +833,6 @@ struct kfd_process *kfd_create_process(struct task_struct *thread) if (!(thread->mm && mmget_not_zero(thread->mm))) return ERR_PTR(-EINVAL); - /* Only the pthreads threading model is supported. */ - if (thread->group_leader->mm != thread->mm) { - mmput(thread->mm); - return ERR_PTR(-EINVAL); - } - /* If the process just called exec(3), it is possible that the * cleanup of the kfd_process (following the release of the mm * of the old process image) is still in the cleanup work queue. @@ -918,10 +912,6 @@ struct kfd_process *kfd_get_process(const struct task_struct *thread) if (!thread->mm) return ERR_PTR(-EINVAL); - /* Only the pthreads threading model is supported. */ - if (thread->group_leader->mm != thread->mm) - return ERR_PTR(-EINVAL); - process = find_process(thread, false); if (!process) return ERR_PTR(-EINVAL); From 05f8f36d0b836006a1f7a7d233789c8c80ea89df Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Jan 2026 17:07:44 +0100 Subject: [PATCH 093/107] drm/pan*: don't abuse current->group_leader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup and preparation to simplify the next changes. Use current->tgid instead of current->group_leader->pid. Link: https://lkml.kernel.org/r/aXY_0MrQBZWKbbmA@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Boris Brezillon Acked-by: Steven Price Cc: Alice Ryhl Cc: Christan König Cc: David S. Miller Cc: Eric Dumazet Cc: Felix Kuehling Cc: Jakub Kicinski Cc: Leon Romanovsky Cc: Paolo Abeni Cc: Simon Horman Signed-off-by: Andrew Morton --- drivers/gpu/drm/panfrost/panfrost_gem.c | 2 +- drivers/gpu/drm/panthor/panthor_gem.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.c b/drivers/gpu/drm/panfrost/panfrost_gem.c index 8041b65c6609..1ff1f2c8b726 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem.c @@ -17,7 +17,7 @@ static void panfrost_gem_debugfs_bo_add(struct panfrost_device *pfdev, struct panfrost_gem_object *bo) { - bo->debugfs.creator.tgid = current->group_leader->pid; + bo->debugfs.creator.tgid = current->tgid; get_task_comm(bo->debugfs.creator.process_name, current->group_leader); mutex_lock(&pfdev->debugfs.gems_lock); diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c index fbde78db270a..29cc57efc4b9 100644 --- a/drivers/gpu/drm/panthor/panthor_gem.c +++ b/drivers/gpu/drm/panthor/panthor_gem.c @@ -27,7 +27,7 @@ static void panthor_gem_debugfs_bo_add(struct panthor_gem_object *bo) struct panthor_device *ptdev = container_of(bo->base.base.dev, struct panthor_device, base); - bo->debugfs.creator.tgid = current->group_leader->pid; + bo->debugfs.creator.tgid = current->tgid; get_task_comm(bo->debugfs.creator.process_name, current->group_leader); mutex_lock(&ptdev->gems.lock); From 6fd390e2bccfd82e6e2932acb21299938f8981bb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Jan 2026 17:07:52 +0100 Subject: [PATCH 094/107] RDMA/umem: don't abuse current->group_leader MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup and preparation to simplify the next changes. Use current->tgid instead of current->group_leader->pid. Link: https://lkml.kernel.org/r/aXY_2JIhCeGAYC0r@redhat.com Signed-off-by: Oleg Nesterov Acked-by: Leon Romanovsky Cc: Alice Ryhl Cc: Boris Brezillon Cc: Christan König Cc: David S. Miller Cc: Eric Dumazet Cc: Felix Kuehling Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Steven Price Signed-off-by: Andrew Morton --- drivers/infiniband/core/umem_odp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 572a91a62a7b..32267258a19c 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -149,7 +149,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, umem->owning_mm = current->mm; umem_odp->page_shift = PAGE_SHIFT; - umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + umem_odp->tgid = get_task_pid(current, PIDTYPE_TGID); ib_init_umem_implicit_odp(umem_odp); return umem_odp; } @@ -258,7 +258,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, umem_odp->page_shift = HPAGE_SHIFT; #endif - umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + umem_odp->tgid = get_task_pid(current, PIDTYPE_TGID); ret = ib_init_umem_odp(umem_odp, ops); if (ret) goto err_put_pid; From f3951e93d4fe9cc85128dc38915877ff6ef633db Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 25 Jan 2026 17:08:00 +0100 Subject: [PATCH 095/107] netclassid: use thread_group_leader(p) in update_classid_task() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup and preparation to simplify planned future changes. Link: https://lkml.kernel.org/r/aXY_4NSP094-Cf-2@redhat.com Signed-off-by: Oleg Nesterov Cc: Alice Ryhl Cc: Boris Brezillon Cc: Christan König Cc: David S. Miller Cc: Eric Dumazet Cc: Felix Kuehling Cc: Jakub Kicinski Cc: Leon Romanovsky Cc: Paolo Abeni Cc: Simon Horman Cc: Steven Price Signed-off-by: Andrew Morton --- net/core/netclassid_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index dff66d8fb325..db9a5354f9de 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -93,7 +93,7 @@ static void update_classid_task(struct task_struct *p, u32 classid) /* Only update the leader task, when many threads in this task, * so it can avoid the useless traversal. */ - if (p != p->group_leader) + if (!thread_group_leader(p)) return; do { From 2e171ab29f916455a49274a2042bac4a4b35570e Mon Sep 17 00:00:00 2001 From: Pnina Feder Date: Thu, 22 Jan 2026 12:24:57 +0200 Subject: [PATCH 096/107] panic: add panic_force_cpu= parameter to redirect panic to a specific CPU Some platforms require panic handling to execute on a specific CPU for crash dump to work reliably. This can be due to firmware limitations, interrupt routing constraints, or platform-specific requirements where only a single CPU is able to safely enter the crash kernel. Add the panic_force_cpu= kernel command-line parameter to redirect panic execution to a designated CPU. When the parameter is provided, the CPU that initially triggers panic forwards the panic context to the target CPU via IPI, which then proceeds with the normal panic and kexec flow. The IPI delivery is implemented as a weak function (panic_smp_redirect_cpu) so architectures with NMI support can override it for more reliable delivery. If the specified CPU is invalid, offline, or a panic is already in progress on another CPU, the redirection is skipped and panic continues on the current CPU. [pnina.feder@mobileye.com: fix unused variable warning] Link: https://lkml.kernel.org/r/20260126122618.2967950-1-pnina.feder@mobileye.com Link: https://lkml.kernel.org/r/20260122102457.1154599-1-pnina.feder@mobileye.com Signed-off-by: Pnina Feder Reviewed-by: Petr Mladek Cc: Baoquan He Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Mel Gorman Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 15 ++ include/linux/panic.h | 8 + include/linux/smp.h | 1 + kernel/panic.c | 164 +++++++++++++++++- 4 files changed, 186 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 73d846211144..97161861781c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4788,6 +4788,21 @@ Kernel parameters panic_on_warn=1 panic() instead of WARN(). Useful to cause kdump on a WARN(). + panic_force_cpu= + [KNL,SMP] Force panic handling to execute on a specific CPU. + Format: + Some platforms require panic handling to occur on a + specific CPU for the crash kernel to function correctly. + This can be due to firmware limitations, interrupt routing + constraints, or platform-specific requirements where only + a particular CPU can safely enter the crash kernel. + When set, panic() will redirect execution to the specified + CPU before proceeding with the normal panic and kexec flow. + If the target CPU is offline or unavailable, panic proceeds + on the current CPU. + This option should only be used for systems with the above + constraints as it might cause the panic operation to be less reliable. + panic_print= Bitmask for printing system info when panic happens. User can chose combination of the following bits: bit 0: print all tasks info diff --git a/include/linux/panic.h b/include/linux/panic.h index a00bc0937698..f1dd417e54b2 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -41,6 +41,14 @@ void abort(void); * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec(). */ extern atomic_t panic_cpu; + +/* + * panic_redirect_cpu is used when panic is redirected to a specific CPU via + * the panic_force_cpu= boot parameter. It holds the CPU number that originally + * triggered the panic before redirection. A value of PANIC_CPU_INVALID means + * no redirection has occurred. + */ +extern atomic_t panic_redirect_cpu; #define PANIC_CPU_INVALID -1 bool panic_try_start(void); diff --git a/include/linux/smp.h b/include/linux/smp.h index 91d0ecf3b8d3..1ebd88026119 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -62,6 +62,7 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd); void __noreturn panic_smp_self_stop(void); void __noreturn nmi_panic_self_stop(struct pt_regs *regs); void crash_smp_send_stop(void); +int panic_smp_redirect_cpu(int target_cpu, void *msg); /* * Call a function on all processors diff --git a/kernel/panic.c b/kernel/panic.c index 0c20fcaae98a..c78600212b6c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -42,6 +42,7 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#define PANIC_MSG_BUFSZ 1024 #ifdef CONFIG_SMP /* @@ -74,6 +75,8 @@ EXPORT_SYMBOL_GPL(panic_timeout); unsigned long panic_print; +static int panic_force_cpu = -1; + ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); @@ -300,6 +303,150 @@ void __weak crash_smp_send_stop(void) } atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +atomic_t panic_redirect_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); + +#if defined(CONFIG_SMP) && defined(CONFIG_CRASH_DUMP) +static char *panic_force_buf; + +static int __init panic_force_cpu_setup(char *str) +{ + int cpu; + + if (!str) + return -EINVAL; + + if (kstrtoint(str, 0, &cpu) || cpu < 0 || cpu >= nr_cpu_ids) { + pr_warn("panic_force_cpu: invalid value '%s'\n", str); + return -EINVAL; + } + + panic_force_cpu = cpu; + return 0; +} +early_param("panic_force_cpu", panic_force_cpu_setup); + +static int __init panic_force_cpu_late_init(void) +{ + if (panic_force_cpu < 0) + return 0; + + panic_force_buf = kmalloc(PANIC_MSG_BUFSZ, GFP_KERNEL); + + return 0; +} +late_initcall(panic_force_cpu_late_init); + +static void do_panic_on_target_cpu(void *info) +{ + panic("%s", (char *)info); +} + +/** + * panic_smp_redirect_cpu - Redirect panic to target CPU + * @target_cpu: CPU that should handle the panic + * @msg: formatted panic message + * + * Default implementation uses IPI. Architectures with NMI support + * can override this for more reliable delivery. + * + * Return: 0 on success, negative errno on failure + */ +int __weak panic_smp_redirect_cpu(int target_cpu, void *msg) +{ + static call_single_data_t panic_csd; + + panic_csd.func = do_panic_on_target_cpu; + panic_csd.info = msg; + + return smp_call_function_single_async(target_cpu, &panic_csd); +} + +/** + * panic_try_force_cpu - Redirect panic to a specific CPU for crash kernel + * @fmt: panic message format string + * @args: arguments for format string + * + * Some platforms require panic handling to occur on a specific CPU + * for the crash kernel to function correctly. This function redirects + * panic handling to the CPU specified via the panic_force_cpu= boot parameter. + * + * Returns false if panic should proceed on current CPU. + * Returns true if panic was redirected. + */ +__printf(1, 0) +static bool panic_try_force_cpu(const char *fmt, va_list args) +{ + int this_cpu = raw_smp_processor_id(); + int old_cpu = PANIC_CPU_INVALID; + const char *msg; + + /* Feature not enabled via boot parameter */ + if (panic_force_cpu < 0) + return false; + + /* Already on target CPU - proceed normally */ + if (this_cpu == panic_force_cpu) + return false; + + /* Target CPU is offline, can't redirect */ + if (!cpu_online(panic_force_cpu)) { + pr_warn("panic: target CPU %d is offline, continuing on CPU %d\n", + panic_force_cpu, this_cpu); + return false; + } + + /* Another panic already in progress */ + if (panic_in_progress()) + return false; + + /* + * Only one CPU can do the redirect. Use atomic cmpxchg to ensure + * we don't race with another CPU also trying to redirect. + */ + if (!atomic_try_cmpxchg(&panic_redirect_cpu, &old_cpu, this_cpu)) + return false; + + /* + * Use dynamically allocated buffer if available, otherwise + * fall back to static message for early boot panics or allocation failure. + */ + if (panic_force_buf) { + vsnprintf(panic_force_buf, PANIC_MSG_BUFSZ, fmt, args); + msg = panic_force_buf; + } else { + msg = "Redirected panic (buffer unavailable)"; + } + + console_verbose(); + bust_spinlocks(1); + + pr_emerg("panic: Redirecting from CPU %d to CPU %d for crash kernel.\n", + this_cpu, panic_force_cpu); + + /* Dump original CPU before redirecting */ + if (!test_taint(TAINT_DIE) && + oops_in_progress <= 1 && + IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { + dump_stack(); + } + + if (panic_smp_redirect_cpu(panic_force_cpu, (void *)msg) != 0) { + atomic_set(&panic_redirect_cpu, PANIC_CPU_INVALID); + pr_warn("panic: failed to redirect to CPU %d, continuing on CPU %d\n", + panic_force_cpu, this_cpu); + return false; + } + + /* IPI/NMI sent, this CPU should stop */ + return true; +} +#else +__printf(1, 0) +static inline bool panic_try_force_cpu(const char *fmt, va_list args) +{ + return false; +} +#endif /* CONFIG_SMP && CONFIG_CRASH_DUMP */ bool panic_try_start(void) { @@ -428,7 +575,7 @@ static void panic_other_cpus_shutdown(bool crash_kexec) */ void vpanic(const char *fmt, va_list args) { - static char buf[1024]; + static char buf[PANIC_MSG_BUFSZ]; long i, i_next = 0, len; int state = 0; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; @@ -452,6 +599,15 @@ void vpanic(const char *fmt, va_list args) local_irq_disable(); preempt_disable_notrace(); + /* Redirect panic to target CPU if configured via panic_force_cpu=. */ + if (panic_try_force_cpu(fmt, args)) { + /* + * Mark ourselves offline so panic_other_cpus_shutdown() won't wait + * for us on architectures that check num_online_cpus(). + */ + set_cpu_online(smp_processor_id(), false); + panic_smp_self_stop(); + } /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want @@ -484,7 +640,11 @@ void vpanic(const char *fmt, va_list args) /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (test_taint(TAINT_DIE) || oops_in_progress > 1) { + if (atomic_read(&panic_redirect_cpu) != PANIC_CPU_INVALID && + panic_force_cpu == raw_smp_processor_id()) { + pr_emerg("panic: Redirected from CPU %d, skipping stack dump.\n", + atomic_read(&panic_redirect_cpu)); + } else if (test_taint(TAINT_DIE) || oops_in_progress > 1) { panic_this_cpu_backtrace_printed = true; } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { dump_stack(); From 90079798f1d748e97c74e23736491543577b8aee Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 2 Feb 2026 10:59:00 +0100 Subject: [PATCH 097/107] delayacct: fix uapi timespec64 definition The custom definition of 'struct timespec64' is incompatible with both the kernel's internal definition and the glibc type, at least on big-endian targets that have the tv_nsec field in a different place, and the definition clashes with any userspace that also defines a timespec64 structure. Running the header check with -Wpadding enabled produces this output that warns about the incorrect padding: usr/include/linux/taskstats.h:25:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded] Remove the hack and instead use the regular __kernel_timespec type that is meant to be used in uapi definitions. Link: https://lkml.kernel.org/r/20260202095906.1344100-1-arnd@kernel.org Fixes: 29b63f6eff0e ("delayacct: add timestamp of delay max") Signed-off-by: Arnd Bergmann Cc: Fan Yu Cc: Jonathan Corbet Cc: xu xin Cc: Yang Yang Cc: Balbir Singh Cc: Jiang Kun Signed-off-by: Andrew Morton --- include/uapi/linux/taskstats.h | 27 +++++++++------------------ kernel/delayacct.c | 6 ++++-- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index 1b31e8e14d2f..3ae25f3ce067 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -18,16 +18,7 @@ #define _LINUX_TASKSTATS_H #include -#ifdef __KERNEL__ -#include -#else -#ifndef _LINUX_TIME64_H -struct timespec64 { - __s64 tv_sec; /* seconds */ - long tv_nsec; /* nanoseconds */ -}; -#endif -#endif +#include /* Format for per-task data returned to userland when * - a task exits @@ -242,14 +233,14 @@ struct taskstats { __u64 irq_delay_min; /*v17: delay max timestamp record*/ - struct timespec64 cpu_delay_max_ts; - struct timespec64 blkio_delay_max_ts; - struct timespec64 swapin_delay_max_ts; - struct timespec64 freepages_delay_max_ts; - struct timespec64 thrashing_delay_max_ts; - struct timespec64 compact_delay_max_ts; - struct timespec64 wpcopy_delay_max_ts; - struct timespec64 irq_delay_max_ts; + struct __kernel_timespec cpu_delay_max_ts; + struct __kernel_timespec blkio_delay_max_ts; + struct __kernel_timespec swapin_delay_max_ts; + struct __kernel_timespec freepages_delay_max_ts; + struct __kernel_timespec thrashing_delay_max_ts; + struct __kernel_timespec compact_delay_max_ts; + struct __kernel_timespec wpcopy_delay_max_ts; + struct __kernel_timespec irq_delay_max_ts; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index d58ffc63bcba..2e55c493c98b 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -18,7 +18,8 @@ do { \ d->type##_delay_max = tsk->delays->type##_delay_max; \ d->type##_delay_min = tsk->delays->type##_delay_min; \ - d->type##_delay_max_ts = tsk->delays->type##_delay_max_ts; \ + d->type##_delay_max_ts.tv_sec = tsk->delays->type##_delay_max_ts.tv_sec; \ + d->type##_delay_max_ts.tv_nsec = tsk->delays->type##_delay_max_ts.tv_nsec; \ tmp = d->type##_delay_total + tsk->delays->type##_delay; \ d->type##_delay_total = (tmp < d->type##_delay_total) ? 0 : tmp; \ d->type##_count += tsk->delays->type##_count; \ @@ -175,7 +176,8 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->cpu_delay_max = tsk->sched_info.max_run_delay; d->cpu_delay_min = tsk->sched_info.min_run_delay; - d->cpu_delay_max_ts = tsk->sched_info.max_run_delay_ts; + d->cpu_delay_max_ts.tv_sec = tsk->sched_info.max_run_delay_ts.tv_sec; + d->cpu_delay_max_ts.tv_nsec = tsk->sched_info.max_run_delay_ts.tv_nsec; tmp = (s64)d->cpu_delay_total + t2; d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; tmp = (s64)d->cpu_run_virtual_total + t3; From 989b3c5af63ecb1cbaf1598fe3f79865538bc1ea Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 18 Dec 2025 10:57:48 -0500 Subject: [PATCH 098/107] list: add primitives for private list manipulations Patch series "list private v2 & luo flb", v9. This series introduces two connected infrastructure improvements: a new API for handling private linked lists, and the "File-Lifecycle-Bound" (FLB) mechanism for the Live Update Orchestrator. 1. Private List Primitives (patches 1-3) Recently, Linux introduced the ability to mark structure members as __private and access them via ACCESS_PRIVATE(). This enforces better encapsulation by ensuring internal details are only accessible by the owning subsystem. However, struct list_head is frequently used as an internal linkage mechanism within these private sections. The standard macros in do not support ACCESS_PRIVATE() natively. Consequently, subsystems using private lists are forced to implement ad-hoc workarounds or local iterator macros. This series adds , providing a set of primitives identical to those in but designed for private list heads. It also includes a KUnit test suite to verify that the macros correctly handle pointer offsets and qualifiers. 2. This series adds FLB (patches 4-5) support to Live Update that also internally uses private lists. FLB allows global kernel state (such as IOMMU domains or HugeTLB state) to be preserved once, shared across multiple file descriptors, and restored when needed. This is necessary for subsystems where multiple preserved file descriptors depend on a single, shared underlying resource. Preserving this state for each individual file would be redundant and incorrect. FLB uses reference counting tied to the lifecycle of preserved files. The state is preserved when the first file depending on it is preserved, and restored or cleaned up only when the last file is handled. This patch (of 5): Linux recently added an ability to add private members to structs (i.e. __private) and access them via ACCESS_PRIVATE(). This ensures that those members are only accessible by the subsystem which owns the struct type, and not to the object owner. However, struct list_head often needs to be placed into the private section to be manipulated privately by the subsystem. Add macros to support private list manipulations in . [akpm@linux-foundation.org: fix kerneldoc] Link: https://lkml.kernel.org/r/20251218155752.3045808-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20251218155752.3045808-2-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Alexander Graf Cc: David Gow Cc: David Matlack Cc: David Rientjes Cc: Jonathan Corbet Cc: Kees Cook Cc: Mike Rapoport Cc: Petr Mladek Cc: Pratyush Yadav Cc: Samiullah Khawaja Cc: Tamir Duberstein Signed-off-by: Andrew Morton --- Documentation/core-api/list.rst | 9 ++ include/linux/list_private.h | 256 ++++++++++++++++++++++++++++++++ 2 files changed, 265 insertions(+) create mode 100644 include/linux/list_private.h diff --git a/Documentation/core-api/list.rst b/Documentation/core-api/list.rst index 86873ce9adbf..241464ca0549 100644 --- a/Documentation/core-api/list.rst +++ b/Documentation/core-api/list.rst @@ -774,3 +774,12 @@ Full List API .. kernel-doc:: include/linux/list.h :internal: + +Private List API +================ + +.. kernel-doc:: include/linux/list_private.h + :doc: Private List Primitives + +.. kernel-doc:: include/linux/list_private.h + :internal: diff --git a/include/linux/list_private.h b/include/linux/list_private.h new file mode 100644 index 000000000000..19b01d16beda --- /dev/null +++ b/include/linux/list_private.h @@ -0,0 +1,256 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ +#ifndef _LINUX_LIST_PRIVATE_H +#define _LINUX_LIST_PRIVATE_H + +/** + * DOC: Private List Primitives + * + * Provides a set of list primitives identical in function to those in + * ````, but designed for cases where the embedded + * ``&struct list_head`` is private member. + */ + +#include +#include + +#define __list_private_offset(type, member) \ + ((size_t)(&ACCESS_PRIVATE(((type *)0), member))) + +/** + * list_private_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the identifier passed to ACCESS_PRIVATE. + */ +#define list_private_entry(ptr, type, member) ({ \ + const struct list_head *__mptr = (ptr); \ + (type *)((char *)__mptr - __list_private_offset(type, member)); \ +}) + +/** + * list_private_first_entry - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the identifier passed to ACCESS_PRIVATE. + */ +#define list_private_first_entry(ptr, type, member) \ + list_private_entry((ptr)->next, type, member) + +/** + * list_private_last_entry - get the last element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the identifier passed to ACCESS_PRIVATE. + */ +#define list_private_last_entry(ptr, type, member) \ + list_private_entry((ptr)->prev, type, member) + +/** + * list_private_next_entry - get the next element in list + * @pos: the type * to cursor + * @member: the name of the list_head within the struct. + */ +#define list_private_next_entry(pos, member) \ + list_private_entry(ACCESS_PRIVATE(pos, member).next, typeof(*(pos)), member) + +/** + * list_private_next_entry_circular - get the next element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the last element (return the first element). + * Note, that list is expected to be not empty. + */ +#define list_private_next_entry_circular(pos, head, member) \ + (list_is_last(&ACCESS_PRIVATE(pos, member), head) ? \ + list_private_first_entry(head, typeof(*(pos)), member) : \ + list_private_next_entry(pos, member)) + +/** + * list_private_prev_entry - get the prev element in list + * @pos: the type * to cursor + * @member: the name of the list_head within the struct. + */ +#define list_private_prev_entry(pos, member) \ + list_private_entry(ACCESS_PRIVATE(pos, member).prev, typeof(*(pos)), member) + +/** + * list_private_prev_entry_circular - get the prev element in list + * @pos: the type * to cursor. + * @head: the list head to take the element from. + * @member: the name of the list_head within the struct. + * + * Wraparound if pos is the first element (return the last element). + * Note, that list is expected to be not empty. + */ +#define list_private_prev_entry_circular(pos, head, member) \ + (list_is_first(&ACCESS_PRIVATE(pos, member), head) ? \ + list_private_last_entry(head, typeof(*(pos)), member) : \ + list_private_prev_entry(pos, member)) + +/** + * list_private_entry_is_head - test if the entry points to the head of the list + * @pos: the type * to cursor + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_private_entry_is_head(pos, head, member) \ + list_is_head(&ACCESS_PRIVATE(pos, member), (head)) + +/** + * list_private_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_private_for_each_entry(pos, head, member) \ + for (pos = list_private_first_entry(head, typeof(*pos), member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = list_private_next_entry(pos, member)) + +/** + * list_private_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_private_for_each_entry_reverse(pos, head, member) \ + for (pos = list_private_last_entry(head, typeof(*pos), member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = list_private_prev_entry(pos, member)) + +/** + * list_private_for_each_entry_continue - continue iteration over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define list_private_for_each_entry_continue(pos, head, member) \ + for (pos = list_private_next_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = list_private_next_entry(pos, member)) + +/** + * list_private_for_each_entry_continue_reverse - iterate backwards from the given point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Start to iterate over list of given type backwards, continuing after + * the current position. + */ +#define list_private_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_private_prev_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = list_private_prev_entry(pos, member)) + +/** + * list_private_for_each_entry_from - iterate over list of given type from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate over list of given type, continuing from current position. + */ +#define list_private_for_each_entry_from(pos, head, member) \ + for (; !list_private_entry_is_head(pos, head, member); \ + pos = list_private_next_entry(pos, member)) + +/** + * list_private_for_each_entry_from_reverse - iterate backwards over list of given type + * from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type, continuing from current position. + */ +#define list_private_for_each_entry_from_reverse(pos, head, member) \ + for (; !list_private_entry_is_head(pos, head, member); \ + pos = list_private_prev_entry(pos, member)) + +/** + * list_private_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + */ +#define list_private_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_private_first_entry(head, typeof(*pos), member), \ + n = list_private_next_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = n, n = list_private_next_entry(n, member)) + +/** + * list_private_for_each_entry_safe_continue - continue list iteration safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate over list of given type, continuing after current point, + * safe against removal of list entry. + */ +#define list_private_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_private_next_entry(pos, member), \ + n = list_private_next_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = n, n = list_private_next_entry(n, member)) + +/** + * list_private_for_each_entry_safe_from - iterate over list from current point safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate over list of given type from current point, safe against + * removal of list entry. + */ +#define list_private_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_private_next_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = n, n = list_private_next_entry(n, member)) + +/** + * list_private_for_each_entry_safe_reverse - iterate backwards over list safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type, safe against removal + * of list entry. + */ +#define list_private_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_private_last_entry(head, typeof(*pos), member), \ + n = list_private_prev_entry(pos, member); \ + !list_private_entry_is_head(pos, head, member); \ + pos = n, n = list_private_prev_entry(n, member)) + +/** + * list_private_safe_reset_next - reset a stale list_for_each_entry_safe loop + * @pos: the loop cursor used in the list_for_each_entry_safe loop + * @n: temporary storage used in list_for_each_entry_safe + * @member: the name of the list_head within the struct. + * + * list_safe_reset_next is not safe to use in general if the list may be + * modified concurrently (eg. the lock is dropped in the loop body). An + * exception to this is if the cursor element (pos) is pinned in the list, + * and list_safe_reset_next is called after re-taking the lock and before + * completing the current iteration of the loop body. + */ +#define list_private_safe_reset_next(pos, n, member) \ + n = list_private_next_entry(pos, member) + +#endif /* _LINUX_LIST_PRIVATE_H */ From 66bd8501ceb4782b10dfa009085d9b3f4efecad6 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 18 Dec 2025 10:57:49 -0500 Subject: [PATCH 099/107] list: add kunit test for private list primitives Add a KUnit test suite for the new private list primitives. The test defines a struct with a __private list_head and exercises every macro defined in . This ensures that the macros correctly handle the ACCESS_PRIVATE() abstraction and compile without warnings when acting on private members, verifying that qualifiers are stripped and offsets are calculated correctly. Link: https://lkml.kernel.org/r/20251218155752.3045808-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: David Gow Cc: Alexander Graf Cc: David Matlack Cc: David Rientjes Cc: Jonathan Corbet Cc: Kees Cook Cc: Mike Rapoport Cc: Petr Mladek Cc: Pratyush Yadav Cc: Samiullah Khawaja Cc: Tamir Duberstein Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 14 +++++++ lib/tests/Makefile | 1 + lib/tests/list-private-test.c | 76 +++++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 lib/tests/list-private-test.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7eed3b197ca9..234b73f9baf7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2786,6 +2786,20 @@ config LIST_KUNIT_TEST If unsure, say N. +config LIST_PRIVATE_KUNIT_TEST + tristate "KUnit Test for Kernel Private Linked-list structures" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This builds the KUnit test for the private linked-list primitives + defined in include/linux/list_private.h. + + These primitives allow manipulation of list_head members that are + marked as private and require special accessors (ACCESS_PRIVATE) + to strip qualifiers or handle encapsulation. + + If unsure, say N. + config HASHTABLE_KUNIT_TEST tristate "KUnit Test for Kernel Hashtable structures" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/tests/Makefile b/lib/tests/Makefile index ab3e74d0da9e..f740b0a26750 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_TEST_IOV_ITER) += kunit_iov_iter.o obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o +obj-$(CONFIG_LIST_PRIVATE_KUNIT_TEST) += list-private-test.o obj-$(CONFIG_KFIFO_KUNIT_TEST) += kfifo_kunit.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o diff --git a/lib/tests/list-private-test.c b/lib/tests/list-private-test.c new file mode 100644 index 000000000000..3bd62939ae67 --- /dev/null +++ b/lib/tests/list-private-test.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit compilation/smoke test for Private list primitives. + * + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ +#include +#include + +/* + * This forces compiler to warn if you access it directly, because list + * primitives expect (struct list_head *), not (volatile struct list_head *). + */ +#undef __private +#define __private volatile + +/* Redefine ACCESS_PRIVATE for this test. */ +#undef ACCESS_PRIVATE +#define ACCESS_PRIVATE(p, member) \ + (*((struct list_head *)((unsigned long)&((p)->member)))) + +struct list_test_struct { + int data; + struct list_head __private list; +}; + +static void list_private_compile_test(struct kunit *test) +{ + struct list_test_struct entry; + struct list_test_struct *pos, *n; + LIST_HEAD(head); + + INIT_LIST_HEAD(&ACCESS_PRIVATE(&entry, list)); + list_add(&ACCESS_PRIVATE(&entry, list), &head); + pos = &entry; + + pos = list_private_entry(&ACCESS_PRIVATE(&entry, list), struct list_test_struct, list); + pos = list_private_first_entry(&head, struct list_test_struct, list); + pos = list_private_last_entry(&head, struct list_test_struct, list); + pos = list_private_next_entry(pos, list); + pos = list_private_prev_entry(pos, list); + pos = list_private_next_entry_circular(pos, &head, list); + pos = list_private_prev_entry_circular(pos, &head, list); + + if (list_private_entry_is_head(pos, &head, list)) + return; + + list_private_for_each_entry(pos, &head, list) { } + list_private_for_each_entry_reverse(pos, &head, list) { } + list_private_for_each_entry_continue(pos, &head, list) { } + list_private_for_each_entry_continue_reverse(pos, &head, list) { } + list_private_for_each_entry_from(pos, &head, list) { } + list_private_for_each_entry_from_reverse(pos, &head, list) { } + + list_private_for_each_entry_safe(pos, n, &head, list) + list_private_safe_reset_next(pos, n, list); + list_private_for_each_entry_safe_continue(pos, n, &head, list) { } + list_private_for_each_entry_safe_from(pos, n, &head, list) { } + list_private_for_each_entry_safe_reverse(pos, n, &head, list) { } +} + +static struct kunit_case list_private_test_cases[] = { + KUNIT_CASE(list_private_compile_test), + {}, +}; + +static struct kunit_suite list_private_test_module = { + .name = "list-private-kunit-test", + .test_cases = list_private_test_cases, +}; + +kunit_test_suite(list_private_test_module); + +MODULE_DESCRIPTION("KUnit compilation test for private list primitives"); +MODULE_LICENSE("GPL"); From 6845645eef81da64b916743e3f8d696ec1fb0a13 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 18 Dec 2025 10:57:50 -0500 Subject: [PATCH 100/107] liveupdate: luo_file: Use private list Switch LUO to use the private list iterators. Link: https://lkml.kernel.org/r/20251218155752.3045808-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Alexander Graf Cc: David Gow Cc: David Matlack Cc: David Rientjes Cc: Jonathan Corbet Cc: Kees Cook Cc: Mike Rapoport Cc: Petr Mladek Cc: Pratyush Yadav Cc: Samiullah Khawaja Cc: Tamir Duberstein Signed-off-by: Andrew Morton --- kernel/liveupdate/luo_file.c | 7 ++++--- kernel/liveupdate/luo_internal.h | 7 ------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index a32a777f6df8..1a8a1bb73a58 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -104,6 +104,7 @@ #include #include #include +#include #include #include #include @@ -273,7 +274,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) goto err_fput; err = -ENOENT; - luo_list_for_each_private(fh, &luo_file_handler_list, list) { + list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (fh->ops->can_preserve(fh, file)) { err = 0; break; @@ -760,7 +761,7 @@ int luo_file_deserialize(struct luo_file_set *file_set, bool handler_found = false; struct luo_file *luo_file; - luo_list_for_each_private(fh, &luo_file_handler_list, list) { + list_private_for_each_entry(fh, &luo_file_handler_list, list) { if (!strcmp(fh->compatible, file_ser[i].compatible)) { handler_found = true; break; @@ -835,7 +836,7 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) return -EBUSY; /* Check for duplicate compatible strings */ - luo_list_for_each_private(fh_iter, &luo_file_handler_list, list) { + list_private_for_each_entry(fh_iter, &luo_file_handler_list, list) { if (!strcmp(fh_iter->compatible, fh->compatible)) { pr_err("File handler registration failed: Compatible string '%s' already registered.\n", fh->compatible); diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index c8973b543d1d..3f1e0c94637e 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -40,13 +40,6 @@ static inline int luo_ucmd_respond(struct luo_ucmd *ucmd, */ #define luo_restore_fail(__fmt, ...) panic(__fmt, ##__VA_ARGS__) -/* Mimics list_for_each_entry() but for private list head entries */ -#define luo_list_for_each_private(pos, head, member) \ - for (struct list_head *__iter = (head)->next; \ - __iter != (head) && \ - ({ pos = container_of(__iter, typeof(*(pos)), member); 1; }); \ - __iter = __iter->next) - /** * struct luo_file_set - A set of files that belong to the same sessions. * @files_list: An ordered list of files associated with this session, it is From cab056f2aae7250af50e503b81a80dfc567a1acd Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 18 Dec 2025 10:57:51 -0500 Subject: [PATCH 101/107] liveupdate: luo_flb: introduce File-Lifecycle-Bound global state Introduce a mechanism for managing global kernel state whose lifecycle is tied to the preservation of one or more files. This is necessary for subsystems where multiple preserved file descriptors depend on a single, shared underlying resource. An example is HugeTLB, where multiple file descriptors such as memfd and guest_memfd may rely on the state of a single HugeTLB subsystem. Preserving this state for each individual file would be redundant and incorrect. The state should be preserved only once when the first file is preserved, and restored/finished only once the last file is handled. This patch introduces File-Lifecycle-Bound (FLB) objects to solve this problem. An FLB is a global, reference-counted object with a defined set of operations: - A file handler (struct liveupdate_file_handler) declares a dependency on one or more FLBs via a new registration function, liveupdate_register_flb(). - When the first file depending on an FLB is preserved, the FLB's .preserve() callback is invoked to save the shared global state. The reference count is then incremented for each subsequent file. - Conversely, when the last file is unpreserved (before reboot) or finished (after reboot), the FLB's .unpreserve() or .finish() callback is invoked to clean up the global resource. The implementation includes: - A new set of ABI definitions (luo_flb_ser, luo_flb_head_ser) and a corresponding FDT node (luo-flb) to serialize the state of all active FLBs and pass them via Kexec Handover. - Core logic in luo_flb.c to manage FLB registration, reference counting, and the invocation of lifecycle callbacks. - An API (liveupdate_flb_get/_incoming/_outgoing) for other kernel subsystems to safely access the live object managed by an FLB, both before and after the live update. This framework provides the necessary infrastructure for more complex subsystems like IOMMU, VFIO, and KVM to integrate with the Live Update Orchestrator. Link: https://lkml.kernel.org/r/20251218155752.3045808-5-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Alexander Graf Cc: David Gow Cc: David Matlack Cc: David Rientjes Cc: Jonathan Corbet Cc: Kees Cook Cc: Mike Rapoport Cc: Petr Mladek Cc: Pratyush Yadav Cc: Samiullah Khawaja Cc: Tamir Duberstein Signed-off-by: Andrew Morton --- Documentation/core-api/liveupdate.rst | 11 + include/linux/kho/abi/luo.h | 76 +++ include/linux/liveupdate.h | 147 ++++++ kernel/liveupdate/Makefile | 1 + kernel/liveupdate/luo_core.c | 7 +- kernel/liveupdate/luo_file.c | 24 +- kernel/liveupdate/luo_flb.c | 654 ++++++++++++++++++++++++++ kernel/liveupdate/luo_internal.h | 7 + 8 files changed, 924 insertions(+), 3 deletions(-) create mode 100644 kernel/liveupdate/luo_flb.c diff --git a/Documentation/core-api/liveupdate.rst b/Documentation/core-api/liveupdate.rst index e2aba13494cf..5a292d0f3706 100644 --- a/Documentation/core-api/liveupdate.rst +++ b/Documentation/core-api/liveupdate.rst @@ -18,6 +18,11 @@ LUO Preserving File Descriptors .. kernel-doc:: kernel/liveupdate/luo_file.c :doc: LUO File Descriptors +LUO File Lifecycle Bound Global Data +==================================== +.. kernel-doc:: kernel/liveupdate/luo_flb.c + :doc: LUO File Lifecycle Bound Global Data + Live Update Orchestrator ABI ============================ .. kernel-doc:: include/linux/kho/abi/luo.h @@ -40,6 +45,9 @@ Public API .. kernel-doc:: kernel/liveupdate/luo_core.c :export: +.. kernel-doc:: kernel/liveupdate/luo_flb.c + :export: + .. kernel-doc:: kernel/liveupdate/luo_file.c :export: @@ -48,6 +56,9 @@ Internal API .. kernel-doc:: kernel/liveupdate/luo_core.c :internal: +.. kernel-doc:: kernel/liveupdate/luo_flb.c + :internal: + .. kernel-doc:: kernel/liveupdate/luo_session.c :internal: diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h index beb86847b544..a44010aafb5e 100644 --- a/include/linux/kho/abi/luo.h +++ b/include/linux/kho/abi/luo.h @@ -37,6 +37,11 @@ * compatible = "luo-session-v1"; * luo-session-header = ; * }; + * + * luo-flb { + * compatible = "luo-flb-v1"; + * luo-flb-header = ; + * }; * }; * * Main LUO Node (/): @@ -56,6 +61,17 @@ * is the header for a contiguous block of memory containing an array of * `struct luo_session_ser`, one for each preserved session. * + * File-Lifecycle-Bound Node (luo-flb): + * This node describes all preserved global objects whose lifecycle is bound + * to that of the preserved files (e.g., shared IOMMU state). + * + * - compatible: "luo-flb-v1" + * Identifies the FLB ABI version. + * - luo-flb-header: u64 + * The physical address of a `struct luo_flb_header_ser`. This structure is + * the header for a contiguous block of memory containing an array of + * `struct luo_flb_ser`, one for each preserved global object. + * * Serialization Structures: * The FDT properties point to memory regions containing arrays of simple, * `__packed` structures. These structures contain the actual preserved state. @@ -74,6 +90,16 @@ * Metadata for a single preserved file. Contains the `compatible` string to * find the correct handler in the new kernel, a user-provided `token` for * identification, and an opaque `data` handle for the handler to use. + * + * - struct luo_flb_header_ser: + * Header for the FLB array. Contains the total page count of the + * preserved memory block and the number of `struct luo_flb_ser` entries + * that follow. + * + * - struct luo_flb_ser: + * Metadata for a single preserved global object. Contains its `name` + * (compatible string), an opaque `data` handle, and the `count` + * number of files depending on it. */ #ifndef _LINUX_KHO_ABI_LUO_H @@ -163,4 +189,54 @@ struct luo_session_ser { struct luo_file_set_ser file_set_ser; } __packed; +/* The max size is set so it can be reliably used during in serialization */ +#define LIVEUPDATE_FLB_COMPAT_LENGTH 48 + +#define LUO_FDT_FLB_NODE_NAME "luo-flb" +#define LUO_FDT_FLB_COMPATIBLE "luo-flb-v1" +#define LUO_FDT_FLB_HEADER "luo-flb-header" + +/** + * struct luo_flb_header_ser - Header for the serialized FLB data block. + * @pgcnt: The total number of pages occupied by the entire preserved memory + * region, including this header and the subsequent array of + * &struct luo_flb_ser entries. + * @count: The number of &struct luo_flb_ser entries that follow this header + * in the memory block. + * + * This structure is located at the physical address specified by the + * `LUO_FDT_FLB_HEADER` FDT property. It provides the new kernel with the + * necessary information to find and iterate over the array of preserved + * File-Lifecycle-Bound objects and to manage the underlying memory. + * + * If this structure is modified, LUO_FDT_FLB_COMPATIBLE must be updated. + */ +struct luo_flb_header_ser { + u64 pgcnt; + u64 count; +} __packed; + +/** + * struct luo_flb_ser - Represents the serialized state of a single FLB object. + * @name: The unique compatibility string of the FLB object, used to find the + * corresponding &struct liveupdate_flb handler in the new kernel. + * @data: The opaque u64 handle returned by the FLB's .preserve() operation + * in the old kernel. This handle encapsulates the entire state needed + * for restoration. + * @count: The reference count at the time of serialization; i.e., the number + * of preserved files that depended on this FLB. This is used by the + * new kernel to correctly manage the FLB's lifecycle. + * + * An array of these structures is created in a preserved memory region and + * passed to the new kernel. Each entry allows the LUO core to restore one + * global, shared object. + * + * If this structure is modified, LUO_FDT_FLB_COMPATIBLE must be updated. + */ +struct luo_flb_ser { + char name[LIVEUPDATE_FLB_COMPAT_LENGTH]; + u64 data; + u64 count; +} __packed; + #endif /* _LINUX_KHO_ABI_LUO_H */ diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index a7f6ee5b6771..fe82a6c3005f 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -11,10 +11,13 @@ #include #include #include +#include #include #include struct liveupdate_file_handler; +struct liveupdate_flb; +struct liveupdate_session; struct file; /** @@ -99,6 +102,118 @@ struct liveupdate_file_handler { * registered file handlers. */ struct list_head __private list; + /* A list of FLB dependencies. */ + struct list_head __private flb_list; +}; + +/** + * struct liveupdate_flb_op_args - Arguments for FLB operation callbacks. + * @flb: The global FLB instance for which this call is performed. + * @data: For .preserve(): [OUT] The callback sets this field. + * For .unpreserve(): [IN] The handle from .preserve(). + * For .retrieve(): [IN] The handle from .preserve(). + * @obj: For .preserve(): [OUT] Sets this to the live object. + * For .retrieve(): [OUT] Sets this to the live object. + * For .finish(): [IN] The live object from .retrieve(). + * + * This structure bundles all parameters for the FLB operation callbacks. + */ +struct liveupdate_flb_op_args { + struct liveupdate_flb *flb; + u64 data; + void *obj; +}; + +/** + * struct liveupdate_flb_ops - Callbacks for global File-Lifecycle-Bound data. + * @preserve: Called when the first file using this FLB is preserved. + * The callback must save its state and return a single, + * self-contained u64 handle by setting the 'argp->data' + * field and 'argp->obj'. + * @unpreserve: Called when the last file using this FLB is unpreserved + * (aborted before reboot). Receives the handle via + * 'argp->data' and live object via 'argp->obj'. + * @retrieve: Called on-demand in the new kernel, the first time a + * component requests access to the shared object. It receives + * the preserved handle via 'argp->data' and must reconstruct + * the live object, returning it by setting the 'argp->obj' + * field. + * @finish: Called in the new kernel when the last file using this FLB + * is finished. Receives the live object via 'argp->obj' for + * cleanup. + * @owner: Module reference + * + * Operations that manage global shared data with file bound lifecycle, + * triggered by the first file that uses it and concluded by the last file that + * uses it, across all sessions. + */ +struct liveupdate_flb_ops { + int (*preserve)(struct liveupdate_flb_op_args *argp); + void (*unpreserve)(struct liveupdate_flb_op_args *argp); + int (*retrieve)(struct liveupdate_flb_op_args *argp); + void (*finish)(struct liveupdate_flb_op_args *argp); + struct module *owner; +}; + +/* + * struct luo_flb_private_state - Private FLB state structures. + * @count: The number of preserved files currently depending on this FLB. + * This is used to trigger the preserve/unpreserve/finish ops on the + * first/last file. + * @data: The opaque u64 handle returned by .preserve() or passed to + * .retrieve(). + * @obj: The live kernel object returned by .preserve() or .retrieve(). + * @lock: A mutex that protects all fields within this structure, providing + * the synchronization service for the FLB's ops. + * @finished: True once the FLB's finish() callback has run. + * @retrieved: True once the FLB's retrieve() callback has run. + */ +struct luo_flb_private_state { + long count; + u64 data; + void *obj; + struct mutex lock; + bool finished; + bool retrieved; +}; + +/* + * struct luo_flb_private - Keep separate incoming and outgoing states. + * @list: A global list of registered FLBs. + * @outgoing: The runtime state for the pre-reboot + * (preserve/unpreserve) lifecycle. + * @incoming: The runtime state for the post-reboot (retrieve/finish) + * lifecycle. + * @users: With how many File-Handlers this FLB is registered. + * @initialized: true when private fields have been initialized. + */ +struct luo_flb_private { + struct list_head list; + struct luo_flb_private_state outgoing; + struct luo_flb_private_state incoming; + int users; + bool initialized; +}; + +/** + * struct liveupdate_flb - A global definition for a shared data object. + * @ops: Callback functions + * @compatible: The compatibility string (e.g., "iommu-core-v1" + * that uniquely identifies the FLB type this handler + * supports. This is matched against the compatible string + * associated with individual &struct liveupdate_flb + * instances. + * + * This struct is the "template" that a driver registers to define a shared, + * file-lifecycle-bound object. The actual runtime state (the live object, + * refcount, etc.) is managed privately by the LUO core. + */ +struct liveupdate_flb { + const struct liveupdate_flb_ops *ops; + const char compatible[LIVEUPDATE_FLB_COMPAT_LENGTH]; + + /* private: */ + struct luo_flb_private __private private; }; #ifdef CONFIG_LIVEUPDATE @@ -112,6 +227,14 @@ int liveupdate_reboot(void); int liveupdate_register_file_handler(struct liveupdate_file_handler *fh); int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); +int liveupdate_register_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb); +int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb); + +int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp); +int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp); + #else /* CONFIG_LIVEUPDATE */ static inline bool liveupdate_enabled(void) @@ -134,5 +257,29 @@ static inline int liveupdate_unregister_file_handler(struct liveupdate_file_hand return -EOPNOTSUPP; } +static inline int liveupdate_register_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, + void **objp) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, + void **objp) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_LIVEUPDATE */ #endif /* _LINUX_LIVEUPDATE_H */ diff --git a/kernel/liveupdate/Makefile b/kernel/liveupdate/Makefile index 7cad2eece32d..d2f779cbe279 100644 --- a/kernel/liveupdate/Makefile +++ b/kernel/liveupdate/Makefile @@ -3,6 +3,7 @@ luo-y := \ luo_core.o \ luo_file.o \ + luo_flb.o \ luo_session.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o diff --git a/kernel/liveupdate/luo_core.c b/kernel/liveupdate/luo_core.c index a26c093eb8eb..dda7bb57d421 100644 --- a/kernel/liveupdate/luo_core.c +++ b/kernel/liveupdate/luo_core.c @@ -127,7 +127,9 @@ static int __init luo_early_startup(void) if (err) return err; - return 0; + err = luo_flb_setup_incoming(luo_global.fdt_in); + + return err; } static int __init liveupdate_early_init(void) @@ -164,6 +166,7 @@ static int __init luo_fdt_setup(void) err |= fdt_property_string(fdt_out, "compatible", LUO_FDT_COMPATIBLE); err |= fdt_property(fdt_out, LUO_FDT_LIVEUPDATE_NUM, &ln, sizeof(ln)); err |= luo_session_setup_outgoing(fdt_out); + err |= luo_flb_setup_outgoing(fdt_out); err |= fdt_end_node(fdt_out); err |= fdt_finish(fdt_out); if (err) @@ -225,6 +228,8 @@ int liveupdate_reboot(void) if (err) return err; + luo_flb_serialize(); + err = kho_finalize(); if (err) { pr_err("kho_finalize failed %d\n", err); diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 1a8a1bb73a58..cade273c50c9 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -285,10 +285,14 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) if (err) goto err_free_files_mem; + err = luo_flb_file_preserve(fh); + if (err) + goto err_free_files_mem; + luo_file = kzalloc(sizeof(*luo_file), GFP_KERNEL); if (!luo_file) { err = -ENOMEM; - goto err_free_files_mem; + goto err_flb_unpreserve; } luo_file->file = file; @@ -312,6 +316,8 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) err_kfree: kfree(luo_file); +err_flb_unpreserve: + luo_flb_file_unpreserve(fh); err_free_files_mem: luo_free_files_mem(file_set); err_fput: @@ -353,6 +359,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set) args.serialized_data = luo_file->serialized_data; args.private_data = luo_file->private_data; luo_file->fh->ops->unpreserve(&args); + luo_flb_file_unpreserve(luo_file->fh); list_del(&luo_file->list); file_set->count--; @@ -630,6 +637,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set, args.retrieved = luo_file->retrieved; luo_file->fh->ops->finish(&args); + luo_flb_file_finish(luo_file->fh); } /** @@ -851,6 +859,7 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) goto err_resume; } + INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, flb_list)); INIT_LIST_HEAD(&ACCESS_PRIVATE(fh, list)); list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); luo_session_resume(); @@ -871,23 +880,34 @@ err_resume: * * It ensures safe removal by checking that: * No live update session is currently in progress. + * No FLB registered with this file handler. * * If the unregistration fails, the internal test state is reverted. * * Return: 0 Success. -EOPNOTSUPP when live update is not enabled. -EBUSY A live - * update is in progress, can't quiesce live update. + * update is in progress, can't quiesce live update or FLB is registred with + * this file handler. */ int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) { + int err = -EBUSY; + if (!liveupdate_enabled()) return -EOPNOTSUPP; if (!luo_session_quiesce()) return -EBUSY; + if (!list_empty(&ACCESS_PRIVATE(fh, flb_list))) + goto err_resume; + list_del(&ACCESS_PRIVATE(fh, list)); module_put(fh->ops->owner); luo_session_resume(); return 0; + +err_resume: + luo_session_resume(); + return err; } diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c new file mode 100644 index 000000000000..4c437de5c0b0 --- /dev/null +++ b/kernel/liveupdate/luo_flb.c @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +/** + * DOC: LUO File Lifecycle Bound Global Data + * + * File-Lifecycle-Bound (FLB) objects provide a mechanism for managing global + * state that is shared across multiple live-updatable files. The lifecycle of + * this shared state is tied to the preservation of the files that depend on it. + * + * An FLB represents a global resource, such as the IOMMU core state, that is + * required by multiple file descriptors (e.g., all VFIO fds). + * + * The preservation of the FLB's state is triggered when the *first* file + * depending on it is preserved. The cleanup of this state (unpreserve or + * finish) is triggered when the *last* file depending on it is unpreserved or + * finished. + * + * Handler Dependency: A file handler declares its dependency on one or more + * FLBs by registering them via liveupdate_register_flb(). + * + * Callback Model: Each FLB is defined by a set of operations + * (&struct liveupdate_flb_ops) that LUO invokes at key points: + * + * - .preserve(): Called for the first file. Saves global state. + * - .unpreserve(): Called for the last file (if aborted pre-reboot). + * - .retrieve(): Called on-demand in the new kernel to restore the state. + * - .finish(): Called for the last file in the new kernel for cleanup. + * + * This reference-counted approach ensures that shared state is saved exactly + * once and restored exactly once, regardless of how many files depend on it, + * and that its lifecycle is correctly managed across the kexec transition. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "luo_internal.h" + +#define LUO_FLB_PGCNT 1ul +#define LUO_FLB_MAX (((LUO_FLB_PGCNT << PAGE_SHIFT) - \ + sizeof(struct luo_flb_header_ser)) / sizeof(struct luo_flb_ser)) + +struct luo_flb_header { + struct luo_flb_header_ser *header_ser; + struct luo_flb_ser *ser; + bool active; +}; + +struct luo_flb_global { + struct luo_flb_header incoming; + struct luo_flb_header outgoing; + struct list_head list; + long count; +}; + +static struct luo_flb_global luo_flb_global = { + .list = LIST_HEAD_INIT(luo_flb_global.list), +}; + +/* + * struct luo_flb_link - Links an FLB definition to a file handler's internal + * list of dependencies. + * @flb: A pointer to the registered &struct liveupdate_flb definition. + * @list: The list_head for linking. + */ +struct luo_flb_link { + struct liveupdate_flb *flb; + struct list_head list; +}; + +/* luo_flb_get_private - Access private field, and if needed initialize it. */ +static struct luo_flb_private *luo_flb_get_private(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = &ACCESS_PRIVATE(flb, private); + + if (!private->initialized) { + mutex_init(&private->incoming.lock); + mutex_init(&private->outgoing.lock); + INIT_LIST_HEAD(&private->list); + private->users = 0; + private->initialized = true; + } + + return private; +} + +static int luo_flb_file_preserve_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + scoped_guard(mutex, &private->outgoing.lock) { + if (!private->outgoing.count) { + struct liveupdate_flb_op_args args = {0}; + int err; + + args.flb = flb; + err = flb->ops->preserve(&args); + if (err) + return err; + private->outgoing.data = args.data; + private->outgoing.obj = args.obj; + } + private->outgoing.count++; + } + + return 0; +} + +static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + scoped_guard(mutex, &private->outgoing.lock) { + private->outgoing.count--; + if (!private->outgoing.count) { + struct liveupdate_flb_op_args args = {0}; + + args.flb = flb; + args.data = private->outgoing.data; + args.obj = private->outgoing.obj; + + if (flb->ops->unpreserve) + flb->ops->unpreserve(&args); + + private->outgoing.data = 0; + private->outgoing.obj = NULL; + } + } +} + +static int luo_flb_retrieve_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct luo_flb_header *fh = &luo_flb_global.incoming; + struct liveupdate_flb_op_args args = {0}; + bool found = false; + int err; + + guard(mutex)(&private->incoming.lock); + + if (private->incoming.finished) + return -ENODATA; + + if (private->incoming.retrieved) + return 0; + + if (!fh->active) + return -ENODATA; + + for (int i = 0; i < fh->header_ser->count; i++) { + if (!strcmp(fh->ser[i].name, flb->compatible)) { + private->incoming.data = fh->ser[i].data; + private->incoming.count = fh->ser[i].count; + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + args.flb = flb; + args.data = private->incoming.data; + + err = flb->ops->retrieve(&args); + if (err) + return err; + + private->incoming.obj = args.obj; + private->incoming.retrieved = true; + + return 0; +} + +static void luo_flb_file_finish_one(struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + u64 count; + + scoped_guard(mutex, &private->incoming.lock) + count = --private->incoming.count; + + if (!count) { + struct liveupdate_flb_op_args args = {0}; + + if (!private->incoming.retrieved) { + int err = luo_flb_retrieve_one(flb); + + if (WARN_ON(err)) + return; + } + + scoped_guard(mutex, &private->incoming.lock) { + args.flb = flb; + args.obj = private->incoming.obj; + flb->ops->finish(&args); + + private->incoming.data = 0; + private->incoming.obj = NULL; + private->incoming.finished = true; + } + } +} + +/** + * luo_flb_file_preserve - Notifies FLBs that a file is about to be preserved. + * @fh: The file handler for the preserved file. + * + * This function iterates through all FLBs associated with the given file + * handler. It increments the reference count for each FLB. If the count becomes + * 1, it triggers the FLB's .preserve() callback to save the global state. + * + * This operation is atomic. If any FLB's .preserve() op fails, it will roll + * back by calling .unpreserve() on any FLBs that were successfully preserved + * during this call. + * + * Context: Called from luo_preserve_file() + * Return: 0 on success, or a negative errno on failure. + */ +int luo_flb_file_preserve(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + int err = 0; + + list_for_each_entry(iter, flb_list, list) { + err = luo_flb_file_preserve_one(iter->flb); + if (err) + goto exit_err; + } + + return 0; + +exit_err: + list_for_each_entry_continue_reverse(iter, flb_list, list) + luo_flb_file_unpreserve_one(iter->flb); + + return err; +} + +/** + * luo_flb_file_unpreserve - Notifies FLBs that a dependent file was unpreserved. + * @fh: The file handler for the unpreserved file. + * + * This function iterates through all FLBs associated with the given file + * handler, in reverse order of registration. It decrements the reference count + * for each FLB. If the count becomes 0, it triggers the FLB's .unpreserve() + * callback to clean up the global state. + * + * Context: Called when a preserved file is being cleaned up before reboot + * (e.g., from luo_file_unpreserve_files()). + */ +void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + + list_for_each_entry_reverse(iter, flb_list, list) + luo_flb_file_unpreserve_one(iter->flb); +} + +/** + * luo_flb_file_finish - Notifies FLBs that a dependent file has been finished. + * @fh: The file handler for the finished file. + * + * This function iterates through all FLBs associated with the given file + * handler, in reverse order of registration. It decrements the incoming + * reference count for each FLB. If the count becomes 0, it triggers the FLB's + * .finish() callback for final cleanup in the new kernel. + * + * Context: Called from luo_file_finish() for each file being finished. + */ +void luo_flb_file_finish(struct liveupdate_file_handler *fh) +{ + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + + list_for_each_entry_reverse(iter, flb_list, list) + luo_flb_file_finish_one(iter->flb); +} + +/** + * liveupdate_register_flb - Associate an FLB with a file handler and register it globally. + * @fh: The file handler that will now depend on the FLB. + * @flb: The File-Lifecycle-Bound object to associate. + * + * Establishes a dependency, informing the LUO core that whenever a file of + * type @fh is preserved, the state of @flb must also be managed. + * + * On the first registration of a given @flb object, it is added to a global + * registry. This function checks for duplicate registrations, both for a + * specific handler and globally, and ensures the total number of unique + * FLBs does not exceed the system limit. + * + * Context: Typically called from a subsystem's module init function after + * both the handler and the FLB have been defined and initialized. + * Return: 0 on success. Returns a negative errno on failure: + * -EINVAL if arguments are NULL or not initialized. + * -ENOMEM on memory allocation failure. + * -EEXIST if this FLB is already registered with this handler. + * -ENOSPC if the maximum number of global FLBs has been reached. + * -EOPNOTSUPP if live update is disabled or not configured. + */ +int liveupdate_register_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *link __free(kfree) = NULL; + struct liveupdate_flb *gflb; + struct luo_flb_link *iter; + int err; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (WARN_ON(!flb->ops->preserve || !flb->ops->unpreserve || + !flb->ops->retrieve || !flb->ops->finish)) { + return -EINVAL; + } + + /* + * File handler must already be registered, as it initializes the + * flb_list + */ + if (WARN_ON(list_empty(&ACCESS_PRIVATE(fh, list)))) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + + /* + * Ensure the system is quiescent (no active sessions). + * This acts as a global lock for registration: no other thread can + * be in this section, and no sessions can be creating/using FDs. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Check that this FLB is not already linked to this file handler */ + err = -EEXIST; + list_for_each_entry(iter, flb_list, list) { + if (iter->flb == flb) + goto err_resume; + } + + /* + * If this FLB is not linked to global list it's the first time the FLB + * is registered + */ + if (!private->users) { + if (WARN_ON(!list_empty(&private->list))) { + err = -EINVAL; + goto err_resume; + } + + if (luo_flb_global.count == LUO_FLB_MAX) { + err = -ENOSPC; + goto err_resume; + } + + /* Check that compatible string is unique in global list */ + list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { + if (!strcmp(gflb->compatible, flb->compatible)) + goto err_resume; + } + + if (!try_module_get(flb->ops->owner)) { + err = -EAGAIN; + goto err_resume; + } + + list_add_tail(&private->list, &luo_flb_global.list); + luo_flb_global.count++; + } + + /* Finally, link the FLB to the file handler */ + private->users++; + link->flb = flb; + list_add_tail(&no_free_ptr(link)->list, flb_list); + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_unregister_flb - Remove an FLB dependency from a file handler. + * @fh: The file handler that is currently depending on the FLB. + * @flb: The File-Lifecycle-Bound object to remove. + * + * Removes the association between the specified file handler and the FLB + * previously established by liveupdate_register_flb(). + * + * This function manages the global lifecycle of the FLB. It decrements the + * FLB's usage count. If this was the last file handler referencing this FLB, + * the FLB is removed from the global registry and the reference to its + * owner module (acquired during registration) is released. + * + * Context: This function ensures the session is quiesced (no active FDs + * being created) during the update. It is typically called from a + * subsystem's module exit function. + * Return: 0 on success. + * -EOPNOTSUPP if live update is disabled. + * -EBUSY if the live update session is active and cannot be quiesced. + * -ENOENT if the FLB was not found in the file handler's list. + */ +int liveupdate_unregister_flb(struct liveupdate_file_handler *fh, + struct liveupdate_flb *flb) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + struct list_head *flb_list = &ACCESS_PRIVATE(fh, flb_list); + struct luo_flb_link *iter; + int err = -ENOENT; + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + /* + * Ensure the system is quiescent (no active sessions). + * This acts as a global lock for unregistration. + */ + if (!luo_session_quiesce()) + return -EBUSY; + + /* Find and remove the link from the file handler's list */ + list_for_each_entry(iter, flb_list, list) { + if (iter->flb == flb) { + list_del(&iter->list); + kfree(iter); + err = 0; + break; + } + } + + if (err) + goto err_resume; + + private->users--; + /* + * If this is the last file-handler with which we are registred, remove + * from the global list, and relese module reference. + */ + if (!private->users) { + list_del_init(&private->list); + luo_flb_global.count--; + module_put(flb->ops->owner); + } + + luo_session_resume(); + + return 0; + +err_resume: + luo_session_resume(); + return err; +} + +/** + * liveupdate_flb_get_incoming - Retrieve the incoming FLB object. + * @flb: The FLB definition. + * @objp: Output parameter; will be populated with the live shared object. + * + * Returns a pointer to its shared live object for the incoming (post-reboot) + * path. + * + * If this is the first time the object is requested in the new kernel, this + * function will trigger the FLB's .retrieve() callback to reconstruct the + * object from its preserved state. Subsequent calls will return the same + * cached object. + * + * Return: 0 on success, or a negative errno on failure. -ENODATA means no + * incoming FLB data, -ENOENT means specific flb not found in the incoming + * data, and -EOPNOTSUPP when live update is disabled or not configured. + */ +int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + if (!private->incoming.obj) { + int err = luo_flb_retrieve_one(flb); + + if (err) + return err; + } + + guard(mutex)(&private->incoming.lock); + *objp = private->incoming.obj; + + return 0; +} + +/** + * liveupdate_flb_get_outgoing - Retrieve the outgoing FLB object. + * @flb: The FLB definition. + * @objp: Output parameter; will be populated with the live shared object. + * + * Returns a pointer to its shared live object for the outgoing (pre-reboot) + * path. + * + * This function assumes the object has already been created by the FLB's + * .preserve() callback, which is triggered when the first dependent file + * is preserved. + * + * Return: 0 on success, or a negative errno on failure. + */ +int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp) +{ + struct luo_flb_private *private = luo_flb_get_private(flb); + + if (!liveupdate_enabled()) + return -EOPNOTSUPP; + + guard(mutex)(&private->outgoing.lock); + *objp = private->outgoing.obj; + + return 0; +} + +int __init luo_flb_setup_outgoing(void *fdt_out) +{ + struct luo_flb_header_ser *header_ser; + u64 header_ser_pa; + int err; + + header_ser = kho_alloc_preserve(LUO_FLB_PGCNT << PAGE_SHIFT); + if (IS_ERR(header_ser)) + return PTR_ERR(header_ser); + + header_ser_pa = virt_to_phys(header_ser); + + err = fdt_begin_node(fdt_out, LUO_FDT_FLB_NODE_NAME); + err |= fdt_property_string(fdt_out, "compatible", + LUO_FDT_FLB_COMPATIBLE); + err |= fdt_property(fdt_out, LUO_FDT_FLB_HEADER, &header_ser_pa, + sizeof(header_ser_pa)); + err |= fdt_end_node(fdt_out); + + if (err) + goto err_unpreserve; + + header_ser->pgcnt = LUO_FLB_PGCNT; + luo_flb_global.outgoing.header_ser = header_ser; + luo_flb_global.outgoing.ser = (void *)(header_ser + 1); + luo_flb_global.outgoing.active = true; + + return 0; + +err_unpreserve: + kho_unpreserve_free(header_ser); + + return err; +} + +int __init luo_flb_setup_incoming(void *fdt_in) +{ + struct luo_flb_header_ser *header_ser; + int err, header_size, offset; + const void *ptr; + u64 header_ser_pa; + + offset = fdt_subnode_offset(fdt_in, 0, LUO_FDT_FLB_NODE_NAME); + if (offset < 0) { + pr_err("Unable to get FLB node [%s]\n", LUO_FDT_FLB_NODE_NAME); + + return -ENOENT; + } + + err = fdt_node_check_compatible(fdt_in, offset, + LUO_FDT_FLB_COMPATIBLE); + if (err) { + pr_err("FLB node is incompatible with '%s' [%d]\n", + LUO_FDT_FLB_COMPATIBLE, err); + + return -EINVAL; + } + + header_size = 0; + ptr = fdt_getprop(fdt_in, offset, LUO_FDT_FLB_HEADER, &header_size); + if (!ptr || header_size != sizeof(u64)) { + pr_err("Unable to get FLB header property '%s' [%d]\n", + LUO_FDT_FLB_HEADER, header_size); + + return -EINVAL; + } + + header_ser_pa = get_unaligned((u64 *)ptr); + header_ser = phys_to_virt(header_ser_pa); + + luo_flb_global.incoming.header_ser = header_ser; + luo_flb_global.incoming.ser = (void *)(header_ser + 1); + luo_flb_global.incoming.active = true; + + return 0; +} + +/** + * luo_flb_serialize - Serializes all active FLB objects for KHO. + * + * This function is called from the reboot path. It iterates through all + * registered File-Lifecycle-Bound (FLB) objects. For each FLB that has been + * preserved (i.e., its reference count is greater than zero), it writes its + * metadata into the memory region designated for Kexec Handover. + * + * The serialized data includes the FLB's compatibility string, its opaque + * data handle, and the final reference count. This allows the new kernel to + * find the appropriate handler and reconstruct the FLB's state. + * + * Context: Called from liveupdate_reboot() just before kho_finalize(). + */ +void luo_flb_serialize(void) +{ + struct luo_flb_header *fh = &luo_flb_global.outgoing; + struct liveupdate_flb *gflb; + int i = 0; + + list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { + struct luo_flb_private *private = luo_flb_get_private(gflb); + + if (private->outgoing.count > 0) { + strscpy(fh->ser[i].name, gflb->compatible, + sizeof(fh->ser[i].name)); + fh->ser[i].data = private->outgoing.data; + fh->ser[i].count = private->outgoing.count; + i++; + } + } + + fh->header_ser->count = i; +} diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index 3f1e0c94637e..99db13d99530 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -100,4 +100,11 @@ int luo_file_deserialize(struct luo_file_set *file_set, void luo_file_set_init(struct luo_file_set *file_set); void luo_file_set_destroy(struct luo_file_set *file_set); +int luo_flb_file_preserve(struct liveupdate_file_handler *fh); +void luo_flb_file_unpreserve(struct liveupdate_file_handler *fh); +void luo_flb_file_finish(struct liveupdate_file_handler *fh); +int __init luo_flb_setup_outgoing(void *fdt); +int __init luo_flb_setup_incoming(void *fdt); +void luo_flb_serialize(void); + #endif /* _LINUX_LUO_INTERNAL_H */ From f653ff7af96951faa69c68665d44bed80702544f Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Thu, 18 Dec 2025 10:57:52 -0500 Subject: [PATCH 102/107] tests/liveupdate: add in-kernel liveupdate test Introduce an in-kernel test module to validate the core logic of the Live Update Orchestrator's File-Lifecycle-Bound feature. This provides a low-level, controlled environment to test FLB registration and callback invocation without requiring userspace interaction or actual kexec reboots. The test is enabled by the CONFIG_LIVEUPDATE_TEST Kconfig option. Link: https://lkml.kernel.org/r/20251218155752.3045808-6-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Alexander Graf Cc: David Gow Cc: David Matlack Cc: David Rientjes Cc: Jonathan Corbet Cc: Kees Cook Cc: Mike Rapoport Cc: Petr Mladek Cc: Pratyush Yadav Cc: Samiullah Khawaja Cc: Tamir Duberstein Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + include/linux/kho/abi/luo.h | 5 + kernel/liveupdate/luo_file.c | 8 +- kernel/liveupdate/luo_internal.h | 8 ++ lib/Kconfig.debug | 23 +++++ lib/tests/Makefile | 1 + lib/tests/liveupdate.c | 158 +++++++++++++++++++++++++++++++ 7 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 lib/tests/liveupdate.c diff --git a/MAINTAINERS b/MAINTAINERS index 92b377cd131b..a2a4cfd19fad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14652,6 +14652,7 @@ F: include/linux/liveupdate.h F: include/linux/liveupdate/ F: include/uapi/linux/liveupdate.h F: kernel/liveupdate/ +F: lib/tests/liveupdate.c F: mm/memfd_luo.c F: tools/testing/selftests/liveupdate/ diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h index a44010aafb5e..46750a0ddf88 100644 --- a/include/linux/kho/abi/luo.h +++ b/include/linux/kho/abi/luo.h @@ -239,4 +239,9 @@ struct luo_flb_ser { u64 count; } __packed; +/* Kernel Live Update Test ABI */ +#ifdef CONFIG_LIVEUPDATE_TEST +#define LIVEUPDATE_TEST_FLB_COMPATIBLE(i) "liveupdate-test-flb-v" #i +#endif + #endif /* _LINUX_KHO_ABI_LUO_H */ diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index cade273c50c9..35d2a8b1a0df 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -864,6 +864,8 @@ int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) list_add_tail(&ACCESS_PRIVATE(fh, list), &luo_file_handler_list); luo_session_resume(); + liveupdate_test_register(fh); + return 0; err_resume: @@ -895,8 +897,10 @@ int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) if (!liveupdate_enabled()) return -EOPNOTSUPP; + liveupdate_test_unregister(fh); + if (!luo_session_quiesce()) - return -EBUSY; + goto err_register; if (!list_empty(&ACCESS_PRIVATE(fh, flb_list))) goto err_resume; @@ -909,5 +913,7 @@ int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) err_resume: luo_session_resume(); +err_register: + liveupdate_test_register(fh); return err; } diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index 99db13d99530..8083d8739b09 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -107,4 +107,12 @@ int __init luo_flb_setup_outgoing(void *fdt); int __init luo_flb_setup_incoming(void *fdt); void luo_flb_serialize(void); +#ifdef CONFIG_LIVEUPDATE_TEST +void liveupdate_test_register(struct liveupdate_file_handler *fh); +void liveupdate_test_unregister(struct liveupdate_file_handler *fh); +#else +static inline void liveupdate_test_register(struct liveupdate_file_handler *fh) { } +static inline void liveupdate_test_unregister(struct liveupdate_file_handler *fh) { } +#endif + #endif /* _LINUX_LUO_INTERNAL_H */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 234b73f9baf7..ef201f1cc498 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2825,6 +2825,29 @@ config LINEAR_RANGES_TEST If unsure, say N. +config LIVEUPDATE_TEST + bool "Live Update Kernel Test" + default n + depends on LIVEUPDATE + help + Enable a built-in kernel test module for the Live Update + Orchestrator. + + This module validates the File-Lifecycle-Bound subsystem by + registering a set of mock FLB objects with any real file handlers + that support live update (such as the memfd handler). + + When live update operations are performed, this test module will + output messages to the kernel log (dmesg), confirming that its + registration and various callback functions (preserve, retrieve, + finish, etc.) are being invoked correctly. + + This is a debugging and regression testing tool for developers + working on the Live Update subsystem. It should not be enabled in + production kernels. + + If unsure, say N + config CMDLINE_KUNIT_TEST tristate "KUnit test for cmdline API" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/tests/Makefile b/lib/tests/Makefile index f740b0a26750..436b7b7a65f0 100644 --- a/lib/tests/Makefile +++ b/lib/tests/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_LIST_PRIVATE_KUNIT_TEST) += list-private-test.o obj-$(CONFIG_KFIFO_KUNIT_TEST) += kfifo_kunit.o obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o +obj-$(CONFIG_LIVEUPDATE_TEST) += liveupdate.o CFLAGS_longest_symbol_kunit.o += $(call cc-disable-warning, missing-prototypes) obj-$(CONFIG_LONGEST_SYM_KUNIT_TEST) += longest_symbol_kunit.o diff --git a/lib/tests/liveupdate.c b/lib/tests/liveupdate.c new file mode 100644 index 000000000000..496d6ef91a30 --- /dev/null +++ b/lib/tests/liveupdate.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +#define pr_fmt(fmt) KBUILD_MODNAME " test: " fmt + +#include +#include +#include +#include +#include +#include "../../kernel/liveupdate/luo_internal.h" + +static const struct liveupdate_flb_ops test_flb_ops; +#define DEFINE_TEST_FLB(i) { \ + .ops = &test_flb_ops, \ + .compatible = LIVEUPDATE_TEST_FLB_COMPATIBLE(i), \ +} + +/* Number of Test FLBs to register with every file handler */ +#define TEST_NFLBS 3 +static struct liveupdate_flb test_flbs[TEST_NFLBS] = { + DEFINE_TEST_FLB(0), + DEFINE_TEST_FLB(1), + DEFINE_TEST_FLB(2), +}; + +#define TEST_FLB_MAGIC_BASE 0xFEEDF00DCAFEBEE0ULL + +static int test_flb_preserve(struct liveupdate_flb_op_args *argp) +{ + ptrdiff_t index = argp->flb - test_flbs; + + pr_info("%s: preserve was triggered\n", argp->flb->compatible); + argp->data = TEST_FLB_MAGIC_BASE + index; + + return 0; +} + +static void test_flb_unpreserve(struct liveupdate_flb_op_args *argp) +{ + pr_info("%s: unpreserve was triggered\n", argp->flb->compatible); +} + +static int test_flb_retrieve(struct liveupdate_flb_op_args *argp) +{ + ptrdiff_t index = argp->flb - test_flbs; + u64 expected_data = TEST_FLB_MAGIC_BASE + index; + + if (argp->data == expected_data) { + pr_info("%s: found flb data from the previous boot\n", + argp->flb->compatible); + argp->obj = (void *)argp->data; + } else { + pr_err("%s: ERROR - incorrect data handle: %llx, expected %llx\n", + argp->flb->compatible, argp->data, expected_data); + return -EINVAL; + } + + return 0; +} + +static void test_flb_finish(struct liveupdate_flb_op_args *argp) +{ + ptrdiff_t index = argp->flb - test_flbs; + void *expected_obj = (void *)(TEST_FLB_MAGIC_BASE + index); + + if (argp->obj == expected_obj) { + pr_info("%s: finish was triggered\n", argp->flb->compatible); + } else { + pr_err("%s: ERROR - finish called with invalid object\n", + argp->flb->compatible); + } +} + +static const struct liveupdate_flb_ops test_flb_ops = { + .preserve = test_flb_preserve, + .unpreserve = test_flb_unpreserve, + .retrieve = test_flb_retrieve, + .finish = test_flb_finish, + .owner = THIS_MODULE, +}; + +static void liveupdate_test_init(void) +{ + static DEFINE_MUTEX(init_lock); + static bool initialized; + int i; + + guard(mutex)(&init_lock); + + if (initialized) + return; + + for (i = 0; i < TEST_NFLBS; i++) { + struct liveupdate_flb *flb = &test_flbs[i]; + void *obj; + int err; + + err = liveupdate_flb_get_incoming(flb, &obj); + if (err && err != -ENODATA && err != -ENOENT) { + pr_err("liveupdate_flb_get_incoming for %s failed: %pe\n", + flb->compatible, ERR_PTR(err)); + } + } + initialized = true; +} + +void liveupdate_test_register(struct liveupdate_file_handler *fh) +{ + int err, i; + + liveupdate_test_init(); + + for (i = 0; i < TEST_NFLBS; i++) { + struct liveupdate_flb *flb = &test_flbs[i]; + + err = liveupdate_register_flb(fh, flb); + if (err) { + pr_err("Failed to register %s %pe\n", + flb->compatible, ERR_PTR(err)); + } + } + + err = liveupdate_register_flb(fh, &test_flbs[0]); + if (!err || err != -EEXIST) { + pr_err("Failed: %s should be already registered, but got err: %pe\n", + test_flbs[0].compatible, ERR_PTR(err)); + } + + pr_info("Registered %d FLBs with file handler: [%s]\n", + TEST_NFLBS, fh->compatible); +} + +void liveupdate_test_unregister(struct liveupdate_file_handler *fh) +{ + int err, i; + + for (i = 0; i < TEST_NFLBS; i++) { + struct liveupdate_flb *flb = &test_flbs[i]; + + err = liveupdate_unregister_flb(fh, flb); + if (err) { + pr_err("Failed to unregister %s %pe\n", + flb->compatible, ERR_PTR(err)); + } + } + + pr_info("Unregistered %d FLBs from file handler: [%s]\n", + TEST_NFLBS, fh->compatible); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pasha Tatashin "); +MODULE_DESCRIPTION("In-kernel test for LUO mechanism"); From 0758293d5dc88a45b910f46a0c7558bf6b09e01d Mon Sep 17 00:00:00 2001 From: "Tycho Andersen (AMD)" Date: Fri, 23 Jan 2026 12:05:06 -0700 Subject: [PATCH 103/107] kho: fix doc for kho_restore_pages() This function returns NULL if kho_restore_page() returns NULL, which happens in a couple of corner cases. It never returns an error code. Link: https://lkml.kernel.org/r/20260123190506.1058669-1-tycho@kernel.org Signed-off-by: Tycho Andersen (AMD) Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/liveupdate/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/liveupdate/kexec_handover.c b/kernel/liveupdate/kexec_handover.c index 8a2b2a7e50fc..fb3a7b67676e 100644 --- a/kernel/liveupdate/kexec_handover.c +++ b/kernel/liveupdate/kexec_handover.c @@ -299,7 +299,7 @@ EXPORT_SYMBOL_GPL(kho_restore_folio); * Restore a contiguous list of order 0 pages that was preserved with * kho_preserve_pages(). * - * Return: 0 on success, error code on failure + * Return: the first page on success, NULL on failure. */ struct page *kho_restore_pages(phys_addr_t phys, unsigned long nr_pages) { From 9dc052234da736f7749f19ab6936342ec7dbe3ac Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 16 Jan 2026 09:17:30 +0000 Subject: [PATCH 104/107] kcsan, compiler_types: avoid duplicate type issues in BPF Type Format Enabling KCSAN is causing a large number of duplicate types in BTF for core kernel structs like task_struct [1]. This is due to the definition in include/linux/compiler_types.h `#ifdef __SANITIZE_THREAD__ ... `#define __data_racy volatile .. `#else ... `#define __data_racy ... `#endif Because some objects in the kernel are compiled without KCSAN flags (KCSAN_SANITIZE) we sometimes get the empty __data_racy annotation for objects; as a result we get multiple conflicting representations of the associated structs in DWARF, and these lead to multiple instances of core kernel types in BTF since they cannot be deduplicated due to the additional modifier in some instances. Moving the __data_racy definition under CONFIG_KCSAN avoids this problem, since the volatile modifier will be present for both KCSAN and KCSAN_SANITIZE objects in a CONFIG_KCSAN=y kernel. Link: https://lkml.kernel.org/r/20260116091730.324322-1-alan.maguire@oracle.com Fixes: 31f605a308e6 ("kcsan, compiler_types: Introduce __data_racy type qualifier") Signed-off-by: Alan Maguire Reported-by: Nilay Shroff Tested-by: Nilay Shroff Suggested-by: Marco Elver Reviewed-by: Marco Elver Acked-by: Yonghong Song Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Bart van Assche Cc: Daniel Borkman Cc: Eduard Zingerman Cc: Hao Luo Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Jason A. Donenfeld Cc: Jiri Olsa Cc: John Fastabend Cc: Kees Cook Cc: KP Singh Cc: Martin KaFai Lau Cc: Miguel Ojeda Cc: Naman Jain Cc: Nathan Chancellor Cc: "Paul E . McKenney" Cc: Peter Zijlstra Cc: Stanislav Fomichev Cc: Uros Bizjak Cc: Signed-off-by: Andrew Morton --- include/linux/compiler_types.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index d3318a3c2577..86111a189a87 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -303,6 +303,22 @@ struct ftrace_likely_data { # define __no_kasan_or_inline __always_inline #endif +#ifdef CONFIG_KCSAN +/* + * Type qualifier to mark variables where all data-racy accesses should be + * ignored by KCSAN. Note, the implementation simply marks these variables as + * volatile, since KCSAN will treat such accesses as "marked". + * + * Defined here because defining __data_racy as volatile for KCSAN objects only + * causes problems in BPF Type Format (BTF) generation since struct members + * of core kernel data structs will be volatile in some objects and not in + * others. Instead define it globally for KCSAN kernels. + */ +# define __data_racy volatile +#else +# define __data_racy +#endif + #ifdef __SANITIZE_THREAD__ /* * Clang still emits instrumentation for __tsan_func_{entry,exit}() and builtin @@ -314,16 +330,9 @@ struct ftrace_likely_data { * disable all instrumentation. See Kconfig.kcsan where this is mandatory. */ # define __no_kcsan __no_sanitize_thread __disable_sanitizer_instrumentation -/* - * Type qualifier to mark variables where all data-racy accesses should be - * ignored by KCSAN. Note, the implementation simply marks these variables as - * volatile, since KCSAN will treat such accesses as "marked". - */ -# define __data_racy volatile # define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused #else # define __no_kcsan -# define __data_racy #endif #ifdef __SANITIZE_MEMORY__ From cafe4074a7221dca2fa954dd1ab0cf99b6318e23 Mon Sep 17 00:00:00 2001 From: Shengming Hu Date: Mon, 19 Jan 2026 21:59:05 +0800 Subject: [PATCH 105/107] watchdog/softlockup: fix sample ring index wrap in need_counting_irqs() cpustat_tail indexes cpustat_util[], which is a NUM_SAMPLE_PERIODS-sized ring buffer. need_counting_irqs() currently wraps the index using NUM_HARDIRQ_REPORT, which only happens to match NUM_SAMPLE_PERIODS. Use NUM_SAMPLE_PERIODS for the wrap to keep the ring math correct even if the NUM_HARDIRQ_REPORT or NUM_SAMPLE_PERIODS changes. Link: https://lkml.kernel.org/r/tencent_7068189CB6D6689EB353F3D17BF5A5311A07@qq.com Fixes: e9a9292e2368 ("watchdog/softlockup: Report the most frequent interrupts") Signed-off-by: Shengming Hu Reviewed-by: Petr Mladek Cc: Ingo Molnar Cc: Mark Brown Cc: Thomas Gleixner Cc: Zhang Run Cc: Signed-off-by: Andrew Morton --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b4d5fbdb933a..7d675781bc91 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -550,7 +550,7 @@ static bool need_counting_irqs(void) u8 util; int tail = __this_cpu_read(cpustat_tail); - tail = (tail + NUM_HARDIRQ_REPORT - 1) % NUM_HARDIRQ_REPORT; + tail = (tail + NUM_SAMPLE_PERIODS - 1) % NUM_SAMPLE_PERIODS; util = __this_cpu_read(cpustat_util[tail][STATS_HARDIRQ]); return util > HARDIRQ_PERCENT_THRESH; } From 76149d53502cf17ef3ae454ff384551236fba867 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Wed, 28 Jan 2026 16:30:07 +0800 Subject: [PATCH 106/107] procfs: fix missing RCU protection when reading real_parent in do_task_stat() When reading /proc/[pid]/stat, do_task_stat() accesses task->real_parent without proper RCU protection, which leads to: cpu 0 cpu 1 ----- ----- do_task_stat var = task->real_parent release_task call_rcu(delayed_put_task_struct) task_tgid_nr_ns(var) rcu_read_lock <--- Too late to protect task->real_parent! task_pid_ptr <--- UAF! rcu_read_unlock This patch uses task_ppid_nr_ns() instead of task_tgid_nr_ns() to add proper RCU protection for accessing task->real_parent. Link: https://lkml.kernel.org/r/20260128083007.3173016-1-alexjlzheng@tencent.com Fixes: 06fffb1267c9 ("do_task_stat: don't take rcu_read_lock()") Signed-off-by: Jinliang Zheng Acked-by: Oleg Nesterov Cc: David Hildenbrand Cc: Ingo Molnar Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: ruippan Cc: Usama Arif Signed-off-by: Andrew Morton --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 39e9246f6e4a..f447e734612a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -529,7 +529,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, } sid = task_session_nr_ns(task, ns); - ppid = task_tgid_nr_ns(task->real_parent, ns); + ppid = task_ppid_nr_ns(task, ns); pgid = task_pgrp_nr_ns(task, ns); unlock_task_sighand(task, &flags); From 0dddf20b4fd4afd59767acc144ad4da60259f21f Mon Sep 17 00:00:00 2001 From: Qiliang Yuan Date: Wed, 28 Jan 2026 21:26:14 -0500 Subject: [PATCH 107/107] watchdog/hardlockup: simplify perf event probe and remove per-cpu dependency Simplify the hardlockup detector's probe path and remove its implicit dependency on pinned per-cpu execution. Refactor hardlockup_detector_event_create() to be stateless. Return the created perf_event pointer to the caller instead of directly modifying the per-cpu 'watchdog_ev' variable. This allows the probe path to safely manage a temporary event without the risk of leaving stale pointers should task migration occur. Link: https://lkml.kernel.org/r/20260129022629.2201331-1-realwujing@gmail.com Signed-off-by: Shouxin Sun Signed-off-by: Junnan Zhang Signed-off-by: Qiliang Yuan Signed-off-by: Qiliang Yuan Reviewed-by: Douglas Anderson Cc: Jinchao Wang Cc: Ingo Molnar Cc: Li Huafei Cc: Song Liu Cc: Thorsten Blum Cc: Wang Jinchao Cc: Yicong Yang Signed-off-by: Andrew Morton --- kernel/watchdog_perf.c | 50 +++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index d3ca70e3c256..cf05775a96d3 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -118,18 +118,11 @@ static void watchdog_overflow_callback(struct perf_event *event, watchdog_hardlockup_check(smp_processor_id(), regs); } -static int hardlockup_detector_event_create(void) +static struct perf_event *hardlockup_detector_event_create(unsigned int cpu) { - unsigned int cpu; struct perf_event_attr *wd_attr; struct perf_event *evt; - /* - * Preemption is not disabled because memory will be allocated. - * Ensure CPU-locality by calling this in per-CPU kthread. - */ - WARN_ON(!is_percpu_thread()); - cpu = raw_smp_processor_id(); wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); @@ -143,14 +136,7 @@ static int hardlockup_detector_event_create(void) watchdog_overflow_callback, NULL); } - if (IS_ERR(evt)) { - pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, - PTR_ERR(evt)); - return PTR_ERR(evt); - } - WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); - this_cpu_write(watchdog_ev, evt); - return 0; + return evt; } /** @@ -159,17 +145,26 @@ static int hardlockup_detector_event_create(void) */ void watchdog_hardlockup_enable(unsigned int cpu) { + struct perf_event *evt; + WARN_ON_ONCE(cpu != smp_processor_id()); - if (hardlockup_detector_event_create()) + evt = hardlockup_detector_event_create(cpu); + if (IS_ERR(evt)) { + pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); return; + } /* use original value for check */ if (!atomic_fetch_inc(&watchdog_cpus)) pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); + WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak"); + this_cpu_write(watchdog_ev, evt); + watchdog_init_timestamp(); - perf_event_enable(this_cpu_read(watchdog_ev)); + perf_event_enable(evt); } /** @@ -263,19 +258,30 @@ bool __weak __init arch_perf_nmi_is_available(void) */ int __init watchdog_hardlockup_probe(void) { + struct perf_event *evt; + unsigned int cpu; int ret; if (!arch_perf_nmi_is_available()) return -ENODEV; - ret = hardlockup_detector_event_create(); + if (!hw_nmi_get_sample_period(watchdog_thresh)) + return -EINVAL; - if (ret) { + /* + * Test hardware PMU availability by creating a temporary perf event. + * The event is released immediately. + */ + cpu = raw_smp_processor_id(); + evt = hardlockup_detector_event_create(cpu); + if (IS_ERR(evt)) { pr_info("Perf NMI watchdog permanently disabled\n"); + ret = PTR_ERR(evt); } else { - perf_event_release_kernel(this_cpu_read(watchdog_ev)); - this_cpu_write(watchdog_ev, NULL); + perf_event_release_kernel(evt); + ret = 0; } + return ret; }