From 8127c4fdf169465b631b62f7e45a042ced32dc77 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 16 Jul 2025 16:15:56 +0530 Subject: [PATCH 01/32] pseries/lparcfg: Add resource group monitoring Systems can now be partitioned into resource groups. By default all systems will be part of default resource group. Once a resource group is created, and resources allocated to the resource group, those resources will be removed from the default resource group. If a LPAR moved to a resource group, then it can only use resources in the resource group. So maximum processors that can be allocated to a LPAR can be equal or smaller than the resources in the resource group. lparcfg can now exposes the resource group id to which this LPAR belongs to. It also exposes the number of processors in the current resource group. The default resource group id happens to be 0. These would be documented in the upcoming PAPR update. Example of an LPAR in a default resource group root@ltcp11-lp3 $ grep resource_group /proc/powerpc/lparcfg resource_group_number=0 resource_group_active_processors=50 root@ltcp11-lp3 $ Example of an LPAR in a non-default resource group root@ltcp11-lp5 $ grep resource_group /proc/powerpc/lparcfg resource_group_number=1 resource_group_active_processors=30 root@ltcp11-lp5 $ Signed-off-by: Srikar Dronamraju Tested-by: Venkat Rao Bagalkote Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250716104600.59102-1-srikar@linux.ibm.com --- arch/powerpc/platforms/pseries/lparcfg.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index cc22924f159f..6554537984fb 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -78,6 +78,8 @@ struct hvcall_ppp_data { u8 capped; u8 weight; u8 unallocated_weight; + u8 resource_group_index; + u16 active_procs_in_resource_group; u16 active_procs_in_pool; u16 active_system_procs; u16 phys_platform_procs; @@ -86,7 +88,7 @@ struct hvcall_ppp_data { }; /* - * H_GET_PPP hcall returns info in 4 parms. + * H_GET_PPP hcall returns info in 5 parms. * entitled_capacity,unallocated_capacity, * aggregation, resource_capability). * @@ -94,11 +96,11 @@ struct hvcall_ppp_data { * R5 = Unallocated Processor Capacity Percentage. * R6 (AABBCCDDEEFFGGHH). * XXXX - reserved (0) - * XXXX - reserved (0) + * XXXX - Active Cores in Resource Group * XXXX - Group Number * XXXX - Pool Number. * R7 (IIJJKKLLMMNNOOPP). - * XX - reserved. (0) + * XX - Resource group Number * XX - bit 0-6 reserved (0). bit 7 is Capped indicator. * XX - variable processor Capacity Weight * XX - Unallocated Variable Processor Capacity Weight. @@ -120,9 +122,11 @@ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data) ppp_data->entitlement = retbuf[0]; ppp_data->unallocated_entitlement = retbuf[1]; + ppp_data->active_procs_in_resource_group = (retbuf[2] >> 4 * 8) & 0xffff; ppp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; ppp_data->pool_num = retbuf[2] & 0xffff; + ppp_data->resource_group_index = (retbuf[3] >> 7 * 8) & 0xff; ppp_data->capped = (retbuf[3] >> 6 * 8) & 0x01; ppp_data->weight = (retbuf[3] >> 5 * 8) & 0xff; ppp_data->unallocated_weight = (retbuf[3] >> 4 * 8) & 0xff; @@ -236,6 +240,13 @@ static void parse_ppp_data(struct seq_file *m) seq_printf(m, "unallocated_capacity=%lld\n", ppp_data.unallocated_entitlement); + if (ppp_data.active_procs_in_resource_group) { + seq_printf(m, "resource_group_number=%d\n", + ppp_data.resource_group_index); + seq_printf(m, "resource_group_active_processors=%d\n", + ppp_data.active_procs_in_resource_group); + } + /* The last bits of information returned from h_get_ppp are only * valid if the ibm,partition-performance-parameters-level * property is >= 1. From b4a96ab50f368afc2360ff539a20254ca2c9a889 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Fri, 7 Nov 2025 13:33:34 +0530 Subject: [PATCH 02/32] powerpc/kdump: Add support for crashkernel CMA reservation Commit 35c18f2933c5 ("Add a new optional ",cma" suffix to the crashkernel= command line option") and commit ab475510e042 ("kdump: implement reserve_crashkernel_cma") added CMA support for kdump crashkernel reservation. Extend crashkernel CMA reservation support to powerpc. The following changes are made to enable CMA reservation on powerpc: - Parse and obtain the CMA reservation size along with other crashkernel parameters - Call reserve_crashkernel_cma() to allocate the CMA region for kdump - Include the CMA-reserved ranges in the usable memory ranges for the kdump kernel to use. - Exclude the CMA-reserved ranges from the crash kernel memory to prevent them from being exported through /proc/vmcore. With the introduction of the CMA crashkernel regions, crash_exclude_mem_range() needs to be called multiple times to exclude both crashk_res and crashk_cma_ranges from the crash memory ranges. To avoid repetitive logic for validating mem_ranges size and handling reallocation when required, this functionality is moved to a new wrapper function crash_exclude_mem_range_guarded(). To ensure proper CMA reservation, reserve_crashkernel_cma() is called after pageblock_order is initialized. Update kernel-parameters.txt to document CMA support for crashkernel on powerpc architecture. Signed-off-by: Sourabh Jain Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251107080334.708028-1-sourabhjain@linux.ibm.com --- .../admin-guide/kernel-parameters.txt | 2 +- arch/powerpc/include/asm/kexec.h | 2 + arch/powerpc/kernel/setup-common.c | 4 +- arch/powerpc/kexec/core.c | 10 ++++- arch/powerpc/kexec/ranges.c | 43 ++++++++++++++----- 5 files changed, 47 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6c42061ca20e..1c10190d583d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1013,7 +1013,7 @@ It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G. crashkernel=size[KMG],cma - [KNL, X86] Reserve additional crash kernel memory from + [KNL, X86, ppc] Reserve additional crash kernel memory from CMA. This reservation is usable by the first system's userspace memory and kernel movable allocations (memory balloon, zswap). Pages allocated from this memory range diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 4bbf9f699aaa..bd4a6c42a5f3 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -115,9 +115,11 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, struct crash_mem #ifdef CONFIG_CRASH_RESERVE int __init overlaps_crashkernel(unsigned long start, unsigned long size); extern void arch_reserve_crashkernel(void); +extern void kdump_cma_reserve(void); #else static inline void arch_reserve_crashkernel(void) {} static inline int overlaps_crashkernel(unsigned long start, unsigned long size) { return 0; } +static inline void kdump_cma_reserve(void) { } #endif #if defined(CONFIG_CRASH_DUMP) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 68d47c53876c..c8c42b419742 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -995,11 +996,12 @@ void __init setup_arch(char **cmdline_p) initmem_init(); /* - * Reserve large chunks of memory for use by CMA for fadump, KVM and + * Reserve large chunks of memory for use by CMA for kdump, fadump, KVM and * hugetlb. These must be called after initmem_init(), so that * pageblock_order is initialised. */ fadump_cma_init(); + kdump_cma_reserve(); kvm_cma_reserve(); gigantic_hugetlb_cma_reserve(); diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index d1a2d755381c..e59bdfcc6463 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -59,6 +59,8 @@ void machine_kexec(struct kimage *image) #ifdef CONFIG_CRASH_RESERVE +static unsigned long long crashk_cma_size; + static unsigned long long __init get_crash_base(unsigned long long crash_base) { @@ -110,7 +112,7 @@ void __init arch_reserve_crashkernel(void) /* use common parsing */ ret = parse_crashkernel(boot_command_line, total_mem_sz, &crash_size, - &crash_base, NULL, NULL, NULL); + &crash_base, NULL, &crashk_cma_size, NULL); if (ret) return; @@ -130,6 +132,12 @@ void __init arch_reserve_crashkernel(void) reserve_crashkernel_generic(crash_size, crash_base, 0, false); } +void __init kdump_cma_reserve(void) +{ + if (crashk_cma_size) + reserve_crashkernel_cma(crashk_cma_size); +} + int __init overlaps_crashkernel(unsigned long start, unsigned long size) { return (start + size) > crashk_res.start && start <= crashk_res.end; diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c index 3702b0bdab14..3bd27c38726b 100644 --- a/arch/powerpc/kexec/ranges.c +++ b/arch/powerpc/kexec/ranges.c @@ -515,7 +515,7 @@ out: */ int get_usable_memory_ranges(struct crash_mem **mem_ranges) { - int ret; + int ret, i; /* * Early boot failure observed on guests when low memory (first memory @@ -528,6 +528,13 @@ int get_usable_memory_ranges(struct crash_mem **mem_ranges) if (ret) goto out; + for (i = 0; i < crashk_cma_cnt; i++) { + ret = add_mem_range(mem_ranges, crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end - crashk_cma_ranges[i].start + 1); + if (ret) + goto out; + } + ret = add_rtas_mem_range(mem_ranges); if (ret) goto out; @@ -546,6 +553,22 @@ out: #endif /* CONFIG_KEXEC_FILE */ #ifdef CONFIG_CRASH_DUMP +static int crash_exclude_mem_range_guarded(struct crash_mem **mem_ranges, + unsigned long long mstart, + unsigned long long mend) +{ + struct crash_mem *tmem = *mem_ranges; + + /* Reallocate memory ranges if there is no space to split ranges */ + if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) { + tmem = realloc_mem_ranges(mem_ranges); + if (!tmem) + return -ENOMEM; + } + + return crash_exclude_mem_range(tmem, mstart, mend); +} + /** * get_crash_memory_ranges - Get crash memory ranges. This list includes * first/crashing kernel's memory regions that @@ -557,7 +580,6 @@ out: int get_crash_memory_ranges(struct crash_mem **mem_ranges) { phys_addr_t base, end; - struct crash_mem *tmem; u64 i; int ret; @@ -582,19 +604,18 @@ int get_crash_memory_ranges(struct crash_mem **mem_ranges) sort_memory_ranges(*mem_ranges, true); } - /* Reallocate memory ranges if there is no space to split ranges */ - tmem = *mem_ranges; - if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) { - tmem = realloc_mem_ranges(mem_ranges); - if (!tmem) - goto out; - } - /* Exclude crashkernel region */ - ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end); + ret = crash_exclude_mem_range_guarded(mem_ranges, crashk_res.start, crashk_res.end); if (ret) goto out; + for (i = 0; i < crashk_cma_cnt; ++i) { + ret = crash_exclude_mem_range_guarded(mem_ranges, crashk_cma_ranges[i].start, + crashk_cma_ranges[i].end); + if (ret) + goto out; + } + /* * FIXME: For now, stay in parity with kexec-tools but if RTAS/OPAL * regions are exported to save their context at the time of From 7afe2383eff05f76f4ce2cfda658b7889c89f101 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Wed, 5 Nov 2025 09:09:41 +0530 Subject: [PATCH 03/32] powerpc/kdump: Fix size calculation for hot-removed memory ranges The elfcorehdr segment in the kdump image stores information about the memory regions (called crash memory ranges) that the kdump kernel must capture. When a memory hot-remove event occurs, the kernel regenerates the elfcorehdr for the currently loaded kdump image to remove the hot-removed memory from the crash memory ranges. Call chain: remove_mem_range() update_crash_elfcorehdr() arch_crash_handle_hotplug_event() crash_handle_hotplug_event() While removing the hot-removed memory from the crash memory ranges in remove_mem_range(), if the removed memory lies within an existing crash range, that range is split into two. During this split, the size of the second range was being calculated incorrectly. This leads to dump capture failure with makedumpfile with below error: $ makedumpfile -l -d 31 /proc/vmcore /tmp/vmcore readpage_elf: Attempt to read non-existent page at 0xbbdab0000. readmem: type_addr: 0, addr:c000000bbdab7f00, size:16 validate_mem_section: Can't read mem_section array. readpage_elf: Attempt to read non-existent page at 0xbbdab0000. readmem: type_addr: 0, addr:c000000bbdab7f00, size:8 get_mm_sparsemem: Can't get the address of mem_section. The updated crash memory range in PT_LOAD entry is holding incorrect data (checkout FileSiz and MemSiz): readelf -a /proc/vmcore Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align LOAD 0x0000000b013d0000 0xc000000b80000000 0x0000000b80000000 0xffffffffc0000000 0xffffffffc0000000 RWE 0x0 Update the size calculation for the new crash memory range to fix this issue. Note: This problem will not occur if the kdump kernel is loaded or reloaded after a memory hot-remove operation. Fixes: 849599b702ef ("powerpc/crash: add crash memory hotplug support") Reported-by: Shirisha G Signed-off-by: Sourabh Jain Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251105033941.1752287-1-sourabhjain@linux.ibm.com --- arch/powerpc/kexec/ranges.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c index 3bd27c38726b..867135560e5c 100644 --- a/arch/powerpc/kexec/ranges.c +++ b/arch/powerpc/kexec/ranges.c @@ -718,8 +718,8 @@ int remove_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size) * two half. */ else { + size = mem_rngs->ranges[i].end - end + 1; mem_rngs->ranges[i].end = base - 1; - size = mem_rngs->ranges[i].end - end; ret = add_mem_range(mem_ranges, end + 1, size); } } From 98fa236044ca4f8841107382fb03832101fa7328 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 27 Aug 2025 16:18:53 +0200 Subject: [PATCH 04/32] powerpc/8xx: Remove specific code from fast_exception_return The label 2: in fast_exception_return is a leftover from commit b96bae3ae2cb ("powerpc/32: Replace ASM exception exit by C exception exit from ppc64"). Once removed, we see that fast_exception_return is a standalone function that is called only from pieces of assembly dedicated to book3s/32 or booke, never by common code or 8xx code. So remove the clear of MSR[RI] enclosed in #ifdef CONFIG_PPC_8xx. Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/39de3e0f0122b571474b1ba352a2dc3ad8cb71dd.1756304318.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/entry_32.S | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index f4a8c9877249..762df811433c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -216,7 +216,7 @@ fast_exception_return: beq 3f /* if not, we've got problems */ #endif -2: lwz r10,_CCR(r11) + lwz r10,_CCR(r11) REST_GPRS(1, 6, r11) mtcr r10 lwz r10,_LINK(r11) @@ -225,9 +225,6 @@ fast_exception_return: li r10, 0 stw r10, 8(r11) REST_GPR(10, r11) -#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS) - mtspr SPRN_NRI, r0 -#endif mtspr SPRN_SRR1,r9 mtspr SPRN_SRR0,r12 REST_GPR(9, r11) From 2997876c4a1a5864baa13d7393c2b68cf5b51183 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 11 Sep 2025 14:30:12 +0200 Subject: [PATCH 05/32] powerpc/32: Restore clearing of MSR[RI] at interrupt/syscall exit Commit 13799748b957 ("powerpc/64: use interrupt restart table to speed up return from interrupt") removed the inconditional clearing of MSR[RI] when returning from interrupt into kernel. But powerpc/32 doesn't implement interrupt restart table hence still need MSR[RI] to be cleared. It could be added back in interrupt_exit_kernel_prepare() but it is easier and better to add it back in entry_32.S for following reasons: - Writing to MSR must be followed by a synchronising instruction - The smaller the non recoverable section is the better it is So add a macro called clr_ri and use it in the three places that play up with SRR0/SRR1. Use it just before another mtspr for synchronisation to avoid having to add an isync. Now that's done in entry_32.S, exit_must_hard_disable() can return false for non book3s/64, taking into account that BOOKE doesn't have MSR_RI. Also add back blacklisting syscall_exit_finish for kprobe. This was initially added by commit 7cdf44013885 ("powerpc/entry32: Blacklist syscall exit points for kprobe.") then lost with commit 6f76a01173cc ("powerpc/syscall: implement system call entry/exit logic in C for PPC32"). Fixes: 6f76a01173cc ("powerpc/syscall: implement system call entry/exit logic in C for PPC32") Fixes: 13799748b957 ("powerpc/64: use interrupt restart table to speed up return from interrupt") Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/66d0ab070563ad460ed481328ab0887c27f21a2c.1757593807.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/entry_32.S | 18 +++++++++++++++++- arch/powerpc/kernel/interrupt.c | 2 +- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 762df811433c..61ffd2989e7b 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -101,6 +101,17 @@ SYM_FUNC_END(__kuep_unlock) .endm #endif +.macro clr_ri trash +#ifndef CONFIG_BOOKE +#ifdef CONFIG_PPC_8xx + mtspr SPRN_NRI, \trash +#else + li \trash, MSR_KERNEL & ~MSR_RI + mtmsr \trash +#endif +#endif +.endm + .globl transfer_to_syscall transfer_to_syscall: stw r3, ORIG_GPR3(r1) @@ -149,6 +160,7 @@ ret_from_syscall: cmpwi r3,0 REST_GPR(3, r1) syscall_exit_finish: + clr_ri r4 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 @@ -168,6 +180,7 @@ syscall_exit_finish: REST_GPR(0, r1) REST_GPRS(3, 12, r1) b 1b +_ASM_NOKPROBE_SYMBOL(syscall_exit_finish) #ifdef CONFIG_44x .L44x_icache_flush: @@ -224,10 +237,11 @@ fast_exception_return: /* Clear the exception marker on the stack to avoid confusing stacktrace */ li r10, 0 stw r10, 8(r11) - REST_GPR(10, r11) + clr_ri r10 mtspr SPRN_SRR1,r9 mtspr SPRN_SRR0,r12 REST_GPR(9, r11) + REST_GPR(10, r11) REST_GPR(12, r11) REST_GPR(11, r11) rfi @@ -256,6 +270,7 @@ interrupt_return: .Lfast_user_interrupt_return: lwz r11,_NIP(r1) lwz r12,_MSR(r1) + clr_ri r4 mtspr SPRN_SRR0,r11 mtspr SPRN_SRR1,r12 @@ -299,6 +314,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) cmpwi cr1,r3,0 lwz r11,_NIP(r1) lwz r12,_MSR(r1) + clr_ri r4 mtspr SPRN_SRR0,r11 mtspr SPRN_SRR1,r12 diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index e0c681d0b076..aea6f7e8e9c6 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -38,7 +38,7 @@ static inline bool exit_must_hard_disable(void) #else static inline bool exit_must_hard_disable(void) { - return true; + return false; } #endif From 10e1c77c3636d815db802ceef588522c2d2d947c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 12 Sep 2025 10:37:34 +0200 Subject: [PATCH 06/32] powerpc/32: Fix unpaired stwcx. on interrupt exit Commit b96bae3ae2cb ("powerpc/32: Replace ASM exception exit by C exception exit from ppc64") erroneouly copied to powerpc/32 the logic from powerpc/64 based on feature CPU_FTR_STCX_CHECKS_ADDRESS which is always 0 on powerpc/32. Re-instate the logic implemented by commit b64f87c16f3c ("[POWERPC] Avoid unpaired stwcx. on some processors") which is based on CPU_FTR_NEED_PAIRED_STWCX feature. Fixes: b96bae3ae2cb ("powerpc/32: Replace ASM exception exit by C exception exit from ppc64") Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/6040b5dbcf5cdaa1cd919fcf0790f12974ea6e5a.1757666244.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/entry_32.S | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 61ffd2989e7b..16f8ee6cb2cd 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -275,10 +275,9 @@ interrupt_return: mtspr SPRN_SRR1,r12 BEGIN_FTR_SECTION + lwarx r0,0,r1 +END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ -FTR_SECTION_ELSE - lwarx r0,0,r1 -ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) lwz r3,_CCR(r1) lwz r4,_LINK(r1) @@ -319,10 +318,9 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) mtspr SPRN_SRR1,r12 BEGIN_FTR_SECTION + lwarx r0,0,r1 +END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ -FTR_SECTION_ELSE - lwarx r0,0,r1 -ALT_FTR_SECTION_END_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) lwz r3,_LINK(r1) lwz r4,_CTR(r1) From 1e4b207ffe54cf33a4b7a2912c4110f89c73bf3f Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 19 Aug 2025 17:10:35 +0800 Subject: [PATCH 07/32] macintosh/mac_hid: fix race condition in mac_hid_toggle_emumouse The following warning appears when running syzkaller, and this issue also exists in the mainline code. ------------[ cut here ]------------ list_add double add: new=ffffffffa57eee28, prev=ffffffffa57eee28, next=ffffffffa5e63100. WARNING: CPU: 0 PID: 1491 at lib/list_debug.c:35 __list_add_valid_or_report+0xf7/0x130 Modules linked in: CPU: 0 PID: 1491 Comm: syz.1.28 Not tainted 6.6.0+ #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__list_add_valid_or_report+0xf7/0x130 RSP: 0018:ff1100010dfb7b78 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffffa57eee18 RCX: ffffffff97fc9817 RDX: 0000000000040000 RSI: ffa0000002383000 RDI: 0000000000000001 RBP: ffffffffa57eee28 R08: 0000000000000001 R09: ffe21c0021bf6f2c R10: 0000000000000001 R11: 6464615f7473696c R12: ffffffffa5e63100 R13: ffffffffa57eee28 R14: ffffffffa57eee28 R15: ff1100010dfb7d48 FS: 00007fb14398b640(0000) GS:ff11000119600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000010d096005 CR4: 0000000000773ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 80000000 Call Trace: input_register_handler+0xb3/0x210 mac_hid_start_emulation+0x1c5/0x290 mac_hid_toggle_emumouse+0x20a/0x240 proc_sys_call_handler+0x4c2/0x6e0 new_sync_write+0x1b1/0x2d0 vfs_write+0x709/0x950 ksys_write+0x12a/0x250 do_syscall_64+0x5a/0x110 entry_SYSCALL_64_after_hwframe+0x78/0xe2 The WARNING occurs when two processes concurrently write to the mac-hid emulation sysctl, causing a race condition in mac_hid_toggle_emumouse(). Both processes read old_val=0, then both try to register the input handler, leading to a double list_add of the same handler. CPU0 CPU1 ------------------------- ------------------------- vfs_write() //write 1 vfs_write() //write 1 proc_sys_write() proc_sys_write() mac_hid_toggle_emumouse() mac_hid_toggle_emumouse() old_val = *valp // old_val=0 old_val = *valp // old_val=0 mutex_lock_killable() proc_dointvec() // *valp=1 mac_hid_start_emulation() input_register_handler() mutex_unlock() mutex_lock_killable() proc_dointvec() mac_hid_start_emulation() input_register_handler() //Trigger Warning mutex_unlock() Fix this by moving the old_val read inside the mutex lock region. Fixes: 99b089c3c38a ("Input: Mac button emulation - implement as an input filter") Signed-off-by: Long Li Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250819091035.2263329-1-leo.lilong@huaweicloud.com --- drivers/macintosh/mac_hid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 369d72f59b3c..06fd910b3fd1 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -187,13 +187,14 @@ static int mac_hid_toggle_emumouse(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; - int old_val = *valp; + int old_val; int rc; rc = mutex_lock_killable(&mac_hid_emumouse_mutex); if (rc) return rc; + old_val = *valp; rc = proc_dointvec(table, write, buffer, lenp, ppos); if (rc == 0 && write && *valp != old_val) { From d2be62d5858312f3e6c36dbfc43faa1f287d5249 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 18 Oct 2025 18:52:40 +0100 Subject: [PATCH 08/32] powerpc/vmlinux.lds: Drop .interp description Commit da30705c4621 ("arch/powerpc: Remove .interp section in vmlinux") intended to drop the .interp section from vmlinux but even with this change, relocatable kernels linked with ld.lld contain an empty .interp section, which ends up causing crashes in GDB [1]. $ make -skj"$(nproc)" ARCH=powerpc LLVM=1 clean pseries_le_defconfig vmlinux $ llvm-readelf -S vmlinux | grep interp [44] .interp PROGBITS c0000000021ddb34 21edb34 000000 00 A 0 0 1 There appears to be a subtle difference between GNU ld and ld.lld when it comes to discarding sections that specify load addresses [2]. Since '--no-dynamic-linker' prevents emission of the .interp section, there is no need to describe it in the output sections of the vmlinux linker script. Drop the .interp section description from vmlinux.lds.S to avoid this issue altogether. Link: https://sourceware.org/bugzilla/show_bug.cgi?id=33481 [1] Link: https://github.com/ClangBuiltLinux/linux/issues/2137 [2] Reported-by: Vishal Chourasia Closes: https://lore.kernel.org/20251013040148.560439-1-vishalc@linux.ibm.com/ Signed-off-by: Nathan Chancellor Tested-by: Vishal Chourasia Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251018-ppc-fix-lld-interp-v1-1-a083de6dccc9@kernel.org --- arch/powerpc/kernel/vmlinux.lds.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index de6ee7d35cff..15850296c0a9 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -330,7 +330,6 @@ SECTIONS } .hash : AT(ADDR(.hash) - LOAD_OFFSET) { *(.hash) } .gnu.hash : AT(ADDR(.gnu.hash) - LOAD_OFFSET) { *(.gnu.hash) } - .interp : AT(ADDR(.interp) - LOAD_OFFSET) { *(.interp) } .rela.dyn : AT(ADDR(.rela.dyn) - LOAD_OFFSET) { __rela_dyn_start = .; From cbc8bd1b7d905b3f746542cc726837a954a46bd8 Mon Sep 17 00:00:00 2001 From: Antonio Alvarez Feijoo Date: Tue, 16 Sep 2025 08:18:40 +0200 Subject: [PATCH 09/32] powerpc/boot: Add missing compression methods to usage lzma and lzo are also supported. Signed-off-by: Antonio Alvarez Feijoo Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250916061840.5492-1-antonio.feijoo@suse.com --- arch/powerpc/boot/wrapper | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index a75baefd1cff..1efd1206fcab 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -21,7 +21,7 @@ # (default ./arch/powerpc/boot) # -W dir specify working directory for temporary files (default .) # -z use gzip (legacy) -# -Z zsuffix compression to use (gz, xz or none) +# -Z zsuffix compression to use (gz, xz, lzma, lzo or none) # Stop execution if any command fails set -e @@ -69,7 +69,7 @@ usage() { echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2 echo ' [-d devtree] [-s tree.dts] [-e esm_blob]' >&2 echo ' [-c] [-C cross-prefix] [-D datadir] [-W workingdir]' >&2 - echo ' [-Z (gz|xz|none)] [--no-compression] [vmlinux]' >&2 + echo ' [-Z (gz|xz|lzma|lzo|none)] [--no-compression] [vmlinux]' >&2 exit 1 } From 825ce89a3ef17f84cf2c0eacfa6b8dc9fd11d13f Mon Sep 17 00:00:00 2001 From: Ben Collins Date: Mon, 21 Apr 2025 22:31:13 -0400 Subject: [PATCH 10/32] powerpc/addnote: Fix overflow on 32-bit builds The PUT_64[LB]E() macros need to cast the value to unsigned long long like the GET_64[LB]E() macros. Caused lots of warnings when compiled on 32-bit, and clobbered addresses (36-bit P4080). Signed-off-by: Ben Collins Reviewed-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/2025042122-mustard-wrasse-694572@boujee-and-buff --- arch/powerpc/boot/addnote.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/boot/addnote.c b/arch/powerpc/boot/addnote.c index 53b3b2621457..78704927453a 100644 --- a/arch/powerpc/boot/addnote.c +++ b/arch/powerpc/boot/addnote.c @@ -68,8 +68,8 @@ static int e_class = ELFCLASS32; #define PUT_16BE(off, v)(buf[off] = ((v) >> 8) & 0xff, \ buf[(off) + 1] = (v) & 0xff) #define PUT_32BE(off, v)(PUT_16BE((off), (v) >> 16L), PUT_16BE((off) + 2, (v))) -#define PUT_64BE(off, v)((PUT_32BE((off), (v) >> 32L), \ - PUT_32BE((off) + 4, (v)))) +#define PUT_64BE(off, v)((PUT_32BE((off), (unsigned long long)(v) >> 32L), \ + PUT_32BE((off) + 4, (unsigned long long)(v)))) #define GET_16LE(off) ((buf[off]) + (buf[(off)+1] << 8)) #define GET_32LE(off) (GET_16LE(off) + (GET_16LE((off)+2U) << 16U)) @@ -78,7 +78,8 @@ static int e_class = ELFCLASS32; #define PUT_16LE(off, v) (buf[off] = (v) & 0xff, \ buf[(off) + 1] = ((v) >> 8) & 0xff) #define PUT_32LE(off, v) (PUT_16LE((off), (v)), PUT_16LE((off) + 2, (v) >> 16L)) -#define PUT_64LE(off, v) (PUT_32LE((off), (v)), PUT_32LE((off) + 4, (v) >> 32L)) +#define PUT_64LE(off, v) (PUT_32LE((off), (unsigned long long)(v)), \ + PUT_32LE((off) + 4, (unsigned long long)(v) >> 32L)) #define GET_16(off) (e_data == ELFDATA2MSB ? GET_16BE(off) : GET_16LE(off)) #define GET_32(off) (e_data == ELFDATA2MSB ? GET_32BE(off) : GET_32LE(off)) From 0070b2cbfe7389159669c3a5bb23d2ef89043055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= Date: Fri, 18 Apr 2025 21:28:51 +0200 Subject: [PATCH 11/32] powerpc: 512x: Rename wdt@ node to watchdog@ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The watchdog.yaml schema prescribes a node name of "timer" or "watchdog" rather than the abbreviation "wdt". Signed-off-by: J. Neuschäfer Reviewed-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250418-watchdog-v1-1-987ff2046272@posteo.net --- arch/powerpc/boot/dts/mpc5121.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/dts/mpc5121.dtsi b/arch/powerpc/boot/dts/mpc5121.dtsi index d3fc8062fbcd..a278fb7b9e71 100644 --- a/arch/powerpc/boot/dts/mpc5121.dtsi +++ b/arch/powerpc/boot/dts/mpc5121.dtsi @@ -112,7 +112,7 @@ }; /* Watchdog timer */ - wdt@900 { + watchdog@900 { compatible = "fsl,mpc5121-wdt"; reg = <0x900 0x100>; }; From cc156be1e7566add1f490854133e27f091ae58b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= Date: Fri, 18 Apr 2025 21:28:52 +0200 Subject: [PATCH 12/32] powerpc: 83xx: Rename wdt@ nodes to watchdog@ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The watchdog.yaml schema prescribes a node name of "timer" or "watchdog" rather than the abbreviation "wdt". Signed-off-by: J. Neuschäfer Reviewed-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250418-watchdog-v1-2-987ff2046272@posteo.net --- arch/powerpc/boot/dts/asp834x-redboot.dts | 2 +- arch/powerpc/boot/dts/mpc8313erdb.dts | 2 +- arch/powerpc/boot/dts/mpc8315erdb.dts | 2 +- arch/powerpc/boot/dts/mpc832x_rdb.dts | 2 +- arch/powerpc/boot/dts/mpc8349emitx.dts | 2 +- arch/powerpc/boot/dts/mpc8349emitxgp.dts | 2 +- arch/powerpc/boot/dts/mpc836x_rdk.dts | 2 +- arch/powerpc/boot/dts/mpc8377_rdb.dts | 2 +- arch/powerpc/boot/dts/mpc8377_wlan.dts | 2 +- arch/powerpc/boot/dts/mpc8378_rdb.dts | 2 +- arch/powerpc/boot/dts/mpc8379_rdb.dts | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/boot/dts/asp834x-redboot.dts b/arch/powerpc/boot/dts/asp834x-redboot.dts index 52a84561c4f0..33ddb17d1876 100644 --- a/arch/powerpc/boot/dts/asp834x-redboot.dts +++ b/arch/powerpc/boot/dts/asp834x-redboot.dts @@ -72,7 +72,7 @@ reg = <0xff000000 0x00000200>; bus-frequency = <0>; - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; diff --git a/arch/powerpc/boot/dts/mpc8313erdb.dts b/arch/powerpc/boot/dts/mpc8313erdb.dts index a8315795b2c9..09508b4c8c73 100644 --- a/arch/powerpc/boot/dts/mpc8313erdb.dts +++ b/arch/powerpc/boot/dts/mpc8313erdb.dts @@ -99,7 +99,7 @@ reg = <0xe0000000 0x00000200>; bus-frequency = <0>; - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; diff --git a/arch/powerpc/boot/dts/mpc8315erdb.dts b/arch/powerpc/boot/dts/mpc8315erdb.dts index a89cb3139ca8..a8f68d6e50b0 100644 --- a/arch/powerpc/boot/dts/mpc8315erdb.dts +++ b/arch/powerpc/boot/dts/mpc8315erdb.dts @@ -100,7 +100,7 @@ reg = <0xe0000000 0x00000200>; bus-frequency = <0>; - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; diff --git a/arch/powerpc/boot/dts/mpc832x_rdb.dts b/arch/powerpc/boot/dts/mpc832x_rdb.dts index ecebc27a2898..ba7caaf98fd5 100644 --- a/arch/powerpc/boot/dts/mpc832x_rdb.dts +++ b/arch/powerpc/boot/dts/mpc832x_rdb.dts @@ -52,7 +52,7 @@ reg = <0xe0000000 0x00000200>; bus-frequency = <0>; - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; diff --git a/arch/powerpc/boot/dts/mpc8349emitx.dts b/arch/powerpc/boot/dts/mpc8349emitx.dts index d4ebbb93de0b..13f17232ba83 100644 --- a/arch/powerpc/boot/dts/mpc8349emitx.dts +++ b/arch/powerpc/boot/dts/mpc8349emitx.dts @@ -53,7 +53,7 @@ reg = <0xe0000000 0x00000200>; bus-frequency = <0>; // from bootloader - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; diff --git a/arch/powerpc/boot/dts/mpc8349emitxgp.dts b/arch/powerpc/boot/dts/mpc8349emitxgp.dts index bcf68a0a7b55..eae0afd5abbc 100644 --- a/arch/powerpc/boot/dts/mpc8349emitxgp.dts +++ b/arch/powerpc/boot/dts/mpc8349emitxgp.dts @@ -51,7 +51,7 @@ reg = <0xe0000000 0x00000200>; bus-frequency = <0>; // from bootloader - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; diff --git a/arch/powerpc/boot/dts/mpc836x_rdk.dts b/arch/powerpc/boot/dts/mpc836x_rdk.dts index a0cc1953484d..4ff38e1a2185 100644 --- a/arch/powerpc/boot/dts/mpc836x_rdk.dts +++ b/arch/powerpc/boot/dts/mpc836x_rdk.dts @@ -62,7 +62,7 @@ /* filled by u-boot */ bus-frequency = <0>; - wdt@200 { + watchdog@200 { compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; }; diff --git a/arch/powerpc/boot/dts/mpc8377_rdb.dts b/arch/powerpc/boot/dts/mpc8377_rdb.dts index 7df452efa957..f137ccb8cfde 100644 --- a/arch/powerpc/boot/dts/mpc8377_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8377_rdb.dts @@ -99,7 +99,7 @@ reg = <0xe0000000 0x00000200>; bus-frequency = <0>; - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; diff --git a/arch/powerpc/boot/dts/mpc8377_wlan.dts b/arch/powerpc/boot/dts/mpc8377_wlan.dts index d8e7d40aeae4..ce254dd74dd0 100644 --- a/arch/powerpc/boot/dts/mpc8377_wlan.dts +++ b/arch/powerpc/boot/dts/mpc8377_wlan.dts @@ -89,7 +89,7 @@ reg = <0xe0000000 0x00000200>; bus-frequency = <0>; - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; diff --git a/arch/powerpc/boot/dts/mpc8378_rdb.dts b/arch/powerpc/boot/dts/mpc8378_rdb.dts index bdcfe83a561e..19e5473d4161 100644 --- a/arch/powerpc/boot/dts/mpc8378_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8378_rdb.dts @@ -99,7 +99,7 @@ reg = <0xe0000000 0x00000200>; bus-frequency = <0>; - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; diff --git a/arch/powerpc/boot/dts/mpc8379_rdb.dts b/arch/powerpc/boot/dts/mpc8379_rdb.dts index a5f702304a35..61519acca228 100644 --- a/arch/powerpc/boot/dts/mpc8379_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8379_rdb.dts @@ -97,7 +97,7 @@ reg = <0xe0000000 0x00000200>; bus-frequency = <0>; - wdt@200 { + watchdog@200 { device_type = "watchdog"; compatible = "mpc83xx_wdt"; reg = <0x200 0x100>; From 39fe29e7f2fd38b0fee9bf987d180dee976dd2c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= Date: Fri, 18 Apr 2025 21:28:53 +0200 Subject: [PATCH 13/32] powerpc: 86xx: Rename wdt@ nodes to watchdog@ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The watchdog.yaml schema prescribes a node name of "timer" or "watchdog" rather than the abbreviation "wdt". Signed-off-by: J. Neuschäfer Reviewed-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250418-watchdog-v1-3-987ff2046272@posteo.net --- arch/powerpc/boot/dts/fsl/gef_ppc9a.dts | 4 ++-- arch/powerpc/boot/dts/fsl/gef_sbc310.dts | 4 ++-- arch/powerpc/boot/dts/fsl/gef_sbc610.dts | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts index fc92bb032c51..48a81430a8a3 100644 --- a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts +++ b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts @@ -82,7 +82,7 @@ reg = <0x4 0x0 0x40>; }; - wdt@4,2000 { + watchdog@4,2000 { compatible = "gef,ppc9a-fpga-wdt", "gef,fpga-wdt-1.00", "gef,fpga-wdt"; reg = <0x4 0x2000 0x8>; @@ -90,7 +90,7 @@ interrupt-parent = <&gef_pic>; }; /* Second watchdog available, driver currently supports one. - wdt@4,2010 { + watchdog@4,2010 { compatible = "gef,ppc9a-fpga-wdt", "gef,fpga-wdt-1.00", "gef,fpga-wdt"; reg = <0x4 0x2010 0x8>; diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts index 47ae85c34635..8eb254b1738d 100644 --- a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts +++ b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts @@ -79,7 +79,7 @@ reg = <0x4 0x0 0x40>; }; - wdt@4,2000 { + watchdog@4,2000 { compatible = "gef,sbc310-fpga-wdt", "gef,fpga-wdt-1.00", "gef,fpga-wdt"; reg = <0x4 0x2000 0x8>; @@ -87,7 +87,7 @@ interrupt-parent = <&gef_pic>; }; /* - wdt@4,2010 { + watchdog@4,2010 { compatible = "gef,sbc310-fpga-wdt", "gef,fpga-wdt-1.00", "gef,fpga-wdt"; reg = <0x4 0x2010 0x8>; diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts index 5322be44b62e..02edbb262b8f 100644 --- a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts +++ b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts @@ -82,14 +82,14 @@ reg = <0x4 0x0 0x40>; }; - wdt@4,2000 { + watchdog@4,2000 { compatible = "gef,fpga-wdt"; reg = <0x4 0x2000 0x8>; interrupts = <0x1a 0x4>; interrupt-parent = <&gef_pic>; }; /* Second watchdog available, driver currently supports one. - wdt@4,2010 { + watchdog@4,2010 { compatible = "gef,fpga-wdt"; reg = <0x4 0x2010 0x8>; interrupts = <0x1b 0x4>; From af6850ac9ef3c98e6e8f2929e24ed6fd154fa39e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= Date: Fri, 18 Apr 2025 21:28:54 +0200 Subject: [PATCH 14/32] powerpc: p2020: Rename wdt@ nodes to watchdog@ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The watchdog.yaml schema prescribes a node name of "timer" or "watchdog" rather than the abbreviation "wdt". Signed-off-by: J. Neuschäfer Reviewed-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250418-watchdog-v1-4-987ff2046272@posteo.net --- arch/powerpc/boot/dts/fsl/ge_imp3a.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/ge_imp3a.dts b/arch/powerpc/boot/dts/fsl/ge_imp3a.dts index da3de8e2b7d2..9e5c01cfac2f 100644 --- a/arch/powerpc/boot/dts/fsl/ge_imp3a.dts +++ b/arch/powerpc/boot/dts/fsl/ge_imp3a.dts @@ -94,7 +94,7 @@ gpio-controller; }; - wdt@4,800 { + watchdog@4,800 { compatible = "ge,imp3a-fpga-wdt", "gef,fpga-wdt-1.00", "gef,fpga-wdt"; reg = <0x4 0x800 0x8>; @@ -103,7 +103,7 @@ }; /* Second watchdog available, driver currently supports one. - wdt@4,808 { + watchdog@4,808 { compatible = "gef,imp3a-fpga-wdt", "gef,fpga-wdt-1.00", "gef,fpga-wdt"; reg = <0x4 0x808 0x8>; From 38c64dfe0af12778953846df5f259e913275cfe5 Mon Sep 17 00:00:00 2001 From: Ben Collins Date: Mon, 21 Apr 2025 22:36:46 -0400 Subject: [PATCH 15/32] kexec: Include kernel-end even without crashkernel Certain versions of kexec don't even work without kernel-end being added to the device-tree. Add it even if crash-kernel is disabled. Signed-off-by: Ben Collins Reviewed-by: Sourabh Jain Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/2025042122-inescapable-mandrill-8a5ff2@boujee-and-buff --- arch/powerpc/kexec/core.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c index e59bdfcc6463..104c05520bf0 100644 --- a/arch/powerpc/kexec/core.c +++ b/arch/powerpc/kexec/core.c @@ -22,6 +22,8 @@ #include #include +#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG) + #ifdef CONFIG_CRASH_DUMP void machine_crash_shutdown(struct pt_regs *regs) { @@ -144,17 +146,10 @@ int __init overlaps_crashkernel(unsigned long start, unsigned long size) } /* Values we need to export to the second kernel via the device tree. */ -static phys_addr_t kernel_end; static phys_addr_t crashk_base; static phys_addr_t crashk_size; static unsigned long long mem_limit; -static struct property kernel_end_prop = { - .name = "linux,kernel-end", - .length = sizeof(phys_addr_t), - .value = &kernel_end, -}; - static struct property crashk_base_prop = { .name = "linux,crashkernel-base", .length = sizeof(phys_addr_t), @@ -173,8 +168,6 @@ static struct property memory_limit_prop = { .value = &mem_limit, }; -#define cpu_to_be_ulong __PASTE(cpu_to_be, BITS_PER_LONG) - static void __init export_crashk_values(struct device_node *node) { /* There might be existing crash kernel properties, but we can't @@ -198,6 +191,15 @@ static void __init export_crashk_values(struct device_node *node) mem_limit = cpu_to_be_ulong(memory_limit); of_update_property(node, &memory_limit_prop); } +#endif /* CONFIG_CRASH_RESERVE */ + +static phys_addr_t kernel_end; + +static struct property kernel_end_prop = { + .name = "linux,kernel-end", + .length = sizeof(phys_addr_t), + .value = &kernel_end, +}; static int __init kexec_setup(void) { @@ -208,16 +210,17 @@ static int __init kexec_setup(void) return -ENOENT; /* remove any stale properties so ours can be found */ - of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL)); + of_remove_property(node, of_find_property(node, kernel_end_prop.name, + NULL)); /* information needed by userspace when using default_machine_kexec */ kernel_end = cpu_to_be_ulong(__pa(_end)); of_add_property(node, &kernel_end_prop); +#ifdef CONFIG_CRASH_RESERVE export_crashk_values(node); - +#endif of_node_put(node); return 0; } late_initcall(kexec_setup); -#endif /* CONFIG_CRASH_RESERVE */ From f90d28443b1f4dbbbdcfea1be5295f6903acc94c Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Wed, 23 Jul 2025 03:29:36 +0530 Subject: [PATCH 16/32] arch:powerpc:tools This file was missing shebang line, so added it This file was missing the shebang line, so added it. Signed-off-by: Bhaskar Chowdhury Reviewed-by: Stephen Rothwell Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250722220043.14862-1-unixbhaskar@gmail.com --- arch/powerpc/tools/head_check.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh index 689907cda996..a9cd06958921 100644 --- a/arch/powerpc/tools/head_check.sh +++ b/arch/powerpc/tools/head_check.sh @@ -1,3 +1,4 @@ +#!/bin/sh # Copyright © 2016 IBM Corporation # This program is free software; you can redistribute it and/or From 2617bd81ae54128e63e764c48935e572e3dee501 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Mon, 15 Jan 2024 17:43:30 +0800 Subject: [PATCH 17/32] powerpc/83xx: Add a null pointer check to mcu_gpiochip_add kasprintf() returns a pointer to dynamically allocated memory which can be NULL upon failure. Ensure the allocation was successful by checking the pointer validity. Signed-off-by: Kunwu Chan Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20240115094330.33014-1-chentao@kylinos.cn --- arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c index cb7b9498f291..80d944f29288 100644 --- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c +++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c @@ -123,6 +123,8 @@ static int mcu_gpiochip_add(struct mcu *mcu) gc->owner = THIS_MODULE; gc->label = kasprintf(GFP_KERNEL, "%pfw", dev_fwnode(dev)); + if (!gc->label) + return -ENOMEM; gc->can_sleep = 1; gc->ngpio = MCU_NUM_GPIO; gc->base = -1; From fb2ff9fa72e20a75cab0ffc9dc8735de68ed4d0d Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Wed, 12 Nov 2025 13:18:59 +0530 Subject: [PATCH 18/32] powerpc/smp: Expose die_id and die_cpumask >From Power10 processors onwards, each chip has 2 hemispheres. For LPARs running on PowerVM Hypervisor, hypervisor determines the allocation of CPU groups to each LPAR, resulting in two LPARs with the same number of CPUs potentially having different numbers of CPUs from each hemisphere. Additionally, it is not feasible to ascertain the hemisphere based solely on the CPU number. Users wishing to assign their workload to all CPUs, or a subset of CPUs within a specific hemisphere, encounter difficulties in identifying the cpumask. To address this, it is proposed to expose hemisphere information as a die in sysfs. This aligns with other architectures and facilitates the identification of CPUs within the same hemisphere. Tools such as lstopo can also access this information. Please note: The hypervisor reveals the locality of the CPUs to hemispheres only in dedicated mode. Consequently, in systems where hemisphere information is unavailable, such as shared LPARs, the die_cpus information in sysfs will mirror package_cpus, with die_id set to -1. Without this change. $ grep . /sys/devices/system/cpu/cpu16/topology/{die*,package*} 2>/dev/null /sys/devices/system/cpu/cpu16/topology/package_cpus:000000,000000ff,ffff0000 /sys/devices/system/cpu/cpu16/topology/package_cpus_list:16-39 With this change. $ grep . /sys/devices/system/cpu/cpu16/topology/{die*,package*} 2>/dev/null /sys/devices/system/cpu/cpu16/topology/die_cpus:000000,00000000,00ff0000 /sys/devices/system/cpu/cpu16/topology/die_cpus_list:16-23 /sys/devices/system/cpu/cpu16/topology/die_id:2 /sys/devices/system/cpu/cpu16/topology/package_cpus:000000,000000ff,ffff0000 /sys/devices/system/cpu/cpu16/topology/package_cpus_list:16-39 snipped lstopo-no-graphics o/p Group0 L#0 (total=8747584KB) Package L#0 (total=3564096KB CPUModel="POWER10 (architected), altivec supported" CPURevision="2.0 (pvr 0080 0200)") NUMANode L#0 (P#0 local=3564096KB total=3564096KB) Die L#0 (P#0) Core L#0 (P#0) Package L#1 (total=5183488KB CPUModel="POWER10 (architected), altivec supported" CPURevision="2.0 (pvr 0080 0200)") NUMANode L#1 (P#1 local=5183488KB total=5183488KB) Die L#2 (P#2) Core L#2 (P#16) L3Cache L#4 (size=4096KB linesize=128 ways=16) L2Cache L#4 (size=1024KB linesize=128 ways=8) L1dCache L#4 (size=32KB linesize=128 ways=8) L1iCache L#4 (size=48KB linesize=128 ways=6) PU L#16 (P#16) PU L#17 (P#18) PU L#18 (P#20) PU L#19 (P#22) L3Cache L#5 (size=4096KB linesize=128 ways=16) L2Cache L#5 (size=1024KB linesize=128 ways=8) L1dCache L#5 (size=32KB linesize=128 ways=8) L1iCache L#5 (size=48KB linesize=128 ways=6) PU L#20 (P#17) PU L#21 (P#19) PU L#22 (P#21) PU L#23 (P#23) Die L#3 (P#3) Core L#3 (P#24) L3Cache L#6 (size=4096KB linesize=128 ways=16) L2Cache L#6 (size=1024KB linesize=128 ways=8) L1dCache L#6 (size=32KB linesize=128 ways=8) L1iCache L#6 (size=48KB linesize=128 ways=6) PU L#24 (P#24) PU L#25 (P#26) PU L#26 (P#28) PU L#27 (P#30) L3Cache L#7 (size=4096KB linesize=128 ways=16) L2Cache L#7 (size=1024KB linesize=128 ways=8) L1dCache L#7 (size=32KB linesize=128 ways=8) L1iCache L#7 (size=48KB linesize=128 ways=6) PU L#28 (P#25) PU L#29 (P#27) PU L#30 (P#29) PU L#31 (P#31) Core L#4 (P#32) L3Cache L#8 (size=4096KB linesize=128 ways=16) L2Cache L#8 (size=1024KB linesize=128 ways=8) L1dCache L#8 (size=32KB linesize=128 ways=8) L1iCache L#8 (size=48KB linesize=128 ways=6) PU L#32 (P#32) PU L#33 (P#34) PU L#34 (P#36) PU L#35 (P#38) L3Cache L#9 (size=4096KB linesize=128 ways=16) L2Cache L#9 (size=1024KB linesize=128 ways=8) L1dCache L#9 (size=32KB linesize=128 ways=8) L1iCache L#9 (size=48KB linesize=128 ways=6) PU L#36 (P#33) PU L#37 (P#35) PU L#38 (P#37) PU L#39 (P#39) Group0 L#1 (total=7736896KB) Package L#2 (total=5170880KB CPUModel="POWER10 (architected), altivec supported" CPURevision="2.0 (pvr 0080 0200)") NUMANode L#2 (P#2 local=5170880KB total=5170880KB) Die L#4 (P#4) Reviewed-by: Shrikanth Hegde Signed-off-by: Srikar Dronamraju Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251112074859.814087-1-srikar@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 11 +++++++---- arch/powerpc/kernel/smp.c | 23 +++++++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index f19ca44512d1..66ed5fe1b718 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -132,15 +132,18 @@ static inline int cpu_to_coregroup_id(int cpu) #include struct cpumask *cpu_coregroup_mask(int cpu); +const struct cpumask *cpu_die_mask(int cpu); +int cpu_die_id(int cpu); #ifdef CONFIG_PPC64 #include #define topology_physical_package_id(cpu) (cpu_to_chip_id(cpu)) - -#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) -#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) -#define topology_core_id(cpu) (cpu_to_core_id(cpu)) +#define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) +#define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) +#define topology_core_id(cpu) (cpu_to_core_id(cpu)) +#define topology_die_id(cpu) (cpu_die_id(cpu)) +#define topology_die_cpumask(cpu) (cpu_die_mask(cpu)) #endif #endif diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 68edb66c2964..292fee8809bc 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1085,6 +1085,29 @@ static int __init init_big_cores(void) return 0; } +/* + * die_mask and die_id are only available on systems which support + * multiple coregroups within a same package. On all other systems, die_mask + * would be same as package mask and die_id would be set to -1. + */ +const struct cpumask *cpu_die_mask(int cpu) +{ + if (has_coregroup_support()) + return per_cpu(cpu_coregroup_map, cpu); + else + return cpu_node_mask(cpu); +} +EXPORT_SYMBOL_GPL(cpu_die_mask); + +int cpu_die_id(int cpu) +{ + if (has_coregroup_support()) + return cpu_to_coregroup_id(cpu); + else + return -1; +} +EXPORT_SYMBOL_GPL(cpu_die_id); + void __init smp_prepare_cpus(unsigned int max_cpus) { unsigned int cpu, num_threads; From 78fc63ffa7813e33681839bb33826c24195f0eb7 Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Sun, 16 Nov 2025 01:40:46 -0500 Subject: [PATCH 19/32] powerpc, mm: Fix mprotect on book3s 32-bit On 32-bit book3s with hash-MMUs, tlb_flush() was a no-op. This was unnoticed because all uses until recently were for unmaps, and thus handled by __tlb_remove_tlb_entry(). After commit 4a18419f71cd ("mm/mprotect: use mmu_gather") in kernel 5.19, tlb_gather_mmu() started being used for mprotect as well. This caused mprotect to simply not work on these machines: int *ptr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); *ptr = 1; // force HPTE to be created mprotect(ptr, 4096, PROT_READ); *ptr = 2; // should segfault, but succeeds Fixed by making tlb_flush() actually flush TLB pages. This finally agrees with the behaviour of boot3s64's tlb_flush(). Fixes: 4a18419f71cd ("mm/mprotect: use mmu_gather") Cc: stable@vger.kernel.org Reviewed-by: Christophe Leroy Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Dave Vasilevsky Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20251116-vasi-mprotect-g3-v3-1-59a9bd33ba00@vasilevsky.ca --- arch/powerpc/include/asm/book3s/32/tlbflush.h | 5 ++++- arch/powerpc/mm/book3s32/tlb.c | 9 +++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/32/tlbflush.h b/arch/powerpc/include/asm/book3s/32/tlbflush.h index e43534da5207..4be2200a3c7e 100644 --- a/arch/powerpc/include/asm/book3s/32/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/32/tlbflush.h @@ -11,6 +11,7 @@ void hash__flush_tlb_mm(struct mm_struct *mm); void hash__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); void hash__flush_range(struct mm_struct *mm, unsigned long start, unsigned long end); +void hash__flush_gather(struct mmu_gather *tlb); #ifdef CONFIG_SMP void _tlbie(unsigned long address); @@ -29,7 +30,9 @@ void _tlbia(void); static inline void tlb_flush(struct mmu_gather *tlb) { /* 603 needs to flush the whole TLB here since it doesn't use a hash table. */ - if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) + if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) + hash__flush_gather(tlb); + else _tlbia(); } diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c index 9ad6b56bfec9..e54a7b011232 100644 --- a/arch/powerpc/mm/book3s32/tlb.c +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -105,3 +105,12 @@ void hash__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); } EXPORT_SYMBOL(hash__flush_tlb_page); + +void hash__flush_gather(struct mmu_gather *tlb) +{ + if (tlb->fullmm || tlb->need_flush_all) + hash__flush_tlb_mm(tlb->mm); + else + hash__flush_range(tlb->mm, tlb->start, tlb->end); +} +EXPORT_SYMBOL(hash__flush_gather); From 00312419f0863964625d6dcda8183f96849412c6 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Thu, 30 Oct 2025 20:27:26 +0530 Subject: [PATCH 20/32] powerpc/64s/slb: Fix SLB multihit issue during SLB preload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On systems using the hash MMU, there is a software SLB preload cache that mirrors the entries loaded into the hardware SLB buffer. This preload cache is subject to periodic eviction — typically after every 256 context switches — to remove old entry. To optimize performance, the kernel skips switch_mmu_context() in switch_mm_irqs_off() when the prev and next mm_struct are the same. However, on hash MMU systems, this can lead to inconsistencies between the hardware SLB and the software preload cache. If an SLB entry for a process is evicted from the software cache on one CPU, and the same process later runs on another CPU without executing switch_mmu_context(), the hardware SLB may retain stale entries. If the kernel then attempts to reload that entry, it can trigger an SLB multi-hit error. The following timeline shows how stale SLB entries are created and can cause a multi-hit error when a process moves between CPUs without a MMU context switch. CPU 0 CPU 1 ----- ----- Process P exec swapper/1 load_elf_binary begin_new_exc activate_mm switch_mm_irqs_off switch_mmu_context switch_slb /* * This invalidates all * the entries in the HW * and setup the new HW * SLB entries as per the * preload cache. */ context_switch sched_migrate_task migrates process P to cpu-1 Process swapper/0 context switch (to process P) (uses mm_struct of Process P) switch_mm_irqs_off() switch_slb load_slb++ /* * load_slb becomes 0 here * and we evict an entry from * the preload cache with * preload_age(). We still * keep HW SLB and preload * cache in sync, that is * because all HW SLB entries * anyways gets evicted in * switch_slb during SLBIA. * We then only add those * entries back in HW SLB, * which are currently * present in preload_cache * (after eviction). */ load_elf_binary continues... setup_new_exec() slb_setup_new_exec() sched_switch event sched_migrate_task migrates process P to cpu-0 context_switch from swapper/0 to Process P switch_mm_irqs_off() /* * Since both prev and next mm struct are same we don't call * switch_mmu_context(). This will cause the HW SLB and SW preload * cache to go out of sync in preload_new_slb_context. Because there * was an SLB entry which was evicted from both HW and preload cache * on cpu-1. Now later in preload_new_slb_context(), when we will try * to add the same preload entry again, we will add this to the SW * preload cache and then will add it to the HW SLB. Since on cpu-0 * this entry was never invalidated, hence adding this entry to the HW * SLB will cause a SLB multi-hit error. */ load_elf_binary continues... START_THREAD start_thread preload_new_slb_context /* * This tries to add a new EA to preload cache which was earlier * evicted from both cpu-1 HW SLB and preload cache. This caused the * HW SLB of cpu-0 to go out of sync with the SW preload cache. The * reason for this was, that when we context switched back on CPU-0, * we should have ideally called switch_mmu_context() which will * bring the HW SLB entries on CPU-0 in sync with SW preload cache * entries by setting up the mmu context properly. But we didn't do * that since the prev mm_struct running on cpu-0 was same as the * next mm_struct (which is true for swapper / kernel threads). So * now when we try to add this new entry into the HW SLB of cpu-0, * we hit a SLB multi-hit error. */ WARNING: CPU: 0 PID: 1810970 at arch/powerpc/mm/book3s64/slb.c:62 assert_slb_presence+0x2c/0x50(48 results) 02:47:29 [20157/42149] Modules linked in: CPU: 0 UID: 0 PID: 1810970 Comm: dd Not tainted 6.16.0-rc3-dirty #12 VOLUNTARY Hardware name: IBM pSeries (emulated by qemu) POWER8 (architected) 0x4d0200 0xf000004 of:SLOF,HEAD hv:linux,kvm pSeries NIP: c00000000015426c LR: c0000000001543b4 CTR: 0000000000000000 REGS: c0000000497c77e0 TRAP: 0700 Not tainted (6.16.0-rc3-dirty) MSR: 8000000002823033 CR: 28888482 XER: 00000000 CFAR: c0000000001543b0 IRQMASK: 3 <...> NIP [c00000000015426c] assert_slb_presence+0x2c/0x50 LR [c0000000001543b4] slb_insert_entry+0x124/0x390 Call Trace: 0x7fffceb5ffff (unreliable) preload_new_slb_context+0x100/0x1a0 start_thread+0x26c/0x420 load_elf_binary+0x1b04/0x1c40 bprm_execve+0x358/0x680 do_execveat_common+0x1f8/0x240 sys_execve+0x58/0x70 system_call_exception+0x114/0x300 system_call_common+0x160/0x2c4 >From the above analysis, during early exec the hardware SLB is cleared, and entries from the software preload cache are reloaded into hardware by switch_slb. However, preload_new_slb_context and slb_setup_new_exec also attempt to load some of the same entries, which can trigger a multi-hit. In most cases, these additional preloads simply hit existing entries and add nothing new. Removing these functions avoids redundant preloads and eliminates the multi-hit issue. This patch removes these two functions. We tested process switching performance using the context_switch benchmark on POWER9/hash, and observed no regression. Without this patch: 129041 ops/sec With this patch: 129341 ops/sec We also measured SLB faults during boot, and the counts are essentially the same with and without this patch. SLB faults without this patch: 19727 SLB faults with this patch: 19786 Fixes: 5434ae74629a ("powerpc/64s/hash: Add a SLB preload cache") cc: stable@vger.kernel.org Suggested-by: Nicholas Piggin Signed-off-by: Donet Tom Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/0ac694ae683494fe8cadbd911a1a5018d5d3c541.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 1 - arch/powerpc/kernel/process.c | 5 -- arch/powerpc/mm/book3s64/internal.h | 2 - arch/powerpc/mm/book3s64/mmu_context.c | 2 - arch/powerpc/mm/book3s64/slb.c | 88 ------------------- 5 files changed, 98 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 346351423207..af12e2ba8eb8 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -524,7 +524,6 @@ void slb_save_contents(struct slb_entry *slb_ptr); void slb_dump_contents(struct slb_entry *slb_ptr); extern void slb_vmalloc_update(void); -void preload_new_slb_context(unsigned long start, unsigned long sp); #ifdef CONFIG_PPC_64S_HASH_MMU void slb_set_size(u16 size); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index eb23966ac0a9..a45fe147868b 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1897,8 +1897,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) return 0; } -void preload_new_slb_context(unsigned long start, unsigned long sp); - /* * Set up a thread for executing a new program */ @@ -1906,9 +1904,6 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp) { #ifdef CONFIG_PPC64 unsigned long load_addr = regs->gpr[2]; /* saved by ELF_PLAT_INIT */ - - if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !radix_enabled()) - preload_new_slb_context(start, sp); #endif #ifdef CONFIG_PPC_TRANSACTIONAL_MEM diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h index a57a25f06a21..c26a6f0c90fc 100644 --- a/arch/powerpc/mm/book3s64/internal.h +++ b/arch/powerpc/mm/book3s64/internal.h @@ -24,8 +24,6 @@ static inline bool stress_hpt(void) void hpt_do_stress(unsigned long ea, unsigned long hpte_group); -void slb_setup_new_exec(void); - void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush); #endif /* ARCH_POWERPC_MM_BOOK3S64_INTERNAL_H */ diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 4e1e45420bd4..fb9dcf9ca599 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -150,8 +150,6 @@ static int hash__init_new_context(struct mm_struct *mm) void hash__setup_new_exec(void) { slice_setup_new_exec(); - - slb_setup_new_exec(); } #else static inline int hash__init_new_context(struct mm_struct *mm) diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index 6b783552403c..7e053c561a09 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -328,94 +328,6 @@ static void preload_age(struct thread_info *ti) ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; } -void slb_setup_new_exec(void) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long exec = 0x10000000; - - WARN_ON(irqs_disabled()); - - /* - * preload cache can only be used to determine whether a SLB - * entry exists if it does not start to overflow. - */ - if (ti->slb_preload_nr + 2 > SLB_PRELOAD_NR) - return; - - hard_irq_disable(); - - /* - * We have no good place to clear the slb preload cache on exec, - * flush_thread is about the earliest arch hook but that happens - * after we switch to the mm and have already preloaded the SLBEs. - * - * For the most part that's probably okay to use entries from the - * previous exec, they will age out if unused. It may turn out to - * be an advantage to clear the cache before switching to it, - * however. - */ - - /* - * preload some userspace segments into the SLB. - * Almost all 32 and 64bit PowerPC executables are linked at - * 0x10000000 so it makes sense to preload this segment. - */ - if (!is_kernel_addr(exec)) { - if (preload_add(ti, exec)) - slb_allocate_user(mm, exec); - } - - /* Libraries and mmaps. */ - if (!is_kernel_addr(mm->mmap_base)) { - if (preload_add(ti, mm->mmap_base)) - slb_allocate_user(mm, mm->mmap_base); - } - - /* see switch_slb */ - asm volatile("isync" : : : "memory"); - - local_irq_enable(); -} - -void preload_new_slb_context(unsigned long start, unsigned long sp) -{ - struct thread_info *ti = current_thread_info(); - struct mm_struct *mm = current->mm; - unsigned long heap = mm->start_brk; - - WARN_ON(irqs_disabled()); - - /* see above */ - if (ti->slb_preload_nr + 3 > SLB_PRELOAD_NR) - return; - - hard_irq_disable(); - - /* Userspace entry address. */ - if (!is_kernel_addr(start)) { - if (preload_add(ti, start)) - slb_allocate_user(mm, start); - } - - /* Top of stack, grows down. */ - if (!is_kernel_addr(sp)) { - if (preload_add(ti, sp)) - slb_allocate_user(mm, sp); - } - - /* Bottom of heap, grows up. */ - if (heap && !is_kernel_addr(heap)) { - if (preload_add(ti, heap)) - slb_allocate_user(mm, heap); - } - - /* see switch_slb */ - asm volatile("isync" : : : "memory"); - - local_irq_enable(); -} - static void slb_cache_slbie_kernel(unsigned int index) { unsigned long slbie_data = get_paca()->slb_cache[index]; From 17b45ccf09882e0c808ad2cf62acdc90ad968746 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:27 +0530 Subject: [PATCH 21/32] powerpc/64s/hash: Restrict stress_hpt_struct memblock region to within RMA limit When HV=0 & IR/DR=0, the Hash MMU is said to be in Virtual Real Addressing Mode during early boot. During this, we should ensure that memory region allocations for stress_hpt_struct should happen from within RMA region as otherwise the boot might get stuck while doing memset of this region. History behind why do we have RMA region limitation is better explained in these 2 patches [1] & [2]. This patch ensures that memset to stress_hpt_struct during early boot does not cross ppc64_rma_size boundary. [1]: https://lore.kernel.org/all/20190710052018.14628-1-sjitindarsingh@gmail.com/ [2]: https://lore.kernel.org/all/87wp54usvj.fsf@linux.vnet.ibm.com/ Fixes: 6b34a099faa12 ("powerpc/64s/hash: add stress_hpt kernel boot option to increase hash faults") Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/ada1173933ea7617a994d6ee3e54ced8797339fc.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 3aee3af614af..c99be1286d51 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1302,11 +1302,14 @@ static void __init htab_initialize(void) unsigned long table; unsigned long pteg_count; unsigned long prot; - phys_addr_t base = 0, size = 0, end; + phys_addr_t base = 0, size = 0, end, limit = MEMBLOCK_ALLOC_ANYWHERE; u64 i; DBG(" -> htab_initialize()\n"); + if (firmware_has_feature(FW_FEATURE_LPAR)) + limit = ppc64_rma_size; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { mmu_kernel_ssize = MMU_SEGSIZE_1T; mmu_highuser_ssize = MMU_SEGSIZE_1T; @@ -1322,7 +1325,7 @@ static void __init htab_initialize(void) // Too early to use nr_cpu_ids, so use NR_CPUS tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS, __alignof__(struct stress_hpt_struct), - 0, MEMBLOCK_ALLOC_ANYWHERE); + MEMBLOCK_LOW_LIMIT, limit); memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS); stress_hpt_struct = __va(tmp); @@ -1356,11 +1359,10 @@ static void __init htab_initialize(void) mmu_hash_ops.hpte_clear_all(); #endif } else { - unsigned long limit = MEMBLOCK_ALLOC_ANYWHERE; table = memblock_phys_alloc_range(htab_size_bytes, htab_size_bytes, - 0, limit); + MEMBLOCK_LOW_LIMIT, limit); if (!table) panic("ERROR: Failed to allocate %pa bytes below %pa\n", &htab_size_bytes, &limit); From eae40a6da63faa9fb63ff61f8fa2b3b57da78a84 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:28 +0530 Subject: [PATCH 22/32] powerpc/64s/ptdump: Fix kernel_hash_pagetable dump for ISA v3.00 HPTE format HPTE format was changed since Power9 (ISA 3.0) onwards. While dumping kernel hash page tables, nothing gets printed on powernv P9+. This patch utilizes the helpers added in the patch tagged as fixes, to convert new format to old format and dump the hptes. This fix is only needed for native_find() (powernv), since pseries continues to work fine with the old format. Fixes: 6b243fcfb5f1e ("powerpc/64: Simplify adaptation to new ISA v3.00 HPTE format") Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/4c2bb9e5b3cfbc0dd80b61b67cdd3ccfc632684c.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/mm/ptdump/hashpagetable.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index a6baa6166d94..671d0dc00c6d 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -216,6 +216,8 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 vpn = hpt_vpn(ea, vsid, ssize); hash = hpt_hash(vpn, shift, ssize); want_v = hpte_encode_avpn(vpn, psize, ssize); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + want_v = hpte_old_to_new_v(want_v); /* to check in the secondary hash table, we invert the hash */ if (!primary) @@ -229,6 +231,10 @@ static int native_find(unsigned long ea, int psize, bool primary, u64 *v, u64 /* HPTE matches */ *v = be64_to_cpu(hptep->v); *r = be64_to_cpu(hptep->r); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + *v = hpte_new_to_old_v(*v, *r); + *r = hpte_new_to_old_r(*r); + } return 0; } ++hpte_group; From 178dd2ee2b72817a67a8814c35a65fd901b325ba Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:29 +0530 Subject: [PATCH 23/32] powerpc/64s/hash: Fix phys_addr_t printf format in htab_initialize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We get below errors when we try to enable debug logs in book3s64/hash_utils.c This patch fixes these errors related to phys_addr_t printf format. arch/powerpc/mm/book3s64/hash_utils.c: In function ‘htab_initialize’: arch/powerpc/mm/book3s64/hash_utils.c:1401:21: error: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 2 has type ‘phys_addr_t’ {aka ‘long long unsigned int’} [-Werror=format=] 1401 | DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", arch/powerpc/mm/book3s64/hash_utils.c:1401:21: error: format ‘%lx’ expects argument of type ‘long unsigned int’, but argument 3 has type ‘phys_addr_t’ {aka ‘long long unsigned int’} [-Werror=format=] cc1: all warnings being treated as errors make[6]: *** [../scripts/Makefile.build:287: arch/powerpc/mm/book3s64/hash_utils.o] Error 1 Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/4873e9692fc4411099c9741005d218d5e734c345.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index c99be1286d51..0509c0a436d2 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1394,8 +1394,8 @@ static void __init htab_initialize(void) size = end - base; base = (unsigned long)__va(base); - DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", - base, size, prot); + pr_debug("creating mapping for region: 0x%pa..0x%pa (prot: %lx)\n", + &base, &size, prot); if ((base + size) >= H_VMALLOC_START) { pr_warn("Outside the supported range\n"); From fec40fe7e6dc08c97370420301377ee031199a6d Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:30 +0530 Subject: [PATCH 24/32] powerpc/64s/hash: Improve hash mmu printk messages Let's use pr_info() instead of printk() in order to utilize the pr_fmt set to "hash-mmu:". This improves the debug messages that are spitted out during kernel bootup. Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/a2af842f85c099cdbd19bf468606960c5226a079.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 0509c0a436d2..2fa98d26876a 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -952,7 +952,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned long node, block_size = be64_to_cpu(addr_prop[1]); if (block_size != (16 * GB)) return 0; - printk(KERN_INFO "Huge page(16GB) memory: " + pr_info("Huge page(16GB) memory: " "addr = 0x%lX size = 0x%lX pages = %d\n", phys_addr, block_size, expected_pages); if (phys_addr + block_size * expected_pages <= memblock_end_of_DRAM()) { @@ -1135,7 +1135,7 @@ static void __init htab_init_page_sizes(void) mmu_vmemmap_psize = mmu_virtual_psize; #endif /* CONFIG_SPARSEMEM_VMEMMAP */ - printk(KERN_DEBUG "Page orders: linear mapping = %d, " + pr_info("Page orders: linear mapping = %d, " "virtual = %d, io = %d" #ifdef CONFIG_SPARSEMEM_VMEMMAP ", vmemmap = %d" @@ -1313,7 +1313,7 @@ static void __init htab_initialize(void) if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { mmu_kernel_ssize = MMU_SEGSIZE_1T; mmu_highuser_ssize = MMU_SEGSIZE_1T; - printk(KERN_INFO "Using 1TB segments\n"); + pr_info("Using 1TB segments\n"); } if (stress_slb_enabled) @@ -1869,7 +1869,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, * in vmalloc space, so switch vmalloc * to 4k pages */ - printk(KERN_ALERT "Reducing vmalloc segment " + pr_alert("Reducing vmalloc segment " "to 4kB pages because of " "non-cacheable mapping\n"); psize = mmu_vmalloc_psize = MMU_PAGE_4K; From b80691e25ec632d020b90eb9de3af0f956dff0a0 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:31 +0530 Subject: [PATCH 25/32] powerpc/64s/hash: Hash hpt_order should be only available with Hash MMU This disables creating hpt_order debugfs entry with radix mode. Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/99237176a51c73e85f4a7edd60a2460017882d69.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 2fa98d26876a..e63befc96708 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -2434,6 +2434,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n") static int __init hash64_debugfs(void) { + if (radix_enabled()) + return 0; debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL, &fops_hpt_order); return 0; From b296fda58d1d095c95c8207b09856b2ceafa1397 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:32 +0530 Subject: [PATCH 26/32] powerpc/64s/hash: Update directMap page counters for Hash Update the directMap page counters for Hash. Hash by default always uses mmu_linear_psize only, for it's directMap. However, once the kernel has booted and the dmesg log is wrapped over there is no way of knowing the kernel linear pagesize with Hash mmu. Features like debug_page_alloc can make mmu_linear_psize to be PAGE_SIZE instead of PMD / PUD mappings. It would be easier if we have this info printed in proc meminfo similar to Radix for debugging purposes. Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/208e6f946d2ba9c1e2b8b4f665728abe5c891e7c.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/mm/book3s64/hash_utils.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index e63befc96708..31162dbad05c 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -449,6 +450,7 @@ static __init void hash_kfence_map_pool(void) { unsigned long kfence_pool_start, kfence_pool_end; unsigned long prot = pgprot_val(PAGE_KERNEL); + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; if (!kfence_pool) return; @@ -459,6 +461,7 @@ static __init void hash_kfence_map_pool(void) BUG_ON(htab_bolt_mapping(kfence_pool_start, kfence_pool_end, kfence_pool, prot, mmu_linear_psize, mmu_kernel_ssize)); + update_page_count(mmu_linear_psize, KFENCE_POOL_SIZE >> pshift); memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); } @@ -1234,6 +1237,7 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, int nid, pgprot_t prot) { int rc; + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; if (end >= H_VMALLOC_START) { pr_warn("Outside the supported range\n"); @@ -1251,17 +1255,22 @@ int hash__create_section_mapping(unsigned long start, unsigned long end, mmu_kernel_ssize); BUG_ON(rc2 && (rc2 != -ENOENT)); } + update_page_count(mmu_linear_psize, (end - start) >> pshift); return rc; } int hash__remove_section_mapping(unsigned long start, unsigned long end) { + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; + int rc = htab_remove_mapping(start, end, mmu_linear_psize, mmu_kernel_ssize); if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) pr_warn("Hash collision while resizing HPT\n"); + if (!rc) + update_page_count(mmu_linear_psize, -((end - start) >> pshift)); return rc; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -1304,6 +1313,7 @@ static void __init htab_initialize(void) unsigned long prot; phys_addr_t base = 0, size = 0, end, limit = MEMBLOCK_ALLOC_ANYWHERE; u64 i; + unsigned int pshift = mmu_psize_defs[mmu_linear_psize].shift; DBG(" -> htab_initialize()\n"); @@ -1404,6 +1414,8 @@ static void __init htab_initialize(void) BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), prot, mmu_linear_psize, mmu_kernel_ssize)); + + update_page_count(mmu_linear_psize, size >> pshift); } hash_kfence_map_pool(); memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); @@ -1425,6 +1437,8 @@ static void __init htab_initialize(void) BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, __pa(tce_alloc_start), prot, mmu_linear_psize, mmu_kernel_ssize)); + update_page_count(mmu_linear_psize, + (tce_alloc_end - tce_alloc_start) >> pshift); } From 6394f0e8abe7ca3132faa1321c97c53d0994aecc Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:33 +0530 Subject: [PATCH 27/32] powerpc/64s/pgtable: Enable directMap counters in meminfo for Hash This patch enables the directMap counters to be printed in proc/meminfo for Hash mmu. With this patch on a system with 8G of DRAM we can see the entire RAM mapped with 16M pagesize: cat /proc/meminfo |grep -i direct DirectMap4k: 0 kB DirectMap64k: 0 kB DirectMap16M: 8388608 kB DirectMap16G: 0 kB Tested with devdax too: root@buildroot:/# ndctl create-namespace -r region0 -m devdax -s 2G { "dev":"namespace0.0", "mode":"devdax", "map":"dev", "size":"2032.00 MiB (2130.71 MB)", "uuid":"aa383ded-cd99-43a0-979f-5225467cfb40", "daxregion":{ "id":0, "size":"2032.00 MiB (2130.71 MB)", "align":16777216, "devices":[ { "chardev":"dax0.0", "size":"2032.00 MiB (2130.71 MB)", "target_node":0, "align":"16.00 MiB (16.78 MB)", "mode":"devdax" } ] }, "align":16777216 } root@buildroot:/# cat /proc/meminfo |grep -i direct DirectMap4k: 0 kB DirectMap64k: 0 kB DirectMap16M: 10485760 kB DirectMap16G: 0 kB root@buildroot:/# ndctl destroy-namespace -f all destroyed 1 namespace root@buildroot:/# cat /proc/meminfo |grep -i direct DirectMap4k: 0 kB DirectMap64k: 0 kB DirectMap16M: 8388608 kB DirectMap16G: 0 kB Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/ffe47912e17075649b6ce5b2ee5d7f3eb5352e0b.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/mm/book3s64/pgtable.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index c9431ae7f78a..e3485db7de02 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -510,20 +510,21 @@ atomic_long_t direct_pages_count[MMU_PAGE_COUNT]; void arch_report_meminfo(struct seq_file *m) { - /* - * Hash maps the memory with one size mmu_linear_psize. - * So don't bother to print these on hash - */ - if (!radix_enabled()) - return; seq_printf(m, "DirectMap4k: %8lu kB\n", atomic_long_read(&direct_pages_count[MMU_PAGE_4K]) << 2); - seq_printf(m, "DirectMap64k: %8lu kB\n", + seq_printf(m, "DirectMap64k: %8lu kB\n", atomic_long_read(&direct_pages_count[MMU_PAGE_64K]) << 6); - seq_printf(m, "DirectMap2M: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); - seq_printf(m, "DirectMap1G: %8lu kB\n", - atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); + if (radix_enabled()) { + seq_printf(m, "DirectMap2M: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_2M]) << 11); + seq_printf(m, "DirectMap1G: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_1G]) << 20); + } else { + seq_printf(m, "DirectMap16M: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_16M]) << 14); + seq_printf(m, "DirectMap16G: %8lu kB\n", + atomic_long_read(&direct_pages_count[MMU_PAGE_16G]) << 24); + } } #endif /* CONFIG_PROC_FS */ From 3d44be297e7e01357b95dd13d2b335e6550ccfcd Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:34 +0530 Subject: [PATCH 28/32] powerpc/ptdump: Dump PXX level info for kernel_page_tables This patch adds PGD/PUD/PMD/PTE level information while dumping kernel page tables. Before this patch it was hard to identify which entries belongs to which page table level e.g. ~ # dmesg |grep -i radix [0.000000] radix-mmu: Mapped 0x0000000000000000-0x0000000005400000 with 2.00 MiB pages (exec) [0.000000] radix-mmu: Mapped 0x0000000005400000-0x0000000040000000 with 2.00 MiB pages [0.000000] radix-mmu: Mapped 0x0000000040000000-0x0000000100000000 with 1.00 GiB pages [0.000000] radix-mmu: Initializing Radix MMU Before: ---[ Start of kernel VM ]--- 0xc000000000000000-0xc000000003ffffff XXX 64M r X pte valid present dirty accessed 0xc000000004000000-0xc00000003fffffff XXX 960M r w pte valid present dirty accessed 0xc000000040000000-0xc0000000ffffffff XXX 3G r w pte valid present dirty accessed ... ---[ vmemmap start ]--- 0xc00c000000000000-0xc00c0000003fffff XXX 4M r w pte valid present dirty accessed After: ---[ Start of kernel VM ]--- 0xc000000000000000-0xc000000003ffffff XXX 64M PMD r X pte valid present dirty accessed 0xc000000004000000-0xc00000003fffffff XXX 960M PMD r w pte valid present dirty accessed 0xc000000040000000-0xc0000000ffffffff XXX 3G PUD r w pte valid present dirty accessed ... ---[ vmemmap start ]--- 0xc00c000000000000-0xc00c0000003fffff XXX 4M PMD r w pte valid present dirty accessed Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/95defb675ee5607ef3923a1e6aeac39311b8fad4.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/mm/ptdump/8xx.c | 5 +++++ arch/powerpc/mm/ptdump/book3s64.c | 5 +++++ arch/powerpc/mm/ptdump/ptdump.c | 1 + arch/powerpc/mm/ptdump/ptdump.h | 1 + arch/powerpc/mm/ptdump/shared.c | 5 +++++ 5 files changed, 17 insertions(+) diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index 4ca9cf7a90c9..ff845f251724 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -71,18 +71,23 @@ static const struct flag_info flag_array[] = { struct ptdump_pg_level pg_level[5] = { { /* pgd */ + .name = "PGD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* p4d */ + .name = "P4D", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ + .name = "PUD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pmd */ + .name = "PMD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pte */ + .name = "PTE", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c index 6b2da9241d4c..e8a21c6dc32e 100644 --- a/arch/powerpc/mm/ptdump/book3s64.c +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -104,18 +104,23 @@ static const struct flag_info flag_array[] = { struct ptdump_pg_level pg_level[5] = { { /* pgd */ + .name = "PGD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* p4d */ + .name = "P4D", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ + .name = "PUD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pmd */ + .name = "PMD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pte */ + .name = "PTE", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index b2358d794855..0d499aebee72 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -178,6 +178,7 @@ static void dump_addr(struct pg_state *st, unsigned long addr) pt_dump_seq_printf(st->seq, REG "-" REG " ", st->start_address, addr - 1); pt_dump_seq_printf(st->seq, " " REG " ", st->start_pa); pt_dump_size(st->seq, addr - st->start_address); + pt_dump_seq_printf(st->seq, "%s ", pg_level[st->level].name); } static void note_prot_wx(struct pg_state *st, unsigned long addr) diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h index 4232aa4b57ea..12aa9eca8b0c 100644 --- a/arch/powerpc/mm/ptdump/ptdump.h +++ b/arch/powerpc/mm/ptdump/ptdump.h @@ -13,6 +13,7 @@ struct flag_info { struct ptdump_pg_level { const struct flag_info *flag; + char name[4]; size_t num; u64 mask; }; diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index 58998960eb9a..edc69da19b85 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -69,18 +69,23 @@ static const struct flag_info flag_array[] = { struct ptdump_pg_level pg_level[5] = { { /* pgd */ + .name = "PGD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* p4d */ + .name = "P4D", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ + .name = "PUD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pmd */ + .name = "PMD", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pte */ + .name = "PTE", .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, From 2a492d6b38c2943c9d2f9008f31a8bb3afc3a40b Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:35 +0530 Subject: [PATCH 29/32] powerpc/64s/slb: Make preload_add return type as void We dropped preload_new_slb_context() & slb_setup_new_exec() in a previous patch. That means we don't really need preload_add() return type anymore. So let's make its return type as void. Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/88b07223e93b0fc56c1119f6889b7d9c83e20109.1761834163.git.ritesh.list@gmail.com --- arch/powerpc/mm/book3s64/slb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index 7e053c561a09..042b762fc0d2 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -294,7 +294,7 @@ static bool preload_hit(struct thread_info *ti, unsigned long esid) return false; } -static bool preload_add(struct thread_info *ti, unsigned long ea) +static void preload_add(struct thread_info *ti, unsigned long ea) { unsigned char idx; unsigned long esid; @@ -308,7 +308,7 @@ static bool preload_add(struct thread_info *ti, unsigned long ea) esid = ea >> SID_SHIFT; if (preload_hit(ti, esid)) - return false; + return; idx = (ti->slb_preload_tail + ti->slb_preload_nr) % SLB_PRELOAD_NR; ti->slb_preload_esid[idx] = esid; @@ -316,8 +316,6 @@ static bool preload_add(struct thread_info *ti, unsigned long ea) ti->slb_preload_tail = (ti->slb_preload_tail + 1) % SLB_PRELOAD_NR; else ti->slb_preload_nr++; - - return true; } static void preload_age(struct thread_info *ti) From 5b3a426affbd30a4293d284ab0d37164a4064531 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 30 Oct 2025 20:27:36 +0530 Subject: [PATCH 30/32] powerpc/64s/slb: Add no_slb_preload early cmdline param no_slb_preload cmdline can come useful in quickly disabling and/or testing the performance impact of userspace slb preloads. Recently there was a slb multi-hit issue due to slb preload cache which was very difficult to triage. This cmdline option allows to quickly disable preloads and verify if the issue exists in preload cache or somewhere else. This can also be a useful option to see the effect of slb preloads for any application workload e.g. number of slb faults with or w/o slb preloads. with slb_preload: slb_faults (minimal initrd boot): 15 slb_faults (full systemd boot): 300 with no_slb_preload: slb_faults (minimal initrd boot): 33 slb_faults (full systemd boot): 138180 Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/de484b55c45d831bc2db63945f455153c89a9a65.1761834163.git.ritesh.list@gmail.com --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ arch/powerpc/mm/book3s64/hash_utils.c | 3 +++ arch/powerpc/mm/book3s64/internal.h | 7 +++++++ arch/powerpc/mm/book3s64/slb.c | 15 +++++++++++++++ 4 files changed, 28 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1c10190d583d..d962d275dac7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7192,6 +7192,9 @@ them frequently to increase the rate of SLB faults on kernel addresses. + no_slb_preload [PPC,EARLY] + Disables slb preloading for userspace. + sunrpc.min_resvport= sunrpc.max_resvport= [NFS,SUNRPC] diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 31162dbad05c..9dc5889d6ecb 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1329,6 +1329,9 @@ static void __init htab_initialize(void) if (stress_slb_enabled) static_branch_enable(&stress_slb_key); + if (no_slb_preload) + static_branch_enable(&no_slb_preload_key); + if (stress_hpt_enabled) { unsigned long tmp; static_branch_enable(&stress_hpt_key); diff --git a/arch/powerpc/mm/book3s64/internal.h b/arch/powerpc/mm/book3s64/internal.h index c26a6f0c90fc..cad08d83369c 100644 --- a/arch/powerpc/mm/book3s64/internal.h +++ b/arch/powerpc/mm/book3s64/internal.h @@ -22,6 +22,13 @@ static inline bool stress_hpt(void) return static_branch_unlikely(&stress_hpt_key); } +extern bool no_slb_preload; +DECLARE_STATIC_KEY_FALSE(no_slb_preload_key); +static inline bool slb_preload_disabled(void) +{ + return static_branch_unlikely(&no_slb_preload_key); +} + void hpt_do_stress(unsigned long ea, unsigned long hpte_group); void exit_lazy_flush_tlb(struct mm_struct *mm, bool always_flush); diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index 042b762fc0d2..15f73abd1506 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -42,6 +42,15 @@ early_param("stress_slb", parse_stress_slb); __ro_after_init DEFINE_STATIC_KEY_FALSE(stress_slb_key); +bool no_slb_preload __initdata; +static int __init parse_no_slb_preload(char *p) +{ + no_slb_preload = true; + return 0; +} +early_param("no_slb_preload", parse_no_slb_preload); +__ro_after_init DEFINE_STATIC_KEY_FALSE(no_slb_preload_key); + static void assert_slb_presence(bool present, unsigned long ea) { #ifdef CONFIG_DEBUG_VM @@ -299,6 +308,9 @@ static void preload_add(struct thread_info *ti, unsigned long ea) unsigned char idx; unsigned long esid; + if (slb_preload_disabled()) + return; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { /* EAs are stored >> 28 so 256MB segments don't need clearing */ if (ea & ESID_MASK_1T) @@ -412,6 +424,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) copy_mm_to_paca(mm); + if (slb_preload_disabled()) + return; + /* * We gradually age out SLBs after a number of context switches to * reduce reload overhead of unused entries (like we do with FP/VEC From 8d398324967a6e380e92a82e28581ac1e1c2982f Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 27 Oct 2025 15:05:40 +0100 Subject: [PATCH 31/32] powerpc/powermac: backlight: Include Include to avoid dependency on backlight header to include it. Declares of_find_node_by_name(), of_property_match_string() and of_node_put(). Signed-off-by: Thomas Zimmermann Fixes: 243ce64b2b37 ("backlight: Do not include in header file") Reported-by: Naresh Kamboju Reviewed-by: Daniel Thompson (RISCstar) Closes: https://lore.kernel.org/linuxppc-dev/CA+G9fYs8fn5URQx2+s2oNxdUgZkSrdLC0P1tNBW_n-6BaBkK2Q@mail.gmail.com/ Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powermac/backlight.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/powermac/backlight.c b/arch/powerpc/platforms/powermac/backlight.c index 79741370c40c..1796327955c6 100644 --- a/arch/powerpc/platforms/powermac/backlight.c +++ b/arch/powerpc/platforms/powermac/backlight.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include From 9b36c7fc5aa5f2c6e6eeb9f312fdfe61b4291c9f Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 27 Oct 2025 15:05:41 +0100 Subject: [PATCH 32/32] macintosh/via-pmu-backlight: Include and Include and to avoid dependency on backlight header to include them. Declares of_machine_is_compatible() and defines FB_BACKLIGHT_MAX. Signed-off-by: Thomas Zimmermann Fixes: 243ce64b2b37 ("backlight: Do not include in header file") Reported-by: Naresh Kamboju Reviewed-by: Daniel Thompson (RISCstar) Closes: https://lore.kernel.org/linuxppc-dev/CA+G9fYs8fn5URQx2+s2oNxdUgZkSrdLC0P1tNBW_n-6BaBkK2Q@mail.gmail.com/ Signed-off-by: Michael Ellerman --- drivers/macintosh/via-pmu-backlight.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/macintosh/via-pmu-backlight.c b/drivers/macintosh/via-pmu-backlight.c index 26bd9ed5e664..d91825bb0a5c 100644 --- a/drivers/macintosh/via-pmu-backlight.c +++ b/drivers/macintosh/via-pmu-backlight.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include