From 4ca191cec17a997d0e3b2cd312f3a884288acc27 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 4 Feb 2026 09:01:00 -0600 Subject: [PATCH 1/9] x86/boot/sev: Move SEV decompressor variables into the .data section As part of the work to remove the dependency on calling into the decompressor code (startup_64()) for a UEFI boot, a call to rmpadjust() was removed from sev_enable() in favor of checking the value of the snp_vmpl variable. When booting through a non-UEFI path and calling startup_64(), the call to sev_enable() is performed before the BSS section is zeroed. With the removal of the rmpadjust() call and the corresponding check of the return code, the snp_vmpl variable is checked. Since the kernel is running at VMPL0, the snp_vmpl variable will not have been set and should be the default value of 0. However, since the call occurs before the BSS is zeroed, the snp_vmpl variable may not actually be zero, which will cause the guest boot to fail. Since the decompressor relocates itself, the BSS would need to be cleared both before and after the relocation, but this would, in effect, cause all of the changes to BSS variables before relocation to be lost after relocation. Instead, move the snp_vmpl variable into the .data section so that it is initialized and the value made safe during relocation. As a pre-caution against future changes, move other SEV-related decompressor variables into the .data section, too. Fixes: 68a501d7fd82 ("x86/boot: Drop redundant RMPADJUST in SEV SVSM presence check") Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Ard Biesheuvel Reviewed-by: Changyuan Lyu Tested-by: Kevin Hui Tested-by: Changyuan Lyu Cc: stable@vger.kernel.org Link: https://patch.msgid.link/5648b7de5b0a5d0dfef3785f9582b718678c6448.1770217260.git.thomas.lendacky@amd.com --- arch/x86/boot/compressed/sev.c | 8 ++++---- arch/x86/boot/startup/sev-shared.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index c8c1464b3a56..46b54720d91d 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -28,17 +28,17 @@ #include "sev.h" static struct ghcb boot_ghcb_page __aligned(PAGE_SIZE); -struct ghcb *boot_ghcb; +struct ghcb *boot_ghcb __section(".data"); #undef __init #define __init #define __BOOT_COMPRESSED -u8 snp_vmpl; -u16 ghcb_version; +u8 snp_vmpl __section(".data"); +u16 ghcb_version __section(".data"); -u64 boot_svsm_caa_pa; +u64 boot_svsm_caa_pa __section(".data"); /* Include code for early handlers */ #include "../../boot/startup/sev-shared.c" diff --git a/arch/x86/boot/startup/sev-shared.c b/arch/x86/boot/startup/sev-shared.c index a0fa8bb2b945..d9ac3a929d33 100644 --- a/arch/x86/boot/startup/sev-shared.c +++ b/arch/x86/boot/startup/sev-shared.c @@ -31,7 +31,7 @@ static u32 cpuid_std_range_max __ro_after_init; static u32 cpuid_hyp_range_max __ro_after_init; static u32 cpuid_ext_range_max __ro_after_init; -bool sev_snp_needs_sfw; +bool sev_snp_needs_sfw __section(".data"); void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) From 9073428bb204d921ae15326bb7d4558d9d269aab Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Tue, 3 Feb 2026 16:24:03 -0600 Subject: [PATCH 2/9] x86/sev: Allow IBPB-on-Entry feature for SNP guests The SEV-SNP IBPB-on-Entry feature does not require a guest-side implementation. It was added in Zen5 h/w, after the first SNP Zen implementation, and thus was not accounted for when the initial set of SNP features were added to the kernel. In its abundant precaution, commit 8c29f0165405 ("x86/sev: Add SEV-SNP guest feature negotiation support") included SEV_STATUS' IBPB-on-Entry bit as a reserved bit, thereby masking guests from using the feature. Allow guests to make use of IBPB-on-Entry when supported by the hypervisor, as the bit is now architecturally defined and safe to expose. Fixes: 8c29f0165405 ("x86/sev: Add SEV-SNP guest feature negotiation support") Signed-off-by: Kim Phillips Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikunj A Dadhania Reviewed-by: Tom Lendacky Cc: stable@kernel.org Link: https://patch.msgid.link/20260203222405.4065706-2-kim.phillips@amd.com --- arch/x86/boot/compressed/sev.c | 1 + arch/x86/coco/sev/core.c | 1 + arch/x86/include/asm/msr-index.h | 5 ++++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 46b54720d91d..e468476e9e4a 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -188,6 +188,7 @@ bool sev_es_check_ghcb_fault(unsigned long address) MSR_AMD64_SNP_RESERVED_BIT13 | \ MSR_AMD64_SNP_RESERVED_BIT15 | \ MSR_AMD64_SNP_SECURE_AVIC | \ + MSR_AMD64_SNP_RESERVED_BITS19_22 | \ MSR_AMD64_SNP_RESERVED_MASK) #ifdef CONFIG_AMD_SECURE_AVIC diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index 907981b94c40..7ed3da998489 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -89,6 +89,7 @@ static const char * const sev_status_feat_names[] = { [MSR_AMD64_SNP_VMSA_REG_PROT_BIT] = "VMSARegProt", [MSR_AMD64_SNP_SMT_PROT_BIT] = "SMTProt", [MSR_AMD64_SNP_SECURE_AVIC_BIT] = "SecureAVIC", + [MSR_AMD64_SNP_IBPB_ON_ENTRY_BIT] = "IBPBOnEntry", }; /* diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index da5275d8eda6..6673601246b3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -740,7 +740,10 @@ #define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT) #define MSR_AMD64_SNP_SECURE_AVIC_BIT 18 #define MSR_AMD64_SNP_SECURE_AVIC BIT_ULL(MSR_AMD64_SNP_SECURE_AVIC_BIT) -#define MSR_AMD64_SNP_RESV_BIT 19 +#define MSR_AMD64_SNP_RESERVED_BITS19_22 GENMASK_ULL(22, 19) +#define MSR_AMD64_SNP_IBPB_ON_ENTRY_BIT 23 +#define MSR_AMD64_SNP_IBPB_ON_ENTRY BIT_ULL(MSR_AMD64_SNP_IBPB_ON_ENTRY_BIT) +#define MSR_AMD64_SNP_RESV_BIT 24 #define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT) #define MSR_AMD64_SAVIC_CONTROL 0xc0010138 #define MSR_AMD64_SAVIC_EN_BIT 0 From 3d1973a0c76a78a4728cff13648a188ed486cf44 Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Wed, 25 Feb 2026 20:30:23 +0100 Subject: [PATCH 3/9] x86/boot: Handle relative CONFIG_EFI_SBAT_FILE file paths CONFIG_EFI_SBAT_FILE can be a relative path. When compiling using a different output directory (O=) the build currently fails because it can't find the filename set in CONFIG_EFI_SBAT_FILE: arch/x86/boot/compressed/sbat.S: Assembler messages: arch/x86/boot/compressed/sbat.S:6: Error: file not found: kernel.sbat Add $(srctree) as include dir for sbat.o. [ bp: Massage commit message. ] Fixes: 61b57d35396a ("x86/efi: Implement support for embedding SBAT data for x86") Signed-off-by: Jan Stancek Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Vitaly Kuznetsov Cc: Link: https://patch.msgid.link/f4eda155b0cef91d4d316b4e92f5771cb0aa7187.1772047658.git.jstancek@redhat.com --- arch/x86/boot/compressed/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 68f9d7a1683b..b8b2b7bea1d3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -113,6 +113,7 @@ vmlinux-objs-$(CONFIG_EFI_SBAT) += $(obj)/sbat.o ifdef CONFIG_EFI_SBAT $(obj)/sbat.o: $(CONFIG_EFI_SBAT_FILE) +AFLAGS_sbat.o += -I $(srctree) endif $(obj)/vmlinux: $(vmlinux-objs-y) $(vmlinux-libs-y) FORCE From 48084cc153a5b0fbf0aa98d47670d3be0b9f64d5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Mar 2026 11:55:40 +0100 Subject: [PATCH 4/9] x86/numa: Store extra copy of numa_nodes_parsed The topology setup code needs to know the total number of physical nodes enumerated in SRAT; however NUMA_EMU can cause the existing numa_nodes_parsed bitmap to be fictitious. Therefore, keep a copy of the bitmap specifically to retain the physical node count. Suggested-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: K Prateek Nayak Tested-by: Zhang Rui Tested-by: Chen Yu Tested-by: Kyle Meyer Link: https://patch.msgid.link/20260303110059.889884023@infradead.org --- arch/x86/include/asm/numa.h | 6 ++++++ arch/x86/mm/numa.c | 8 ++++++++ arch/x86/mm/srat.c | 2 ++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 53ba39ce010c..a9063f332fa6 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -22,6 +22,7 @@ extern int numa_off; */ extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; +extern nodemask_t numa_phys_nodes_parsed __initdata; static inline void set_apicid_to_node(int apicid, s16 node) { @@ -48,6 +49,7 @@ extern void __init init_cpu_to_node(void); extern void numa_add_cpu(unsigned int cpu); extern void numa_remove_cpu(unsigned int cpu); extern void init_gi_nodes(void); +extern int num_phys_nodes(void); #else /* CONFIG_NUMA */ static inline void numa_set_node(int cpu, int node) { } static inline void numa_clear_node(int cpu) { } @@ -55,6 +57,10 @@ static inline void init_cpu_to_node(void) { } static inline void numa_add_cpu(unsigned int cpu) { } static inline void numa_remove_cpu(unsigned int cpu) { } static inline void init_gi_nodes(void) { } +static inline int num_phys_nodes(void) +{ + return 1; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEBUG_PER_CPU_MAPS diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 7a97327140df..99d0a9332c14 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -48,6 +48,8 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; +nodemask_t numa_phys_nodes_parsed __initdata; + int numa_cpu_node(int cpu) { u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu); @@ -57,6 +59,11 @@ int numa_cpu_node(int cpu) return NUMA_NO_NODE; } +int __init num_phys_nodes(void) +{ + return bitmap_weight(numa_phys_nodes_parsed.bits, MAX_NUMNODES); +} + cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; EXPORT_SYMBOL(node_to_cpumask_map); @@ -210,6 +217,7 @@ static int __init dummy_numa_init(void) 0LLU, PFN_PHYS(max_pfn) - 1); node_set(0, numa_nodes_parsed); + node_set(0, numa_phys_nodes_parsed); numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); return 0; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 6f8e0f21c710..44ca66651756 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -57,6 +57,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); + node_set(node, numa_phys_nodes_parsed); pr_debug("SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", pxm, apic_id, node); } @@ -97,6 +98,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) set_apicid_to_node(apic_id, node); node_set(node, numa_nodes_parsed); + node_set(node, numa_phys_nodes_parsed); pr_debug("SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", pxm, apic_id, node); } From ae6730ff42b3a13d94b405edeb5e40108b6d21b6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Mar 2026 11:55:41 +0100 Subject: [PATCH 5/9] x86/topo: Add topology_num_nodes_per_package() Use the MADT and SRAT table data to compute __num_nodes_per_package. Specifically, SRAT has already been parsed in x86_numa_init(), which is called before acpi_boot_init() which parses MADT. So both are available in topology_init_possible_cpus(). This number is useful to divinate the various Intel CoD/SNC and AMD NPS modes, since the platforms are failing to provide this otherwise. Doing it this way is independent of the number of online CPUs and other such shenanigans. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: Tony Luck Tested-by: K Prateek Nayak Tested-by: Zhang Rui Tested-by: Chen Yu Tested-by: Kyle Meyer Link: https://patch.msgid.link/20260303110100.004091624@infradead.org --- arch/x86/include/asm/topology.h | 6 ++++++ arch/x86/kernel/cpu/common.c | 3 +++ arch/x86/kernel/cpu/topology.c | 13 +++++++++++-- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 1fadf0cf520c..0ba9bdb99871 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -155,6 +155,7 @@ extern unsigned int __max_logical_packages; extern unsigned int __max_threads_per_core; extern unsigned int __num_threads_per_package; extern unsigned int __num_cores_per_package; +extern unsigned int __num_nodes_per_package; const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c); enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c); @@ -179,6 +180,11 @@ static inline unsigned int topology_num_threads_per_package(void) return __num_threads_per_package; } +static inline unsigned int topology_num_nodes_per_package(void) +{ + return __num_nodes_per_package; +} + #ifdef CONFIG_X86_LOCAL_APIC int topology_get_logical_id(u32 apicid, enum x86_topology_domains at_level); #else diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1c3261cae40c..a8ff4376c286 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -95,6 +95,9 @@ EXPORT_SYMBOL(__max_dies_per_package); unsigned int __max_logical_packages __ro_after_init = 1; EXPORT_SYMBOL(__max_logical_packages); +unsigned int __num_nodes_per_package __ro_after_init = 1; +EXPORT_SYMBOL(__num_nodes_per_package); + unsigned int __num_cores_per_package __ro_after_init = 1; EXPORT_SYMBOL(__num_cores_per_package); diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 23190a786d31..eafcb1fc185a 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "cpu.h" @@ -492,11 +493,19 @@ void __init topology_init_possible_cpus(void) set_nr_cpu_ids(allowed); cnta = domain_weight(TOPO_PKG_DOMAIN); - cntb = domain_weight(TOPO_DIE_DOMAIN); __max_logical_packages = cnta; + + pr_info("Max. logical packages: %3u\n", __max_logical_packages); + + cntb = num_phys_nodes(); + __num_nodes_per_package = DIV_ROUND_UP(cntb, cnta); + + pr_info("Max. logical nodes: %3u\n", cntb); + pr_info("Num. nodes per package:%3u\n", __num_nodes_per_package); + + cntb = domain_weight(TOPO_DIE_DOMAIN); __max_dies_per_package = 1U << (get_count_order(cntb) - get_count_order(cnta)); - pr_info("Max. logical packages: %3u\n", cnta); pr_info("Max. logical dies: %3u\n", cntb); pr_info("Max. dies per package: %3u\n", __max_dies_per_package); From 717b64d58cff6fb97f97be07e382ed7641167a56 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Mar 2026 11:55:42 +0100 Subject: [PATCH 6/9] x86/topo: Replace x86_has_numa_in_package .. with the brand spanking new topology_num_nodes_per_package(). Having the topology setup determine this value during MADT/SRAT parsing before SMP bringup avoids having to detect this situation when building the SMP topology masks. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: Tony Luck Tested-by: K Prateek Nayak Tested-by: Zhang Rui Tested-by: Chen Yu Tested-by: Kyle Meyer Link: https://patch.msgid.link/20260303110100.123701837@infradead.org --- arch/x86/kernel/smpboot.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5cd6950ab672..db3e481cdbb2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -468,13 +468,6 @@ static int x86_cluster_flags(void) } #endif -/* - * Set if a package/die has multiple NUMA nodes inside. - * AMD Magny-Cours, Intel Cluster-on-Die, and Intel - * Sub-NUMA Clustering have this. - */ -static bool x86_has_numa_in_package; - static struct sched_domain_topology_level x86_topology[] = { SDTL_INIT(tl_smt_mask, cpu_smt_flags, SMT), #ifdef CONFIG_SCHED_CLUSTER @@ -496,7 +489,7 @@ static void __init build_sched_topology(void) * PKG domain since the NUMA domains will auto-magically create the * right spanning domains based on the SLIT. */ - if (x86_has_numa_in_package) { + if (topology_num_nodes_per_package() > 1) { unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); @@ -550,7 +543,7 @@ int arch_sched_node_distance(int from, int to) case INTEL_GRANITERAPIDS_X: case INTEL_ATOM_DARKMONT_X: - if (!x86_has_numa_in_package || topology_max_packages() == 1 || + if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 || d < REMOTE_DISTANCE) return d; @@ -606,7 +599,7 @@ void set_cpu_sibling_map(int cpu) o = &cpu_data(i); if (match_pkg(c, o) && !topology_same_node(c, o)) - x86_has_numa_in_package = true; + WARN_ON_ONCE(topology_num_nodes_per_package() == 1); if ((i == cpu) || (has_smt && match_smt(c, o))) link_mask(topology_sibling_cpumask, cpu, i); From 528d89a4707e5bfd86e30823c45dbb66877df900 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Mar 2026 11:55:43 +0100 Subject: [PATCH 7/9] x86/topo: Fix SNC topology mess Per 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode"), the original crazy SNC-3 SLIT table was: node distances: node 0 1 2 3 4 5 0: 10 15 17 21 28 26 1: 15 10 15 23 26 23 2: 17 15 10 26 23 21 3: 21 28 26 10 15 17 4: 23 26 23 15 10 15 5: 26 23 21 17 15 10 And per: https://lore.kernel.org/lkml/20250825075642.GQ3245006@noisy.programming.kicks-ass.net/ The suggestion was to average the off-trace clusters to restore sanity. However, 4d6dd05d07d0 implements this under various assumptions: - anything GNR/CWF with numa_in_package; - there will never be more than 2 packages; - the off-trace cluster will have distance >20 And then HPE shows up with a machine that matches the Vendor-Family-Model checks but looks like this: Here's an 8 socket (2 chassis) HPE system with SNC enabled: node 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0: 10 12 16 16 16 16 18 18 40 40 40 40 40 40 40 40 1: 12 10 16 16 16 16 18 18 40 40 40 40 40 40 40 40 2: 16 16 10 12 18 18 16 16 40 40 40 40 40 40 40 40 3: 16 16 12 10 18 18 16 16 40 40 40 40 40 40 40 40 4: 16 16 18 18 10 12 16 16 40 40 40 40 40 40 40 40 5: 16 16 18 18 12 10 16 16 40 40 40 40 40 40 40 40 6: 18 18 16 16 16 16 10 12 40 40 40 40 40 40 40 40 7: 18 18 16 16 16 16 12 10 40 40 40 40 40 40 40 40 8: 40 40 40 40 40 40 40 40 10 12 16 16 16 16 18 18 9: 40 40 40 40 40 40 40 40 12 10 16 16 16 16 18 18 10: 40 40 40 40 40 40 40 40 16 16 10 12 18 18 16 16 11: 40 40 40 40 40 40 40 40 16 16 12 10 18 18 16 16 12: 40 40 40 40 40 40 40 40 16 16 18 18 10 12 16 16 13: 40 40 40 40 40 40 40 40 16 16 18 18 12 10 16 16 14: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 10 12 15: 40 40 40 40 40 40 40 40 18 18 16 16 16 16 12 10 10 = Same chassis and socket 12 = Same chassis and socket (SNC) 16 = Same chassis and adjacent socket 18 = Same chassis and non-adjacent socket 40 = Different chassis Turns out, the 'max 2 packages' thing is only relevant to the SNC-3 parts, the smaller parts do 8 sockets (like usual). The above SLIT table is sane, but violates the previous assumptions and trips a WARN. Now that the topology code has a sensible measure of nodes-per-package, we can use that to divinate the SNC mode at hand, and only fix up SNC-3 topologies. There is a 'healthy' amount of paranoia code validating the assumptions on the SLIT table, a simple pr_err(FW_BUG) print on failure and a fallback to using the regular table. Lets see how long this lasts :-) Fixes: 4d6dd05d07d0 ("sched/topology: Fix sched domain build error for GNR, CWF in SNC-3 mode") Reported-by: Kyle Meyer Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: K Prateek Nayak Tested-by: Zhang Rui Tested-by: Chen Yu Tested-by: Kyle Meyer Link: https://patch.msgid.link/20260303110100.238361290@infradead.org --- arch/x86/kernel/smpboot.c | 188 ++++++++++++++++++++++++++++---------- 1 file changed, 142 insertions(+), 46 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index db3e481cdbb2..294a8ea60298 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -506,33 +506,149 @@ static void __init build_sched_topology(void) } #ifdef CONFIG_NUMA -static int sched_avg_remote_distance; -static int avg_remote_numa_distance(void) +/* + * Test if the on-trace cluster at (N,N) is symmetric. + * Uses upper triangle iteration to avoid obvious duplicates. + */ +static bool slit_cluster_symmetric(int N) { - int i, j; - int distance, nr_remote, total_distance; + int u = topology_num_nodes_per_package(); - if (sched_avg_remote_distance > 0) - return sched_avg_remote_distance; - - nr_remote = 0; - total_distance = 0; - for_each_node_state(i, N_CPU) { - for_each_node_state(j, N_CPU) { - distance = node_distance(i, j); - - if (distance >= REMOTE_DISTANCE) { - nr_remote++; - total_distance += distance; - } + for (int k = 0; k < u; k++) { + for (int l = k; l < u; l++) { + if (node_distance(N + k, N + l) != + node_distance(N + l, N + k)) + return false; } } - if (nr_remote) - sched_avg_remote_distance = total_distance / nr_remote; - else - sched_avg_remote_distance = REMOTE_DISTANCE; - return sched_avg_remote_distance; + return true; +} + +/* + * Return the package-id of the cluster, or ~0 if indeterminate. + * Each node in the on-trace cluster should have the same package-id. + */ +static u32 slit_cluster_package(int N) +{ + int u = topology_num_nodes_per_package(); + u32 pkg_id = ~0; + + for (int n = 0; n < u; n++) { + const struct cpumask *cpus = cpumask_of_node(N + n); + int cpu; + + for_each_cpu(cpu, cpus) { + u32 id = topology_logical_package_id(cpu); + + if (pkg_id == ~0) + pkg_id = id; + if (pkg_id != id) + return ~0; + } + } + + return pkg_id; +} + +/* + * Validate the SLIT table is of the form expected for SNC, specifically: + * + * - each on-trace cluster should be symmetric, + * - each on-trace cluster should have a unique package-id. + * + * If you NUMA_EMU on top of SNC, you get to keep the pieces. + */ +static bool slit_validate(void) +{ + int u = topology_num_nodes_per_package(); + u32 pkg_id, prev_pkg_id = ~0; + + for (int pkg = 0; pkg < topology_max_packages(); pkg++) { + int n = pkg * u; + + /* + * Ensure the on-trace cluster is symmetric and each cluster + * has a different package id. + */ + if (!slit_cluster_symmetric(n)) + return false; + pkg_id = slit_cluster_package(n); + if (pkg_id == ~0) + return false; + if (pkg && pkg_id == prev_pkg_id) + return false; + + prev_pkg_id = pkg_id; + } + + return true; +} + +/* + * Compute a sanitized SLIT table for SNC; notably SNC-3 can end up with + * asymmetric off-trace clusters, reflecting physical assymmetries. However + * this leads to 'unfortunate' sched_domain configurations. + * + * For example dual socket GNR with SNC-3: + * + * node distances: + * node 0 1 2 3 4 5 + * 0: 10 15 17 21 28 26 + * 1: 15 10 15 23 26 23 + * 2: 17 15 10 26 23 21 + * 3: 21 28 26 10 15 17 + * 4: 23 26 23 15 10 15 + * 5: 26 23 21 17 15 10 + * + * Fix things up by averaging out the off-trace clusters; resulting in: + * + * node 0 1 2 3 4 5 + * 0: 10 15 17 24 24 24 + * 1: 15 10 15 24 24 24 + * 2: 17 15 10 24 24 24 + * 3: 24 24 24 10 15 17 + * 4: 24 24 24 15 10 15 + * 5: 24 24 24 17 15 10 + */ +static int slit_cluster_distance(int i, int j) +{ + static int slit_valid = -1; + int u = topology_num_nodes_per_package(); + long d = 0; + int x, y; + + if (slit_valid < 0) { + slit_valid = slit_validate(); + if (!slit_valid) + pr_err(FW_BUG "SLIT table doesn't have the expected form for SNC -- fixup disabled!\n"); + else + pr_info("Fixing up SNC SLIT table.\n"); + } + + /* + * Is this a unit cluster on the trace? + */ + if ((i / u) == (j / u) || !slit_valid) + return node_distance(i, j); + + /* + * Off-trace cluster. + * + * Notably average out the symmetric pair of off-trace clusters to + * ensure the resulting SLIT table is symmetric. + */ + x = i - (i % u); + y = j - (j % u); + + for (i = x; i < x + u; i++) { + for (j = y; j < y + u; j++) { + d += node_distance(i, j); + d += node_distance(j, i); + } + } + + return d / (2*u*u); } int arch_sched_node_distance(int from, int to) @@ -542,34 +658,14 @@ int arch_sched_node_distance(int from, int to) switch (boot_cpu_data.x86_vfm) { case INTEL_GRANITERAPIDS_X: case INTEL_ATOM_DARKMONT_X: - - if (topology_max_packages() == 1 || topology_num_nodes_per_package() == 1 || - d < REMOTE_DISTANCE) + if (topology_max_packages() == 1 || + topology_num_nodes_per_package() < 3) return d; /* - * With SNC enabled, there could be too many levels of remote - * NUMA node distances, creating NUMA domain levels - * including local nodes and partial remote nodes. - * - * Trim finer distance tuning for NUMA nodes in remote package - * for the purpose of building sched domains. Group NUMA nodes - * in the remote package in the same sched group. - * Simplify NUMA domains and avoid extra NUMA levels including - * different remote NUMA nodes and local nodes. - * - * GNR and CWF don't expect systems with more than 2 packages - * and more than 2 hops between packages. Single average remote - * distance won't be appropriate if there are more than 2 - * packages as average distance to different remote packages - * could be different. + * Handle SNC-3 asymmetries. */ - WARN_ONCE(topology_max_packages() > 2, - "sched: Expect only up to 2 packages for GNR or CWF, " - "but saw %d packages when building sched domains.", - topology_max_packages()); - - d = avg_remote_numa_distance(); + return slit_cluster_distance(from, to); } return d; } From 59674fc9d0bfd96ce8a776680ee1cf22c28c9ac7 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 3 Mar 2026 11:55:44 +0100 Subject: [PATCH 8/9] x86/resctrl: Fix SNC detection Now that the x86 topology code has a sensible nodes-per-package measure, that does not depend on the online status of CPUs, use this to divinate the SNC mode. Note that when Cluster on Die (CoD) is configured on older systems this will also show multiple NUMA nodes per package. Intel Resource Director Technology is incomaptible with CoD. Print a warning and do not use the fixup MSR_RMID_SNC_CONFIG. Signed-off-by: Tony Luck Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Tested-by: Zhang Rui Tested-by: Chen Yu Link: https://patch.msgid.link/aaCxbbgjL6OZ6VMd@agluck-desk3 Link: https://patch.msgid.link/20260303110100.367976706@infradead.org --- arch/x86/kernel/cpu/resctrl/monitor.c | 36 ++++----------------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index e6a154240b8d..9bd87bae4983 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -364,7 +364,7 @@ void arch_mon_domain_online(struct rdt_resource *r, struct rdt_l3_mon_domain *d) msr_clear_bit(MSR_RMID_SNC_CONFIG, 0); } -/* CPU models that support MSR_RMID_SNC_CONFIG */ +/* CPU models that support SNC and MSR_RMID_SNC_CONFIG */ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { X86_MATCH_VFM(INTEL_ICELAKE_X, 0), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, 0), @@ -375,40 +375,14 @@ static const struct x86_cpu_id snc_cpu_ids[] __initconst = { {} }; -/* - * There isn't a simple hardware bit that indicates whether a CPU is running - * in Sub-NUMA Cluster (SNC) mode. Infer the state by comparing the - * number of CPUs sharing the L3 cache with CPU0 to the number of CPUs in - * the same NUMA node as CPU0. - * It is not possible to accurately determine SNC state if the system is - * booted with a maxcpus=N parameter. That distorts the ratio of SNC nodes - * to L3 caches. It will be OK if system is booted with hyperthreading - * disabled (since this doesn't affect the ratio). - */ static __init int snc_get_config(void) { - struct cacheinfo *ci = get_cpu_cacheinfo_level(0, RESCTRL_L3_CACHE); - const cpumask_t *node0_cpumask; - int cpus_per_node, cpus_per_l3; - int ret; + int ret = topology_num_nodes_per_package(); - if (!x86_match_cpu(snc_cpu_ids) || !ci) + if (ret > 1 && !x86_match_cpu(snc_cpu_ids)) { + pr_warn("CoD enabled system? Resctrl not supported\n"); return 1; - - cpus_read_lock(); - if (num_online_cpus() != num_present_cpus()) - pr_warn("Some CPUs offline, SNC detection may be incorrect\n"); - cpus_read_unlock(); - - node0_cpumask = cpumask_of_node(cpu_to_node(0)); - - cpus_per_node = cpumask_weight(node0_cpumask); - cpus_per_l3 = cpumask_weight(&ci->shared_cpu_map); - - if (!cpus_per_node || !cpus_per_l3) - return 1; - - ret = cpus_per_l3 / cpus_per_node; + } /* sanity check: Only valid results are 1, 2, 3, 4, 6 */ switch (ret) { From b5ef09a77d0b5213268300eedd8a7d28b4e92d47 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 26 Feb 2026 17:03:07 -0800 Subject: [PATCH 9/9] x86/entry/vdso32: Work around libgcc unwinder bug The unwinder code in libgcc has a long standing bug which causes it to fail to pick up the signal frame CFI flag. This is a generic bug across all platforms. It affects the __kernel_sigreturn and __kernel_rt_sigreturn vdso entry points on i386. The x86-64 kernel doesn't provide a sigreturn stub, and so there is no kernel-provided code that is affected on x86-64. libgcc does have a legacy fallback path which happens to work as long as the bytes immediately before each of the sigreturn functions fall outside any function. This patch adds a nop before the ALIGN to each of the sigreturn stubs to ensure that this is, indeed, the case. The rest of the patch is just a comment which documents the invariants that need to be maintained for this legacy path to work correctly. This is a manifest bug: in the current vdso, __kernel_vsyscall is a multiple of 16 bytes long and thus __kernel_sigreturn does not have any padding in front of it. Closes: https://lore.kernel.org/lkml/f3412cc3e8f66d1853cc9d572c0f2fab076872b1.camel@xry111.site Fixes: 884961618ee5 ("x86/entry/vdso32: Remove open-coded DWARF in sigreturn.S") Reported-by: Xi Ruoyao Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Peter Zijlstra (Intel) Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124050 Link: https://patch.msgid.link/20260227010308.310342-1-hpa@zytor.com --- arch/x86/entry/vdso/vdso32/sigreturn.S | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index b433353bc8e3..b33fcc501ba3 100644 --- a/arch/x86/entry/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -35,9 +35,38 @@ #endif .endm +/* + * WARNING: + * + * A bug in the libgcc unwinder as of at least gcc 15.2 (2026) means that + * the unwinder fails to recognize the signal frame flag. + * + * There is a hacky legacy fallback path in libgcc which ends up + * getting invoked instead. It happens to work as long as BOTH of the + * following conditions are true: + * + * 1. There is at least one byte before the each of the sigreturn + * functions which falls outside any function. This is enforced by + * an explicit nop instruction before the ALIGN. + * 2. The code sequences between the entry point up to and including + * the int $0x80 below need to match EXACTLY. Do not change them + * in any way. The exact byte sequences are: + * + * __kernel_sigreturn: + * 0: 58 pop %eax + * 1: b8 77 00 00 00 mov $0x77,%eax + * 6: cd 80 int $0x80 + * + * __kernel_rt_sigreturn: + * 0: b8 ad 00 00 00 mov $0xad,%eax + * 5: cd 80 int $0x80 + * + * For details, see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=124050 + */ .text .globl __kernel_sigreturn .type __kernel_sigreturn,@function + nop /* libgcc hack: see comment above */ ALIGN __kernel_sigreturn: STARTPROC_SIGNAL_FRAME IA32_SIGFRAME_sigcontext @@ -52,6 +81,7 @@ SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL) .globl __kernel_rt_sigreturn .type __kernel_rt_sigreturn,@function + nop /* libgcc hack: see comment above */ ALIGN __kernel_rt_sigreturn: STARTPROC_SIGNAL_FRAME IA32_RT_SIGFRAME_sigcontext