From 7537bae8b6eb635583e0e6260f61d13ddbd52087 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Mon, 9 Feb 2026 15:43:09 -0800 Subject: [PATCH 1/8] powercap: intel_rapl: Remove incorrect CPU check in PMU context The RAPL MSR read path incorrectly validates CPU context when called from the PMU subsystem: if (atomic) { if (unlikely(smp_processor_id() != cpu)) return -EIO; rdmsrq(ra->reg.msr, ra->value); } This check fails for package-scoped MSRs like RAPL energy counters, which are readable from any CPU within the package. The perf tool avoids hitting this check by validating against /sys/bus/event_source/devices/power/cpumask before opening events. However, turbostat does not perform this validation and may attempt reads from non-lead CPUs, causing the check to fail and return zero power values. Since package-scoped MSRs are architecturally accessible from any CPU in the package, remove the CPU matching check. Also rename 'atomic' to 'pmu_ctx' to clarify this indicates PMU context where rdmsrq() can be used directly instead of rdmsrl_safe_on_cpu(). Fixes: 748d6ba43afd ("powercap: intel_rapl: Enable MSR-based RAPL PMU support") Signed-off-by: Kuppuswamy Sathyanarayanan Tested-by: Furquim Ulisses Reviewed-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260209234310.1440722-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 6 +++--- drivers/powercap/intel_rapl_msr.c | 12 +++++------- include/linux/intel_rapl.h | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 3ff6da3bf4e6..3705d0608a0f 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -254,7 +254,7 @@ static void rapl_init_domains(struct rapl_package *rp); static int rapl_read_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, bool xlate, u64 *data, - bool atomic); + bool pmu_ctx); static int rapl_write_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, unsigned long long value); @@ -832,7 +832,7 @@ prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) */ static int rapl_read_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, bool xlate, u64 *data, - bool atomic) + bool pmu_ctx) { u64 value; enum rapl_primitives prim_fixed = prim_fixups(rd, prim); @@ -854,7 +854,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd, ra.mask = rpi->mask; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, atomic)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, pmu_ctx)) { pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name); return -EIO; } diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index a2bc0a9c1e10..3d5e7f56d68a 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -110,16 +110,14 @@ static int rapl_cpu_down_prep(unsigned int cpu) return 0; } -static int rapl_msr_read_raw(int cpu, struct reg_action *ra, bool atomic) +static int rapl_msr_read_raw(int cpu, struct reg_action *ra, bool pmu_ctx) { /* - * When called from atomic-context (eg PMU event handler) - * perform MSR read directly using rdmsrq(). + * When called from PMU context, perform MSR read directly using + * rdmsrq() without IPI overhead. Package-scoped MSRs are readable + * from any CPU in the package. */ - if (atomic) { - if (unlikely(smp_processor_id() != cpu)) - return -EIO; - + if (pmu_ctx) { rdmsrq(ra->reg.msr, ra->value); goto out; } diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index f479ef5b3341..fa1f328d6712 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -152,7 +152,7 @@ struct rapl_if_priv { union rapl_reg reg_unit; union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; int limits[RAPL_DOMAIN_MAX]; - int (*read_raw)(int id, struct reg_action *ra, bool atomic); + int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); void *defaults; void *rpi; From 525e0064f3d81764277036036932e873608a47af Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Mon, 9 Feb 2026 15:43:10 -0800 Subject: [PATCH 2/8] powercap: intel_rapl: Expose all package CPUs in PMU cpumask Currently, the RAPL PMU cpumask only includes one CPU per package (typically the lead_cpu) for both MSR and TPMI interfaces. This forces tools to pin their operations to that specific CPU, even though package-scoped registers are readable from any CPU within the package. Change the cpumask to include all online CPUs in each package. This allows tools like perf and turbostat to read RAPL events from any CPU in the package without requiring special handling to find and use the designated lead_cpu. The change refactors get_pmu_cpu() into set_pmu_cpumask() which populates the cpumask with all CPUs belonging to each RAPL package instead of returning a single CPU. This improves flexibility for userspace tools while maintaining correctness since package-scoped RAPL MSRs are architecturally accessible from any CPU in the package. Signed-off-by: Kuppuswamy Sathyanarayanan Tested-by: Furquim Ulisses Reviewed-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260209234310.1440722-3-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 3705d0608a0f..539625531709 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1590,23 +1590,21 @@ static struct rapl_pmu rapl_pmu; /* PMU helpers */ -static int get_pmu_cpu(struct rapl_package *rp) +static void set_pmu_cpumask(struct rapl_package *rp, cpumask_var_t mask) { int cpu; if (!rp->has_pmu) - return nr_cpu_ids; + return; /* Only TPMI & MSR RAPL are supported for now */ if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR) - return nr_cpu_ids; + return; /* TPMI/MSR RAPL uses any CPU in the package for PMU */ for_each_online_cpu(cpu) if (topology_physical_package_id(cpu) == rp->id) - return cpu; - - return nr_cpu_ids; + cpumask_set_cpu(cpu, mask); } static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu) @@ -1883,7 +1881,6 @@ static ssize_t cpumask_show(struct device *dev, { struct rapl_package *rp; cpumask_var_t cpu_mask; - int cpu; int ret; if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) @@ -1895,9 +1892,7 @@ static ssize_t cpumask_show(struct device *dev, /* Choose a cpu for each RAPL Package */ list_for_each_entry(rp, &rapl_packages, plist) { - cpu = get_pmu_cpu(rp); - if (cpu < nr_cpu_ids) - cpumask_set_cpu(cpu, cpu_mask); + set_pmu_cpumask(rp, cpu_mask); } cpus_read_unlock(); From c900e33e30e9d32fe8cfc89202ee339f9a66aabc Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 5 Feb 2026 12:53:54 +0000 Subject: [PATCH 3/8] Documentation: PM: Document intel_idle.table command line option Add the 'intel_idle.table' command line option description to the intel_idle document in admin-guide. Signed-off-by: Artem Bityutskiy [ rjw: Subject adjustment, changelog edits] Link: https://patch.msgid.link/20260205125354.632891-1-artem.bityutskiy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_idle.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst index ed6f055d4b14..188d52cd26e8 100644 --- a/Documentation/admin-guide/pm/intel_idle.rst +++ b/Documentation/admin-guide/pm/intel_idle.rst @@ -260,6 +260,17 @@ mode to off when the CPU is in any one of the available idle states. This may help performance of a sibling CPU at the expense of a slightly higher wakeup latency for the idle CPU. +The ``table`` argument allows customization of idle state latency and target +residency. The syntax is a comma-separated list of ``name:latency:residency`` +entries, where ``name`` is the idle state name, ``latency`` is the exit latency +in microseconds, and ``residency`` is the target residency in microseconds. It +is not necessary to specify all idle states; only those to be customized. For +example, ``C1:1:3,C6:50:100`` sets the exit latency and target residency for +C1 and C6 to 1/3 and 50/100 microseconds, respectively. Remaining idle states +keep their default values. The driver verifies that deeper idle states have +higher latency and target residency than shallower ones. Also, target +residency cannot be smaller than exit latency. If any of these conditions is +not met, the driver ignores the entire ``table`` parameter. .. _intel-idle-core-and-package-idle-states: From c7d54dafa042cf379859dba265fe5afef6fa8770 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 11 Feb 2026 14:34:01 -0800 Subject: [PATCH 4/8] powercap: intel_rapl_tpmi: Remove FW_BUG from invalid version check On partitioned systems, multiple TPMI instances may exist per package, but RAPL registers are only valid on one instance since RAPL has package-scope control. Other instances return invalid versions during domain parsing, which is expected behavior on such systems. Currently this generates a firmware bug warning: intel_rapl_tpmi: [Firmware Bug]: Invalid version Remove the FW_BUG tag, downgrade to pr_debug(), and update the message to clarify that invalid versions are expected on partitioned systems where only one instance can be valid. Fixes: 9eef7f9da928 ("powercap: intel_rapl: Introduce RAPL TPMI interface driver") Reported-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Reviewed-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260211223401.1575776-1-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_tpmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index 0a0b85f4528b..0f8abdc592bc 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -157,7 +157,7 @@ static int parse_one_domain(struct tpmi_rapl_package *trp, u32 offset) tpmi_domain_flags = tpmi_domain_header >> 32 & 0xffff; if (tpmi_domain_version == TPMI_VERSION_INVALID) { - pr_warn(FW_BUG "Invalid version\n"); + pr_debug("Invalid version, other instances may be valid\n"); return -ENODEV; } From e5c9ffc6ae1bcdb1062527d611043681ac301aca Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Tue, 17 Feb 2026 00:20:02 +0530 Subject: [PATCH 5/8] cpuidle: Skip governor when only one idle state is available On certain platforms (PowerNV systems without a power-mgt DT node), cpuidle may register only a single idle state. In cases where that single state is a polling state (state 0), the ladder governor may incorrectly treat state 1 as the first usable state and pass an out-of-bounds index. This can lead to a NULL enter callback being invoked, ultimately resulting in a system crash. [ 13.342636] cpuidle-powernv : Only Snooze is available [ 13.351854] Faulting instruction address: 0x00000000 [ 13.376489] NIP [0000000000000000] 0x0 [ 13.378351] LR [c000000001e01974] cpuidle_enter_state+0x2c4/0x668 Fix this by adding a bail-out in cpuidle_select() that returns state 0 directly when state_count <= 1, bypassing the governor and keeping the tick running. Fixes: dc2251bf98c6 ("cpuidle: Eliminate the CPUIDLE_DRIVER_STATE_START symbol") Signed-off-by: Aboorva Devarajan Reviewed-by: Christian Loehle Link: https://patch.msgid.link/20260216185005.1131593-2-aboorvad@linux.ibm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c7876e9e024f..65fbb8e807b9 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -359,6 +359,16 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev, int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { + /* + * If there is only a single idle state (or none), there is nothing + * meaningful for the governor to choose. Skip the governor and + * always use state 0 with the tick running. + */ + if (drv->state_count <= 1) { + *stop_tick = false; + return 0; + } + return cpuidle_curr_governor->select(drv, dev, stop_tick); } From 9b9c0ff095f04c27da1f761d77c19cd53594d18e Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Tue, 17 Feb 2026 00:20:03 +0530 Subject: [PATCH 6/8] cpuidle: haltpoll: Remove single state handling cpuidle systems where the governor has no choice because there's only a single idle state are now handled by cpuidle core and bypass the governor, so remove the related handling. Signed-off-by: Aboorva Devarajan Reviewed-by: Christian Loehle [ rjw: Extended the change to drop a redundant local variable ] Link: https://patch.msgid.link/20260216185005.1131593-3-aboorvad@linux.ibm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/haltpoll.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/cpuidle/governors/haltpoll.c b/drivers/cpuidle/governors/haltpoll.c index 663b7f164d20..b367d10279c8 100644 --- a/drivers/cpuidle/governors/haltpoll.c +++ b/drivers/cpuidle/governors/haltpoll.c @@ -50,9 +50,7 @@ static int haltpoll_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { - s64 latency_req = cpuidle_governor_latency_req(dev->cpu); - - if (!drv->state_count || latency_req == 0) { + if (cpuidle_governor_latency_req(dev->cpu) == 0) { *stop_tick = false; return 0; } From 825d5d347935d5fc339df969c572e382393f40ec Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Tue, 17 Feb 2026 00:20:04 +0530 Subject: [PATCH 7/8] cpuidle: teo: Remove single state handling cpuidle systems where the governor has no choice because there's only a single idle state are now handled by cpuidle core and bypass the governor, so remove the related handling. Signed-off-by: Christian Loehle Link: https://patch.msgid.link/20260216185005.1131593-4-aboorvad@linux.ibm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 80f3ba942a06..bec0142377b8 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -338,12 +338,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ cpu_data->sleep_length_ns = KTIME_MAX; - /* Check if there is any choice in the first place. */ - if (drv->state_count < 2) { - idx = 0; - goto out_tick; - } - if (!dev->states_usage[0].disable) idx = 0; From 93983a9f3beea791c21d77c2425488ad327d4fda Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Tue, 17 Feb 2026 00:20:05 +0530 Subject: [PATCH 8/8] cpuidle: menu: Remove single state handling cpuidle systems where the governor has no choice because there's only a single idle state are now handled by cpuidle core and bypass the governor, so remove the related handling. Signed-off-by: Christian Loehle [ rjw: Rebase on top of the cpuidle changes merged recently ] Link: https://patch.msgid.link/20260216185005.1131593-5-aboorvad@linux.ibm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index c6052055ba0f..899ff16ff1fe 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -281,7 +281,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, data->bucket = BUCKETS - 1; } - if (drv->state_count <= 1 || latency_req == 0 || + if (latency_req == 0 || ((data->next_timer_ns < drv->states[1].target_residency_ns || latency_req < drv->states[1].exit_latency_ns) && !dev->states_usage[0].disable)) {