From f6bf47ab32e0863df50f5501d207dcdddb7fc507 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 23 Feb 2026 22:10:10 +0000 Subject: [PATCH 1/9] arm64: io: Rename ioremap_prot() to __ioremap_prot() Rename our ioremap_prot() implementation to __ioremap_prot() and convert all arch-internal callers over to the new function. ioremap_prot() remains as a #define to __ioremap_prot() for generic_access_phys() and will be subsequently extended to handle user permissions in 'prot'. Cc: Zeng Heng Cc: Jinjiang Tu Cc: Catalin Marinas Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 11 ++++++----- arch/arm64/kernel/acpi.c | 2 +- arch/arm64/mm/ioremap.c | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 83e03abbb2ca..cd2fddfe814a 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -264,19 +264,20 @@ __iowrite64_copy(void __iomem *to, const void *from, size_t count) typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, pgprot_t *prot); int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); +void __iomem *__ioremap_prot(phys_addr_t phys, size_t size, pgprot_t prot); -#define ioremap_prot ioremap_prot +#define ioremap_prot __ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE #define ioremap_wc(addr, size) \ - ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC)) + __ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC)) #define ioremap_np(addr, size) \ - ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) + __ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) #define ioremap_encrypted(addr, size) \ - ioremap_prot((addr), (size), PAGE_KERNEL) + __ioremap_prot((addr), (size), PAGE_KERNEL) /* * io{read,write}{16,32,64}be() macros @@ -297,7 +298,7 @@ static inline void __iomem *ioremap_cache(phys_addr_t addr, size_t size) if (pfn_is_map_memory(__phys_to_pfn(addr))) return (void __iomem *)__phys_to_virt(addr); - return ioremap_prot(addr, size, __pgprot(PROT_NORMAL)); + return __ioremap_prot(addr, size, __pgprot(PROT_NORMAL)); } /* diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index af90128cfed5..a9d884fd1d00 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -377,7 +377,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) prot = __acpi_get_writethrough_mem_attribute(); } } - return ioremap_prot(phys, size, prot); + return __ioremap_prot(phys, size, prot); } /* diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index b12cbed9b5ad..6eef48ef6278 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -14,8 +14,8 @@ int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) return 0; } -void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - pgprot_t pgprot) +void __iomem *__ioremap_prot(phys_addr_t phys_addr, size_t size, + pgprot_t pgprot) { unsigned long last_addr = phys_addr + size - 1; @@ -39,7 +39,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, return generic_ioremap_prot(phys_addr, size, pgprot); } -EXPORT_SYMBOL(ioremap_prot); +EXPORT_SYMBOL(__ioremap_prot); /* * Must be called after early_fixmap_init From 8f098037139b294050053123ab2bc0f819d08932 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 23 Feb 2026 22:10:11 +0000 Subject: [PATCH 2/9] arm64: io: Extract user memory type in ioremap_prot() The only caller of ioremap_prot() outside of the generic ioremap() implementation is generic_access_phys(), which passes a 'pgprot_t' value determined from the user mapping of the target 'pfn' being accessed by the kernel. On arm64, the 'pgprot_t' contains all of the non-address bits from the pte, including the permission controls, and so we end up returning a new user mapping from ioremap_prot() which faults when accessed from the kernel on systems with PAN: | Unable to handle kernel read from unreadable memory at virtual address ffff80008ea89000 | ... | Call trace: | __memcpy_fromio+0x80/0xf8 | generic_access_phys+0x20c/0x2b8 | __access_remote_vm+0x46c/0x5b8 | access_remote_vm+0x18/0x30 | environ_read+0x238/0x3e8 | vfs_read+0xe4/0x2b0 | ksys_read+0xcc/0x178 | __arm64_sys_read+0x4c/0x68 Extract only the memory type from the user 'pgprot_t' in ioremap_prot() and assert that we're being passed a user mapping, to protect us against any changes in future that may require additional handling. To avoid falsely flagging users of ioremap(), provide our own ioremap() macro which simply wraps __ioremap_prot(). Cc: Zeng Heng Cc: Jinjiang Tu Cc: Catalin Marinas Fixes: 893dea9ccd08 ("arm64: Add HAVE_IOREMAP_PROT support") Reported-by: Jinjiang Tu Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/io.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index cd2fddfe814a..8cbd1e96fd50 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -266,10 +266,23 @@ typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size, int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); void __iomem *__ioremap_prot(phys_addr_t phys, size_t size, pgprot_t prot); -#define ioremap_prot __ioremap_prot +static inline void __iomem *ioremap_prot(phys_addr_t phys, size_t size, + pgprot_t user_prot) +{ + pgprot_t prot; + ptdesc_t user_prot_val = pgprot_val(user_prot); -#define _PAGE_IOREMAP PROT_DEVICE_nGnRE + if (WARN_ON_ONCE(!(user_prot_val & PTE_USER))) + return NULL; + prot = __pgprot_modify(PAGE_KERNEL, PTE_ATTRINDX_MASK, + user_prot_val & PTE_ATTRINDX_MASK); + return __ioremap_prot(phys, size, prot); +} +#define ioremap_prot ioremap_prot + +#define ioremap(addr, size) \ + __ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) #define ioremap_wc(addr, size) \ __ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC)) #define ioremap_np(addr, size) \ From 8a85b3131225a8c8143ba2ae29c0eef8c1f9117f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Feb 2026 17:45:30 +0000 Subject: [PATCH 3/9] arm64: gcs: Do not set PTE_SHARED on GCS mappings if FEAT_LPA2 is enabled When FEAT_LPA2 is enabled, bits 8-9 of the PTE replace the shareability attribute with bits 50-51 of the output address. The _PAGE_GCS{,_RO} definitions include the PTE_SHARED bits as 0b11 (this matches the other _PAGE_* definitions) but using this macro directly leads to the following panic when enabling GCS on a system/model with LPA2: Unable to handle kernel paging request at virtual address fffff1ffc32d8008 Mem abort info: ESR = 0x0000000096000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault Data abort info: ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 52-bit VAs, pgdp=0000000060f4d000 [fffff1ffc32d8008] pgd=100000006184b003, p4d=0000000000000000 Internal error: Oops: 0000000096000004 [#1] SMP CPU: 0 UID: 0 PID: 513 Comm: gcs_write_fault Tainted: G M 7.0.0-rc1 #1 PREEMPT Tainted: [M]=MACHINE_CHECK Hardware name: QEMU QEMU Virtual Machine, BIOS 2025.02-8+deb13u1 11/08/2025 pstate: 03402005 (nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : zap_huge_pmd+0x168/0x468 lr : zap_huge_pmd+0x2c/0x468 sp : ffff800080beb660 x29: ffff800080beb660 x28: fff00000c2058180 x27: ffff800080beb898 x26: fff00000c2058180 x25: ffff800080beb820 x24: 00c800010b600f41 x23: ffffc1ffc30af1a8 x22: fff00000c2058180 x21: 0000ffff8dc00000 x20: fff00000c2bc6370 x19: ffff800080beb898 x18: ffff800080bebb60 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000007 x14: 000000000000000a x13: 0000aaaacbbbffff x12: 0000000000000000 x11: 0000ffff8ddfffff x10: 00000000000001fe x9 : 0000ffff8ddfffff x8 : 0000ffff8de00000 x7 : 0000ffff8da00000 x6 : fff00000c2bc6370 x5 : 0000ffff8da00000 x4 : 000000010b600000 x3 : ffffc1ffc0000000 x2 : fff00000c2058180 x1 : fffff1ffc32d8000 x0 : 000000c00010b600 Call trace: zap_huge_pmd+0x168/0x468 (P) unmap_page_range+0xd70/0x1560 unmap_single_vma+0x48/0x80 unmap_vmas+0x90/0x180 unmap_region+0x88/0xe4 vms_complete_munmap_vmas+0xf8/0x1e0 do_vmi_align_munmap+0x158/0x180 do_vmi_munmap+0xac/0x160 __vm_munmap+0xb0/0x138 vm_munmap+0x14/0x20 gcs_free+0x70/0x80 mm_release+0x1c/0xc8 exit_mm_release+0x28/0x38 do_exit+0x190/0x8ec do_group_exit+0x34/0x90 get_signal+0x794/0x858 arch_do_signal_or_restart+0x11c/0x3e0 exit_to_user_mode_loop+0x10c/0x17c el0_da+0x8c/0x9c el0t_64_sync_handler+0xd0/0xf0 el0t_64_sync+0x198/0x19c Code: aa1603e2 d34cfc00 cb813001 8b011861 (f9400420) Similarly to how the kernel handles protection_map[], use a gcs_page_prot variable to store the protection bits and clear PTE_SHARED if LPA2 is enabled. Also remove the unused PAGE_GCS{,_RO} macros. Signed-off-by: Catalin Marinas Fixes: 6497b66ba694 ("arm64/mm: Map pages for guarded control stack") Reported-by: Emanuele Rocca Cc: stable@vger.kernel.org Cc: Mark Brown Cc: Will Deacon Reviewed-by: David Hildenbrand (Arm) Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable-prot.h | 3 --- arch/arm64/mm/mmap.c | 8 ++++++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index d27e8872fe3c..2b32639160de 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -164,9 +164,6 @@ static inline bool __pure lpa2_is_enabled(void) #define _PAGE_GCS (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER) #define _PAGE_GCS_RO (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER) -#define PAGE_GCS __pgprot(_PAGE_GCS) -#define PAGE_GCS_RO __pgprot(_PAGE_GCS_RO) - #define PIE_E0 ( \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS), PIE_GCS) | \ PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO), PIE_R) | \ diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 08ee177432c2..75f343009b4b 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -34,6 +34,8 @@ static pgprot_t protection_map[16] __ro_after_init = { [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC }; +static ptdesc_t gcs_page_prot __ro_after_init = _PAGE_GCS_RO; + /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. @@ -73,9 +75,11 @@ static int __init adjust_protection_map(void) protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY; } - if (lpa2_is_enabled()) + if (lpa2_is_enabled()) { for (int i = 0; i < ARRAY_SIZE(protection_map); i++) pgprot_val(protection_map[i]) &= ~PTE_SHARED; + gcs_page_prot &= ~PTE_SHARED; + } return 0; } @@ -87,7 +91,7 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { - prot = _PAGE_GCS_RO; + prot = gcs_page_prot; } else { prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); From 47a8aad135ac1aed04b7b0c0a8157fd208075827 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Feb 2026 17:45:31 +0000 Subject: [PATCH 4/9] arm64: gcs: Honour mprotect(PROT_NONE) on shadow stack mappings vm_get_page_prot() short-circuits the protection_map[] lookup for a VM_SHADOW_STACK mapping since it uses a different PIE index from the typical read/write/exec permissions. However, the side effect is that it also ignores mprotect(PROT_NONE) by creating an accessible PTE. Special-case the !(vm_flags & VM_ACCESS_FLAGS) flags to use the protection_map[VM_NONE] permissions instead. No GCS attributes are required for an inaccessible PTE. Signed-off-by: Catalin Marinas Fixes: 6497b66ba694 ("arm64/mm: Map pages for guarded control stack") Cc: stable@vger.kernel.org Cc: Mark Brown Cc: Will Deacon Cc: David Hildenbrand Reviewed-by: David Hildenbrand (Arm) Signed-off-by: Will Deacon --- arch/arm64/mm/mmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 75f343009b4b..92b2f5097a96 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -91,7 +91,11 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags) /* Short circuit GCS to avoid bloating the table. */ if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) { - prot = gcs_page_prot; + /* Honour mprotect(PROT_NONE) on shadow stack mappings */ + if (vm_flags & VM_ACCESS_FLAGS) + prot = gcs_page_prot; + else + prot = pgprot_val(protection_map[VM_NONE]); } else { prot = pgprot_val(protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); From 9d1a7c4a457eac8a7e07e1685f95e54fa551db5e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 23 Feb 2026 17:45:32 +0000 Subject: [PATCH 5/9] kselftest: arm64: Check access to GCS after mprotect(PROT_NONE) A GCS mapping should not be accessible after mprotect(PROT_NONE). Add a kselftest for this scenario. Signed-off-by: Catalin Marinas Cc: Shuah Khan Cc: Mark Brown Cc: Will Deacon Cc: David Hildenbrand Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- .../signal/testcases/gcs_prot_none_fault.c | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c new file mode 100644 index 000000000000..2259f454a202 --- /dev/null +++ b/tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2026 ARM Limited + */ + +#include +#include +#include + +#include +#include + +#include "test_signals_utils.h" +#include "testcases.h" + +static uint64_t *gcs_page; +static bool post_mprotect; + +#ifndef __NR_map_shadow_stack +#define __NR_map_shadow_stack 453 +#endif + +static bool alloc_gcs(struct tdescr *td) +{ + long page_size = sysconf(_SC_PAGE_SIZE); + + gcs_page = (void *)syscall(__NR_map_shadow_stack, 0, + page_size, 0); + if (gcs_page == MAP_FAILED) { + fprintf(stderr, "Failed to map %ld byte GCS: %d\n", + page_size, errno); + return false; + } + + return true; +} + +static int gcs_prot_none_fault_trigger(struct tdescr *td) +{ + /* Verify that the page is readable (ie, not completely unmapped) */ + fprintf(stderr, "Read value 0x%lx\n", gcs_page[0]); + + if (mprotect(gcs_page, sysconf(_SC_PAGE_SIZE), PROT_NONE) != 0) { + fprintf(stderr, "mprotect(PROT_NONE) failed: %d\n", errno); + return 0; + } + post_mprotect = true; + + /* This should trigger a fault if PROT_NONE is honoured for the GCS page */ + fprintf(stderr, "Read value after mprotect(PROT_NONE) 0x%lx\n", gcs_page[0]); + return 0; +} + +static int gcs_prot_none_fault_signal(struct tdescr *td, siginfo_t *si, + ucontext_t *uc) +{ + ASSERT_GOOD_CONTEXT(uc); + + /* A fault before mprotect(PROT_NONE) is unexpected. */ + if (!post_mprotect) + return 0; + + return 1; +} + +struct tdescr tde = { + .name = "GCS PROT_NONE fault", + .descr = "Read from GCS after mprotect(PROT_NONE) segfaults", + .feats_required = FEAT_GCS, + .timeout = 3, + .sig_ok = SIGSEGV, + .sanity_disabled = true, + .init = alloc_gcs, + .trigger = gcs_prot_none_fault_trigger, + .run = gcs_prot_none_fault_signal, +}; From bfd9c931d19aa59fb8371d557774fa169b15db9a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 18 Feb 2026 16:43:47 +0000 Subject: [PATCH 6/9] arm64: tlb: Allow XZR argument to TLBI ops The TLBI instruction accepts XZR as a register argument, and for TLBI operations with a register argument, there is no functional difference between using XZR or another GPR which contains zeroes. Operations without a register argument are encoded as if XZR were used. Allow the __TLBI_1() macro to use XZR when a register argument is all zeroes. Today this only results in a trivial code saving in __do_compat_cache_op()'s workaround for Neoverse-N1 erratum #1542419. In subsequent patches this pattern will be used more generally. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Marc Zyngier Cc: Oliver Upton Cc: Ryan Roberts Cc: Will Deacon Signed-off-by: Will Deacon --- arch/arm64/include/asm/tlbflush.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a2d65d7d6aae..bf1cc9949dc8 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -38,12 +38,12 @@ : : ) #define __TLBI_1(op, arg) asm (ARM64_ASM_PREAMBLE \ - "tlbi " #op ", %0\n" \ + "tlbi " #op ", %x0\n" \ ALTERNATIVE("nop\n nop", \ - "dsb ish\n tlbi " #op ", %0", \ + "dsb ish\n tlbi " #op ", %x0", \ ARM64_WORKAROUND_REPEAT_TLBI, \ CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ - : : "r" (arg)) + : : "rZ" (arg)) #define __TLBI_N(op, arg, n, ...) __TLBI_##n(op, arg) From a8f78680ee6bf795086384e8aea159a52814f827 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 18 Feb 2026 16:43:48 +0000 Subject: [PATCH 7/9] arm64: tlb: Optimize ARM64_WORKAROUND_REPEAT_TLBI The ARM64_WORKAROUND_REPEAT_TLBI workaround is used to mitigate several errata where broadcast TLBI;DSB sequences don't provide all the architecturally required synchronization. The workaround performs more work than necessary, and can have significant overhead. This patch optimizes the workaround, as explained below. The workaround was originally added for Qualcomm Falkor erratum 1009 in commit: d9ff80f83ecb ("arm64: Work around Falkor erratum 1009") As noted in the message for that commit, the workaround is applied even in cases where it is not strictly necessary. The workaround was later reused without changes for: * Arm Cortex-A76 erratum #1286807 SDEN v33: https://developer.arm.com/documentation/SDEN-885749/33-0/ * Arm Cortex-A55 erratum #2441007 SDEN v16: https://developer.arm.com/documentation/SDEN-859338/1600/ * Arm Cortex-A510 erratum #2441009 SDEN v19: https://developer.arm.com/documentation/SDEN-1873351/1900/ The important details to note are as follows: 1. All relevant errata only affect the ordering and/or completion of memory accesses which have been translated by an invalidated TLB entry. The actual invalidation of TLB entries is unaffected. 2. The existing workaround is applied to both broadcast and local TLB invalidation, whereas for all relevant errata it is only necessary to apply a workaround for broadcast invalidation. 3. The existing workaround replaces every TLBI with a TLBI;DSB;TLBI sequence, whereas for all relevant errata it is only necessary to execute a single additional TLBI;DSB sequence after any number of TLBIs are completed by a DSB. For example, for a sequence of batched TLBIs: TLBI [, ] TLBI [, ] TLBI [, ] DSB ISH ... the existing workaround will expand this to: TLBI [, ] DSB ISH // additional TLBI [, ] // additional TLBI [, ] DSB ISH // additional TLBI [, ] // additional TLBI [, ] DSB ISH // additional TLBI [, ] // additional DSB ISH ... whereas it is sufficient to have: TLBI [, ] TLBI [, ] TLBI [, ] DSB ISH TLBI [, ] // additional DSB ISH // additional Using a single additional TBLI and DSB at the end of the sequence can have significantly lower overhead as each DSB which completes a TLBI must synchronize with other PEs in the system, with potential performance effects both locally and system-wide. 4. The existing workaround repeats each specific TLBI operation, whereas for all relevant errata it is sufficient for the additional TLBI to use *any* operation which will be broadcast, regardless of which translation regime or stage of translation the operation applies to. For example, for a single TLBI: TLBI ALLE2IS DSB ISH ... the existing workaround will expand this to: TLBI ALLE2IS DSB ISH TLBI ALLE2IS // additional DSB ISH // additional ... whereas it is sufficient to have: TLBI ALLE2IS DSB ISH TLBI VALE1IS, XZR // additional DSB ISH // additional As the additional TLBI doesn't have to match a specific earlier TLBI, the additional TLBI can be implemented in separate code, with no memory of the earlier TLBIs. The additional TLBI can also use a cheaper TLBI operation. 5. The existing workaround is applied to both Stage-1 and Stage-2 TLB invalidation, whereas for all relevant errata it is only necessary to apply a workaround for Stage-1 invalidation. Architecturally, TLBI operations which invalidate only Stage-2 information (e.g. IPAS2E1IS) are not required to invalidate TLB entries which combine information from Stage-1 and Stage-2 translation table entries, and consequently may not complete memory accesses translated by those combined entries. In these cases, completion of memory accesses is only guaranteed after subsequent invalidation of Stage-1 information (e.g. VMALLE1IS). Taking the above points into account, this patch reworks the workaround logic to reduce overhead: * New __tlbi_sync_s1ish() and __tlbi_sync_s1ish_hyp() functions are added and used in place of any dsb(ish) which is used to complete broadcast Stage-1 TLB maintenance. When the ARM64_WORKAROUND_REPEAT_TLBI workaround is enabled, these helpers will execute an additional TLBI;DSB sequence. For consistency, it might make sense to add __tlbi_sync_*() helpers for local and stage 2 maintenance. For now I've left those with open-coded dsb() to keep the diff small. * The duplication of TLBIs in __TLBI_0() and __TLBI_1() is removed. This is no longer needed as the necessary synchronization will happen in __tlbi_sync_s1ish() or __tlbi_sync_s1ish_hyp(). * The additional TLBI operation is chosen to have minimal impact: - __tlbi_sync_s1ish() uses "TLBI VALE1IS, XZR". This is only used at EL1 or at EL2 with {E2H,TGE}=={1,1}, where it will target an unused entry for the reserved ASID in the kernel's own translation regime, and have no adverse affect. - __tlbi_sync_s1ish_hyp() uses "TLBI VALE2IS, XZR". This is only used in hyp code, where it will target an unused entry in the hyp code's TTBR0 mapping, and should have no adverse effect. * As __TLBI_0() and __TLBI_1() no longer replace each TLBI with a TLBI;DSB;TLBI sequence, batching TLBIs is worthwhile, and there's no need for arch_tlbbatch_should_defer() to consider ARM64_WORKAROUND_REPEAT_TLBI. When building defconfig with GCC 15.1.0, compared to v6.19-rc1, this patch saves ~1KiB of text, makes the vmlinux ~42KiB smaller, and makes the resulting Image 64KiB smaller: | [mark@lakrids:~/src/linux]% size vmlinux-* | text data bss dec hex filename | 21179831 19660919 708216 41548966 279fca6 vmlinux-after | 21181075 19660903 708216 41550194 27a0172 vmlinux-before | [mark@lakrids:~/src/linux]% ls -l vmlinux-* | -rwxr-xr-x 1 mark mark 157771472 Feb 4 12:05 vmlinux-after | -rwxr-xr-x 1 mark mark 157815432 Feb 4 12:05 vmlinux-before | [mark@lakrids:~/src/linux]% ls -l Image-* | -rw-r--r-- 1 mark mark 41007616 Feb 4 12:05 Image-after | -rw-r--r-- 1 mark mark 41073152 Feb 4 12:05 Image-before Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Marc Zyngier Cc: Oliver Upton Cc: Ryan Roberts Cc: Will Deacon Signed-off-by: Will Deacon --- arch/arm64/include/asm/tlbflush.h | 59 ++++++++++++++++++------------- arch/arm64/kernel/sys_compat.c | 2 +- arch/arm64/kvm/hyp/nvhe/mm.c | 2 +- arch/arm64/kvm/hyp/nvhe/tlb.c | 8 ++--- arch/arm64/kvm/hyp/pgtable.c | 2 +- arch/arm64/kvm/hyp/vhe/tlb.c | 10 +++--- 6 files changed, 47 insertions(+), 36 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index bf1cc9949dc8..1416e652612b 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -31,18 +31,10 @@ */ #define __TLBI_0(op, arg) asm (ARM64_ASM_PREAMBLE \ "tlbi " #op "\n" \ - ALTERNATIVE("nop\n nop", \ - "dsb ish\n tlbi " #op, \ - ARM64_WORKAROUND_REPEAT_TLBI, \ - CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ : : ) #define __TLBI_1(op, arg) asm (ARM64_ASM_PREAMBLE \ "tlbi " #op ", %x0\n" \ - ALTERNATIVE("nop\n nop", \ - "dsb ish\n tlbi " #op ", %x0", \ - ARM64_WORKAROUND_REPEAT_TLBI, \ - CONFIG_ARM64_WORKAROUND_REPEAT_TLBI) \ : : "rZ" (arg)) #define __TLBI_N(op, arg, n, ...) __TLBI_##n(op, arg) @@ -181,6 +173,34 @@ static inline unsigned long get_trans_granule(void) (__pages >> (5 * (scale) + 1)) - 1; \ }) +#define __repeat_tlbi_sync(op, arg...) \ +do { \ + if (!alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI)) \ + break; \ + __tlbi(op, ##arg); \ + dsb(ish); \ +} while (0) + +/* + * Complete broadcast TLB maintenance issued by the host which invalidates + * stage 1 information in the host's own translation regime. + */ +static inline void __tlbi_sync_s1ish(void) +{ + dsb(ish); + __repeat_tlbi_sync(vale1is, 0); +} + +/* + * Complete broadcast TLB maintenance issued by hyp code which invalidates + * stage 1 translation information in any translation regime. + */ +static inline void __tlbi_sync_s1ish_hyp(void) +{ + dsb(ish); + __repeat_tlbi_sync(vale2is, 0); +} + /* * TLB Invalidation * ================ @@ -279,7 +299,7 @@ static inline void flush_tlb_all(void) { dsb(ishst); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish(); isb(); } @@ -291,7 +311,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) asid = __TLBI_VADDR(0, ASID(mm)); __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); - dsb(ish); + __tlbi_sync_s1ish(); mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } @@ -345,20 +365,11 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { flush_tlb_page_nosync(vma, uaddr); - dsb(ish); + __tlbi_sync_s1ish(); } static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) { - /* - * TLB flush deferral is not required on systems which are affected by - * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation - * will have two consecutive TLBI instructions with a dsb(ish) in between - * defeating the purpose (i.e save overall 'dsb ish' cost). - */ - if (alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI)) - return false; - return true; } @@ -374,7 +385,7 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) */ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { - dsb(ish); + __tlbi_sync_s1ish(); } /* @@ -509,7 +520,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, { __flush_tlb_range_nosync(vma->vm_mm, start, end, stride, last_level, tlb_level); - dsb(ish); + __tlbi_sync_s1ish(); } static inline void local_flush_tlb_contpte(struct vm_area_struct *vma, @@ -557,7 +568,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end dsb(ishst); __flush_tlb_range_op(vaale1is, start, pages, stride, 0, TLBI_TTL_UNKNOWN, false, lpa2_is_enabled()); - dsb(ish); + __tlbi_sync_s1ish(); isb(); } @@ -571,7 +582,7 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) dsb(ishst); __tlbi(vaae1is, addr); - dsb(ish); + __tlbi_sync_s1ish(); isb(); } diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 4a609e9b65de..b9d4998c97ef 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -37,7 +37,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end) * We pick the reserved-ASID to minimise the impact. */ __tlbi(aside1is, __TLBI_VADDR(0, 0)); - dsb(ish); + __tlbi_sync_s1ish(); } ret = caches_clean_inval_user_pou(start, start + chunk); diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index ae8391baebc3..218976287d3f 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -271,7 +271,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot) */ dsb(ishst); __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); } diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 48da9ca9763f..3dc1ce0d27fe 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -169,7 +169,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, */ dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -226,7 +226,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -240,7 +240,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) enter_vmid_context(mmu, &cxt, false); __tlbi(vmalls12e1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -266,5 +266,5 @@ void __kvm_flush_vm_context(void) /* Same remark as in enter_vmid_context() */ dsb(ish); __tlbi(alle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 0e4ddd28ef5d..9b480f947da2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -501,7 +501,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, *unmapped += granule; } - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); mm_ops->put_page(ctx->ptep); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index ec2569818629..35855dadfb1b 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -115,7 +115,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, */ dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -176,7 +176,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, dsb(ish); __tlbi(vmalle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -192,7 +192,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) enter_vmid_context(mmu, &cxt); __tlbi(vmalls12e1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); exit_vmid_context(&cxt); @@ -217,7 +217,7 @@ void __kvm_flush_vm_context(void) { dsb(ishst); __tlbi(alle1is); - dsb(ish); + __tlbi_sync_s1ish_hyp(); } /* @@ -358,7 +358,7 @@ int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding) default: ret = -EINVAL; } - dsb(ish); + __tlbi_sync_s1ish_hyp(); isb(); if (mmu) From e5cb94ba5f96d691d8885175d4696d6ae6bc5ec9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 26 Feb 2026 08:22:32 +0000 Subject: [PATCH 8/9] arm64: Fix sampling the "stable" virtual counter in preemptible section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ben reports that when running with CONFIG_DEBUG_PREEMPT, using __arch_counter_get_cntvct_stable() results in well deserves warnings, as we access a per-CPU variable without preemption disabled. Fix the issue by disabling preemption on reading the counter. We can probably do a lot better by not disabling preemption on systems that do not require horrible workarounds to return a valid counter value, but this plugs the issue for the time being. Fixes: 29cc0f3aa7c6 ("arm64: Force the use of CNTVCT_EL0 in __delay()") Reported-by: Ben Horgan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/aZw3EGs4rbQvbAzV@e134344.arm.com Tested-by: Ben Horgan Tested-by: André Draszik Signed-off-by: Will Deacon --- arch/arm64/lib/delay.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c index d02341303899..e278e060e78a 100644 --- a/arch/arm64/lib/delay.c +++ b/arch/arm64/lib/delay.c @@ -32,7 +32,11 @@ static inline unsigned long xloops_to_cycles(unsigned long xloops) * Note that userspace cannot change the offset behind our back either, * as the vcpu mutex is held as long as KVM_RUN is in progress. */ -#define __delay_cycles() __arch_counter_get_cntvct_stable() +static cycles_t notrace __delay_cycles(void) +{ + guard(preempt_notrace)(); + return __arch_counter_get_cntvct_stable(); +} void __delay(unsigned long cycles) { From df6e4ab654dc482c1d45776257a62ac10e14086c Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Thu, 26 Feb 2026 17:29:11 +0530 Subject: [PATCH 9/9] arm64: topology: Fix false warning in counters_read_on_cpu() for same-CPU reads The counters_read_on_cpu() function warns when called with IRQs disabled to prevent deadlock in smp_call_function_single(). However, this warning is spurious when reading counters on the current CPU, since no IPI is needed for same CPU reads. Commit 12eb8f4fff24 ("cpufreq: CPPC: Update FIE arch_freq_scale in ticks for non-PCC regs") changed the CPPC Frequency Invariance Engine to read AMU counters directly from the scheduler tick for non-PCC register spaces (like FFH), instead of deferring to a kthread. This means counters_read_on_cpu() is now called with IRQs disabled from the tick handler, triggering the warning. Fix this by restructuring the logic: when IRQs are disabled (tick context), call the function directly for same-CPU reads. Otherwise use smp_call_function_single(). Fixes: 997c021abc6e ("cpufreq: CPPC: Update FIE arch_freq_scale in ticks for non-PCC regs") Suggested-by: Will Deacon Signed-off-by: Sumit Gupta Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3fe1faab0362..b32f13358fbb 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -400,16 +400,25 @@ static inline int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) { /* - * Abort call on counterless CPU or when interrupts are - * disabled - can lead to deadlock in smp sync call. + * Abort call on counterless CPU. */ if (!cpu_has_amu_feat(cpu)) return -EOPNOTSUPP; - if (WARN_ON_ONCE(irqs_disabled())) - return -EPERM; - - smp_call_function_single(cpu, func, val, 1); + if (irqs_disabled()) { + /* + * When IRQs are disabled (tick path: sched_tick -> + * topology_scale_freq_tick or cppc_scale_freq_tick), only local + * CPU counter reads are allowed. Remote CPU counter read would + * require smp_call_function_single() which is unsafe with IRQs + * disabled. + */ + if (WARN_ON_ONCE(cpu != smp_processor_id())) + return -EPERM; + func(val); + } else { + smp_call_function_single(cpu, func, val, 1); + } return 0; }