From f6bf47ab32e0863df50f5501d207dcdddb7fc507 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 23 Feb 2026 22:10:10 +0000
Subject: [PATCH 1/9] arm64: io: Rename ioremap_prot() to __ioremap_prot()

Rename our ioremap_prot() implementation to __ioremap_prot() and convert
all arch-internal callers over to the new function.

ioremap_prot() remains as a #define to __ioremap_prot() for
generic_access_phys() and will be subsequently extended to handle user
permissions in 'prot'.

Cc: Zeng Heng <zengheng4@huawei.com>
Cc: Jinjiang Tu <tujinjiang@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/io.h | 11 ++++++-----
 arch/arm64/kernel/acpi.c    |  2 +-
 arch/arm64/mm/ioremap.c     |  6 +++---
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 83e03abbb2ca..cd2fddfe814a 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -264,19 +264,20 @@ __iowrite64_copy(void __iomem *to, const void *from, size_t count)
 typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size,
 				   pgprot_t *prot);
 int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook);
+void __iomem *__ioremap_prot(phys_addr_t phys, size_t size, pgprot_t prot);
 
-#define ioremap_prot ioremap_prot
+#define ioremap_prot __ioremap_prot
 
 #define _PAGE_IOREMAP PROT_DEVICE_nGnRE
 
 #define ioremap_wc(addr, size)	\
-	ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC))
+	__ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC))
 #define ioremap_np(addr, size)	\
-	ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))
+	__ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))
 
 
 #define ioremap_encrypted(addr, size)	\
-	ioremap_prot((addr), (size), PAGE_KERNEL)
+	__ioremap_prot((addr), (size), PAGE_KERNEL)
 
 /*
  * io{read,write}{16,32,64}be() macros
@@ -297,7 +298,7 @@ static inline void __iomem *ioremap_cache(phys_addr_t addr, size_t size)
 	if (pfn_is_map_memory(__phys_to_pfn(addr)))
 		return (void __iomem *)__phys_to_virt(addr);
 
-	return ioremap_prot(addr, size, __pgprot(PROT_NORMAL));
+	return __ioremap_prot(addr, size, __pgprot(PROT_NORMAL));
 }
 
 /*
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index af90128cfed5..a9d884fd1d00 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -377,7 +377,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
 				prot = __acpi_get_writethrough_mem_attribute();
 		}
 	}
-	return ioremap_prot(phys, size, prot);
+	return __ioremap_prot(phys, size, prot);
 }
 
 /*
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index b12cbed9b5ad..6eef48ef6278 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -14,8 +14,8 @@ int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook)
 	return 0;
 }
 
-void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
-			   pgprot_t pgprot)
+void __iomem *__ioremap_prot(phys_addr_t phys_addr, size_t size,
+			     pgprot_t pgprot)
 {
 	unsigned long last_addr = phys_addr + size - 1;
 
@@ -39,7 +39,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
 
 	return generic_ioremap_prot(phys_addr, size, pgprot);
 }
-EXPORT_SYMBOL(ioremap_prot);
+EXPORT_SYMBOL(__ioremap_prot);
 
 /*
  * Must be called after early_fixmap_init

From 8f098037139b294050053123ab2bc0f819d08932 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Mon, 23 Feb 2026 22:10:11 +0000
Subject: [PATCH 2/9] arm64: io: Extract user memory type in ioremap_prot()

The only caller of ioremap_prot() outside of the generic ioremap()
implementation is generic_access_phys(), which passes a 'pgprot_t' value
determined from the user mapping of the target 'pfn' being accessed by
the kernel. On arm64, the 'pgprot_t' contains all of the non-address
bits from the pte, including the permission controls, and so we end up
returning a new user mapping from ioremap_prot() which faults when
accessed from the kernel on systems with PAN:

  | Unable to handle kernel read from unreadable memory at virtual address ffff80008ea89000
  | ...
  | Call trace:
  |   __memcpy_fromio+0x80/0xf8
  |   generic_access_phys+0x20c/0x2b8
  |   __access_remote_vm+0x46c/0x5b8
  |   access_remote_vm+0x18/0x30
  |   environ_read+0x238/0x3e8
  |   vfs_read+0xe4/0x2b0
  |   ksys_read+0xcc/0x178
  |   __arm64_sys_read+0x4c/0x68

Extract only the memory type from the user 'pgprot_t' in ioremap_prot()
and assert that we're being passed a user mapping, to protect us against
any changes in future that may require additional handling. To avoid
falsely flagging users of ioremap(), provide our own ioremap() macro
which simply wraps __ioremap_prot().

Cc: Zeng Heng <zengheng4@huawei.com>
Cc: Jinjiang Tu <tujinjiang@huawei.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Fixes: 893dea9ccd08 ("arm64: Add HAVE_IOREMAP_PROT support")
Reported-by: Jinjiang Tu <tujinjiang@huawei.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/io.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index cd2fddfe814a..8cbd1e96fd50 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -266,10 +266,23 @@ typedef int (*ioremap_prot_hook_t)(phys_addr_t phys_addr, size_t size,
 int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook);
 void __iomem *__ioremap_prot(phys_addr_t phys, size_t size, pgprot_t prot);
 
-#define ioremap_prot __ioremap_prot
+static inline void __iomem *ioremap_prot(phys_addr_t phys, size_t size,
+					 pgprot_t user_prot)
+{
+	pgprot_t prot;
+	ptdesc_t user_prot_val = pgprot_val(user_prot);
 
-#define _PAGE_IOREMAP PROT_DEVICE_nGnRE
+	if (WARN_ON_ONCE(!(user_prot_val & PTE_USER)))
+		return NULL;
 
+	prot = __pgprot_modify(PAGE_KERNEL, PTE_ATTRINDX_MASK,
+			       user_prot_val & PTE_ATTRINDX_MASK);
+	return __ioremap_prot(phys, size, prot);
+}
+#define ioremap_prot ioremap_prot
+
+#define ioremap(addr, size)	\
+	__ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
 #define ioremap_wc(addr, size)	\
 	__ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC))
 #define ioremap_np(addr, size)	\

From 8a85b3131225a8c8143ba2ae29c0eef8c1f9117f Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 23 Feb 2026 17:45:30 +0000
Subject: [PATCH 3/9] arm64: gcs: Do not set PTE_SHARED on GCS mappings if
 FEAT_LPA2 is enabled

When FEAT_LPA2 is enabled, bits 8-9 of the PTE replace the
shareability attribute with bits 50-51 of the output address. The
_PAGE_GCS{,_RO} definitions include the PTE_SHARED bits as 0b11 (this
matches the other _PAGE_* definitions) but using this macro directly
leads to the following panic when enabling GCS on a system/model with
LPA2:

  Unable to handle kernel paging request at virtual address fffff1ffc32d8008
  Mem abort info:
    ESR = 0x0000000096000004
    EC = 0x25: DABT (current EL), IL = 32 bits
    SET = 0, FnV = 0
    EA = 0, S1PTW = 0
    FSC = 0x04: level 0 translation fault
  Data abort info:
    ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
    CM = 0, WnR = 0, TnD = 0, TagAccess = 0
    GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
  swapper pgtable: 4k pages, 52-bit VAs, pgdp=0000000060f4d000
  [fffff1ffc32d8008] pgd=100000006184b003, p4d=0000000000000000
  Internal error: Oops: 0000000096000004 [#1]  SMP
  CPU: 0 UID: 0 PID: 513 Comm: gcs_write_fault Tainted: G   M                7.0.0-rc1 #1 PREEMPT
  Tainted: [M]=MACHINE_CHECK
  Hardware name: QEMU QEMU Virtual Machine, BIOS 2025.02-8+deb13u1 11/08/2025
  pstate: 03402005 (nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
  pc : zap_huge_pmd+0x168/0x468
  lr : zap_huge_pmd+0x2c/0x468
  sp : ffff800080beb660
  x29: ffff800080beb660 x28: fff00000c2058180 x27: ffff800080beb898
  x26: fff00000c2058180 x25: ffff800080beb820 x24: 00c800010b600f41
  x23: ffffc1ffc30af1a8 x22: fff00000c2058180 x21: 0000ffff8dc00000
  x20: fff00000c2bc6370 x19: ffff800080beb898 x18: ffff800080bebb60
  x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000007
  x14: 000000000000000a x13: 0000aaaacbbbffff x12: 0000000000000000
  x11: 0000ffff8ddfffff x10: 00000000000001fe x9 : 0000ffff8ddfffff
  x8 : 0000ffff8de00000 x7 : 0000ffff8da00000 x6 : fff00000c2bc6370
  x5 : 0000ffff8da00000 x4 : 000000010b600000 x3 : ffffc1ffc0000000
  x2 : fff00000c2058180 x1 : fffff1ffc32d8000 x0 : 000000c00010b600
  Call trace:
   zap_huge_pmd+0x168/0x468 (P)
   unmap_page_range+0xd70/0x1560
   unmap_single_vma+0x48/0x80
   unmap_vmas+0x90/0x180
   unmap_region+0x88/0xe4
   vms_complete_munmap_vmas+0xf8/0x1e0
   do_vmi_align_munmap+0x158/0x180
   do_vmi_munmap+0xac/0x160
   __vm_munmap+0xb0/0x138
   vm_munmap+0x14/0x20
   gcs_free+0x70/0x80
   mm_release+0x1c/0xc8
   exit_mm_release+0x28/0x38
   do_exit+0x190/0x8ec
   do_group_exit+0x34/0x90
   get_signal+0x794/0x858
   arch_do_signal_or_restart+0x11c/0x3e0
   exit_to_user_mode_loop+0x10c/0x17c
   el0_da+0x8c/0x9c
   el0t_64_sync_handler+0xd0/0xf0
   el0t_64_sync+0x198/0x19c
  Code: aa1603e2 d34cfc00 cb813001 8b011861 (f9400420)

Similarly to how the kernel handles protection_map[], use a
gcs_page_prot variable to store the protection bits and clear PTE_SHARED
if LPA2 is enabled.

Also remove the unused PAGE_GCS{,_RO} macros.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Fixes: 6497b66ba694 ("arm64/mm: Map pages for guarded control stack")
Reported-by: Emanuele Rocca <emanuele.rocca@arm.com>
Cc: stable@vger.kernel.org
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/pgtable-prot.h | 3 ---
 arch/arm64/mm/mmap.c                  | 8 ++++++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index d27e8872fe3c..2b32639160de 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -164,9 +164,6 @@ static inline bool __pure lpa2_is_enabled(void)
 #define _PAGE_GCS	(_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER)
 #define _PAGE_GCS_RO	(_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER)
 
-#define PAGE_GCS	__pgprot(_PAGE_GCS)
-#define PAGE_GCS_RO	__pgprot(_PAGE_GCS_RO)
-
 #define PIE_E0	( \
 	PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS),           PIE_GCS)  | \
 	PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO),        PIE_R)   | \
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 08ee177432c2..75f343009b4b 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -34,6 +34,8 @@ static pgprot_t protection_map[16] __ro_after_init = {
 	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= PAGE_SHARED_EXEC
 };
 
+static ptdesc_t gcs_page_prot __ro_after_init = _PAGE_GCS_RO;
+
 /*
  * You really shouldn't be using read() or write() on /dev/mem.  This might go
  * away in the future.
@@ -73,9 +75,11 @@ static int __init adjust_protection_map(void)
 		protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY;
 	}
 
-	if (lpa2_is_enabled())
+	if (lpa2_is_enabled()) {
 		for (int i = 0; i < ARRAY_SIZE(protection_map); i++)
 			pgprot_val(protection_map[i]) &= ~PTE_SHARED;
+		gcs_page_prot &= ~PTE_SHARED;
+	}
 
 	return 0;
 }
@@ -87,7 +91,7 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
 
 	/* Short circuit GCS to avoid bloating the table. */
 	if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
-		prot = _PAGE_GCS_RO;
+		prot = gcs_page_prot;
 	} else {
 		prot = pgprot_val(protection_map[vm_flags &
 				   (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);

From 47a8aad135ac1aed04b7b0c0a8157fd208075827 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 23 Feb 2026 17:45:31 +0000
Subject: [PATCH 4/9] arm64: gcs: Honour mprotect(PROT_NONE) on shadow stack
 mappings

vm_get_page_prot() short-circuits the protection_map[] lookup for a
VM_SHADOW_STACK mapping since it uses a different PIE index from the
typical read/write/exec permissions. However, the side effect is that it
also ignores mprotect(PROT_NONE) by creating an accessible PTE.

Special-case the !(vm_flags & VM_ACCESS_FLAGS) flags to use the
protection_map[VM_NONE] permissions instead. No GCS attributes are
required for an inaccessible PTE.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Fixes: 6497b66ba694 ("arm64/mm: Map pages for guarded control stack")
Cc: stable@vger.kernel.org
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Reviewed-by: David Hildenbrand (Arm) <david@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/mmap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index 75f343009b4b..92b2f5097a96 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -91,7 +91,11 @@ pgprot_t vm_get_page_prot(vm_flags_t vm_flags)
 
 	/* Short circuit GCS to avoid bloating the table. */
 	if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
-		prot = gcs_page_prot;
+		/* Honour mprotect(PROT_NONE) on shadow stack mappings */
+		if (vm_flags & VM_ACCESS_FLAGS)
+			prot = gcs_page_prot;
+		else
+			prot = pgprot_val(protection_map[VM_NONE]);
 	} else {
 		prot = pgprot_val(protection_map[vm_flags &
 				   (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);

From 9d1a7c4a457eac8a7e07e1685f95e54fa551db5e Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 23 Feb 2026 17:45:32 +0000
Subject: [PATCH 5/9] kselftest: arm64: Check access to GCS after
 mprotect(PROT_NONE)

A GCS mapping should not be accessible after mprotect(PROT_NONE). Add a
kselftest for this scenario.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../signal/testcases/gcs_prot_none_fault.c    | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c

diff --git a/tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c b/tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c
new file mode 100644
index 000000000000..2259f454a202
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/gcs_prot_none_fault.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2026 ARM Limited
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static uint64_t *gcs_page;
+static bool post_mprotect;
+
+#ifndef __NR_map_shadow_stack
+#define __NR_map_shadow_stack 453
+#endif
+
+static bool alloc_gcs(struct tdescr *td)
+{
+	long page_size = sysconf(_SC_PAGE_SIZE);
+
+	gcs_page = (void *)syscall(__NR_map_shadow_stack, 0,
+				   page_size, 0);
+	if (gcs_page == MAP_FAILED) {
+		fprintf(stderr, "Failed to map %ld byte GCS: %d\n",
+			page_size, errno);
+		return false;
+	}
+
+	return true;
+}
+
+static int gcs_prot_none_fault_trigger(struct tdescr *td)
+{
+	/* Verify that the page is readable (ie, not completely unmapped) */
+	fprintf(stderr, "Read value 0x%lx\n", gcs_page[0]);
+
+	if (mprotect(gcs_page, sysconf(_SC_PAGE_SIZE), PROT_NONE) != 0) {
+		fprintf(stderr, "mprotect(PROT_NONE) failed: %d\n", errno);
+		return 0;
+	}
+	post_mprotect = true;
+
+	/* This should trigger a fault if PROT_NONE is honoured for the GCS page */
+	fprintf(stderr, "Read value after mprotect(PROT_NONE) 0x%lx\n", gcs_page[0]);
+	return 0;
+}
+
+static int gcs_prot_none_fault_signal(struct tdescr *td, siginfo_t *si,
+				      ucontext_t *uc)
+{
+	ASSERT_GOOD_CONTEXT(uc);
+
+	/* A fault before mprotect(PROT_NONE) is unexpected. */
+	if (!post_mprotect)
+		return 0;
+
+	return 1;
+}
+
+struct tdescr tde = {
+	.name = "GCS PROT_NONE fault",
+	.descr = "Read from GCS after mprotect(PROT_NONE) segfaults",
+	.feats_required = FEAT_GCS,
+	.timeout = 3,
+	.sig_ok = SIGSEGV,
+	.sanity_disabled = true,
+	.init = alloc_gcs,
+	.trigger = gcs_prot_none_fault_trigger,
+	.run = gcs_prot_none_fault_signal,
+};

From bfd9c931d19aa59fb8371d557774fa169b15db9a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 18 Feb 2026 16:43:47 +0000
Subject: [PATCH 6/9] arm64: tlb: Allow XZR argument to TLBI ops

The TLBI instruction accepts XZR as a register argument, and for TLBI
operations with a register argument, there is no functional difference
between using XZR or another GPR which contains zeroes. Operations
without a register argument are encoded as if XZR were used.

Allow the __TLBI_1() macro to use XZR when a register argument is all
zeroes.

Today this only results in a trivial code saving in
__do_compat_cache_op()'s workaround for Neoverse-N1 erratum #1542419. In
subsequent patches this pattern will be used more generally.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Oliver Upton <oupton@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/tlbflush.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index a2d65d7d6aae..bf1cc9949dc8 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -38,12 +38,12 @@
 			    : : )
 
 #define __TLBI_1(op, arg) asm (ARM64_ASM_PREAMBLE			       \
-			       "tlbi " #op ", %0\n"			       \
+			       "tlbi " #op ", %x0\n"			       \
 		   ALTERNATIVE("nop\n			nop",		       \
-			       "dsb ish\n		tlbi " #op ", %0",     \
+			       "dsb ish\n		tlbi " #op ", %x0",    \
 			       ARM64_WORKAROUND_REPEAT_TLBI,		       \
 			       CONFIG_ARM64_WORKAROUND_REPEAT_TLBI)	       \
-			    : : "r" (arg))
+			    : : "rZ" (arg))
 
 #define __TLBI_N(op, arg, n, ...) __TLBI_##n(op, arg)
 

From a8f78680ee6bf795086384e8aea159a52814f827 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 18 Feb 2026 16:43:48 +0000
Subject: [PATCH 7/9] arm64: tlb: Optimize ARM64_WORKAROUND_REPEAT_TLBI

The ARM64_WORKAROUND_REPEAT_TLBI workaround is used to mitigate several
errata where broadcast TLBI;DSB sequences don't provide all the
architecturally required synchronization. The workaround performs more
work than necessary, and can have significant overhead. This patch
optimizes the workaround, as explained below.

The workaround was originally added for Qualcomm Falkor erratum 1009 in
commit:

  d9ff80f83ecb ("arm64: Work around Falkor erratum 1009")

As noted in the message for that commit, the workaround is applied even
in cases where it is not strictly necessary.

The workaround was later reused without changes for:

* Arm Cortex-A76 erratum #1286807
  SDEN v33: https://developer.arm.com/documentation/SDEN-885749/33-0/

* Arm Cortex-A55 erratum #2441007
  SDEN v16: https://developer.arm.com/documentation/SDEN-859338/1600/

* Arm Cortex-A510 erratum #2441009
  SDEN v19: https://developer.arm.com/documentation/SDEN-1873351/1900/

The important details to note are as follows:

1. All relevant errata only affect the ordering and/or completion of
   memory accesses which have been translated by an invalidated TLB
   entry. The actual invalidation of TLB entries is unaffected.

2. The existing workaround is applied to both broadcast and local TLB
   invalidation, whereas for all relevant errata it is only necessary to
   apply a workaround for broadcast invalidation.

3. The existing workaround replaces every TLBI with a TLBI;DSB;TLBI
   sequence, whereas for all relevant errata it is only necessary to
   execute a single additional TLBI;DSB sequence after any number of
   TLBIs are completed by a DSB.

   For example, for a sequence of batched TLBIs:

       TLBI <op1>[, <arg1>]
       TLBI <op2>[, <arg2>]
       TLBI <op3>[, <arg3>]
       DSB ISH

   ... the existing workaround will expand this to:

       TLBI <op1>[, <arg1>]
       DSB ISH                  // additional
       TLBI <op1>[, <arg1>]     // additional
       TLBI <op2>[, <arg2>]
       DSB ISH                  // additional
       TLBI <op2>[, <arg2>]     // additional
       TLBI <op3>[, <arg3>]
       DSB ISH                  // additional
       TLBI <op3>[, <arg3>]     // additional
       DSB ISH

   ... whereas it is sufficient to have:

       TLBI <op1>[, <arg1>]
       TLBI <op2>[, <arg2>]
       TLBI <op3>[, <arg3>]
       DSB ISH
       TLBI <opX>[, <argX>]     // additional
       DSB ISH                  // additional

   Using a single additional TBLI and DSB at the end of the sequence can
   have significantly lower overhead as each DSB which completes a TLBI
   must synchronize with other PEs in the system, with potential
   performance effects both locally and system-wide.

4. The existing workaround repeats each specific TLBI operation, whereas
   for all relevant errata it is sufficient for the additional TLBI to
   use *any* operation which will be broadcast, regardless of which
   translation regime or stage of translation the operation applies to.

   For example, for a single TLBI:

       TLBI ALLE2IS
       DSB ISH

   ... the existing workaround will expand this to:

       TLBI ALLE2IS
       DSB ISH
       TLBI ALLE2IS             // additional
       DSB ISH                  // additional

   ... whereas it is sufficient to have:

       TLBI ALLE2IS
       DSB ISH
       TLBI VALE1IS, XZR        // additional
       DSB ISH                  // additional

   As the additional TLBI doesn't have to match a specific earlier TLBI,
   the additional TLBI can be implemented in separate code, with no
   memory of the earlier TLBIs. The additional TLBI can also use a
   cheaper TLBI operation.

5. The existing workaround is applied to both Stage-1 and Stage-2 TLB
   invalidation, whereas for all relevant errata it is only necessary to
   apply a workaround for Stage-1 invalidation.

   Architecturally, TLBI operations which invalidate only Stage-2
   information (e.g. IPAS2E1IS) are not required to invalidate TLB
   entries which combine information from Stage-1 and Stage-2
   translation table entries, and consequently may not complete memory
   accesses translated by those combined entries. In these cases,
   completion of memory accesses is only guaranteed after subsequent
   invalidation of Stage-1 information (e.g. VMALLE1IS).

Taking the above points into account, this patch reworks the workaround
logic to reduce overhead:

* New __tlbi_sync_s1ish() and __tlbi_sync_s1ish_hyp() functions are
  added and used in place of any dsb(ish) which is used to complete
  broadcast Stage-1 TLB maintenance. When the
  ARM64_WORKAROUND_REPEAT_TLBI workaround is enabled, these helpers will
  execute an additional TLBI;DSB sequence.

  For consistency, it might make sense to add __tlbi_sync_*() helpers
  for local and stage 2 maintenance. For now I've left those with
  open-coded dsb() to keep the diff small.

* The duplication of TLBIs in __TLBI_0() and __TLBI_1() is removed. This
  is no longer needed as the necessary synchronization will happen in
  __tlbi_sync_s1ish() or __tlbi_sync_s1ish_hyp().

* The additional TLBI operation is chosen to have minimal impact:

  - __tlbi_sync_s1ish() uses "TLBI VALE1IS, XZR". This is only used at
    EL1 or at EL2 with {E2H,TGE}=={1,1}, where it will target an unused
    entry for the reserved ASID in the kernel's own translation regime,
    and have no adverse affect.

  - __tlbi_sync_s1ish_hyp() uses "TLBI VALE2IS, XZR". This is only used
    in hyp code, where it will target an unused entry in the hyp code's
    TTBR0 mapping, and should have no adverse effect.

* As __TLBI_0() and __TLBI_1() no longer replace each TLBI with a
  TLBI;DSB;TLBI sequence, batching TLBIs is worthwhile, and there's no
  need for arch_tlbbatch_should_defer() to consider
  ARM64_WORKAROUND_REPEAT_TLBI.

When building defconfig with GCC 15.1.0, compared to v6.19-rc1, this
patch saves ~1KiB of text, makes the vmlinux ~42KiB smaller, and makes
the resulting Image 64KiB smaller:

| [mark@lakrids:~/src/linux]% size vmlinux-*
|    text    data     bss     dec     hex filename
| 21179831        19660919         708216 41548966        279fca6 vmlinux-after
| 21181075        19660903         708216 41550194        27a0172 vmlinux-before
| [mark@lakrids:~/src/linux]% ls -l vmlinux-*
| -rwxr-xr-x 1 mark mark 157771472 Feb  4 12:05 vmlinux-after
| -rwxr-xr-x 1 mark mark 157815432 Feb  4 12:05 vmlinux-before
| [mark@lakrids:~/src/linux]% ls -l Image-*
| -rw-r--r-- 1 mark mark 41007616 Feb  4 12:05 Image-after
| -rw-r--r-- 1 mark mark 41073152 Feb  4 12:05 Image-before

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Oliver Upton <oupton@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/tlbflush.h | 59 ++++++++++++++++++-------------
 arch/arm64/kernel/sys_compat.c    |  2 +-
 arch/arm64/kvm/hyp/nvhe/mm.c      |  2 +-
 arch/arm64/kvm/hyp/nvhe/tlb.c     |  8 ++---
 arch/arm64/kvm/hyp/pgtable.c      |  2 +-
 arch/arm64/kvm/hyp/vhe/tlb.c      | 10 +++---
 6 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index bf1cc9949dc8..1416e652612b 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -31,18 +31,10 @@
  */
 #define __TLBI_0(op, arg) asm (ARM64_ASM_PREAMBLE			       \
 			       "tlbi " #op "\n"				       \
-		   ALTERNATIVE("nop\n			nop",		       \
-			       "dsb ish\n		tlbi " #op,	       \
-			       ARM64_WORKAROUND_REPEAT_TLBI,		       \
-			       CONFIG_ARM64_WORKAROUND_REPEAT_TLBI)	       \
 			    : : )
 
 #define __TLBI_1(op, arg) asm (ARM64_ASM_PREAMBLE			       \
 			       "tlbi " #op ", %x0\n"			       \
-		   ALTERNATIVE("nop\n			nop",		       \
-			       "dsb ish\n		tlbi " #op ", %x0",    \
-			       ARM64_WORKAROUND_REPEAT_TLBI,		       \
-			       CONFIG_ARM64_WORKAROUND_REPEAT_TLBI)	       \
 			    : : "rZ" (arg))
 
 #define __TLBI_N(op, arg, n, ...) __TLBI_##n(op, arg)
@@ -181,6 +173,34 @@ static inline unsigned long get_trans_granule(void)
 		(__pages >> (5 * (scale) + 1)) - 1;			\
 	})
 
+#define __repeat_tlbi_sync(op, arg...)						\
+do {										\
+	if (!alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI))	\
+		break;								\
+	__tlbi(op, ##arg);							\
+	dsb(ish);								\
+} while (0)
+
+/*
+ * Complete broadcast TLB maintenance issued by the host which invalidates
+ * stage 1 information in the host's own translation regime.
+ */
+static inline void __tlbi_sync_s1ish(void)
+{
+	dsb(ish);
+	__repeat_tlbi_sync(vale1is, 0);
+}
+
+/*
+ * Complete broadcast TLB maintenance issued by hyp code which invalidates
+ * stage 1 translation information in any translation regime.
+ */
+static inline void __tlbi_sync_s1ish_hyp(void)
+{
+	dsb(ish);
+	__repeat_tlbi_sync(vale2is, 0);
+}
+
 /*
  *	TLB Invalidation
  *	================
@@ -279,7 +299,7 @@ static inline void flush_tlb_all(void)
 {
 	dsb(ishst);
 	__tlbi(vmalle1is);
-	dsb(ish);
+	__tlbi_sync_s1ish();
 	isb();
 }
 
@@ -291,7 +311,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
 	asid = __TLBI_VADDR(0, ASID(mm));
 	__tlbi(aside1is, asid);
 	__tlbi_user(aside1is, asid);
-	dsb(ish);
+	__tlbi_sync_s1ish();
 	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
 }
 
@@ -345,20 +365,11 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 				  unsigned long uaddr)
 {
 	flush_tlb_page_nosync(vma, uaddr);
-	dsb(ish);
+	__tlbi_sync_s1ish();
 }
 
 static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
 {
-	/*
-	 * TLB flush deferral is not required on systems which are affected by
-	 * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation
-	 * will have two consecutive TLBI instructions with a dsb(ish) in between
-	 * defeating the purpose (i.e save overall 'dsb ish' cost).
-	 */
-	if (alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI))
-		return false;
-
 	return true;
 }
 
@@ -374,7 +385,7 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
  */
 static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 {
-	dsb(ish);
+	__tlbi_sync_s1ish();
 }
 
 /*
@@ -509,7 +520,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 {
 	__flush_tlb_range_nosync(vma->vm_mm, start, end, stride,
 				 last_level, tlb_level);
-	dsb(ish);
+	__tlbi_sync_s1ish();
 }
 
 static inline void local_flush_tlb_contpte(struct vm_area_struct *vma,
@@ -557,7 +568,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
 	dsb(ishst);
 	__flush_tlb_range_op(vaale1is, start, pages, stride, 0,
 			     TLBI_TTL_UNKNOWN, false, lpa2_is_enabled());
-	dsb(ish);
+	__tlbi_sync_s1ish();
 	isb();
 }
 
@@ -571,7 +582,7 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr)
 
 	dsb(ishst);
 	__tlbi(vaae1is, addr);
-	dsb(ish);
+	__tlbi_sync_s1ish();
 	isb();
 }
 
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index 4a609e9b65de..b9d4998c97ef 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -37,7 +37,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
 			 * We pick the reserved-ASID to minimise the impact.
 			 */
 			__tlbi(aside1is, __TLBI_VADDR(0, 0));
-			dsb(ish);
+			__tlbi_sync_s1ish();
 		}
 
 		ret = caches_clean_inval_user_pou(start, start + chunk);
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index ae8391baebc3..218976287d3f 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -271,7 +271,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
 	 */
 	dsb(ishst);
 	__tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 	isb();
 }
 
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 48da9ca9763f..3dc1ce0d27fe 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -169,7 +169,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
 	 */
 	dsb(ish);
 	__tlbi(vmalle1is);
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 	isb();
 
 	exit_vmid_context(&cxt);
@@ -226,7 +226,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
 
 	dsb(ish);
 	__tlbi(vmalle1is);
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 	isb();
 
 	exit_vmid_context(&cxt);
@@ -240,7 +240,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
 	enter_vmid_context(mmu, &cxt, false);
 
 	__tlbi(vmalls12e1is);
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 	isb();
 
 	exit_vmid_context(&cxt);
@@ -266,5 +266,5 @@ void __kvm_flush_vm_context(void)
 	/* Same remark as in enter_vmid_context() */
 	dsb(ish);
 	__tlbi(alle1is);
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 }
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 0e4ddd28ef5d..9b480f947da2 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -501,7 +501,7 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
 		*unmapped += granule;
 	}
 
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 	isb();
 	mm_ops->put_page(ctx->ptep);
 
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index ec2569818629..35855dadfb1b 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -115,7 +115,7 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
 	 */
 	dsb(ish);
 	__tlbi(vmalle1is);
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 	isb();
 
 	exit_vmid_context(&cxt);
@@ -176,7 +176,7 @@ void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
 
 	dsb(ish);
 	__tlbi(vmalle1is);
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 	isb();
 
 	exit_vmid_context(&cxt);
@@ -192,7 +192,7 @@ void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
 	enter_vmid_context(mmu, &cxt);
 
 	__tlbi(vmalls12e1is);
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 	isb();
 
 	exit_vmid_context(&cxt);
@@ -217,7 +217,7 @@ void __kvm_flush_vm_context(void)
 {
 	dsb(ishst);
 	__tlbi(alle1is);
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 }
 
 /*
@@ -358,7 +358,7 @@ int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding)
 	default:
 		ret = -EINVAL;
 	}
-	dsb(ish);
+	__tlbi_sync_s1ish_hyp();
 	isb();
 
 	if (mmu)

From e5cb94ba5f96d691d8885175d4696d6ae6bc5ec9 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 26 Feb 2026 08:22:32 +0000
Subject: [PATCH 8/9] arm64: Fix sampling the "stable" virtual counter in
 preemptible section
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ben reports that when running with CONFIG_DEBUG_PREEMPT, using
__arch_counter_get_cntvct_stable() results in well deserves warnings,
as we access a per-CPU variable without preemption disabled.

Fix the issue by disabling preemption on reading the counter. We can
probably do a lot better by not disabling preemption on systems that
do not require horrible workarounds to return a valid counter value,
but this plugs the issue for the time being.

Fixes: 29cc0f3aa7c6 ("arm64: Force the use of CNTVCT_EL0 in __delay()")
Reported-by: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/aZw3EGs4rbQvbAzV@e134344.arm.com
Tested-by: Ben Horgan <ben.horgan@arm.com>
Tested-by: André Draszik <andre.draszik@linaro.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/lib/delay.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/lib/delay.c b/arch/arm64/lib/delay.c
index d02341303899..e278e060e78a 100644
--- a/arch/arm64/lib/delay.c
+++ b/arch/arm64/lib/delay.c
@@ -32,7 +32,11 @@ static inline unsigned long xloops_to_cycles(unsigned long xloops)
  * Note that userspace cannot change the offset behind our back either,
  * as the vcpu mutex is held as long as KVM_RUN is in progress.
  */
-#define __delay_cycles()	__arch_counter_get_cntvct_stable()
+static cycles_t notrace __delay_cycles(void)
+{
+	guard(preempt_notrace)();
+	return __arch_counter_get_cntvct_stable();
+}
 
 void __delay(unsigned long cycles)
 {

From df6e4ab654dc482c1d45776257a62ac10e14086c Mon Sep 17 00:00:00 2001
From: Sumit Gupta <sumitg@nvidia.com>
Date: Thu, 26 Feb 2026 17:29:11 +0530
Subject: [PATCH 9/9] arm64: topology: Fix false warning in
 counters_read_on_cpu() for same-CPU reads

The counters_read_on_cpu() function warns when called with IRQs
disabled to prevent deadlock in smp_call_function_single(). However,
this warning is spurious when reading counters on the current CPU,
since no IPI is needed for same CPU reads.

Commit 12eb8f4fff24 ("cpufreq: CPPC: Update FIE arch_freq_scale in
ticks for non-PCC regs") changed the CPPC Frequency Invariance Engine
to read AMU counters directly from the scheduler tick for non-PCC
register spaces (like FFH), instead of deferring to a kthread. This
means counters_read_on_cpu() is now called with IRQs disabled from the
tick handler, triggering the warning.

Fix this by restructuring the logic: when IRQs are disabled (tick
context), call the function directly for same-CPU reads. Otherwise
use smp_call_function_single().

Fixes: 997c021abc6e ("cpufreq: CPPC: Update FIE arch_freq_scale in ticks for non-PCC regs")
Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Sumit Gupta <sumitg@nvidia.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/topology.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 3fe1faab0362..b32f13358fbb 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -400,16 +400,25 @@ static inline
 int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val)
 {
 	/*
-	 * Abort call on counterless CPU or when interrupts are
-	 * disabled - can lead to deadlock in smp sync call.
+	 * Abort call on counterless CPU.
 	 */
 	if (!cpu_has_amu_feat(cpu))
 		return -EOPNOTSUPP;
 
-	if (WARN_ON_ONCE(irqs_disabled()))
-		return -EPERM;
-
-	smp_call_function_single(cpu, func, val, 1);
+	if (irqs_disabled()) {
+		/*
+		 * When IRQs are disabled (tick path: sched_tick ->
+		 * topology_scale_freq_tick or cppc_scale_freq_tick), only local
+		 * CPU counter reads are allowed. Remote CPU counter read would
+		 * require smp_call_function_single() which is unsafe with IRQs
+		 * disabled.
+		 */
+		if (WARN_ON_ONCE(cpu != smp_processor_id()))
+			return -EPERM;
+		func(val);
+	} else {
+		smp_call_function_single(cpu, func, val, 1);
+	}
 
 	return 0;
 }