- gmap rewrite: completely new memory management for kvm/s390

- vSIE improvement - maintainership change for s390 vfio-pci - small quality of life improvement for protected guests -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEoWuZBM6M3lCBSfTnuARItAMU6BMFAmmLOKAACgkQuARItAMU 6BOkhQ/9G0Sr1bfkcSQvbszvjSoDsOCSm+oAm39679Dr4v+7SsItknTutYK0M7dM n6oY2kU1dveFsF0FwBAALh4LC0lYNEmz34eHxaFPFfgqQ9lX2fBAQuBSPq8uOz1L Pk6IIlqTls8TAvKF/KqTXCEULnPGVXD2KP4WaSir+T2vZr6N/mqB7cZPR23/rMAi +PF/UtmgbfG+eRjqA8QRdm8nnzTrM5cWe9roZXYsAXDLZh+EYYeG4d96GFTV8udY /6mV1YKP0Aa+youC5p4oIh1Iv7p/Yjv6RxPoEbW1O31M9yBDJmFpz4W5C/rdkzwI nOjQj9i7ZINXf83kAZMvFb8MdXlJzaw8rBUlWzxNfrsL4ga8Rp3xMNEdufWd3T5x zNFXr0ANuBifi0B0EasBWlYDRbK4WGAC4vnkgmxqP5t2JiAN+d0FXB8LaRyZvgs/ tiwEDenCk1eDWEBcWbLnX7fGGDKDUNXVMAFTrGM1BMNZe6/IL/h/sypLuYSJ/d3Y VXDgZZyAWUVqjidDxrwurdjyzvbPd69GDbKjhTuUu4OdqUMucjjQf74w6m857Wn/ 9oLoR0p+8deb1SQ2RuB8sujcJiO9YHczwL8PLDa+bGw3jH6TRiMVVrt1HOw5QmfG QpwhKvrF2yPTgv5VZbFvYEvtITnBBfaepQe97pDDEfsHqPeStmI= =2eiy -----END PGP SIGNATURE----- Merge tag 'kvm-s390-next-7.0-1' of https://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux into HEAD - gmap rewrite: completely new memory management for kvm/s390 - vSIE improvement - maintainership change for s390 vfio-pci - small quality of life improvement for protected guests
2026-03-08 03:24:45 +01:00 · 2026-02-11 18:52:27 +01:00 · 2026-02-11 18:52:27 +01:00 · b1195183ed
commit b1195183ed
parent bf2c3138ae e3372ffb5f
51 changed files with 5930 additions and 5512 deletions
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@ -6518,6 +6518,40 @@ the capability to be present.

 `flags` must currently be zero.

+4.144 KVM_S390_KEYOP
+--------------------
+
+:Capability: KVM_CAP_S390_KEYOP
+:Architectures: s390
+:Type: vm ioctl
+:Parameters: struct kvm_s390_keyop (in/out)
+:Returns: 0 in case of success, < 0 on error
+
+The specified key operation is performed on the given guest address. The
+previous storage key (or the relevant part thereof) will be returned in
+`key`.
+
+::
+
+  struct kvm_s390_keyop {
+	__u64 guest_addr;
+	__u8  key;
+	__u8  operation;
+  };
+
+Currently supported values for ``operation``:
+
+KVM_S390_KEYOP_ISKE
+  Returns the storage key for the guest address ``guest_addr`` in ``key``.
+
+KVM_S390_KEYOP_RRBE
+  Resets the reference bit for the guest address ``guest_addr``, returning the
+  R and C bits of the old storage key in ``key``; the remaining fields of
+  the storage key will be set to 0.
+
+KVM_S390_KEYOP_SSKE
+  Sets the storage key for the guest address ``guest_addr`` to the key
+  specified in ``key``, returning the previous value in ``key``.

 .. _kvm_run:

@ -9384,6 +9418,14 @@ The presence of this capability indicates that KVM_RUN will update the
 KVM_RUN_X86_GUEST_MODE bit in kvm_run.flags to indicate whether the
 vCPU was executing nested guest code when it exited.

+8.46 KVM_CAP_S390_KEYOP
+-----------------------
+
+:Architectures: s390
+
+The presence of this capability indicates that the KVM_S390_KEYOP ioctl is
+available.
+
 KVM exits with the register state of either the L1 or L2 guest
 depending on which executed at the time of an exit. Userspace must
 take care to differentiate between these cases.
--- a/5
+++ b/5
@ -13914,14 +13914,12 @@ L:	kvm@vger.kernel.org
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git
 F:	Documentation/virt/kvm/s390*
-F:	arch/s390/include/asm/gmap.h
 F:	arch/s390/include/asm/gmap_helpers.h
 F:	arch/s390/include/asm/kvm*
 F:	arch/s390/include/uapi/asm/kvm*
 F:	arch/s390/include/uapi/asm/uvdevice.h
 F:	arch/s390/kernel/uv.c
 F:	arch/s390/kvm/
-F:	arch/s390/mm/gmap.c
 F:	arch/s390/mm/gmap_helpers.c
 F:	drivers/s390/char/uvdevice.c
 F:	tools/testing/selftests/drivers/s390x/uvdevice/
@ -23111,7 +23109,8 @@ F:	include/uapi/linux/vfio_ccw.h

 S390 VFIO-PCI DRIVER
 M:	Matthew Rosato <mjrosato@linux.ibm.com>
-M:	Eric Farman <farman@linux.ibm.com>
+M:	Farhan Ali <alifm@linux.ibm.com>
+R:	Eric Farman <farman@linux.ibm.com>
 L:	linux-s390@vger.kernel.org
 L:	kvm@vger.kernel.org
 S:	Supported
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@ -32,9 +32,6 @@ config GENERIC_BUG_RELATIVE_POINTERS
 config GENERIC_LOCKBREAK
 	def_bool y if PREEMPTION

-config PGSTE
-	def_bool y if KVM
-
 config AUDIT_ARCH
 	def_bool y

--- a/arch/s390/include/asm/dat-bits.h
+++ b/arch/s390/include/asm/dat-bits.h
@ -9,6 +9,32 @@
 #ifndef _S390_DAT_BITS_H
 #define _S390_DAT_BITS_H

+/*
+ * vaddress union in order to easily decode a virtual address into its
+ * region first index, region second index etc. parts.
+ */
+union vaddress {
+	unsigned long addr;
+	struct {
+		unsigned long rfx : 11;
+		unsigned long rsx : 11;
+		unsigned long rtx : 11;
+		unsigned long sx  : 11;
+		unsigned long px  : 8;
+		unsigned long bx  : 12;
+	};
+	struct {
+		unsigned long rfx01 : 2;
+		unsigned long	    : 9;
+		unsigned long rsx01 : 2;
+		unsigned long	    : 9;
+		unsigned long rtx01 : 2;
+		unsigned long	    : 9;
+		unsigned long sx01  : 2;
+		unsigned long	    : 29;
+	};
+};
+
 union asce {
 	unsigned long val;
 	struct {
@ -98,7 +124,8 @@ union region3_table_entry {
 	struct {
 		unsigned long	: 53;
 		unsigned long fc: 1; /* Format-Control */
-		unsigned long	: 4;
+		unsigned long p : 1; /* DAT-Protection Bit */
+		unsigned long	: 3;
 		unsigned long i : 1; /* Region-Invalid Bit */
 		unsigned long cr: 1; /* Common-Region Bit */
 		unsigned long tt: 2; /* Table-Type Bits */
@ -140,7 +167,8 @@ union segment_table_entry {
 	struct {
 		unsigned long	: 53;
 		unsigned long fc: 1; /* Format-Control */
-		unsigned long	: 4;
+		unsigned long p : 1; /* DAT-Protection Bit */
+		unsigned long	: 3;
 		unsigned long i : 1; /* Segment-Invalid Bit */
 		unsigned long cs: 1; /* Common-Segment Bit */
 		unsigned long tt: 2; /* Table-Type Bits */
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@ -1,174 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- *  KVM guest address space mapping code
- *
- *    Copyright IBM Corp. 2007, 2016
- *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
- */
-
-#ifndef _ASM_S390_GMAP_H
-#define _ASM_S390_GMAP_H
-
-#include <linux/radix-tree.h>
-#include <linux/refcount.h>
-
-/* Generic bits for GMAP notification on DAT table entry changes. */
-#define GMAP_NOTIFY_SHADOW	0x2
-#define GMAP_NOTIFY_MPROT	0x1
-
-/* Status bits only for huge segment entries */
-#define _SEGMENT_ENTRY_GMAP_IN		0x0800	/* invalidation notify bit */
-#define _SEGMENT_ENTRY_GMAP_UC		0x0002	/* dirty (migration) */
-
-/**
- * struct gmap_struct - guest address space
- * @list: list head for the mm->context gmap list
- * @mm: pointer to the parent mm_struct
- * @guest_to_host: radix tree with guest to host address translation
- * @host_to_guest: radix tree with pointer to segment table entries
- * @guest_table_lock: spinlock to protect all entries in the guest page table
- * @ref_count: reference counter for the gmap structure
- * @table: pointer to the page directory
- * @asce: address space control element for gmap page table
- * @pfault_enabled: defines if pfaults are applicable for the guest
- * @guest_handle: protected virtual machine handle for the ultravisor
- * @host_to_rmap: radix tree with gmap_rmap lists
- * @children: list of shadow gmap structures
- * @shadow_lock: spinlock to protect the shadow gmap list
- * @parent: pointer to the parent gmap for shadow guest address spaces
- * @orig_asce: ASCE for which the shadow page table has been created
- * @edat_level: edat level to be used for the shadow translation
- * @removed: flag to indicate if a shadow guest address space has been removed
- * @initialized: flag to indicate if a shadow guest address space can be used
- */
-struct gmap {
-	struct list_head list;
-	struct mm_struct *mm;
-	struct radix_tree_root guest_to_host;
-	struct radix_tree_root host_to_guest;
-	spinlock_t guest_table_lock;
-	refcount_t ref_count;
-	unsigned long *table;
-	unsigned long asce;
-	unsigned long asce_end;
-	void *private;
-	bool pfault_enabled;
-	/* only set for protected virtual machines */
-	unsigned long guest_handle;
-	/* Additional data for shadow guest address spaces */
-	struct radix_tree_root host_to_rmap;
-	struct list_head children;
-	spinlock_t shadow_lock;
-	struct gmap *parent;
-	unsigned long orig_asce;
-	int edat_level;
-	bool removed;
-	bool initialized;
-};
-
-/**
- * struct gmap_rmap - reverse mapping for shadow page table entries
- * @next: pointer to next rmap in the list
- * @raddr: virtual rmap address in the shadow guest address space
- */
-struct gmap_rmap {
-	struct gmap_rmap *next;
-	unsigned long raddr;
-};
-
-#define gmap_for_each_rmap(pos, head) \
-	for (pos = (head); pos; pos = pos->next)
-
-#define gmap_for_each_rmap_safe(pos, n, head) \
-	for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
-
-/**
- * struct gmap_notifier - notify function block for page invalidation
- * @notifier_call: address of callback function
- */
-struct gmap_notifier {
-	struct list_head list;
-	struct rcu_head rcu;
-	void (*notifier_call)(struct gmap *gmap, unsigned long start,
-			      unsigned long end);
-};
-
-static inline int gmap_is_shadow(struct gmap *gmap)
-{
-	return !!gmap->parent;
-}
-
-struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
-void gmap_remove(struct gmap *gmap);
-struct gmap *gmap_get(struct gmap *gmap);
-void gmap_put(struct gmap *gmap);
-void gmap_free(struct gmap *gmap);
-struct gmap *gmap_alloc(unsigned long limit);
-
-int gmap_map_segment(struct gmap *gmap, unsigned long from,
-		     unsigned long to, unsigned long len);
-int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
-unsigned long __gmap_translate(struct gmap *, unsigned long gaddr);
-int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr);
-void __gmap_zap(struct gmap *, unsigned long gaddr);
-void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
-
-int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
-
-void gmap_unshadow(struct gmap *sg);
-int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
-		    int fake);
-int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
-		    int fake);
-int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
-		    int fake);
-int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
-		    int fake);
-int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
-
-void gmap_register_pte_notifier(struct gmap_notifier *);
-void gmap_unregister_pte_notifier(struct gmap_notifier *);
-
-int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits);
-
-void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4],
-			     unsigned long gaddr, unsigned long vmaddr);
-int s390_replace_asce(struct gmap *gmap);
-void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns);
-int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
-			    unsigned long end, bool interruptible);
-unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level);
-
-/**
- * s390_uv_destroy_range - Destroy a range of pages in the given mm.
- * @mm: the mm on which to operate on
- * @start: the start of the range
- * @end: the end of the range
- *
- * This function will call cond_sched, so it should not generate stalls, but
- * it will otherwise only return when it completed.
- */
-static inline void s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
-					 unsigned long end)
-{
-	(void)__s390_uv_destroy_range(mm, start, end, false);
-}
-
-/**
- * s390_uv_destroy_range_interruptible - Destroy a range of pages in the
- * given mm, but stop when a fatal signal is received.
- * @mm: the mm on which to operate on
- * @start: the start of the range
- * @end: the end of the range
- *
- * This function will call cond_sched, so it should not generate stalls. If
- * a fatal signal is received, it will return with -EINTR immediately,
- * without finishing destroying the whole range. Upon successful
- * completion, 0 is returned.
- */
-static inline int s390_uv_destroy_range_interruptible(struct mm_struct *mm, unsigned long start,
-						      unsigned long end)
-{
-	return __s390_uv_destroy_range(mm, start, end, true);
-}
-#endif /* _ASM_S390_GMAP_H */
--- a/arch/s390/include/asm/gmap_helpers.h
+++ b/arch/s390/include/asm/gmap_helpers.h
@ -11,5 +11,6 @@
 void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr);
 void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end);
 int gmap_helper_disable_cow_sharing(void);
+void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr);

 #endif /* _ASM_S390_GMAP_HELPERS_H */
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@ -37,12 +37,6 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 	return __huge_ptep_get_and_clear(mm, addr, ptep);
 }

-static inline void arch_clear_hugetlb_flags(struct folio *folio)
-{
-	clear_bit(PG_arch_1, &folio->flags.f);
-}
-#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
-
 #define __HAVE_ARCH_HUGE_PTE_CLEAR
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 				  pte_t *ptep, unsigned long sz)
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@ -27,6 +27,7 @@
 #include <asm/isc.h>
 #include <asm/guarded_storage.h>

+#define KVM_HAVE_MMU_RWLOCK
 #define KVM_MAX_VCPUS 255

 #define KVM_INTERNAL_MEM_SLOTS 1
@ -441,6 +442,7 @@ struct kvm_vcpu_arch {
 	bool acrs_loaded;
 	struct kvm_s390_pv_vcpu pv;
 	union diag318_info diag318_info;
+	struct kvm_s390_mmu_cache *mc;
 };

 struct kvm_vm_stat {
@ -630,8 +632,12 @@ struct kvm_s390_pv {
 	void *set_aside;
 	struct list_head need_cleanup;
 	struct mmu_notifier mmu_notifier;
+	/* Protects against concurrent import-like operations */
+	struct mutex import_lock;
 };

+struct kvm_s390_mmu_cache;
+
 struct kvm_arch {
 	struct esca_block *sca;
 	debug_info_t *dbf;
@ -671,6 +677,7 @@ struct kvm_arch {
 	struct kvm_s390_pv pv;
 	struct list_head kzdev_list;
 	spinlock_t kzdev_list_lock;
+	struct kvm_s390_mmu_cache *mc;
 };

 #define KVM_HVA_ERR_BAD		(-1UL)
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@ -18,24 +18,11 @@ typedef struct {
 	unsigned long vdso_base;
 	/* The mmu context belongs to a secure guest. */
 	atomic_t protected_count;
-	/*
-	 * The following bitfields need a down_write on the mm
-	 * semaphore when they are written to. As they are only
-	 * written once, they can be read without a lock.
-	 */
-	/* The mmu context uses extended page tables. */
-	unsigned int has_pgste:1;
-	/* The mmu context uses storage keys. */
-	unsigned int uses_skeys:1;
-	/* The mmu context uses CMM. */
-	unsigned int uses_cmm:1;
 	/*
 	 * The mmu context allows COW-sharing of memory pages (KSM, zeropage).
 	 * Note that COW-sharing during fork() is currently always allowed.
 	 */
 	unsigned int allow_cow_sharing:1;
-	/* The gmaps associated with this context are allowed to use huge pages. */
-	unsigned int allow_gmap_hpage_1m:1;
 } mm_context_t;

 #define INIT_MM_CONTEXT(name)						   \
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@ -29,12 +29,8 @@ static inline int init_new_context(struct task_struct *tsk,
 	atomic_set(&mm->context.protected_count, 0);
 	mm->context.gmap_asce = 0;
 	mm->context.flush_mm = 0;
-#ifdef CONFIG_PGSTE
-	mm->context.has_pgste = 0;
-	mm->context.uses_skeys = 0;
-	mm->context.uses_cmm = 0;
+#if IS_ENABLED(CONFIG_KVM)
 	mm->context.allow_cow_sharing = 1;
-	mm->context.allow_gmap_hpage_1m = 0;
 #endif
 	switch (mm->context.asce_limit) {
 	default:
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@ -78,7 +78,6 @@ static inline void copy_page(void *to, void *from)
 #ifdef STRICT_MM_TYPECHECKS

 typedef struct { unsigned long pgprot; } pgprot_t;
-typedef struct { unsigned long pgste; } pgste_t;
 typedef struct { unsigned long pte; } pte_t;
 typedef struct { unsigned long pmd; } pmd_t;
 typedef struct { unsigned long pud; } pud_t;
@ -94,7 +93,6 @@ static __always_inline unsigned long name ## _val(name ## _t name)	\
 #else /* STRICT_MM_TYPECHECKS */

 typedef unsigned long pgprot_t;
-typedef unsigned long pgste_t;
 typedef unsigned long pte_t;
 typedef unsigned long pmd_t;
 typedef unsigned long pud_t;
@ -110,7 +108,6 @@ static __always_inline unsigned long name ## _val(name ## _t name)	\
 #endif /* STRICT_MM_TYPECHECKS */

 DEFINE_PGVAL_FUNC(pgprot)
-DEFINE_PGVAL_FUNC(pgste)
 DEFINE_PGVAL_FUNC(pte)
 DEFINE_PGVAL_FUNC(pmd)
 DEFINE_PGVAL_FUNC(pud)
@ -120,7 +117,6 @@ DEFINE_PGVAL_FUNC(pgd)
 typedef pte_t *pgtable_t;

 #define __pgprot(x)	((pgprot_t) { (x) } )
-#define __pgste(x)	((pgste_t) { (x) } )
 #define __pte(x)        ((pte_t) { (x) } )
 #define __pmd(x)        ((pmd_t) { (x) } )
 #define __pud(x)	((pud_t) { (x) } )
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@ -27,10 +27,6 @@ unsigned long *page_table_alloc_noprof(struct mm_struct *);
 #define page_table_alloc(...)	alloc_hooks(page_table_alloc_noprof(__VA_ARGS__))
 void page_table_free(struct mm_struct *, unsigned long *);

-struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm);
-#define page_table_alloc_pgste(...)	alloc_hooks(page_table_alloc_pgste_noprof(__VA_ARGS__))
-void page_table_free_pgste(struct ptdesc *ptdesc);
-
 static inline void crst_table_init(unsigned long *crst, unsigned long entry)
 {
 	memset64((u64 *)crst, entry, _CRST_ENTRIES);
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@ -413,28 +413,6 @@ void setup_protection_map(void);
 * SW-bits: y young, d dirty, r read, w write
 */

-/* Page status table bits for virtualization */
-#define PGSTE_ACC_BITS	0xf000000000000000UL
-#define PGSTE_FP_BIT	0x0800000000000000UL
-#define PGSTE_PCL_BIT	0x0080000000000000UL
-#define PGSTE_HR_BIT	0x0040000000000000UL
-#define PGSTE_HC_BIT	0x0020000000000000UL
-#define PGSTE_GR_BIT	0x0004000000000000UL
-#define PGSTE_GC_BIT	0x0002000000000000UL
-#define PGSTE_ST2_MASK	0x0000ffff00000000UL
-#define PGSTE_UC_BIT	0x0000000000008000UL	/* user dirty (migration) */
-#define PGSTE_IN_BIT	0x0000000000004000UL	/* IPTE notify bit */
-#define PGSTE_VSIE_BIT	0x0000000000002000UL	/* ref'd in a shadow table */
-
-/* Guest Page State used for virtualization */
-#define _PGSTE_GPS_ZERO			0x0000000080000000UL
-#define _PGSTE_GPS_NODAT		0x0000000040000000UL
-#define _PGSTE_GPS_USAGE_MASK		0x0000000003000000UL
-#define _PGSTE_GPS_USAGE_STABLE		0x0000000000000000UL
-#define _PGSTE_GPS_USAGE_UNUSED		0x0000000001000000UL
-#define _PGSTE_GPS_USAGE_POT_VOLATILE	0x0000000002000000UL
-#define _PGSTE_GPS_USAGE_VOLATILE	_PGSTE_GPS_USAGE_MASK
-
 /*
 * A user page table pointer has the space-switch-event bit, the
 * private-space-control bit and the storage-alteration-event-control
@ -566,34 +544,15 @@ static inline bool mm_pmd_folded(struct mm_struct *mm)
 }
 #define mm_pmd_folded(mm) mm_pmd_folded(mm)

-static inline int mm_has_pgste(struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	if (unlikely(mm->context.has_pgste))
-		return 1;
-#endif
-	return 0;
-}
-
 static inline int mm_is_protected(struct mm_struct *mm)
 {
-#ifdef CONFIG_PGSTE
+#if IS_ENABLED(CONFIG_KVM)
 	if (unlikely(atomic_read(&mm->context.protected_count)))
 		return 1;
 #endif
 	return 0;
 }

-static inline pgste_t clear_pgste_bit(pgste_t pgste, unsigned long mask)
-{
-	return __pgste(pgste_val(pgste) & ~mask);
-}
-
-static inline pgste_t set_pgste_bit(pgste_t pgste, unsigned long mask)
-{
-	return __pgste(pgste_val(pgste) | mask);
-}
-
 static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
 {
 	return __pte(pte_val(pte) & ~pgprot_val(prot));
@ -632,22 +591,13 @@ static inline pud_t set_pud_bit(pud_t pud, pgprot_t prot)
 #define mm_forbids_zeropage mm_forbids_zeropage
 static inline int mm_forbids_zeropage(struct mm_struct *mm)
 {
-#ifdef CONFIG_PGSTE
+#if IS_ENABLED(CONFIG_KVM)
 	if (!mm->context.allow_cow_sharing)
 		return 1;
 #endif
 	return 0;
 }

-static inline int mm_uses_skeys(struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	if (mm->context.uses_skeys)
-		return 1;
-#endif
-	return 0;
-}
-
 /**
 * cspg() - Compare and Swap and Purge (CSPG)
 * @ptr: Pointer to the value to be exchanged
@ -1136,6 +1086,13 @@ static inline pte_t pte_mkhuge(pte_t pte)
 }
 #endif

+static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
+{
+	asm volatile("sske %[skey],%[addr],1"
+		     : [addr] "+a" (addr) : [skey] "d" (skey));
+	return addr;
+}
+
 #define IPTE_GLOBAL	0
 #define	IPTE_LOCAL	1

@ -1232,7 +1189,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID));
 	/* At this point the reference through the mapping is still present */
 	if (mm_is_protected(mm) && pte_present(res))
-		uv_convert_from_secure_pte(res);
+		WARN_ON_ONCE(uv_convert_from_secure_pte(res));
 	return res;
 }

@ -1250,7 +1207,7 @@ static inline pte_t ptep_clear_flush(struct vm_area_struct *vma,
 	res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID));
 	/* At this point the reference through the mapping is still present */
 	if (mm_is_protected(vma->vm_mm) && pte_present(res))
-		uv_convert_from_secure_pte(res);
+		WARN_ON_ONCE(uv_convert_from_secure_pte(res));
 	return res;
 }

@ -1287,9 +1244,10 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
 	/*
 	 * If something went wrong and the page could not be destroyed, or
 	 * if this is not a mm teardown, the slower export is used as
-	 * fallback instead.
+	 * fallback instead. If even that fails, print a warning and leak
+	 * the page, to avoid crashing the whole system.
 	 */
-	uv_convert_from_secure_pte(res);
+	WARN_ON_ONCE(uv_convert_from_secure_pte(res));
 	return res;
 }

@ -1348,50 +1306,13 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 {
 	if (pte_same(*ptep, entry))
 		return 0;
-	if (cpu_has_rdp() && !mm_has_pgste(vma->vm_mm) && pte_allow_rdp(*ptep, entry))
+	if (cpu_has_rdp() && pte_allow_rdp(*ptep, entry))
 		ptep_reset_dat_prot(vma->vm_mm, addr, ptep, entry);
 	else
 		ptep_xchg_direct(vma->vm_mm, addr, ptep, entry);
 	return 1;
 }

-/*
- * Additional functions to handle KVM guest page tables
- */
-void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t entry);
-void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr,
-		 pte_t *ptep, unsigned long bits);
-int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
-		    pte_t *ptep, int prot, unsigned long bit);
-void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep , int reset);
-void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
-		    pte_t *sptep, pte_t *tptep, pte_t pte);
-void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
-
-bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long address,
-			    pte_t *ptep);
-int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned char key, bool nq);
-int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			       unsigned char key, unsigned char *oldkey,
-			       bool nq, bool mr, bool mc);
-int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
-int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned char *key);
-
-int set_pgste_bits(struct mm_struct *mm, unsigned long addr,
-				unsigned long bits, unsigned long value);
-int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep);
-int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
-			unsigned long *oldpte, unsigned long *oldpgste);
-void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr);
-void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr);
-void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr);
-
 #define pgprot_writecombine	pgprot_writecombine
 pgprot_t pgprot_writecombine(pgprot_t prot);

@ -1406,23 +1327,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 {
 	if (pte_present(entry))
 		entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED));
-	if (mm_has_pgste(mm)) {
-		for (;;) {
-			ptep_set_pte_at(mm, addr, ptep, entry);
-			if (--nr == 0)
-				break;
-			ptep++;
-			entry = __pte(pte_val(entry) + PAGE_SIZE);
-			addr += PAGE_SIZE;
-		}
-	} else {
-		for (;;) {
-			set_pte(ptep, entry);
-			if (--nr == 0)
-				break;
-			ptep++;
-			entry = __pte(pte_val(entry) + PAGE_SIZE);
-		}
+	for (;;) {
+		set_pte(ptep, entry);
+		if (--nr == 0)
+			break;
+		ptep++;
+		entry = __pte(pte_val(entry) + PAGE_SIZE);
 	}
 }
 #define set_ptes set_ptes
@ -2015,9 +1925,6 @@ extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t p
 extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot);
 extern void vmem_unmap_4k_page(unsigned long addr);
 extern pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc);
-extern int s390_enable_sie(void);
-extern int s390_enable_skey(void);
-extern void s390_reset_cmma(struct mm_struct *mm);

 /* s390 has a private copy of get unmapped area to deal with cache synonyms */
 #define HAVE_ARCH_UNMAPPED_AREA
@ -2026,40 +1933,4 @@ extern void s390_reset_cmma(struct mm_struct *mm);
 #define pmd_pgtable(pmd) \
 	((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE))

-static inline unsigned long gmap_pgste_get_pgt_addr(unsigned long *pgt)
-{
-	unsigned long *pgstes, res;
-
-	pgstes = pgt + _PAGE_ENTRIES;
-
-	res = (pgstes[0] & PGSTE_ST2_MASK) << 16;
-	res |= pgstes[1] & PGSTE_ST2_MASK;
-	res |= (pgstes[2] & PGSTE_ST2_MASK) >> 16;
-	res |= (pgstes[3] & PGSTE_ST2_MASK) >> 32;
-
-	return res;
-}
-
-static inline pgste_t pgste_get_lock(pte_t *ptep)
-{
-	unsigned long value = 0;
-#ifdef CONFIG_PGSTE
-	unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE);
-
-	do {
-		value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr);
-	} while (value & PGSTE_PCL_BIT);
-	value |= PGSTE_PCL_BIT;
-#endif
-	return __pgste(value);
-}
-
-static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	barrier();
-	WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT);
-#endif
-}
-
 #endif /* _S390_PAGE_H */
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@ -36,7 +36,6 @@ static inline bool __tlb_remove_folio_pages(struct mmu_gather *tlb,

 #include <asm/tlbflush.h>
 #include <asm-generic/tlb.h>
-#include <asm/gmap.h>

 /*
 * Release the page cache reference for a pte removed by
@ -85,8 +84,6 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 	tlb->mm->context.flush_mm = 1;
 	tlb->freed_tables = 1;
 	tlb->cleared_pmds = 1;
-	if (mm_has_pgste(tlb->mm))
-		gmap_unlink(tlb->mm, (unsigned long *)pte, address);
 	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte));
 }

--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@ -471,65 +471,15 @@ do {									\
 #define arch_get_kernel_nofault __mvc_kernel_nofault
 #define arch_put_kernel_nofault __mvc_kernel_nofault

-void __cmpxchg_user_key_called_with_bad_pointer(void);
-
-int __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
-			unsigned char old, unsigned char new, unsigned long key);
-int __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
-			unsigned short old, unsigned short new, unsigned long key);
-int __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
-			unsigned int old, unsigned int new, unsigned long key);
-int __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
-			unsigned long old, unsigned long new, unsigned long key);
-int __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
-			 __uint128_t old, __uint128_t new, unsigned long key);
-
-static __always_inline int _cmpxchg_user_key(unsigned long address, void *uval,
-					     __uint128_t old, __uint128_t new,
-					     unsigned long key, int size)
-{
-	switch (size) {
-	case 1:  return __cmpxchg_user_key1(address, uval, old, new, key);
-	case 2:  return __cmpxchg_user_key2(address, uval, old, new, key);
-	case 4:  return __cmpxchg_user_key4(address, uval, old, new, key);
-	case 8:  return __cmpxchg_user_key8(address, uval, old, new, key);
-	case 16: return __cmpxchg_user_key16(address, uval, old, new, key);
-	default: __cmpxchg_user_key_called_with_bad_pointer();
-	}
-	return 0;
-}
-
-/**
- * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys
- * @ptr: User space address of value to compare to @old and exchange with
- *	 @new. Must be aligned to sizeof(*@ptr).
- * @uval: Address where the old value of *@ptr is written to.
- * @old: Old value. Compared to the content pointed to by @ptr in order to
- *	 determine if the exchange occurs. The old value read from *@ptr is
- *	 written to *@uval.
- * @new: New value to place at *@ptr.
- * @key: Access key to use for checking storage key protection.
- *
- * Perform a cmpxchg on a user space target, honoring storage key protection.
- * @key alone determines how key checking is performed, neither
- * storage-protection-override nor fetch-protection-override apply.
- * The caller must compare *@uval and @old to determine if values have been
- * exchanged. In case of an exception *@uval is set to zero.
- *
- * Return:     0: cmpxchg executed
- *	       -EFAULT: an exception happened when trying to access *@ptr
- *	       -EAGAIN: maxed out number of retries (byte and short only)
- */
-#define cmpxchg_user_key(ptr, uval, old, new, key)			\
-({									\
-	__typeof__(ptr) __ptr = (ptr);					\
-	__typeof__(uval) __uval = (uval);				\
-									\
-	BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval)));		\
-	might_fault();							\
-	__chk_user_ptr(__ptr);						\
-	_cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval),	\
-			  (old), (new), (key), sizeof(*(__ptr)));	\
-})
+int __cmpxchg_key1(void *address, unsigned char *uval, unsigned char old,
+		   unsigned char new, unsigned long key);
+int __cmpxchg_key2(void *address, unsigned short *uval, unsigned short old,
+		   unsigned short new, unsigned long key);
+int __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old,
+		   unsigned int new, unsigned long key);
+int __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old,
+		   unsigned long new, unsigned long key);
+int __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old,
+		    __uint128_t new, unsigned long key);

 #endif /* __S390_UACCESS_H */
--- a/arch/s390/include/asm/uv.h
+++ b/arch/s390/include/asm/uv.h
@ -631,7 +631,8 @@ int uv_pin_shared(unsigned long paddr);
 int uv_destroy_folio(struct folio *folio);
 int uv_destroy_pte(pte_t pte);
 int uv_convert_from_secure_pte(pte_t pte);
-int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb);
+int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio);
+int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb);
 int uv_convert_from_secure(unsigned long paddr);
 int uv_convert_from_secure_folio(struct folio *folio);

--- a/arch/s390/kernel/uv.c
+++ b/arch/s390/kernel/uv.c
@ -134,14 +134,15 @@ static int uv_destroy(unsigned long paddr)
 */
 int uv_destroy_folio(struct folio *folio)
 {
+	unsigned long i;
 	int rc;

-	/* Large folios cannot be secure */
-	if (unlikely(folio_test_large(folio)))
-		return 0;
-
 	folio_get(folio);
-	rc = uv_destroy(folio_to_phys(folio));
+	for (i = 0; i < (1 << folio_order(folio)); i++) {
+		rc = uv_destroy(folio_to_phys(folio) + i * PAGE_SIZE);
+		if (rc)
+			break;
+	}
 	if (!rc)
 		clear_bit(PG_arch_1, &folio->flags.f);
 	folio_put(folio);
@ -183,14 +184,15 @@ EXPORT_SYMBOL_GPL(uv_convert_from_secure);
 */
 int uv_convert_from_secure_folio(struct folio *folio)
 {
+	unsigned long i;
 	int rc;

-	/* Large folios cannot be secure */
-	if (unlikely(folio_test_large(folio)))
-		return 0;
-
 	folio_get(folio);
-	rc = uv_convert_from_secure(folio_to_phys(folio));
+	for (i = 0; i < (1 << folio_order(folio)); i++) {
+		rc = uv_convert_from_secure(folio_to_phys(folio) + i * PAGE_SIZE);
+		if (rc)
+			break;
+	}
 	if (!rc)
 		clear_bit(PG_arch_1, &folio->flags.f);
 	folio_put(folio);
@ -207,39 +209,6 @@ int uv_convert_from_secure_pte(pte_t pte)
 	return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte)));
 }

-/**
- * should_export_before_import - Determine whether an export is needed
- * before an import-like operation
- * @uvcb: the Ultravisor control block of the UVC to be performed
- * @mm: the mm of the process
- *
- * Returns whether an export is needed before every import-like operation.
- * This is needed for shared pages, which don't trigger a secure storage
- * exception when accessed from a different guest.
- *
- * Although considered as one, the Unpin Page UVC is not an actual import,
- * so it is not affected.
- *
- * No export is needed also when there is only one protected VM, because the
- * page cannot belong to the wrong VM in that case (there is no "other VM"
- * it can belong to).
- *
- * Return: true if an export is needed before every import, otherwise false.
- */
-static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
-{
-	/*
-	 * The misc feature indicates, among other things, that importing a
-	 * shared page from a different protected VM will automatically also
-	 * transfer its ownership.
-	 */
-	if (uv_has_feature(BIT_UV_FEAT_MISC))
-		return false;
-	if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
-		return false;
-	return atomic_read(&mm->context.protected_count) > 1;
-}
-
 /*
 * Calculate the expected ref_count for a folio that would otherwise have no
 * further pins. This was cribbed from similar functions in other places in
@ -279,7 +248,7 @@ static int expected_folio_refs(struct folio *folio)
 *          (it's the same logic as split_folio()), and the folio must be
 *          locked.
 */
-static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
+int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
 {
 	int expected, cc = 0;

@ -309,20 +278,7 @@ static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb)
 		return -EAGAIN;
 	return uvcb->rc == 0x10a ? -ENXIO : -EINVAL;
 }
-
-static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb)
-{
-	int rc;
-
-	if (!folio_trylock(folio))
-		return -EAGAIN;
-	if (should_export_before_import(uvcb, mm))
-		uv_convert_from_secure(folio_to_phys(folio));
-	rc = __make_folio_secure(folio, uvcb);
-	folio_unlock(folio);
-
-	return rc;
-}
+EXPORT_SYMBOL(__make_folio_secure);

 /**
 * s390_wiggle_split_folio() - try to drain extra references to a folio and
@ -337,7 +293,7 @@ static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct u
 *		   but another attempt can be made;
 *	   -EINVAL in case of other folio splitting errors. See split_folio().
 */
-static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio)
+int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio)
 {
 	int rc, tried_splits;

@ -409,56 +365,7 @@ static int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio)
 	}
 	return -EAGAIN;
 }
-
-int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb)
-{
-	struct vm_area_struct *vma;
-	struct folio_walk fw;
-	struct folio *folio;
-	int rc;
-
-	mmap_read_lock(mm);
-	vma = vma_lookup(mm, hva);
-	if (!vma) {
-		mmap_read_unlock(mm);
-		return -EFAULT;
-	}
-	folio = folio_walk_start(&fw, vma, hva, 0);
-	if (!folio) {
-		mmap_read_unlock(mm);
-		return -ENXIO;
-	}
-
-	folio_get(folio);
-	/*
-	 * Secure pages cannot be huge and userspace should not combine both.
-	 * In case userspace does it anyway this will result in an -EFAULT for
-	 * the unpack. The guest is thus never reaching secure mode.
-	 * If userspace plays dirty tricks and decides to map huge pages at a
-	 * later point in time, it will receive a segmentation fault or
-	 * KVM_RUN will return -EFAULT.
-	 */
-	if (folio_test_hugetlb(folio))
-		rc = -EFAULT;
-	else if (folio_test_large(folio))
-		rc = -E2BIG;
-	else if (!pte_write(fw.pte) || (pte_val(fw.pte) & _PAGE_INVALID))
-		rc = -ENXIO;
-	else
-		rc = make_folio_secure(mm, folio, uvcb);
-	folio_walk_end(&fw, vma);
-	mmap_read_unlock(mm);
-
-	if (rc == -E2BIG || rc == -EBUSY) {
-		rc = s390_wiggle_split_folio(mm, folio);
-		if (!rc)
-			rc = -EAGAIN;
-	}
-	folio_put(folio);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(make_hva_secure);
+EXPORT_SYMBOL_GPL(s390_wiggle_split_folio);

 /*
 * To be called with the folio locked or with an extra reference! This will
@ -470,21 +377,18 @@ int arch_make_folio_accessible(struct folio *folio)
 {
 	int rc = 0;

-	/* Large folios cannot be secure */
-	if (unlikely(folio_test_large(folio)))
-		return 0;
-
 	/*
-	 * PG_arch_1 is used in 2 places:
-	 * 1. for storage keys of hugetlb folios and KVM
-	 * 2. As an indication that this small folio might be secure. This can
-	 *    overindicate, e.g. we set the bit before calling
-	 *    convert_to_secure.
-	 * As secure pages are never large folios, both variants can co-exists.
+	 * PG_arch_1 is used as an indication that this small folio might be
+	 * secure. This can overindicate, e.g. we set the bit before calling
+	 * convert_to_secure.
 	 */
 	if (!test_bit(PG_arch_1, &folio->flags.f))
 		return 0;

+	/* Large folios cannot be secure. */
+	if (WARN_ON_ONCE(folio_test_large(folio)))
+		return -EFAULT;
+
 	rc = uv_pin_shared(folio_to_phys(folio));
 	if (!rc) {
 		clear_bit(PG_arch_1, &folio->flags.f);
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@ -30,6 +30,8 @@ config KVM
 	select KVM_VFIO
 	select MMU_NOTIFIER
 	select VIRT_XFER_TO_GUEST_WORK
+	select KVM_GENERIC_MMU_NOTIFIER
+	select KVM_MMU_LOCKLESS_AGING
 	help
 	  Support hosting paravirtualized guest machines using the SIE
 	  virtualization capability on the mainframe. This should work
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@ -8,7 +8,8 @@ include $(srctree)/virt/kvm/Makefile.kvm
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm

 kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o
+kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o
+kvm-y += dat.o gmap.o faultin.o

 kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
 obj-$(CONFIG_KVM) += kvm.o
--- a/arch/s390/kvm/dat.c
+++ b/arch/s390/kvm/dat.c
--- a/arch/s390/kvm/dat.h
+++ b/arch/s390/kvm/dat.h
@ -0,0 +1,970 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2024, 2025
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_DAT_H
+#define __KVM_S390_DAT_H
+
+#include <linux/radix-tree.h>
+#include <linux/refcount.h>
+#include <linux/io.h>
+#include <linux/kvm_types.h>
+#include <linux/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/dat-bits.h>
+
+/*
+ * Base address and length must be sent at the start of each block, therefore
+ * it's cheaper to send some clean data, as long as it's less than the size of
+ * two longs.
+ */
+#define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *))
+/* For consistency */
+#define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
+
+#define _ASCE(x) ((union asce) { .val = (x), })
+#define NULL_ASCE _ASCE(0)
+
+enum {
+	_DAT_TOKEN_NONE = 0,
+	_DAT_TOKEN_PIC,
+};
+
+#define _CRSTE_TOK(l, t, p) ((union crste) {	\
+		.tok.i = 1,			\
+		.tok.tt = (l),			\
+		.tok.type = (t),		\
+		.tok.par = (p)			\
+	})
+#define _CRSTE_PIC(l, p) _CRSTE_TOK(l, _DAT_TOKEN_PIC, p)
+
+#define _CRSTE_HOLE(l) _CRSTE_PIC(l, PGM_ADDRESSING)
+#define _CRSTE_EMPTY(l) _CRSTE_TOK(l, _DAT_TOKEN_NONE, 0)
+
+#define _PMD_EMPTY _CRSTE_EMPTY(TABLE_TYPE_SEGMENT)
+
+#define _PTE_TOK(t, p) ((union pte) { .tok.i = 1, .tok.type = (t), .tok.par = (p) })
+#define _PTE_EMPTY _PTE_TOK(_DAT_TOKEN_NONE, 0)
+
+/* This fake table type is used for page table walks (both for normal page tables and vSIE) */
+#define TABLE_TYPE_PAGE_TABLE -1
+
+enum dat_walk_flags {
+	DAT_WALK_USES_SKEYS	= 0x40,
+	DAT_WALK_CONTINUE	= 0x20,
+	DAT_WALK_IGN_HOLES	= 0x10,
+	DAT_WALK_SPLIT		= 0x08,
+	DAT_WALK_ALLOC		= 0x04,
+	DAT_WALK_ANY		= 0x02,
+	DAT_WALK_LEAF		= 0x01,
+	DAT_WALK_DEFAULT	= 0
+};
+
+#define DAT_WALK_SPLIT_ALLOC (DAT_WALK_SPLIT | DAT_WALK_ALLOC)
+#define DAT_WALK_ALLOC_CONTINUE (DAT_WALK_CONTINUE | DAT_WALK_ALLOC)
+#define DAT_WALK_LEAF_ALLOC (DAT_WALK_LEAF | DAT_WALK_ALLOC)
+
+union pte {
+	unsigned long val;
+	union page_table_entry h;
+	struct {
+		unsigned long   :56; /* Hardware bits */
+		unsigned long u : 1; /* Page unused */
+		unsigned long s : 1; /* Special */
+		unsigned long w : 1; /* Writable */
+		unsigned long r : 1; /* Readable */
+		unsigned long d : 1; /* Dirty */
+		unsigned long y : 1; /* Young */
+		unsigned long sd: 1; /* Soft dirty */
+		unsigned long pr: 1; /* Present */
+	} s;
+	struct {
+		unsigned char hwbytes[7];
+		unsigned char swbyte;
+	};
+	union {
+		struct {
+			unsigned long type :16; /* Token type */
+			unsigned long par  :16; /* Token parameter */
+			unsigned long      :20;
+			unsigned long      : 1; /* Must be 0 */
+			unsigned long i    : 1; /* Must be 1 */
+			unsigned long      : 2;
+			unsigned long      : 7;
+			unsigned long pr   : 1; /* Must be 0 */
+		};
+		struct {
+			unsigned long token:32; /* Token and parameter */
+			unsigned long      :32;
+		};
+	} tok;
+};
+
+/* Soft dirty, needed as macro for atomic operations on ptes */
+#define _PAGE_SD 0x002
+
+/* Needed as macro to perform atomic operations */
+#define PGSTE_PCL_BIT		0x0080000000000000UL	/* PCL lock, HW bit */
+#define PGSTE_CMMA_D_BIT	0x0000000000008000UL	/* CMMA dirty soft-bit */
+
+enum pgste_gps_usage {
+	PGSTE_GPS_USAGE_STABLE = 0,
+	PGSTE_GPS_USAGE_UNUSED,
+	PGSTE_GPS_USAGE_POT_VOLATILE,
+	PGSTE_GPS_USAGE_VOLATILE,
+};
+
+union pgste {
+	unsigned long val;
+	struct {
+		unsigned long acc          : 4;
+		unsigned long fp           : 1;
+		unsigned long              : 3;
+		unsigned long pcl          : 1;
+		unsigned long hr           : 1;
+		unsigned long hc           : 1;
+		unsigned long              : 2;
+		unsigned long gr           : 1;
+		unsigned long gc           : 1;
+		unsigned long              : 1;
+		unsigned long              :16; /* val16 */
+		unsigned long zero         : 1;
+		unsigned long nodat        : 1;
+		unsigned long              : 4;
+		unsigned long usage        : 2;
+		unsigned long              : 8;
+		unsigned long cmma_d       : 1; /* Dirty flag for CMMA bits */
+		unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+		unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
+		unsigned long              : 5;
+		unsigned long              : 8;
+	};
+	struct {
+		unsigned short hwbytes0;
+		unsigned short val16;	/* Used to store chunked values, see dat_{s,g}et_ptval() */
+		unsigned short hwbytes4;
+		unsigned char flags;	/* Maps to the software bits */
+		unsigned char hwbyte7;
+	} __packed;
+};
+
+union pmd {
+	unsigned long val;
+	union segment_table_entry h;
+	struct {
+		struct {
+			unsigned long              :44; /* HW */
+			unsigned long              : 3; /* Unused */
+			unsigned long              : 1; /* HW */
+			unsigned long w            : 1; /* Writable soft-bit */
+			unsigned long r            : 1; /* Readable soft-bit */
+			unsigned long d            : 1; /* Dirty */
+			unsigned long y            : 1; /* Young */
+			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+			unsigned long              : 3; /* HW */
+			unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
+			unsigned long              : 1; /* Unused */
+			unsigned long              : 4; /* HW */
+			unsigned long sd           : 1; /* Soft-Dirty */
+			unsigned long pr           : 1; /* Present */
+		} fc1;
+	} s;
+};
+
+union pud {
+	unsigned long val;
+	union region3_table_entry h;
+	struct {
+		struct {
+			unsigned long              :33; /* HW */
+			unsigned long              :14; /* Unused */
+			unsigned long              : 1; /* HW */
+			unsigned long w            : 1; /* Writable soft-bit */
+			unsigned long r            : 1; /* Readable soft-bit */
+			unsigned long d            : 1; /* Dirty */
+			unsigned long y            : 1; /* Young */
+			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+			unsigned long              : 3; /* HW */
+			unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
+			unsigned long              : 1; /* Unused */
+			unsigned long              : 4; /* HW */
+			unsigned long sd           : 1; /* Soft-Dirty */
+			unsigned long pr           : 1; /* Present */
+		} fc1;
+	} s;
+};
+
+union p4d {
+	unsigned long val;
+	union region2_table_entry h;
+};
+
+union pgd {
+	unsigned long val;
+	union region1_table_entry h;
+};
+
+union crste {
+	unsigned long val;
+	union {
+		struct {
+			unsigned long   :52;
+			unsigned long   : 1;
+			unsigned long fc: 1;
+			unsigned long p : 1;
+			unsigned long   : 1;
+			unsigned long   : 2;
+			unsigned long i : 1;
+			unsigned long   : 1;
+			unsigned long tt: 2;
+			unsigned long   : 2;
+		};
+		struct {
+			unsigned long to:52;
+			unsigned long   : 1;
+			unsigned long fc: 1;
+			unsigned long p : 1;
+			unsigned long   : 1;
+			unsigned long tf: 2;
+			unsigned long i : 1;
+			unsigned long   : 1;
+			unsigned long tt: 2;
+			unsigned long tl: 2;
+		} fc0;
+		struct {
+			unsigned long    :47;
+			unsigned long av : 1; /* ACCF-Validity Control */
+			unsigned long acc: 4; /* Access-Control Bits */
+			unsigned long f  : 1; /* Fetch-Protection Bit */
+			unsigned long fc : 1; /* Format-Control */
+			unsigned long p  : 1; /* DAT-Protection Bit */
+			unsigned long iep: 1; /* Instruction-Execution-Protection */
+			unsigned long    : 2;
+			unsigned long i  : 1; /* Segment-Invalid Bit */
+			unsigned long cs : 1; /* Common-Segment Bit */
+			unsigned long tt : 2; /* Table-Type Bits */
+			unsigned long    : 2;
+		} fc1;
+	} h;
+	struct {
+		struct {
+			unsigned long              :47;
+			unsigned long              : 1; /* HW (should be 0) */
+			unsigned long w            : 1; /* Writable */
+			unsigned long r            : 1; /* Readable */
+			unsigned long d            : 1; /* Dirty */
+			unsigned long y            : 1; /* Young */
+			unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
+			unsigned long              : 3; /* HW */
+			unsigned long vsie_notif   : 1; /* Referenced in a shadow table */
+			unsigned long              : 1;
+			unsigned long              : 4; /* HW */
+			unsigned long sd           : 1; /* Soft-Dirty */
+			unsigned long pr           : 1; /* Present */
+		} fc1;
+	} s;
+	union {
+		struct {
+			unsigned long type :16; /* Token type */
+			unsigned long par  :16; /* Token parameter */
+			unsigned long      :26;
+			unsigned long i    : 1; /* Must be 1 */
+			unsigned long      : 1;
+			unsigned long tt   : 2;
+			unsigned long      : 1;
+			unsigned long pr   : 1; /* Must be 0 */
+		};
+		struct {
+			unsigned long token:32; /* Token and parameter */
+			unsigned long      :32;
+		};
+	} tok;
+	union pmd pmd;
+	union pud pud;
+	union p4d p4d;
+	union pgd pgd;
+};
+
+union skey {
+	unsigned char skey;
+	struct {
+		unsigned char acc :4;
+		unsigned char fp  :1;
+		unsigned char r   :1;
+		unsigned char c   :1;
+		unsigned char zero:1;
+	};
+};
+
+static_assert(sizeof(union pgste) == sizeof(unsigned long));
+static_assert(sizeof(union pte) == sizeof(unsigned long));
+static_assert(sizeof(union pmd) == sizeof(unsigned long));
+static_assert(sizeof(union pud) == sizeof(unsigned long));
+static_assert(sizeof(union p4d) == sizeof(unsigned long));
+static_assert(sizeof(union pgd) == sizeof(unsigned long));
+static_assert(sizeof(union crste) == sizeof(unsigned long));
+static_assert(sizeof(union skey) == sizeof(char));
+
+struct segment_table {
+	union pmd pmds[_CRST_ENTRIES];
+};
+
+struct region3_table {
+	union pud puds[_CRST_ENTRIES];
+};
+
+struct region2_table {
+	union p4d p4ds[_CRST_ENTRIES];
+};
+
+struct region1_table {
+	union pgd pgds[_CRST_ENTRIES];
+};
+
+struct crst_table {
+	union {
+		union crste crstes[_CRST_ENTRIES];
+		struct segment_table segment;
+		struct region3_table region3;
+		struct region2_table region2;
+		struct region1_table region1;
+	};
+};
+
+struct page_table {
+	union pte ptes[_PAGE_ENTRIES];
+	union pgste pgstes[_PAGE_ENTRIES];
+};
+
+static_assert(sizeof(struct crst_table) == _CRST_TABLE_SIZE);
+static_assert(sizeof(struct page_table) == PAGE_SIZE);
+
+struct dat_walk;
+
+typedef long (*dat_walk_op)(union crste *crste, gfn_t gfn, gfn_t next, struct dat_walk *w);
+
+struct dat_walk_ops {
+	union {
+		dat_walk_op crste_ops[4];
+		struct {
+			dat_walk_op pmd_entry;
+			dat_walk_op pud_entry;
+			dat_walk_op p4d_entry;
+			dat_walk_op pgd_entry;
+		};
+	};
+	long (*pte_entry)(union pte *pte, gfn_t gfn, gfn_t next, struct dat_walk *w);
+};
+
+struct dat_walk {
+	const struct dat_walk_ops *ops;
+	union crste *last;
+	union pte *last_pte;
+	union asce asce;
+	gfn_t start;
+	gfn_t end;
+	int flags;
+	void *priv;
+};
+
+struct ptval_param {
+	unsigned char offset : 6;
+	unsigned char len : 2;
+};
+
+/**
+ * _pte() - Useful constructor for union pte
+ * @pfn: the pfn this pte should point to.
+ * @writable: whether the pte should be writable.
+ * @dirty: whether the pte should be dirty.
+ * @special: whether the pte should be marked as special
+ *
+ * The pte is also marked as young and present. If the pte is marked as dirty,
+ * it gets marked as soft-dirty too. If the pte is not dirty, the hardware
+ * protect bit is set (independently of the write softbit); this way proper
+ * dirty tracking can be performed.
+ *
+ * Return: a union pte value.
+ */
+static inline union pte _pte(kvm_pfn_t pfn, bool writable, bool dirty, bool special)
+{
+	union pte res = { .val = PFN_PHYS(pfn) };
+
+	res.h.p = !dirty;
+	res.s.y = 1;
+	res.s.pr = 1;
+	res.s.w = writable;
+	res.s.d = dirty;
+	res.s.sd = dirty;
+	res.s.s = special;
+	return res;
+}
+
+static inline union crste _crste_fc0(kvm_pfn_t pfn, int tt)
+{
+	union crste res = { .val = PFN_PHYS(pfn) };
+
+	res.h.tt = tt;
+	res.h.fc0.tl = _REGION_ENTRY_LENGTH;
+	res.h.fc0.tf = 0;
+	return res;
+}
+
+/**
+ * _crste() - Useful constructor for union crste with FC=1
+ * @pfn: the pfn this pte should point to.
+ * @tt: the table type
+ * @writable: whether the pte should be writable.
+ * @dirty: whether the pte should be dirty.
+ *
+ * The crste is also marked as young and present. If the crste is marked as
+ * dirty, it gets marked as soft-dirty too. If the crste is not dirty, the
+ * hardware protect bit is set (independently of the write softbit); this way
+ * proper dirty tracking can be performed.
+ *
+ * Return: a union crste value.
+ */
+static inline union crste _crste_fc1(kvm_pfn_t pfn, int tt, bool writable, bool dirty)
+{
+	union crste res = { .val = PFN_PHYS(pfn) & _SEGMENT_MASK };
+
+	res.h.tt = tt;
+	res.h.p = !dirty;
+	res.h.fc = 1;
+	res.s.fc1.y = 1;
+	res.s.fc1.pr = 1;
+	res.s.fc1.w = writable;
+	res.s.fc1.d = dirty;
+	res.s.fc1.sd = dirty;
+	return res;
+}
+
+union essa_state {
+	unsigned char val;
+	struct {
+		unsigned char		: 2;
+		unsigned char nodat	: 1;
+		unsigned char exception	: 1;
+		unsigned char usage	: 2;
+		unsigned char content	: 2;
+	};
+};
+
+/**
+ * struct vsie_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @r_gfn: virtual rmap address in the shadow guest address space
+ */
+struct vsie_rmap {
+	struct vsie_rmap *next;
+	union {
+		unsigned long val;
+		struct {
+			long          level: 8;
+			unsigned long      : 4;
+			unsigned long r_gfn:52;
+		};
+	};
+};
+
+static_assert(sizeof(struct vsie_rmap) == 2 * sizeof(long));
+
+#define KVM_S390_MMU_CACHE_N_CRSTS	6
+#define KVM_S390_MMU_CACHE_N_PTS	2
+#define KVM_S390_MMU_CACHE_N_RMAPS	16
+struct kvm_s390_mmu_cache {
+	void *crsts[KVM_S390_MMU_CACHE_N_CRSTS];
+	void *pts[KVM_S390_MMU_CACHE_N_PTS];
+	void *rmaps[KVM_S390_MMU_CACHE_N_RMAPS];
+	short int n_crsts;
+	short int n_pts;
+	short int n_rmaps;
+};
+
+struct guest_fault {
+	gfn_t gfn;		/* Guest frame */
+	kvm_pfn_t pfn;		/* Host PFN */
+	struct page *page;	/* Host page */
+	union pte *ptep;	/* Used to resolve the fault, or NULL */
+	union crste *crstep;	/* Used to resolve the fault, or NULL */
+	bool writable;		/* Mapping is writable */
+	bool write_attempt;	/* Write access attempted */
+	bool attempt_pfault;	/* Attempt a pfault first */
+	bool valid;		/* This entry contains valid data */
+	void (*callback)(struct guest_fault *f);
+	void *priv;
+};
+
+/*
+ *	0	1	2	3	4	5	6	7
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *  0	|				|	    PGT_ADDR		|
+ *  8	|	 VMADDR		|					|
+ * 16	|								|
+ * 24	|								|
+ */
+#define MKPTVAL(o, l) ((struct ptval_param) { .offset = (o), .len = ((l) + 1) / 2 - 1})
+#define PTVAL_PGT_ADDR	MKPTVAL(4, 8)
+#define PTVAL_VMADDR	MKPTVAL(8, 6)
+
+union pgste __must_check __dat_ptep_xchg(union pte *ptep, union pgste pgste, union pte new,
+					 gfn_t gfn, union asce asce, bool uses_skeys);
+bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
+			    union asce asce);
+void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce);
+
+long _dat_walk_gfn_range(gfn_t start, gfn_t end, union asce asce,
+			 const struct dat_walk_ops *ops, int flags, void *priv);
+
+int dat_entry_walk(struct kvm_s390_mmu_cache *mc, gfn_t gfn, union asce asce, int flags,
+		   int walk_level, union crste **last, union pte **ptepp);
+void dat_free_level(struct crst_table *table, bool owns_ptes);
+struct crst_table *dat_alloc_crst_sleepable(unsigned long init);
+int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newtype);
+int dat_get_storage_key(union asce asce, gfn_t gfn, union skey *skey);
+int dat_set_storage_key(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+			union skey skey, bool nq);
+int dat_cond_set_storage_key(struct kvm_s390_mmu_cache *mmc, union asce asce, gfn_t gfn,
+			     union skey skey, union skey *oldkey, bool nq, bool mr, bool mc);
+int dat_reset_reference_bit(union asce asce, gfn_t gfn);
+long dat_reset_skeys(union asce asce, gfn_t start);
+
+unsigned long dat_get_ptval(struct page_table *table, struct ptval_param param);
+void dat_set_ptval(struct page_table *table, struct ptval_param param, unsigned long val);
+
+int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gfn_t end,
+		 u16 type, u16 param);
+int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn);
+bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end);
+int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
+	     bool uses_skeys, struct guest_fault *f);
+
+int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty);
+long dat_reset_cmma(union asce asce, gfn_t start_gfn);
+int dat_peek_cmma(gfn_t start, union asce asce, unsigned int *count, u8 *values);
+int dat_get_cmma(union asce asce, gfn_t *start, unsigned int *count, u8 *values, atomic64_t *rem);
+int dat_set_cmma_bits(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t gfn,
+		      unsigned long count, unsigned long mask, const uint8_t *bits);
+
+int kvm_s390_mmu_cache_topup(struct kvm_s390_mmu_cache *mc);
+
+#define GFP_KVM_S390_MMU_CACHE (GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN)
+
+static inline struct page_table *kvm_s390_mmu_cache_alloc_pt(struct kvm_s390_mmu_cache *mc)
+{
+	if (mc->n_pts)
+		return mc->pts[--mc->n_pts];
+	return (void *)__get_free_page(GFP_KVM_S390_MMU_CACHE);
+}
+
+static inline struct crst_table *kvm_s390_mmu_cache_alloc_crst(struct kvm_s390_mmu_cache *mc)
+{
+	if (mc->n_crsts)
+		return mc->crsts[--mc->n_crsts];
+	return (void *)__get_free_pages(GFP_KVM_S390_MMU_CACHE | __GFP_COMP, CRST_ALLOC_ORDER);
+}
+
+static inline struct vsie_rmap *kvm_s390_mmu_cache_alloc_rmap(struct kvm_s390_mmu_cache *mc)
+{
+	if (mc->n_rmaps)
+		return mc->rmaps[--mc->n_rmaps];
+	return kzalloc(sizeof(struct vsie_rmap), GFP_KVM_S390_MMU_CACHE);
+}
+
+static inline struct crst_table *crste_table_start(union crste *crstep)
+{
+	return (struct crst_table *)ALIGN_DOWN((unsigned long)crstep, _CRST_TABLE_SIZE);
+}
+
+static inline struct page_table *pte_table_start(union pte *ptep)
+{
+	return (struct page_table *)ALIGN_DOWN((unsigned long)ptep, _PAGE_TABLE_SIZE);
+}
+
+static inline bool crdte_crste(union crste *crstep, union crste old, union crste new, gfn_t gfn,
+			       union asce asce)
+{
+	unsigned long dtt = 0x10 | new.h.tt << 2;
+	void *table = crste_table_start(crstep);
+
+	return crdte(old.val, new.val, table, dtt, gfn_to_gpa(gfn), asce.val);
+}
+
+/**
+ * idte_crste() - invalidate a crste entry using idte
+ * @crstep: pointer to the crste to be invalidated
+ * @gfn: a gfn mapped by the crste
+ * @opt: options for the idte instruction
+ * @asce: the asce
+ * @local: whether the operation is cpu-local
+ */
+static __always_inline void idte_crste(union crste *crstep, gfn_t gfn, unsigned long opt,
+				       union asce asce, int local)
+{
+	unsigned long table_origin = __pa(crste_table_start(crstep));
+	unsigned long gaddr = gfn_to_gpa(gfn) & HPAGE_MASK;
+
+	if (__builtin_constant_p(opt) && opt == 0) {
+		/* flush without guest asce */
+		asm volatile("idte	%[table_origin],0,%[gaddr],%[local]"
+			: "+m" (*crstep)
+			: [table_origin] "a" (table_origin), [gaddr] "a" (gaddr),
+			  [local] "i" (local)
+			: "cc");
+	} else {
+		/* flush with guest asce */
+		asm volatile("idte %[table_origin],%[asce],%[gaddr_opt],%[local]"
+			: "+m" (*crstep)
+			: [table_origin] "a" (table_origin), [gaddr_opt] "a" (gaddr | opt),
+			  [asce] "a" (asce.val), [local] "i" (local)
+			: "cc");
+	}
+}
+
+static inline void dat_init_pgstes(struct page_table *pt, unsigned long val)
+{
+	memset64((void *)pt->pgstes, val, PTRS_PER_PTE);
+}
+
+static inline void dat_init_page_table(struct page_table *pt, unsigned long ptes,
+				       unsigned long pgstes)
+{
+	memset64((void *)pt->ptes, ptes, PTRS_PER_PTE);
+	dat_init_pgstes(pt, pgstes);
+}
+
+static inline gfn_t asce_end(union asce asce)
+{
+	return 1ULL << ((asce.dt + 1) * 11 + _SEGMENT_SHIFT - PAGE_SHIFT);
+}
+
+#define _CRSTE(x) ((union crste) { .val = _Generic((x),	\
+			union pgd : (x).val,		\
+			union p4d : (x).val,		\
+			union pud : (x).val,		\
+			union pmd : (x).val,		\
+			union crste : (x).val)})
+
+#define _CRSTEP(x) ((union crste *)_Generic((*(x)),	\
+				union pgd : (x),	\
+				union p4d : (x),	\
+				union pud : (x),	\
+				union pmd : (x),	\
+				union crste : (x)))
+
+#define _CRSTP(x) ((struct crst_table *)_Generic((*(x)),	\
+		struct crst_table : (x),			\
+		struct segment_table : (x),			\
+		struct region3_table : (x),			\
+		struct region2_table : (x),			\
+		struct region1_table : (x)))
+
+static inline bool asce_contains_gfn(union asce asce, gfn_t gfn)
+{
+	return gfn < asce_end(asce);
+}
+
+static inline bool is_pmd(union crste crste)
+{
+	return crste.h.tt == TABLE_TYPE_SEGMENT;
+}
+
+static inline bool is_pud(union crste crste)
+{
+	return crste.h.tt == TABLE_TYPE_REGION3;
+}
+
+static inline bool is_p4d(union crste crste)
+{
+	return crste.h.tt == TABLE_TYPE_REGION2;
+}
+
+static inline bool is_pgd(union crste crste)
+{
+	return crste.h.tt == TABLE_TYPE_REGION1;
+}
+
+static inline phys_addr_t pmd_origin_large(union pmd pmd)
+{
+	return pmd.val & _SEGMENT_ENTRY_ORIGIN_LARGE;
+}
+
+static inline phys_addr_t pud_origin_large(union pud pud)
+{
+	return pud.val & _REGION3_ENTRY_ORIGIN_LARGE;
+}
+
+/**
+ * crste_origin_large() - Return the large frame origin of a large crste
+ * @crste: The crste whose origin is to be returned. Should be either a
+ *         region-3 table entry or a segment table entry, in both cases with
+ *         FC set to 1 (large pages).
+ *
+ * Return: The origin of the large frame pointed to by @crste, or -1 if the
+ *         crste was not large (wrong table type, or FC==0)
+ */
+static inline phys_addr_t crste_origin_large(union crste crste)
+{
+	if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3))
+		return -1;
+	if (is_pmd(crste))
+		return pmd_origin_large(crste.pmd);
+	return pud_origin_large(crste.pud);
+}
+
+#define crste_origin(x) (_Generic((x),				\
+		union pmd : (x).val & _SEGMENT_ENTRY_ORIGIN,	\
+		union pud : (x).val & _REGION_ENTRY_ORIGIN,	\
+		union p4d : (x).val & _REGION_ENTRY_ORIGIN,	\
+		union pgd : (x).val & _REGION_ENTRY_ORIGIN))
+
+static inline unsigned long pte_origin(union pte pte)
+{
+	return pte.val & PAGE_MASK;
+}
+
+static inline bool pmd_prefix(union pmd pmd)
+{
+	return pmd.h.fc && pmd.s.fc1.prefix_notif;
+}
+
+static inline bool pud_prefix(union pud pud)
+{
+	return pud.h.fc && pud.s.fc1.prefix_notif;
+}
+
+static inline bool crste_leaf(union crste crste)
+{
+	return (crste.h.tt <= TABLE_TYPE_REGION3) && crste.h.fc;
+}
+
+static inline bool crste_prefix(union crste crste)
+{
+	return crste_leaf(crste) && crste.s.fc1.prefix_notif;
+}
+
+static inline bool crste_dirty(union crste crste)
+{
+	return crste_leaf(crste) && crste.s.fc1.d;
+}
+
+static inline union pgste *pgste_of(union pte *pte)
+{
+	return (union pgste *)(pte + _PAGE_ENTRIES);
+}
+
+static inline bool pte_hole(union pte pte)
+{
+	return pte.h.i && !pte.tok.pr && pte.tok.type != _DAT_TOKEN_NONE;
+}
+
+static inline bool _crste_hole(union crste crste)
+{
+	return crste.h.i && !crste.tok.pr && crste.tok.type != _DAT_TOKEN_NONE;
+}
+
+#define crste_hole(x) _crste_hole(_CRSTE(x))
+
+static inline bool _crste_none(union crste crste)
+{
+	return crste.h.i && !crste.tok.pr && crste.tok.type == _DAT_TOKEN_NONE;
+}
+
+#define crste_none(x) _crste_none(_CRSTE(x))
+
+static inline phys_addr_t large_pud_to_phys(union pud pud, gfn_t gfn)
+{
+	return pud_origin_large(pud) | (gfn_to_gpa(gfn) & ~_REGION3_MASK);
+}
+
+static inline phys_addr_t large_pmd_to_phys(union pmd pmd, gfn_t gfn)
+{
+	return pmd_origin_large(pmd) | (gfn_to_gpa(gfn) & ~_SEGMENT_MASK);
+}
+
+static inline phys_addr_t large_crste_to_phys(union crste crste, gfn_t gfn)
+{
+	if (unlikely(!crste.h.fc || crste.h.tt > TABLE_TYPE_REGION3))
+		return -1;
+	if (is_pmd(crste))
+		return large_pmd_to_phys(crste.pmd, gfn);
+	return large_pud_to_phys(crste.pud, gfn);
+}
+
+static inline bool cspg_crste(union crste *crstep, union crste old, union crste new)
+{
+	return cspg(&crstep->val, old.val, new.val);
+}
+
+static inline struct page_table *dereference_pmd(union pmd pmd)
+{
+	return phys_to_virt(crste_origin(pmd));
+}
+
+static inline struct segment_table *dereference_pud(union pud pud)
+{
+	return phys_to_virt(crste_origin(pud));
+}
+
+static inline struct region3_table *dereference_p4d(union p4d p4d)
+{
+	return phys_to_virt(crste_origin(p4d));
+}
+
+static inline struct region2_table *dereference_pgd(union pgd pgd)
+{
+	return phys_to_virt(crste_origin(pgd));
+}
+
+static inline struct crst_table *_dereference_crste(union crste crste)
+{
+	if (unlikely(is_pmd(crste)))
+		return NULL;
+	return phys_to_virt(crste_origin(crste.pud));
+}
+
+#define dereference_crste(x) (_Generic((x),			\
+		union pud : _dereference_crste(_CRSTE(x)),	\
+		union p4d : _dereference_crste(_CRSTE(x)),	\
+		union pgd : _dereference_crste(_CRSTE(x)),	\
+		union crste : _dereference_crste(_CRSTE(x))))
+
+static inline struct crst_table *dereference_asce(union asce asce)
+{
+	return phys_to_virt(asce.val & _ASCE_ORIGIN);
+}
+
+static inline void asce_flush_tlb(union asce asce)
+{
+	__tlb_flush_idte(asce.val);
+}
+
+static inline bool pgste_get_trylock(union pte *ptep, union pgste *res)
+{
+	union pgste *pgstep = pgste_of(ptep);
+	union pgste old_pgste;
+
+	if (READ_ONCE(pgstep->val) & PGSTE_PCL_BIT)
+		return false;
+	old_pgste.val = __atomic64_or_barrier(PGSTE_PCL_BIT, &pgstep->val);
+	if (old_pgste.pcl)
+		return false;
+	old_pgste.pcl = 1;
+	*res = old_pgste;
+	return true;
+}
+
+static inline union pgste pgste_get_lock(union pte *ptep)
+{
+	union pgste res;
+
+	while (!pgste_get_trylock(ptep, &res))
+		cpu_relax();
+	return res;
+}
+
+static inline void pgste_set_unlock(union pte *ptep, union pgste pgste)
+{
+	pgste.pcl = 0;
+	barrier();
+	WRITE_ONCE(*pgste_of(ptep), pgste);
+}
+
+static inline void dat_ptep_xchg(union pte *ptep, union pte new, gfn_t gfn, union asce asce,
+				 bool has_skeys)
+{
+	union pgste pgste;
+
+	pgste = pgste_get_lock(ptep);
+	pgste = __dat_ptep_xchg(ptep, pgste, new, gfn, asce, has_skeys);
+	pgste_set_unlock(ptep, pgste);
+}
+
+static inline void dat_ptep_clear(union pte *ptep, gfn_t gfn, union asce asce, bool has_skeys)
+{
+	dat_ptep_xchg(ptep, _PTE_EMPTY, gfn, asce, has_skeys);
+}
+
+static inline void dat_free_pt(struct page_table *pt)
+{
+	free_page((unsigned long)pt);
+}
+
+static inline void _dat_free_crst(struct crst_table *table)
+{
+	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
+#define dat_free_crst(x) _dat_free_crst(_CRSTP(x))
+
+static inline void kvm_s390_free_mmu_cache(struct kvm_s390_mmu_cache *mc)
+{
+	if (!mc)
+		return;
+	while (mc->n_pts)
+		dat_free_pt(mc->pts[--mc->n_pts]);
+	while (mc->n_crsts)
+		_dat_free_crst(mc->crsts[--mc->n_crsts]);
+	while (mc->n_rmaps)
+		kfree(mc->rmaps[--mc->n_rmaps]);
+	kfree(mc);
+}
+
+DEFINE_FREE(kvm_s390_mmu_cache, struct kvm_s390_mmu_cache *, if (_T) kvm_s390_free_mmu_cache(_T))
+
+static inline struct kvm_s390_mmu_cache *kvm_s390_new_mmu_cache(void)
+{
+	struct kvm_s390_mmu_cache *mc __free(kvm_s390_mmu_cache) = NULL;
+
+	mc = kzalloc(sizeof(*mc), GFP_KERNEL_ACCOUNT);
+	if (mc && !kvm_s390_mmu_cache_topup(mc))
+		return_ptr(mc);
+	return NULL;
+}
+
+static inline bool dat_pmdp_xchg_atomic(union pmd *pmdp, union pmd old, union pmd new,
+					gfn_t gfn, union asce asce)
+{
+	return dat_crstep_xchg_atomic(_CRSTEP(pmdp), _CRSTE(old), _CRSTE(new), gfn, asce);
+}
+
+static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pud new,
+					gfn_t gfn, union asce asce)
+{
+	return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce);
+}
+
+static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce)
+{
+	union crste newcrste = _CRSTE_EMPTY(crstep->h.tt);
+
+	dat_crstep_xchg(crstep, newcrste, gfn, asce);
+}
+
+static inline int get_level(union crste *crstep, union pte *ptep)
+{
+	return ptep ? TABLE_TYPE_PAGE_TABLE : crstep->h.tt;
+}
+
+static inline int dat_delete_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start,
+				  unsigned long npages)
+{
+	return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_PIC, PGM_ADDRESSING);
+}
+
+static inline int dat_create_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start,
+				  unsigned long npages)
+{
+	return dat_set_slot(mc, asce, start, start + npages, _DAT_TOKEN_NONE, 0);
+}
+
+static inline bool crste_is_ucas(union crste crste)
+{
+	return is_pmd(crste) && crste.h.i && crste.h.fc0.tl == 1 && crste.h.fc == 0;
+}
+
+#endif /* __KVM_S390_DAT_H */
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@ -10,13 +10,13 @@

 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
-#include <asm/gmap.h>
 #include <asm/gmap_helpers.h>
 #include <asm/virtio-ccw.h>
 #include "kvm-s390.h"
 #include "trace.h"
 #include "trace-s390.h"
 #include "gaccess.h"
+#include "gmap.h"

 static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end)
 {
--- a/arch/s390/kvm/faultin.c
+++ b/arch/s390/kvm/faultin.c
@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  KVM guest fault handling.
+ *
+ *    Copyright IBM Corp. 2025
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+#include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
+
+#include "gmap.h"
+#include "trace.h"
+#include "faultin.h"
+
+bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu);
+
+/*
+ * kvm_s390_faultin_gfn() - handle a dat fault.
+ * @vcpu: The vCPU whose gmap is to be fixed up, or NULL if operating on the VM.
+ * @kvm: The VM whose gmap is to be fixed up, or NULL if operating on a vCPU.
+ * @f: The guest fault that needs to be resolved.
+ *
+ * Return:
+ * * 0 on success
+ * * < 0 in case of error
+ * * > 0 in case of guest exceptions
+ *
+ * Context:
+ * * The mm lock must not be held before calling
+ * * kvm->srcu must be held
+ * * may sleep
+ */
+int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f)
+{
+	struct kvm_s390_mmu_cache *local_mc __free(kvm_s390_mmu_cache) = NULL;
+	struct kvm_s390_mmu_cache *mc = NULL;
+	struct kvm_memory_slot *slot;
+	unsigned long inv_seq;
+	int foll, rc = 0;
+
+	foll = f->write_attempt ? FOLL_WRITE : 0;
+	foll |= f->attempt_pfault ? FOLL_NOWAIT : 0;
+
+	if (vcpu) {
+		kvm = vcpu->kvm;
+		mc = vcpu->arch.mc;
+	}
+
+	lockdep_assert_held(&kvm->srcu);
+
+	scoped_guard(read_lock, &kvm->mmu_lock) {
+		if (gmap_try_fixup_minor(kvm->arch.gmap, f) == 0)
+			return 0;
+	}
+
+	while (1) {
+		f->valid = false;
+		inv_seq = kvm->mmu_invalidate_seq;
+		/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
+		smp_rmb();
+
+		if (vcpu)
+			slot = kvm_vcpu_gfn_to_memslot(vcpu, f->gfn);
+		else
+			slot = gfn_to_memslot(kvm, f->gfn);
+		f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page);
+
+		/* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT). */
+		if (f->pfn == KVM_PFN_ERR_NEEDS_IO) {
+			if (unlikely(!f->attempt_pfault))
+				return -EAGAIN;
+			if (unlikely(!vcpu))
+				return -EINVAL;
+			trace_kvm_s390_major_guest_pfault(vcpu);
+			if (kvm_arch_setup_async_pf(vcpu))
+				return 0;
+			vcpu->stat.pfault_sync++;
+			/* Could not setup async pfault, try again synchronously. */
+			foll &= ~FOLL_NOWAIT;
+			f->pfn = __kvm_faultin_pfn(slot, f->gfn, foll, &f->writable, &f->page);
+		}
+
+		/* Access outside memory, addressing exception. */
+		if (is_noslot_pfn(f->pfn))
+			return PGM_ADDRESSING;
+		/* Signal pending: try again. */
+		if (f->pfn == KVM_PFN_ERR_SIGPENDING)
+			return -EAGAIN;
+		/* Check if it's read-only memory; don't try to actually handle that case. */
+		if (f->pfn == KVM_PFN_ERR_RO_FAULT)
+			return -EOPNOTSUPP;
+		/* Any other error. */
+		if (is_error_pfn(f->pfn))
+			return -EFAULT;
+
+		if (!mc) {
+			local_mc = kvm_s390_new_mmu_cache();
+			if (!local_mc)
+				return -ENOMEM;
+			mc = local_mc;
+		}
+
+		/* Loop, will automatically release the faulted page. */
+		if (mmu_invalidate_retry_gfn_unsafe(kvm, inv_seq, f->gfn)) {
+			kvm_release_faultin_page(kvm, f->page, true, false);
+			continue;
+		}
+
+		scoped_guard(read_lock, &kvm->mmu_lock) {
+			if (!mmu_invalidate_retry_gfn(kvm, inv_seq, f->gfn)) {
+				f->valid = true;
+				rc = gmap_link(mc, kvm->arch.gmap, f);
+				kvm_release_faultin_page(kvm, f->page, !!rc, f->write_attempt);
+				f->page = NULL;
+			}
+		}
+		kvm_release_faultin_page(kvm, f->page, true, false);
+
+		if (rc == -ENOMEM) {
+			rc = kvm_s390_mmu_cache_topup(mc);
+			if (rc)
+				return rc;
+		} else if (rc != -EAGAIN) {
+			return rc;
+		}
+	}
+}
+
+int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w)
+{
+	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+	int foll = w ? FOLL_WRITE : 0;
+
+	f->write_attempt = w;
+	f->gfn = gfn;
+	f->pfn = __kvm_faultin_pfn(slot, gfn, foll, &f->writable, &f->page);
+	if (is_noslot_pfn(f->pfn))
+		return PGM_ADDRESSING;
+	if (is_sigpending_pfn(f->pfn))
+		return -EINTR;
+	if (f->pfn == KVM_PFN_ERR_NEEDS_IO)
+		return -EAGAIN;
+	if (is_error_pfn(f->pfn))
+		return -EFAULT;
+
+	f->valid = true;
+	return 0;
+}
--- a/arch/s390/kvm/faultin.h
+++ b/arch/s390/kvm/faultin.h
@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  KVM guest fault handling.
+ *
+ *    Copyright IBM Corp. 2025
+ *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef __KVM_S390_FAULTIN_H
+#define __KVM_S390_FAULTIN_H
+
+#include <linux/kvm_host.h>
+
+#include "dat.h"
+
+int kvm_s390_faultin_gfn(struct kvm_vcpu *vcpu, struct kvm *kvm, struct guest_fault *f);
+int kvm_s390_get_guest_page(struct kvm *kvm, struct guest_fault *f, gfn_t gfn, bool w);
+
+static inline int kvm_s390_faultin_gfn_simple(struct kvm_vcpu *vcpu, struct kvm *kvm,
+					      gfn_t gfn, bool wr)
+{
+	struct guest_fault f = { .gfn = gfn, .write_attempt = wr, };
+
+	return kvm_s390_faultin_gfn(vcpu, kvm, &f);
+}
+
+static inline int kvm_s390_get_guest_page_and_read_gpa(struct kvm *kvm, struct guest_fault *f,
+						       gpa_t gaddr, unsigned long *val)
+{
+	int rc;
+
+	rc = kvm_s390_get_guest_page(kvm, f, gpa_to_gfn(gaddr), false);
+	if (rc)
+		return rc;
+
+	*val = *(unsigned long *)phys_to_virt(pfn_to_phys(f->pfn) | offset_in_page(gaddr));
+
+	return 0;
+}
+
+static inline void kvm_s390_release_multiple(struct kvm *kvm, struct guest_fault *guest_faults,
+					     int n, bool ignore)
+{
+	int i;
+
+	for (i = 0; i < n; i++) {
+		kvm_release_faultin_page(kvm, guest_faults[i].page, ignore,
+					 guest_faults[i].write_attempt);
+		guest_faults[i].page = NULL;
+	}
+}
+
+static inline bool kvm_s390_multiple_faults_need_retry(struct kvm *kvm, unsigned long seq,
+						       struct guest_fault *guest_faults, int n,
+						       bool unsafe)
+{
+	int i;
+
+	for (i = 0; i < n; i++) {
+		if (!guest_faults[i].valid)
+			continue;
+		if (unsafe && mmu_invalidate_retry_gfn_unsafe(kvm, seq, guest_faults[i].gfn))
+			return true;
+		if (!unsafe && mmu_invalidate_retry_gfn(kvm, seq, guest_faults[i].gfn))
+			return true;
+	}
+	return false;
+}
+
+static inline int kvm_s390_get_guest_pages(struct kvm *kvm, struct guest_fault *guest_faults,
+					   gfn_t start, int n_pages, bool write_attempt)
+{
+	int i, rc;
+
+	for (i = 0; i < n_pages; i++) {
+		rc = kvm_s390_get_guest_page(kvm, guest_faults + i, start + i, write_attempt);
+		if (rc)
+			break;
+	}
+	return rc;
+}
+
+#define kvm_s390_release_faultin_array(kvm, array, ignore) \
+	kvm_s390_release_multiple(kvm, array, ARRAY_SIZE(array), ignore)
+
+#define kvm_s390_array_needs_retry_unsafe(kvm, seq, array) \
+	kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), true)
+
+#define kvm_s390_array_needs_retry_safe(kvm, seq, array) \
+	kvm_s390_multiple_faults_need_retry(kvm, seq, array, ARRAY_SIZE(array), false)
+
+#endif /* __KVM_S390_FAULTIN_H */
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@ -206,8 +206,8 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 		      void *data, unsigned long len, enum gacc_mode mode);

-int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old,
-			       __uint128_t new, u8 access_key, bool *success);
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old,
+			       union kvm_s390_quad new, u8 access_key, bool *success);

 /**
 * write_guest_with_key - copy data from kernel space to guest space
@ -450,11 +450,17 @@ void ipte_unlock(struct kvm *kvm);
 int ipte_lock_held(struct kvm *kvm);
 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);

-/* MVPG PEI indication bits */
-#define PEI_DAT_PROT 2
-#define PEI_NOT_PTE 4
+union mvpg_pei {
+	unsigned long val;
+	struct {
+		unsigned long addr    : 61;
+		unsigned long not_pte :  1;
+		unsigned long dat_prot:  1;
+		unsigned long real    :  1;
+	};
+};

-int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
-			  unsigned long saddr, unsigned long *datptr);
+int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr,
+			 union mvpg_pei *datptr, bool wr);

 #endif /* __KVM_S390_GACCESS_H */
--- a/arch/s390/kvm/gmap-vsie.c
+++ b/arch/s390/kvm/gmap-vsie.c
@ -1,141 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Guest memory management for KVM/s390 nested VMs.
- *
- * Copyright IBM Corp. 2008, 2020, 2024
- *
- *    Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com>
- *               Martin Schwidefsky <schwidefsky@de.ibm.com>
- *               David Hildenbrand <david@redhat.com>
- *               Janosch Frank <frankja@linux.vnet.ibm.com>
- */
-
-#include <linux/compiler.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/pgtable.h>
-#include <linux/pagemap.h>
-#include <linux/mman.h>
-
-#include <asm/lowcore.h>
-#include <asm/gmap.h>
-#include <asm/uv.h>
-
-#include "kvm-s390.h"
-
-/**
- * gmap_find_shadow - find a specific asce in the list of shadow tables
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, ERR_PTR(-EAGAIN) if another one is just being created,
- * otherwise NULL
- *
- * Context: Called with parent->shadow_lock held
- */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level)
-{
-	struct gmap *sg;
-
-	lockdep_assert_held(&parent->shadow_lock);
-	list_for_each_entry(sg, &parent->children, list) {
-		if (!gmap_shadow_valid(sg, asce, edat_level))
-			continue;
-		if (!sg->initialized)
-			return ERR_PTR(-EAGAIN);
-		refcount_inc(&sg->ref_count);
-		return sg;
-	}
-	return NULL;
-}
-
-/**
- * gmap_shadow - create/find a shadow guest address space
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * The pages of the top level page table referred by the asce parameter
- * will be set to read-only and marked in the PGSTEs of the kvm process.
- * The shadow table will be removed automatically on any change to the
- * PTE mapping for the source table.
- *
- * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
- * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
- * parent gmap table could not be protected.
- */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level)
-{
-	struct gmap *sg, *new;
-	unsigned long limit;
-	int rc;
-
-	if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) ||
-	    KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private))
-		return ERR_PTR(-EFAULT);
-	spin_lock(&parent->shadow_lock);
-	sg = gmap_find_shadow(parent, asce, edat_level);
-	spin_unlock(&parent->shadow_lock);
-	if (sg)
-		return sg;
-	/* Create a new shadow gmap */
-	limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
-	if (asce & _ASCE_REAL_SPACE)
-		limit = -1UL;
-	new = gmap_alloc(limit);
-	if (!new)
-		return ERR_PTR(-ENOMEM);
-	new->mm = parent->mm;
-	new->parent = gmap_get(parent);
-	new->private = parent->private;
-	new->orig_asce = asce;
-	new->edat_level = edat_level;
-	new->initialized = false;
-	spin_lock(&parent->shadow_lock);
-	/* Recheck if another CPU created the same shadow */
-	sg = gmap_find_shadow(parent, asce, edat_level);
-	if (sg) {
-		spin_unlock(&parent->shadow_lock);
-		gmap_free(new);
-		return sg;
-	}
-	if (asce & _ASCE_REAL_SPACE) {
-		/* only allow one real-space gmap shadow */
-		list_for_each_entry(sg, &parent->children, list) {
-			if (sg->orig_asce & _ASCE_REAL_SPACE) {
-				spin_lock(&sg->guest_table_lock);
-				gmap_unshadow(sg);
-				spin_unlock(&sg->guest_table_lock);
-				list_del(&sg->list);
-				gmap_put(sg);
-				break;
-			}
-		}
-	}
-	refcount_set(&new->ref_count, 2);
-	list_add(&new->list, &parent->children);
-	if (asce & _ASCE_REAL_SPACE) {
-		/* nothing to protect, return right away */
-		new->initialized = true;
-		spin_unlock(&parent->shadow_lock);
-		return new;
-	}
-	spin_unlock(&parent->shadow_lock);
-	/* protect after insertion, so it will get properly invalidated */
-	mmap_read_lock(parent->mm);
-	rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN,
-				      ((asce & _ASCE_TABLE_LENGTH) + 1),
-				      PROT_READ, GMAP_NOTIFY_SHADOW);
-	mmap_read_unlock(parent->mm);
-	spin_lock(&parent->shadow_lock);
-	new->initialized = true;
-	if (rc) {
-		list_del(&new->list);
-		gmap_free(new);
-		new = ERR_PTR(rc);
-	}
-	spin_unlock(&parent->shadow_lock);
-	return new;
-}
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
--- a/arch/s390/kvm/gmap.h
+++ b/arch/s390/kvm/gmap.h
@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  KVM guest address space mapping code
+ *
+ *    Copyright IBM Corp. 2007, 2016, 2025
+ *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ *               Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+
+#ifndef ARCH_KVM_S390_GMAP_H
+#define ARCH_KVM_S390_GMAP_H
+
+#include "dat.h"
+
+/**
+ * enum gmap_flags - Flags of a gmap.
+ *
+ * @GMAP_FLAG_SHADOW: The gmap is a vsie shadow gmap.
+ * @GMAP_FLAG_OWNS_PAGETABLES: The gmap owns all dat levels; normally 1, is 0
+ *                             only for ucontrol per-cpu gmaps, since they
+ *                             share the page tables with the main gmap.
+ * @GMAP_FLAG_IS_UCONTROL: The gmap is ucontrol (main gmap or per-cpu gmap).
+ * @GMAP_FLAG_ALLOW_HPAGE_1M: 1M hugepages are allowed for this gmap,
+ *                            independently of the page size used by userspace.
+ * @GMAP_FLAG_ALLOW_HPAGE_2G: 2G hugepages are allowed for this gmap,
+ *                            independently of the page size used by userspace.
+ * @GMAP_FLAG_PFAULT_ENABLED: Pfault is enabled for the gmap.
+ * @GMAP_FLAG_USES_SKEYS: If the guest uses storage keys.
+ * @GMAP_FLAG_USES_CMM: Whether the guest uses CMMA.
+ * @GMAP_FLAG_EXPORT_ON_UNMAP: Whether to export guest pages when unmapping.
+ */
+enum gmap_flags {
+	GMAP_FLAG_SHADOW = 0,
+	GMAP_FLAG_OWNS_PAGETABLES,
+	GMAP_FLAG_IS_UCONTROL,
+	GMAP_FLAG_ALLOW_HPAGE_1M,
+	GMAP_FLAG_ALLOW_HPAGE_2G,
+	GMAP_FLAG_PFAULT_ENABLED,
+	GMAP_FLAG_USES_SKEYS,
+	GMAP_FLAG_USES_CMM,
+	GMAP_FLAG_EXPORT_ON_UNMAP,
+};
+
+/**
+ * struct gmap_struct - Guest address space.
+ *
+ * @flags: GMAP_FLAG_* flags.
+ * @edat_level: The edat level of this shadow gmap.
+ * @kvm: The vm.
+ * @asce: The ASCE used by this gmap.
+ * @list: List head used in children gmaps for the children gmap list.
+ * @children_lock: Protects children and scb_users.
+ * @children: List of child gmaps of this gmap.
+ * @scb_users: List of vsie_scb that use this shadow gmap.
+ * @parent: Parent gmap of a child gmap.
+ * @guest_asce: Original ASCE of this shadow gmap.
+ * @host_to_rmap_lock: Protects host_to_rmap.
+ * @host_to_rmap: Radix tree mapping host addresses to guest addresses.
+ */
+struct gmap {
+	unsigned long flags;
+	unsigned char edat_level;
+	struct kvm *kvm;
+	union asce asce;
+	struct list_head list;
+	spinlock_t children_lock;	/* Protects: children, scb_users */
+	struct list_head children;
+	struct list_head scb_users;
+	struct gmap *parent;
+	union asce guest_asce;
+	spinlock_t host_to_rmap_lock;	/* Protects host_to_rmap */
+	struct radix_tree_root host_to_rmap;
+	refcount_t refcount;
+};
+
+struct gmap_cache {
+	struct list_head list;
+	struct gmap *gmap;
+};
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+	for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
+int s390_replace_asce(struct gmap *gmap);
+bool _gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end, bool hint);
+bool gmap_age_gfn(struct gmap *gmap, gfn_t start, gfn_t end);
+bool gmap_unmap_gfn_range(struct gmap *gmap, struct kvm_memory_slot *slot, gfn_t start, gfn_t end);
+int gmap_try_fixup_minor(struct gmap *gmap, struct guest_fault *fault);
+struct gmap *gmap_new(struct kvm *kvm, gfn_t limit);
+struct gmap *gmap_new_child(struct gmap *parent, gfn_t limit);
+void gmap_remove_child(struct gmap *child);
+void gmap_dispose(struct gmap *gmap);
+int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *fault);
+void gmap_sync_dirty_log(struct gmap *gmap, gfn_t start, gfn_t end);
+int gmap_set_limit(struct gmap *gmap, gfn_t limit);
+int gmap_ucas_translate(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, gpa_t *gaddr);
+int gmap_ucas_map(struct gmap *gmap, gfn_t p_gfn, gfn_t c_gfn, unsigned long count);
+void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count);
+int gmap_enable_skeys(struct gmap *gmap);
+int gmap_pv_destroy_range(struct gmap *gmap, gfn_t start, gfn_t end, bool interruptible);
+int gmap_insert_rmap(struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn, int level);
+int gmap_protect_rmap(struct kvm_s390_mmu_cache *mc, struct gmap *sg, gfn_t p_gfn, gfn_t r_gfn,
+		      kvm_pfn_t pfn, int level, bool wr);
+void gmap_set_cmma_all_dirty(struct gmap *gmap);
+void _gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn);
+struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
+				union asce asce, int edat_level);
+void gmap_split_huge_pages(struct gmap *gmap);
+
+static inline bool uses_skeys(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_USES_SKEYS, &gmap->flags);
+}
+
+static inline bool uses_cmm(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_USES_CMM, &gmap->flags);
+}
+
+static inline bool pfault_enabled(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_PFAULT_ENABLED, &gmap->flags);
+}
+
+static inline bool is_ucontrol(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_IS_UCONTROL, &gmap->flags);
+}
+
+static inline bool is_shadow(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_SHADOW, &gmap->flags);
+}
+
+static inline bool owns_page_tables(struct gmap *gmap)
+{
+	return test_bit(GMAP_FLAG_OWNS_PAGETABLES, &gmap->flags);
+}
+
+static inline struct gmap *gmap_put(struct gmap *gmap)
+{
+	if (refcount_dec_and_test(&gmap->refcount))
+		gmap_dispose(gmap);
+	return NULL;
+}
+
+static inline void gmap_get(struct gmap *gmap)
+{
+	WARN_ON_ONCE(unlikely(!refcount_inc_not_zero(&gmap->refcount)));
+}
+
+static inline void gmap_handle_vsie_unshadow_event(struct gmap *parent, gfn_t gfn)
+{
+	scoped_guard(spinlock, &parent->children_lock)
+		_gmap_handle_vsie_unshadow_event(parent, gfn);
+}
+
+static inline bool gmap_mkold_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end)
+{
+	return _gmap_unmap_prefix(gmap, gfn, end, true);
+}
+
+static inline bool gmap_unmap_prefix(struct gmap *gmap, gfn_t gfn, gfn_t end)
+{
+	return _gmap_unmap_prefix(gmap, gfn, end, false);
+}
+
+static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte,
+					  union pgste pgste, gfn_t gfn, bool needs_lock)
+{
+	lockdep_assert_held(&gmap->kvm->mmu_lock);
+	if (!needs_lock)
+		lockdep_assert_held(&gmap->children_lock);
+	else
+		lockdep_assert_not_held(&gmap->children_lock);
+
+	if (pgste.prefix_notif && (newpte.h.p || newpte.h.i)) {
+		pgste.prefix_notif = 0;
+		gmap_unmap_prefix(gmap, gfn, gfn + 1);
+	}
+	if (pgste.vsie_notif && (ptep->h.p != newpte.h.p || newpte.h.i)) {
+		pgste.vsie_notif = 0;
+		if (needs_lock)
+			gmap_handle_vsie_unshadow_event(gmap, gfn);
+		else
+			_gmap_handle_vsie_unshadow_event(gmap, gfn);
+	}
+	return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap));
+}
+
+static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, union pte newpte,
+					 union pgste pgste, gfn_t gfn)
+{
+	return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true);
+}
+
+static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
+				     gfn_t gfn, bool needs_lock)
+{
+	unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11);
+
+	lockdep_assert_held(&gmap->kvm->mmu_lock);
+	if (!needs_lock)
+		lockdep_assert_held(&gmap->children_lock);
+
+	gfn = ALIGN_DOWN(gfn, align);
+	if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) {
+		ne.s.fc1.prefix_notif = 0;
+		gmap_unmap_prefix(gmap, gfn, gfn + align);
+	}
+	if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif &&
+	    (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) {
+		ne.s.fc1.vsie_notif = 0;
+		if (needs_lock)
+			gmap_handle_vsie_unshadow_event(gmap, gfn);
+		else
+			_gmap_handle_vsie_unshadow_event(gmap, gfn);
+	}
+	dat_crstep_xchg(crstep, ne, gfn, gmap->asce);
+}
+
+static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
+				    gfn_t gfn)
+{
+	return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true);
+}
+
+/**
+ * gmap_is_shadow_valid() - check if a shadow guest address space matches the
+ *                          given properties and is still valid.
+ * @sg: Pointer to the shadow guest address space structure.
+ * @asce: ASCE for which the shadow table is requested.
+ * @edat_level: Edat level to be used for the shadow translation.
+ *
+ * Return: true if the gmap shadow is still valid and matches the given
+ * properties and the caller can continue using it; false otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ */
+static inline bool gmap_is_shadow_valid(struct gmap *sg, union asce asce, int edat_level)
+{
+	return sg->guest_asce.val == asce.val && sg->edat_level == edat_level;
+}
+
+#endif /* ARCH_KVM_S390_GMAP_H */
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@ -21,6 +21,7 @@
 #include "gaccess.h"
 #include "trace.h"
 #include "trace-s390.h"
+#include "faultin.h"

 u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
 {
@ -367,8 +368,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 					      reg2, &srcaddr, GACC_FETCH, 0);
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
-	rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0);
-	if (rc != 0)
+
+	do {
+		rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(srcaddr), false);
+	} while (rc == -EAGAIN);
+	if (rc)
 		return rc;

 	/* Ensure that the source is paged-in, no actual access -> no key checking */
@ -376,8 +380,11 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 					      reg1, &dstaddr, GACC_STORE, 0);
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
-	rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE);
-	if (rc != 0)
+
+	do {
+		rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(dstaddr), true);
+	} while (rc == -EAGAIN);
+	if (rc)
 		return rc;

 	kvm_s390_retry_instr(vcpu);
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@ -26,7 +26,6 @@
 #include <linux/uaccess.h>
 #include <asm/sclp.h>
 #include <asm/isc.h>
-#include <asm/gmap.h>
 #include <asm/nmi.h>
 #include <asm/airq.h>
 #include <asm/tpi.h>
@ -34,6 +33,7 @@
 #include "gaccess.h"
 #include "trace-s390.h"
 #include "pci.h"
+#include "gmap.h"

 #define PFAULT_INIT 0x0600
 #define PFAULT_DONE 0x0680
@ -2632,12 +2632,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 	case KVM_DEV_FLIC_APF_ENABLE:
 		if (kvm_is_ucontrol(dev->kvm))
 			return -EINVAL;
-		dev->kvm->arch.gmap->pfault_enabled = 1;
+		set_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags);
 		break;
 	case KVM_DEV_FLIC_APF_DISABLE_WAIT:
 		if (kvm_is_ucontrol(dev->kvm))
 			return -EINVAL;
-		dev->kvm->arch.gmap->pfault_enabled = 0;
+		clear_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags);
 		/*
 		 * Make sure no async faults are in transition when
 		 * clearing the queues. So we don't need to worry
@ -2768,13 +2768,13 @@ static int adapter_indicators_set(struct kvm *kvm,
 	bit = get_ind_bit(adapter_int->ind_addr,
 			  adapter_int->ind_offset, adapter->swap);
 	set_bit(bit, map);
-	mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT);
+	mark_page_dirty(kvm, adapter_int->ind_gaddr >> PAGE_SHIFT);
 	set_page_dirty_lock(ind_page);
 	map = page_address(summary_page);
 	bit = get_ind_bit(adapter_int->summary_addr,
 			  adapter_int->summary_offset, adapter->swap);
 	summary_set = test_and_set_bit(bit, map);
-	mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT);
+	mark_page_dirty(kvm, adapter_int->summary_gaddr >> PAGE_SHIFT);
 	set_page_dirty_lock(summary_page);
 	srcu_read_unlock(&kvm->srcu, idx);

@ -2870,7 +2870,9 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		if (kvm_is_error_hva(uaddr_s) || kvm_is_error_hva(uaddr_i))
 			return -EFAULT;
 		e->adapter.summary_addr = uaddr_s;
+		e->adapter.summary_gaddr = ue->u.adapter.summary_addr;
 		e->adapter.ind_addr = uaddr_i;
+		e->adapter.ind_gaddr = ue->u.adapter.ind_addr;
 		e->adapter.summary_offset = ue->u.adapter.summary_offset;
 		e->adapter.ind_offset = ue->u.adapter.ind_offset;
 		e->adapter.adapter_id = ue->u.adapter.adapter_id;
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@ -19,9 +19,19 @@
 #include <asm/facility.h>
 #include <asm/processor.h>
 #include <asm/sclp.h>
+#include "dat.h"
+#include "gmap.h"

 #define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0)

+union kvm_s390_quad {
+	__uint128_t sixteen;
+	unsigned long eight;
+	unsigned int four;
+	unsigned short two;
+	unsigned char one;
+};
+
 static inline void kvm_s390_fpu_store(struct kvm_run *run)
 {
 	fpu_stfpc(&run->s.regs.fpc);
@ -106,9 +116,7 @@ static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
 static inline int kvm_is_ucontrol(struct kvm *kvm)
 {
 #ifdef CONFIG_KVM_S390_UCONTROL
-	if (kvm->arch.gmap)
-		return 0;
-	return 1;
+	return test_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags);
 #else
 	return 0;
 #endif
@ -432,14 +440,9 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu);
 /* implemented in vsie.c */
 int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
 void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
-void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
-				 unsigned long end);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end);
 void kvm_s390_vsie_init(struct kvm *kvm);
 void kvm_s390_vsie_destroy(struct kvm *kvm);
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
-
-/* implemented in gmap-vsie.c */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level);

 /* implemented in sigp.c */
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
@ -461,14 +464,10 @@ void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
 void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
 __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
 int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc);
-int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags);
 int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot,
 			     unsigned long bits);

-static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags)
-{
-	return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags);
-}
+bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu);

 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@ -21,13 +21,14 @@
 #include <asm/ebcdic.h>
 #include <asm/sysinfo.h>
 #include <asm/page-states.h>
-#include <asm/gmap.h>
 #include <asm/ptrace.h>
 #include <asm/sclp.h>
 #include <asm/ap.h>
+#include <asm/gmap_helpers.h>
 #include "gaccess.h"
 #include "kvm-s390.h"
 #include "trace.h"
+#include "gmap.h"

 static int handle_ri(struct kvm_vcpu *vcpu)
 {
@ -222,7 +223,7 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.skey_enabled)
 		return 0;

-	rc = s390_enable_skey();
+	rc = gmap_enable_skeys(vcpu->arch.gmap);
 	VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
 	if (rc)
 		return rc;
@ -255,10 +256,9 @@ static int try_handle_skey(struct kvm_vcpu *vcpu)

 static int handle_iske(struct kvm_vcpu *vcpu)
 {
-	unsigned long gaddr, vmaddr;
-	unsigned char key;
+	unsigned long gaddr;
 	int reg1, reg2;
-	bool unlocked;
+	union skey key;
 	int rc;

 	vcpu->stat.instruction_iske++;
@ -275,37 +275,21 @@ static int handle_iske(struct kvm_vcpu *vcpu)
 	gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
 	gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
-	vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
-	if (kvm_is_error_hva(vmaddr))
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-retry:
-	unlocked = false;
-	mmap_read_lock(current->mm);
-	rc = get_guest_storage_key(current->mm, vmaddr, &key);
-
-	if (rc) {
-		rc = fixup_user_fault(current->mm, vmaddr,
-				      FAULT_FLAG_WRITE, &unlocked);
-		if (!rc) {
-			mmap_read_unlock(current->mm);
-			goto retry;
-		}
-	}
-	mmap_read_unlock(current->mm);
-	if (rc == -EFAULT)
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+		rc = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr), &key);
+	if (rc > 0)
+		return kvm_s390_inject_program_int(vcpu, rc);
 	if (rc < 0)
 		return rc;
 	vcpu->run->s.regs.gprs[reg1] &= ~0xff;
-	vcpu->run->s.regs.gprs[reg1] |= key;
+	vcpu->run->s.regs.gprs[reg1] |= key.skey;
 	return 0;
 }

 static int handle_rrbe(struct kvm_vcpu *vcpu)
 {
-	unsigned long vmaddr, gaddr;
+	unsigned long gaddr;
 	int reg1, reg2;
-	bool unlocked;
 	int rc;

 	vcpu->stat.instruction_rrbe++;
@ -322,24 +306,10 @@ static int handle_rrbe(struct kvm_vcpu *vcpu)
 	gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	gaddr = kvm_s390_logical_to_effective(vcpu, gaddr);
 	gaddr = kvm_s390_real_to_abs(vcpu, gaddr);
-	vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr));
-	if (kvm_is_error_hva(vmaddr))
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-retry:
-	unlocked = false;
-	mmap_read_lock(current->mm);
-	rc = reset_guest_reference_bit(current->mm, vmaddr);
-	if (rc < 0) {
-		rc = fixup_user_fault(current->mm, vmaddr,
-				      FAULT_FLAG_WRITE, &unlocked);
-		if (!rc) {
-			mmap_read_unlock(current->mm);
-			goto retry;
-		}
-	}
-	mmap_read_unlock(current->mm);
-	if (rc == -EFAULT)
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+		rc = dat_reset_reference_bit(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr));
+	if (rc > 0)
+		return kvm_s390_inject_program_int(vcpu, rc);
 	if (rc < 0)
 		return rc;
 	kvm_s390_set_psw_cc(vcpu, rc);
@ -354,9 +324,8 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 {
 	unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
 	unsigned long start, end;
-	unsigned char key, oldkey;
+	union skey key, oldkey;
 	int reg1, reg2;
-	bool unlocked;
 	int rc;

 	vcpu->stat.instruction_sske++;
@ -377,7 +346,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)

 	kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);

-	key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+	key.skey = vcpu->run->s.regs.gprs[reg1] & 0xfe;
 	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	start = kvm_s390_logical_to_effective(vcpu, start);
 	if (m3 & SSKE_MB) {
@ -389,27 +358,17 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 	}

 	while (start != end) {
-		unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
-		unlocked = false;
-
-		if (kvm_is_error_hva(vmaddr))
-			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
-		mmap_read_lock(current->mm);
-		rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey,
-						m3 & SSKE_NQ, m3 & SSKE_MR,
-						m3 & SSKE_MC);
-
-		if (rc < 0) {
-			rc = fixup_user_fault(current->mm, vmaddr,
-					      FAULT_FLAG_WRITE, &unlocked);
-			rc = !rc ? -EAGAIN : rc;
+		scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+			rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce,
+						      gpa_to_gfn(start), key, &oldkey,
+						      m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC);
 		}
-		mmap_read_unlock(current->mm);
-		if (rc == -EFAULT)
+		if (rc > 1)
 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-		if (rc == -EAGAIN)
+		if (rc == -ENOMEM) {
+			kvm_s390_mmu_cache_topup(vcpu->arch.mc);
 			continue;
+		}
 		if (rc < 0)
 			return rc;
 		start += PAGE_SIZE;
@ -422,7 +381,7 @@ static int handle_sske(struct kvm_vcpu *vcpu)
 		} else {
 			kvm_s390_set_psw_cc(vcpu, rc);
 			vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
-			vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+			vcpu->run->s.regs.gprs[reg1] |= (u64)oldkey.skey << 8;
 		}
 	}
 	if (m3 & SSKE_MB) {
@ -1082,7 +1041,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	bool mr = false, mc = false, nq;
 	int reg1, reg2;
 	unsigned long start, end;
-	unsigned char key;
+	union skey key;

 	vcpu->stat.instruction_pfmf++;

@ -1110,7 +1069,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	}

 	nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
-	key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
+	key.skey = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
 	start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
 	start = kvm_s390_logical_to_effective(vcpu, start);

@ -1141,14 +1100,6 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	}

 	while (start != end) {
-		unsigned long vmaddr;
-		bool unlocked = false;
-
-		/* Translate guest address to host address */
-		vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
-		if (kvm_is_error_hva(vmaddr))
-			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
 		if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
 			if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE))
 				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@ -1159,19 +1110,17 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)

 			if (rc)
 				return rc;
-			mmap_read_lock(current->mm);
-			rc = cond_set_guest_storage_key(current->mm, vmaddr,
-							key, NULL, nq, mr, mc);
-			if (rc < 0) {
-				rc = fixup_user_fault(current->mm, vmaddr,
-						      FAULT_FLAG_WRITE, &unlocked);
-				rc = !rc ? -EAGAIN : rc;
+			scoped_guard(read_lock, &vcpu->kvm->mmu_lock) {
+				rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce,
+							      gpa_to_gfn(start), key,
+							      NULL, nq, mr, mc);
 			}
-			mmap_read_unlock(current->mm);
-			if (rc == -EFAULT)
-				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-			if (rc == -EAGAIN)
+			if (rc > 1)
+				return kvm_s390_inject_program_int(vcpu, rc);
+			if (rc == -ENOMEM) {
+				kvm_s390_mmu_cache_topup(vcpu->arch.mc);
 				continue;
+			}
 			if (rc < 0)
 				return rc;
 		}
@ -1195,8 +1144,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
 {
 	int r1, r2, nappended, entries;
-	unsigned long gfn, hva, res, pgstev, ptev;
+	union essa_state state;
 	unsigned long *cbrlo;
+	unsigned long gfn;
+	bool dirtied;

 	/*
 	 * We don't need to set SD.FPF.SK to 1 here, because if we have a
@ -1205,33 +1156,12 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)

 	kvm_s390_get_regs_rre(vcpu, &r1, &r2);
 	gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT;
-	hva = gfn_to_hva(vcpu->kvm, gfn);
 	entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;

-	if (kvm_is_error_hva(hva))
-		return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-
-	nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev);
-	if (nappended < 0) {
-		res = orc ? 0x10 : 0;
-		vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */
+	nappended = dat_perform_essa(vcpu->arch.gmap->asce, gfn, orc, &state, &dirtied);
+	vcpu->run->s.regs.gprs[r1] = state.val;
+	if (nappended < 0)
 		return 0;
-	}
-	res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22;
-	/*
-	 * Set the block-content state part of the result. 0 means resident, so
-	 * nothing to do if the page is valid. 2 is for preserved pages
-	 * (non-present and non-zero), and 3 for zero pages (non-present and
-	 * zero).
-	 */
-	if (ptev & _PAGE_INVALID) {
-		res |= 2;
-		if (pgstev & _PGSTE_GPS_ZERO)
-			res |= 1;
-	}
-	if (pgstev & _PGSTE_GPS_NODAT)
-		res |= 0x20;
-	vcpu->run->s.regs.gprs[r1] = res;
 	/*
 	 * It is possible that all the normal 511 slots were full, in which case
 	 * we will now write in the 512th slot, which is reserved for host use.
@ -1243,17 +1173,34 @@ static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
 		cbrlo[entries] = gfn << PAGE_SHIFT;
 	}

-	if (orc) {
-		struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn);
-
-		/* Increment only if we are really flipping the bit */
-		if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
-			atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
-	}
+	if (dirtied)
+		atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);

 	return nappended;
 }

+static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len)
+{
+	union crste *crstep;
+	union pgste pgste;
+	union pte *ptep;
+	int i;
+
+	lockdep_assert_held(&vcpu->kvm->mmu_lock);
+
+	for (i = 0; i < len; i++) {
+		if (dat_entry_walk(NULL, gpa_to_gfn(cbrl[i]), vcpu->arch.gmap->asce,
+				   0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep))
+			continue;
+		if (!ptep || ptep->s.pr)
+			continue;
+		pgste = pgste_get_lock(ptep);
+		if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero)
+			gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]);
+		pgste_set_unlock(ptep, pgste);
+	}
+}
+
 static int handle_essa(struct kvm_vcpu *vcpu)
 {
 	lockdep_assert_held(&vcpu->kvm->srcu);
@ -1261,11 +1208,9 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 	/* entries expected to be 1FF */
 	int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3;
 	unsigned long *cbrlo;
-	struct gmap *gmap;
 	int i, orc;

 	VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries);
-	gmap = vcpu->arch.gmap;
 	vcpu->stat.instruction_essa++;
 	if (!vcpu->kvm->arch.use_cmma)
 		return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
@ -1289,11 +1234,7 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 		 * value really needs to be written to; if the value is
 		 * already correct, we do nothing and avoid the lock.
 		 */
-		if (vcpu->kvm->mm->context.uses_cmm == 0) {
-			mmap_write_lock(vcpu->kvm->mm);
-			vcpu->kvm->mm->context.uses_cmm = 1;
-			mmap_write_unlock(vcpu->kvm->mm);
-		}
+		set_bit(GMAP_FLAG_USES_CMM, &vcpu->arch.gmap->flags);
 		/*
 		 * If we are here, we are supposed to have CMMA enabled in
 		 * the SIE block. Enabling CMMA works on a per-CPU basis,
@ -1307,20 +1248,22 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 		/* Retry the ESSA instruction */
 		kvm_s390_retry_instr(vcpu);
 	} else {
-		mmap_read_lock(vcpu->kvm->mm);
-		i = __do_essa(vcpu, orc);
-		mmap_read_unlock(vcpu->kvm->mm);
+		scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+			i = __do_essa(vcpu, orc);
 		if (i < 0)
 			return i;
 		/* Account for the possible extra cbrl entry */
 		entries += i;
 	}
-	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */
+	/* reset nceo */
+	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;
 	cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
-	mmap_read_lock(gmap->mm);
-	for (i = 0; i < entries; ++i)
-		__gmap_zap(gmap, cbrlo[i]);
-	mmap_read_unlock(gmap->mm);
+
+	mmap_read_lock(vcpu->kvm->mm);
+	scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
+		_essa_clear_cbrl(vcpu, cbrlo, entries);
+	mmap_read_unlock(vcpu->kvm->mm);
+
 	return 0;
 }

--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@ -12,13 +12,16 @@
 #include <linux/minmax.h>
 #include <linux/pagemap.h>
 #include <linux/sched/signal.h>
-#include <asm/gmap.h>
 #include <asm/uv.h>
 #include <asm/mman.h>
 #include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
 #include <linux/mmu_notifier.h>
 #include "kvm-s390.h"
+#include "dat.h"
+#include "gaccess.h"
+#include "gmap.h"
+#include "faultin.h"

 bool kvm_s390_pv_is_protected(struct kvm *kvm)
 {
@ -34,6 +37,85 @@ bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);

+/**
+ * should_export_before_import() - Determine whether an export is needed
+ * before an import-like operation.
+ * @uvcb: The Ultravisor control block of the UVC to be performed.
+ * @mm: The mm of the process.
+ *
+ * Returns whether an export is needed before every import-like operation.
+ * This is needed for shared pages, which don't trigger a secure storage
+ * exception when accessed from a different guest.
+ *
+ * Although considered as one, the Unpin Page UVC is not an actual import,
+ * so it is not affected.
+ *
+ * No export is needed also when there is only one protected VM, because the
+ * page cannot belong to the wrong VM in that case (there is no "other VM"
+ * it can belong to).
+ *
+ * Return: %true if an export is needed before every import, otherwise %false.
+ */
+static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm)
+{
+	/*
+	 * The misc feature indicates, among other things, that importing a
+	 * shared page from a different protected VM will automatically also
+	 * transfer its ownership.
+	 */
+	if (uv_has_feature(BIT_UV_FEAT_MISC))
+		return false;
+	if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED)
+		return false;
+	return atomic_read(&mm->context.protected_count) > 1;
+}
+
+struct pv_make_secure {
+	void *uvcb;
+	struct folio *folio;
+	int rc;
+	bool needs_export;
+};
+
+static int __kvm_s390_pv_make_secure(struct guest_fault *f, struct folio *folio)
+{
+	struct pv_make_secure *priv = f->priv;
+	int rc;
+
+	if (priv->needs_export)
+		uv_convert_from_secure(folio_to_phys(folio));
+
+	if (folio_test_hugetlb(folio))
+		return -EFAULT;
+	if (folio_test_large(folio))
+		return -E2BIG;
+
+	if (!f->page)
+		folio_get(folio);
+	rc = __make_folio_secure(folio, priv->uvcb);
+	if (!f->page)
+		folio_put(folio);
+
+	return rc;
+}
+
+static void _kvm_s390_pv_make_secure(struct guest_fault *f)
+{
+	struct pv_make_secure *priv = f->priv;
+	struct folio *folio;
+
+	folio = pfn_folio(f->pfn);
+	priv->rc = -EAGAIN;
+	if (folio_trylock(folio)) {
+		priv->rc = __kvm_s390_pv_make_secure(f, folio);
+		if (priv->rc == -E2BIG || priv->rc == -EBUSY) {
+			priv->folio = folio;
+			folio_get(folio);
+		}
+		folio_unlock(folio);
+	}
+}
+
 /**
 * kvm_s390_pv_make_secure() - make one guest page secure
 * @kvm: the guest
@ -45,14 +127,34 @@ EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected);
 */
 int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb)
 {
-	unsigned long vmaddr;
+	struct pv_make_secure priv = { .uvcb = uvcb };
+	struct guest_fault f = {
+		.write_attempt = true,
+		.gfn = gpa_to_gfn(gaddr),
+		.callback = _kvm_s390_pv_make_secure,
+		.priv = &priv,
+	};
+	int rc;

 	lockdep_assert_held(&kvm->srcu);

-	vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr));
-	if (kvm_is_error_hva(vmaddr))
-		return -EFAULT;
-	return make_hva_secure(kvm->mm, vmaddr, uvcb);
+	priv.needs_export = should_export_before_import(uvcb, kvm->mm);
+
+	scoped_guard(mutex, &kvm->arch.pv.import_lock) {
+		rc = kvm_s390_faultin_gfn(NULL, kvm, &f);
+
+		if (!rc) {
+			rc = priv.rc;
+			if (priv.folio) {
+				rc = s390_wiggle_split_folio(kvm->mm, priv.folio);
+				if (!rc)
+					rc = -EAGAIN;
+			}
+		}
+	}
+	if (priv.folio)
+		folio_put(priv.folio);
+	return rc;
 }

 int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr)
@ -299,35 +401,6 @@ done_fast:
 	return 0;
 }

-/**
- * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory.
- * @kvm: the VM whose memory is to be cleared.
- *
- * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot.
- * The CPUs of the protected VM need to be destroyed beforehand.
- */
-static void kvm_s390_destroy_lower_2g(struct kvm *kvm)
-{
-	const unsigned long pages_2g = SZ_2G / PAGE_SIZE;
-	struct kvm_memory_slot *slot;
-	unsigned long len;
-	int srcu_idx;
-
-	srcu_idx = srcu_read_lock(&kvm->srcu);
-
-	/* Take the memslot containing guest absolute address 0 */
-	slot = gfn_to_memslot(kvm, 0);
-	/* Clear all slots or parts thereof that are below 2GB */
-	while (slot && slot->base_gfn < pages_2g) {
-		len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE;
-		s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len);
-		/* Take the next memslot */
-		slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages);
-	}
-
-	srcu_read_unlock(&kvm->srcu, srcu_idx);
-}
-
 static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
 {
 	struct uv_cb_destroy_fast uvcb = {
@ -342,7 +415,6 @@ static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc)
 		*rc = uvcb.header.rc;
 	if (rrc)
 		*rrc = uvcb.header.rrc;
-	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
 	KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x",
 		     uvcb.header.rc, uvcb.header.rrc);
 	WARN_ONCE(cc && uvcb.header.rc != 0x104,
@ -391,7 +463,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return -EINVAL;

 	/* Guest with segment type ASCE, refuse to destroy asynchronously */
-	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+	if (kvm->arch.gmap->asce.dt == TABLE_TYPE_SEGMENT)
 		return -EINVAL;

 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
@ -404,8 +476,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 		priv->stor_var = kvm->arch.pv.stor_var;
 		priv->stor_base = kvm->arch.pv.stor_base;
 		priv->handle = kvm_s390_pv_get_handle(kvm);
-		priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table;
-		WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
+		priv->old_gmap_table = (unsigned long)dereference_asce(kvm->arch.gmap->asce);
 		if (s390_replace_asce(kvm->arch.gmap))
 			res = -ENOMEM;
 	}
@ -415,7 +486,7 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return res;
 	}

-	kvm_s390_destroy_lower_2g(kvm);
+	gmap_pv_destroy_range(kvm->arch.gmap, 0, gpa_to_gfn(SZ_2G), false);
 	kvm_s390_clear_pv_state(kvm);
 	kvm->arch.pv.set_aside = priv;

@ -449,7 +520,6 @@ int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc)

 	cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm),
 			   UVC_CMD_DESTROY_SEC_CONF, rc, rrc);
-	WRITE_ONCE(kvm->arch.gmap->guest_handle, 0);
 	if (!cc) {
 		atomic_dec(&kvm->mm->context.protected_count);
 		kvm_s390_pv_dealloc_vm(kvm);
@ -532,7 +602,7 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
 	 * cleanup has been performed.
 	 */
 	if (need_zap && mmget_not_zero(kvm->mm)) {
-		s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE);
+		gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), false);
 		mmput(kvm->mm);
 	}

@ -570,7 +640,7 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return -EINVAL;

 	/* When a fatal signal is received, stop immediately */
-	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
+	if (gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), true))
 		goto done;
 	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
 		ret = -EIO;
@ -609,6 +679,7 @@ static void kvm_s390_pv_mmu_notifier_release(struct mmu_notifier *subscription,
 	r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy);
 	if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm))
 		kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy);
+	set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags);
 }

 static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = {
@ -642,7 +713,7 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 	/* Inputs */
 	uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */
 	uvcb.guest_stor_len = kvm->arch.pv.guest_len;
-	uvcb.guest_asce = kvm->arch.gmap->asce;
+	uvcb.guest_asce = kvm->arch.gmap->asce.val;
 	uvcb.guest_sca = virt_to_phys(kvm->arch.sca);
 	uvcb.conf_base_stor_origin =
 		virt_to_phys((void *)kvm->arch.pv.stor_base);
@ -650,6 +721,9 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 	uvcb.flags.ap_allow_instr = kvm->arch.model.uv_feat_guest.ap;
 	uvcb.flags.ap_instr_intr = kvm->arch.model.uv_feat_guest.ap_intr;

+	clear_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &kvm->arch.gmap->flags);
+	gmap_split_huge_pages(kvm->arch.gmap);
+
 	cc = uv_call_sched(0, (u64)&uvcb);
 	*rc = uvcb.header.rc;
 	*rrc = uvcb.header.rrc;
@ -669,7 +743,6 @@ int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 		}
 		return -EIO;
 	}
-	kvm->arch.gmap->guest_handle = uvcb.guest_handle;
 	return 0;
 }

@ -704,26 +777,14 @@ static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak,
 		.tweak[1] = offset,
 	};
 	int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb);
-	unsigned long vmaddr;
-	bool unlocked;

 	*rc = uvcb.header.rc;
 	*rrc = uvcb.header.rrc;

 	if (ret == -ENXIO) {
-		mmap_read_lock(kvm->mm);
-		vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr));
-		if (kvm_is_error_hva(vmaddr)) {
-			ret = -EFAULT;
-		} else {
-			ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked);
-			if (!ret)
-				ret = __gmap_link(kvm->arch.gmap, addr, vmaddr);
-		}
-		mmap_read_unlock(kvm->mm);
+		ret = kvm_s390_faultin_gfn_simple(NULL, kvm, gpa_to_gfn(addr), true);
 		if (!ret)
 			return -EAGAIN;
-		return ret;
 	}

 	if (ret && ret != -EAGAIN)
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@ -15,7 +15,6 @@
 #include <linux/io.h>
 #include <linux/mman.h>

-#include <asm/gmap.h>
 #include <asm/mmu_context.h>
 #include <asm/sclp.h>
 #include <asm/nmi.h>
@ -23,6 +22,7 @@
 #include <asm/facility.h>
 #include "kvm-s390.h"
 #include "gaccess.h"
+#include "gmap.h"

 enum vsie_page_flags {
 	VSIE_PAGE_IN_USE = 0,
@ -41,8 +41,11 @@ struct vsie_page {
 	 * are reused conditionally, should be accessed via READ_ONCE.
 	 */
 	struct kvm_s390_sie_block *scb_o;	/* 0x0218 */
-	/* the shadow gmap in use by the vsie_page */
-	struct gmap *gmap;			/* 0x0220 */
+	/*
+	 * Flags: must be set/cleared atomically after the vsie page can be
+	 * looked up by other CPUs.
+	 */
+	unsigned long flags;			/* 0x0220 */
 	/* address of the last reported fault to guest2 */
 	unsigned long fault_addr;		/* 0x0228 */
 	/* calculated guest addresses of satellite control blocks */
@ -57,33 +60,14 @@ struct vsie_page {
 	 * radix tree.
 	 */
 	gpa_t scb_gpa;				/* 0x0258 */
-	/*
-	 * Flags: must be set/cleared atomically after the vsie page can be
-	 * looked up by other CPUs.
-	 */
-	unsigned long flags;			/* 0x0260 */
-	__u8 reserved[0x0700 - 0x0268];		/* 0x0268 */
+	/* the shadow gmap in use by the vsie_page */
+	struct gmap_cache gmap_cache;		/* 0x0260 */
+	__u8 reserved[0x0700 - 0x0278];		/* 0x0278 */
 	struct kvm_s390_crypto_cb crycb;	/* 0x0700 */
 	__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE];	/* 0x0800 */
 };

-/**
- * gmap_shadow_valid() - check if a shadow guest address space matches the
- *                       given properties and is still valid
- * @sg: pointer to the shadow guest address space structure
- * @asce: ASCE for which the shadow table is requested
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns 1 if the gmap shadow is still valid and matches the given
- * properties, the caller can continue using it. Returns 0 otherwise; the
- * caller has to request a new shadow gmap in this case.
- */
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
-{
-	if (sg->removed)
-		return 0;
-	return sg->orig_asce == asce && sg->edat_level == edat_level;
-}
+static_assert(sizeof(struct vsie_page) == PAGE_SIZE);

 /* trigger a validity icpt for the given scb */
 static int set_validity_icpt(struct kvm_s390_sie_block *scb,
@ -612,26 +596,17 @@ out:
 	return rc;
 }

-void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
-				 unsigned long end)
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end)
 {
-	struct kvm *kvm = gmap->private;
-	struct vsie_page *cur;
+	struct vsie_page *cur, *next;
 	unsigned long prefix;
-	int i;

-	if (!gmap_is_shadow(gmap))
-		return;
+	KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &gmap->flags), gmap->kvm);
 	/*
 	 * Only new shadow blocks are added to the list during runtime,
 	 * therefore we can safely reference them all the time.
 	 */
-	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
-		cur = READ_ONCE(kvm->arch.vsie.pages[i]);
-		if (!cur)
-			continue;
-		if (READ_ONCE(cur->gmap) != gmap)
-			continue;
+	list_for_each_entry_safe(cur, next, &gmap->scb_users, gmap_cache.list) {
 		prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
 		/* with mso/msl, the prefix lies at an offset */
 		prefix += cur->scb_s.mso;
@ -652,7 +627,7 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
 *          - -EAGAIN if the caller can retry immediately
 *          - -ENOMEM if out of memory
 */
-static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
 	u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
@ -667,10 +642,9 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	/* with mso/msl, the prefix lies at offset *mso* */
 	prefix += scb_s->mso;

-	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix, NULL);
+	rc = gaccess_shadow_fault(vcpu, sg, prefix, NULL, true);
 	if (!rc && (scb_s->ecb & ECB_TE))
-		rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-					   prefix + PAGE_SIZE, NULL);
+		rc = gaccess_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL, true);
 	/*
 	 * We don't have to mprotect, we will be called for all unshadows.
 	 * SIE will detect if protection applies and trigger a validity.
@ -951,8 +925,9 @@ static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
 *          - > 0 if control has to be given to guest 2
 *          - < 0 if an error occurred
 */
-static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 {
+	bool wr = kvm_s390_cur_gmap_fault_is_write();
 	int rc;

 	if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION)
@ -960,12 +935,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		return inject_fault(vcpu, PGM_PROTECTION,
 				    current->thread.gmap_teid.addr * PAGE_SIZE, 1);

-	rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-				   current->thread.gmap_teid.addr * PAGE_SIZE, NULL);
+	rc = gaccess_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL, wr);
 	if (rc > 0) {
 		rc = inject_fault(vcpu, rc,
-				  current->thread.gmap_teid.addr * PAGE_SIZE,
-				  kvm_s390_cur_gmap_fault_is_write());
+				  current->thread.gmap_teid.addr * PAGE_SIZE, wr);
 		if (rc >= 0)
 			vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE;
 	}
@ -978,12 +951,10 @@ static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 *
 * Will ignore any errors. The next SIE fault will do proper fault handling.
 */
-static void handle_last_fault(struct kvm_vcpu *vcpu,
-			      struct vsie_page *vsie_page)
+static void handle_last_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 {
 	if (vsie_page->fault_addr)
-		kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
-				      vsie_page->fault_addr, NULL);
+		gaccess_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL, true);
 	vsie_page->fault_addr = 0;
 }

@ -1065,11 +1036,12 @@ static u64 vsie_get_register(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
 	}
 }

-static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
-	unsigned long pei_dest, pei_src, src, dest, mask, prefix;
+	unsigned long src, dest, mask, prefix;
 	u64 *pei_block = &vsie_page->scb_o->mcic;
+	union mvpg_pei pei_dest, pei_src;
 	int edat, rc_dest, rc_src;
 	union ctlreg0 cr0;

@ -1083,8 +1055,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask;
 	src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso;

-	rc_dest = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, dest, &pei_dest);
-	rc_src = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, src, &pei_src);
+	rc_dest = gaccess_shadow_fault(vcpu, sg, dest, &pei_dest, true);
+	rc_src = gaccess_shadow_fault(vcpu, sg, src, &pei_src, false);
 	/*
 	 * Either everything went well, or something non-critical went wrong
 	 * e.g. because of a race. In either case, simply retry.
@ -1119,8 +1091,8 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 		rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0;
 	}
 	if (!rc_dest && !rc_src) {
-		pei_block[0] = pei_dest;
-		pei_block[1] = pei_src;
+		pei_block[0] = pei_dest.val;
+		pei_block[1] = pei_src.val;
 		return 1;
 	}

@ -1144,7 +1116,7 @@ static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 *          - > 0 if control has to be given to guest 2
 *          - < 0 if an error occurred
 */
-static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg)
 	__releases(vcpu->kvm->srcu)
 	__acquires(vcpu->kvm->srcu)
 {
@ -1153,7 +1125,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 	int guest_bp_isolation;
 	int rc = 0;

-	handle_last_fault(vcpu, vsie_page);
+	handle_last_fault(vcpu, vsie_page, sg);

 	kvm_vcpu_srcu_read_unlock(vcpu);

@ -1191,7 +1163,7 @@ xfer_to_guest_mode_check:
 			goto xfer_to_guest_mode_check;
 		}
 		guest_timing_enter_irqoff();
-		rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, vsie_page->gmap->asce);
+		rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce.val);
 		guest_timing_exit_irqoff();
 		local_irq_enable();
 	}
@ -1215,7 +1187,7 @@ skip_sie:
 	if (rc > 0)
 		rc = 0; /* we could still have an icpt */
 	else if (current->thread.gmap_int_code)
-		return handle_fault(vcpu, vsie_page);
+		return handle_fault(vcpu, vsie_page, sg);

 	switch (scb_s->icptcode) {
 	case ICPT_INST:
@ -1233,7 +1205,7 @@ skip_sie:
 		break;
 	case ICPT_PARTEXEC:
 		if (scb_s->ipa == 0xb254)
-			rc = vsie_handle_mvpg(vcpu, vsie_page);
+			rc = vsie_handle_mvpg(vcpu, vsie_page, sg);
 		break;
 	}
 	return rc;
@ -1241,43 +1213,67 @@ skip_sie:

 static void release_gmap_shadow(struct vsie_page *vsie_page)
 {
-	if (vsie_page->gmap)
-		gmap_put(vsie_page->gmap);
-	WRITE_ONCE(vsie_page->gmap, NULL);
+	struct gmap *gmap = vsie_page->gmap_cache.gmap;
+
+	lockdep_assert_held(&gmap->kvm->arch.gmap->children_lock);
+
+	list_del(&vsie_page->gmap_cache.list);
+	vsie_page->gmap_cache.gmap = NULL;
 	prefix_unmapped(vsie_page);
+
+	if (list_empty(&gmap->scb_users)) {
+		gmap_remove_child(gmap);
+		gmap_put(gmap);
+	}
 }

-static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
-			       struct vsie_page *vsie_page)
+static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
-	unsigned long asce;
 	union ctlreg0 cr0;
 	struct gmap *gmap;
+	union asce asce;
 	int edat;

-	asce = vcpu->arch.sie_block->gcr[1];
+	asce.val = vcpu->arch.sie_block->gcr[1];
 	cr0.val = vcpu->arch.sie_block->gcr[0];
 	edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
 	edat += edat && test_kvm_facility(vcpu->kvm, 78);

-	/*
-	 * ASCE or EDAT could have changed since last icpt, or the gmap
-	 * we're holding has been unshadowed. If the gmap is still valid,
-	 * we can safely reuse it.
-	 */
-	if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) {
-		vcpu->kvm->stat.gmap_shadow_reuse++;
-		return 0;
+	scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) {
+		gmap = vsie_page->gmap_cache.gmap;
+		if (gmap) {
+			/*
+			 * ASCE or EDAT could have changed since last icpt, or the gmap
+			 * we're holding has been unshadowed. If the gmap is still valid,
+			 * we can safely reuse it.
+			 */
+			if (gmap_is_shadow_valid(gmap, asce, edat)) {
+				vcpu->kvm->stat.gmap_shadow_reuse++;
+				gmap_get(gmap);
+				return gmap;
+			}
+			/* release the old shadow and mark the prefix as unmapped */
+			release_gmap_shadow(vsie_page);
+		}
 	}
-
-	/* release the old shadow - if any, and mark the prefix as unmapped */
-	release_gmap_shadow(vsie_page);
-	gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+again:
+	gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat);
 	if (IS_ERR(gmap))
-		return PTR_ERR(gmap);
-	vcpu->kvm->stat.gmap_shadow_create++;
-	WRITE_ONCE(vsie_page->gmap, gmap);
-	return 0;
+		return gmap;
+	scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) {
+		/* unlikely race condition, remove the previous shadow */
+		if (vsie_page->gmap_cache.gmap)
+			release_gmap_shadow(vsie_page);
+		if (!gmap->parent) {
+			gmap_put(gmap);
+			goto again;
+		}
+		vcpu->kvm->stat.gmap_shadow_create++;
+		list_add(&vsie_page->gmap_cache.list, &gmap->scb_users);
+		vsie_page->gmap_cache.gmap = gmap;
+		prefix_unmapped(vsie_page);
+	}
+	return gmap;
 }

 /*
@ -1330,15 +1326,20 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
 static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 {
 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+	struct gmap *sg;
 	int rc = 0;

 	while (1) {
-		rc = acquire_gmap_shadow(vcpu, vsie_page);
+		sg = acquire_gmap_shadow(vcpu, vsie_page);
+		if (IS_ERR(sg)) {
+			rc = PTR_ERR(sg);
+			sg = NULL;
+		}
 		if (!rc)
-			rc = map_prefix(vcpu, vsie_page);
+			rc = map_prefix(vcpu, vsie_page, sg);
 		if (!rc) {
 			update_intervention_requests(vsie_page);
-			rc = do_vsie_run(vcpu, vsie_page);
+			rc = do_vsie_run(vcpu, vsie_page, sg);
 		}
 		atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);

@ -1361,6 +1362,9 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 			kvm_s390_rewind_psw(vcpu, 4);
 			break;
 		}
+		if (sg)
+			sg = gmap_put(sg);
+		cond_resched();
 	}

 	if (rc == -EFAULT) {
@ -1457,8 +1461,7 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 	vsie_page->scb_gpa = ULONG_MAX;

 	/* Double use of the same address or allocation failure. */
-	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9,
-			      vsie_page)) {
+	if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, vsie_page)) {
 		put_vsie_page(vsie_page);
 		mutex_unlock(&kvm->arch.vsie.mutex);
 		return NULL;
@ -1467,7 +1470,12 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
 	mutex_unlock(&kvm->arch.vsie.mutex);

 	memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
-	release_gmap_shadow(vsie_page);
+	if (vsie_page->gmap_cache.gmap) {
+		scoped_guard(spinlock, &kvm->arch.gmap->children_lock)
+			if (vsie_page->gmap_cache.gmap)
+				release_gmap_shadow(vsie_page);
+	}
+	prefix_unmapped(vsie_page);
 	vsie_page->fault_addr = 0;
 	vsie_page->scb_s.ihcpu = 0xffffU;
 	return vsie_page;
@ -1498,11 +1506,13 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
 	}

 	vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
-	if (IS_ERR(vsie_page))
+	if (IS_ERR(vsie_page)) {
 		return PTR_ERR(vsie_page);
-	else if (!vsie_page)
+	} else if (!vsie_page) {
 		/* double use of sie control block - simply do nothing */
+		kvm_s390_rewind_psw(vcpu, 4);
 		return 0;
+	}

 	rc = pin_scb(vcpu, vsie_page, scb_addr);
 	if (rc)
@ -1543,8 +1553,10 @@ void kvm_s390_vsie_destroy(struct kvm *kvm)
 	mutex_lock(&kvm->arch.vsie.mutex);
 	for (i = 0; i < kvm->arch.vsie.page_count; i++) {
 		vsie_page = kvm->arch.vsie.pages[i];
+		scoped_guard(spinlock, &kvm->arch.gmap->children_lock)
+			if (vsie_page->gmap_cache.gmap)
+				release_gmap_shadow(vsie_page);
 		kvm->arch.vsie.pages[i] = NULL;
-		release_gmap_shadow(vsie_page);
 		/* free the radix tree entry */
 		if (vsie_page->scb_gpa != ULONG_MAX)
 			radix_tree_delete(&kvm->arch.vsie.addr_to_page,
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@ -34,136 +34,19 @@ void debug_user_asce(int exit)
 }
 #endif /*CONFIG_DEBUG_ENTRY */

-union oac {
-	unsigned int val;
-	struct {
-		struct {
-			unsigned short key : 4;
-			unsigned short	   : 4;
-			unsigned short as  : 2;
-			unsigned short	   : 4;
-			unsigned short k   : 1;
-			unsigned short a   : 1;
-		} oac1;
-		struct {
-			unsigned short key : 4;
-			unsigned short	   : 4;
-			unsigned short as  : 2;
-			unsigned short	   : 4;
-			unsigned short k   : 1;
-			unsigned short a   : 1;
-		} oac2;
-	};
-};
-
-static uaccess_kmsan_or_inline __must_check unsigned long
-raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key)
-{
-	unsigned long osize;
-	union oac spec = {
-		.oac2.key = key,
-		.oac2.as = PSW_BITS_AS_SECONDARY,
-		.oac2.k = 1,
-		.oac2.a = 1,
-	};
-	int cc;
-
-	while (1) {
-		osize = size;
-		asm_inline volatile(
-			"	lr	%%r0,%[spec]\n"
-			"0:	mvcos	%[to],%[from],%[size]\n"
-			"1:	nopr	%%r7\n"
-			CC_IPM(cc)
-			EX_TABLE_UA_MVCOS_FROM(0b, 0b)
-			EX_TABLE_UA_MVCOS_FROM(1b, 0b)
-			: CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to)
-			: [spec] "d" (spec.val), [from] "Q" (*(const char __user *)from)
-			: CC_CLOBBER_LIST("memory", "0"));
-		if (CC_TRANSFORM(cc) == 0)
-			return osize - size;
-		size -= 4096;
-		to += 4096;
-		from += 4096;
-	}
-}
-
-unsigned long _copy_from_user_key(void *to, const void __user *from,
-				  unsigned long n, unsigned long key)
-{
-	unsigned long res = n;
-
-	might_fault();
-	if (!should_fail_usercopy()) {
-		instrument_copy_from_user_before(to, from, n);
-		res = raw_copy_from_user_key(to, from, n, key);
-		instrument_copy_from_user_after(to, from, n, res);
-	}
-	if (unlikely(res))
-		memset(to + (n - res), 0, res);
-	return res;
-}
-EXPORT_SYMBOL(_copy_from_user_key);
-
-static uaccess_kmsan_or_inline __must_check unsigned long
-raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key)
-{
-	unsigned long osize;
-	union oac spec = {
-		.oac1.key = key,
-		.oac1.as = PSW_BITS_AS_SECONDARY,
-		.oac1.k = 1,
-		.oac1.a = 1,
-	};
-	int cc;
-
-	while (1) {
-		osize = size;
-		asm_inline volatile(
-			"	lr	%%r0,%[spec]\n"
-			"0:	mvcos	%[to],%[from],%[size]\n"
-			"1:	nopr	%%r7\n"
-			CC_IPM(cc)
-			EX_TABLE_UA_MVCOS_TO(0b, 0b)
-			EX_TABLE_UA_MVCOS_TO(1b, 0b)
-			: CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to)
-			: [spec] "d" (spec.val), [from] "Q" (*(const char *)from)
-			: CC_CLOBBER_LIST("memory", "0"));
-		if (CC_TRANSFORM(cc) == 0)
-			return osize - size;
-		size -= 4096;
-		to += 4096;
-		from += 4096;
-	}
-}
-
-unsigned long _copy_to_user_key(void __user *to, const void *from,
-				unsigned long n, unsigned long key)
-{
-	might_fault();
-	if (should_fail_usercopy())
-		return n;
-	instrument_copy_to_user(to, from, n);
-	return raw_copy_to_user_key(to, from, n, key);
-}
-EXPORT_SYMBOL(_copy_to_user_key);
-
 #define CMPXCHG_USER_KEY_MAX_LOOPS 128

-static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsigned int *uval,
-						    unsigned int old, unsigned int new,
-						    unsigned int mask, unsigned long key)
+static nokprobe_inline int __cmpxchg_key_small(void *address, unsigned int *uval,
+					       unsigned int old, unsigned int new,
+					       unsigned int mask, unsigned long key)
 {
 	unsigned long count;
 	unsigned int prev;
-	bool sacf_flag;
 	int rc = 0;

 	skey_regions_initialize();
-	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile(
 		"20:	spka	0(%[key])\n"
-		"	sacf	256\n"
 		"	llill	%[count],%[max_loops]\n"
 		"0:	l	%[prev],%[address]\n"
 		"1:	nr	%[prev],%[mask]\n"
@ -178,8 +61,7 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig
 		"	nr	%[tmp],%[mask]\n"
 		"	jnz	5f\n"
 		"	brct	%[count],2b\n"
-		"5:	sacf	768\n"
-		"	spka	%[default_key]\n"
+		"5:	spka	%[default_key]\n"
 		"21:\n"
 		EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev])
 		EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev])
@ -197,16 +79,16 @@ static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsig
 		[default_key] "J" (PAGE_DEFAULT_KEY),
 		[max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS)
 		: "memory", "cc");
-	disable_sacf_uaccess(sacf_flag);
 	*uval = prev;
 	if (!count)
 		rc = -EAGAIN;
 	return rc;
 }

-int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
-				  unsigned char old, unsigned char new, unsigned long key)
+int __kprobes __cmpxchg_key1(void *addr, unsigned char *uval, unsigned char old,
+			     unsigned char new, unsigned long key)
 {
+	unsigned long address = (unsigned long)addr;
 	unsigned int prev, shift, mask, _old, _new;
 	int rc;

@ -215,15 +97,16 @@ int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval,
 	_old = (unsigned int)old << shift;
 	_new = (unsigned int)new << shift;
 	mask = ~(0xff << shift);
-	rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+	rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key);
 	*uval = prev >> shift;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key1);
+EXPORT_SYMBOL(__cmpxchg_key1);

-int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
-				  unsigned short old, unsigned short new, unsigned long key)
+int __kprobes __cmpxchg_key2(void *addr, unsigned short *uval, unsigned short old,
+			     unsigned short new, unsigned long key)
 {
+	unsigned long address = (unsigned long)addr;
 	unsigned int prev, shift, mask, _old, _new;
 	int rc;

@ -232,27 +115,23 @@ int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval,
 	_old = (unsigned int)old << shift;
 	_new = (unsigned int)new << shift;
 	mask = ~(0xffff << shift);
-	rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key);
+	rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key);
 	*uval = prev >> shift;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key2);
+EXPORT_SYMBOL(__cmpxchg_key2);

-int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
-				  unsigned int old, unsigned int new, unsigned long key)
+int __kprobes __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old,
+			     unsigned int new, unsigned long key)
 {
 	unsigned int prev = old;
-	bool sacf_flag;
 	int rc = 0;

 	skey_regions_initialize();
-	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile(
 		"20:	spka	0(%[key])\n"
-		"	sacf	256\n"
 		"0:	cs	%[prev],%[new],%[address]\n"
-		"1:	sacf	768\n"
-		"	spka	%[default_key]\n"
+		"1:	spka	%[default_key]\n"
 		"21:\n"
 		EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
 		EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
@ -264,27 +143,22 @@ int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval,
 		[key] "a" (key << 4),
 		[default_key] "J" (PAGE_DEFAULT_KEY)
 		: "memory", "cc");
-	disable_sacf_uaccess(sacf_flag);
 	*uval = prev;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key4);
+EXPORT_SYMBOL(__cmpxchg_key4);

-int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
-				  unsigned long old, unsigned long new, unsigned long key)
+int __kprobes __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old,
+			     unsigned long new, unsigned long key)
 {
 	unsigned long prev = old;
-	bool sacf_flag;
 	int rc = 0;

 	skey_regions_initialize();
-	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile(
 		"20:	spka	0(%[key])\n"
-		"	sacf	256\n"
 		"0:	csg	%[prev],%[new],%[address]\n"
-		"1:	sacf	768\n"
-		"	spka	%[default_key]\n"
+		"1:	spka	%[default_key]\n"
 		"21:\n"
 		EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev])
 		EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev])
@ -296,27 +170,22 @@ int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval,
 		[key] "a" (key << 4),
 		[default_key] "J" (PAGE_DEFAULT_KEY)
 		: "memory", "cc");
-	disable_sacf_uaccess(sacf_flag);
 	*uval = prev;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key8);
+EXPORT_SYMBOL(__cmpxchg_key8);

-int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
-				   __uint128_t old, __uint128_t new, unsigned long key)
+int __kprobes __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old,
+			      __uint128_t new, unsigned long key)
 {
 	__uint128_t prev = old;
-	bool sacf_flag;
 	int rc = 0;

 	skey_regions_initialize();
-	sacf_flag = enable_sacf_uaccess();
 	asm_inline volatile(
 		"20:	spka	0(%[key])\n"
-		"	sacf	256\n"
 		"0:	cdsg	%[prev],%[new],%[address]\n"
-		"1:	sacf	768\n"
-		"	spka	%[default_key]\n"
+		"1:	spka	%[default_key]\n"
 		"21:\n"
 		EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev])
 		EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev])
@ -328,8 +197,7 @@ int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval,
 		[key] "a" (key << 4),
 		[default_key] "J" (PAGE_DEFAULT_KEY)
 		: "memory", "cc");
-	disable_sacf_uaccess(sacf_flag);
 	*uval = prev;
 	return rc;
 }
-EXPORT_SYMBOL(__cmpxchg_user_key16);
+EXPORT_SYMBOL(__cmpxchg_key16);
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@ -10,7 +10,6 @@ obj-$(CONFIG_CMM)		+= cmm.o
 obj-$(CONFIG_DEBUG_VIRTUAL)	+= physaddr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PTDUMP)		+= dump_pagetables.o
-obj-$(CONFIG_PGSTE)		+= gmap.o
 obj-$(CONFIG_PFAULT)		+= pfault.o

 obj-$(subst m,y,$(CONFIG_KVM))	+= gmap_helpers.o
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@ -403,7 +403,7 @@ void do_dat_exception(struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(do_dat_exception);

-#if IS_ENABLED(CONFIG_PGSTE)
+#if IS_ENABLED(CONFIG_KVM)

 void do_secure_storage_access(struct pt_regs *regs)
 {
@ -470,4 +470,4 @@ void do_secure_storage_access(struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(do_secure_storage_access);

-#endif /* CONFIG_PGSTE */
+#endif /* CONFIG_KVM */
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
--- a/arch/s390/mm/gmap_helpers.c
+++ b/arch/s390/mm/gmap_helpers.c
@ -15,7 +15,6 @@
 #include <linux/pagewalk.h>
 #include <linux/ksm.h>
 #include <asm/gmap_helpers.h>
-#include <asm/pgtable.h>

 /**
 * ptep_zap_softleaf_entry() - discard a software leaf entry.
@ -47,9 +46,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
 void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
 {
 	struct vm_area_struct *vma;
-	unsigned long pgstev;
 	spinlock_t *ptl;
-	pgste_t pgste;
 	pte_t *ptep;

 	mmap_assert_locked(mm);
@ -64,18 +61,8 @@ void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
 	if (unlikely(!ptep))
 		return;
 	if (pte_swap(*ptep)) {
-		preempt_disable();
-		pgste = pgste_get_lock(ptep);
-		pgstev = pgste_val(pgste);
-
-		if ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
-		    (pgstev & _PGSTE_GPS_ZERO)) {
-			ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
-			pte_clear(mm, vmaddr, ptep);
-		}
-
-		pgste_set_unlock(ptep, pgste);
-		preempt_enable();
+		ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
+		pte_clear(mm, vmaddr, ptep);
 	}
 	pte_unmap_unlock(ptep, ptl);
 }
@ -108,6 +95,85 @@ void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned lo
 }
 EXPORT_SYMBOL_GPL(gmap_helper_discard);

+/**
+ * gmap_helper_try_set_pte_unused() - mark a pte entry as unused
+ * @mm: the mm
+ * @vmaddr: the userspace address whose pte is to be marked
+ *
+ * Mark the pte corresponding the given address as unused. This will cause
+ * core mm code to just drop this page instead of swapping it.
+ *
+ * This function needs to be called with interrupts disabled (for example
+ * while holding a spinlock), or while holding the mmap lock. Normally this
+ * function is called as a result of an unmap operation, and thus KVM common
+ * code will already hold kvm->mmu_lock in write mode.
+ *
+ * Context: Needs to be called while holding the mmap lock or with interrupts
+ *          disabled.
+ */
+void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr)
+{
+	pmd_t *pmdp, pmd, pmdval;
+	pud_t *pudp, pud;
+	p4d_t *p4dp, p4d;
+	pgd_t *pgdp, pgd;
+	spinlock_t *ptl;	/* Lock for the host (userspace) page table */
+	pte_t *ptep;
+
+	pgdp = pgd_offset(mm, vmaddr);
+	pgd = pgdp_get(pgdp);
+	if (pgd_none(pgd) || !pgd_present(pgd))
+		return;
+
+	p4dp = p4d_offset(pgdp, vmaddr);
+	p4d = p4dp_get(p4dp);
+	if (p4d_none(p4d) || !p4d_present(p4d))
+		return;
+
+	pudp = pud_offset(p4dp, vmaddr);
+	pud = pudp_get(pudp);
+	if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud))
+		return;
+
+	pmdp = pmd_offset(pudp, vmaddr);
+	pmd = pmdp_get_lockless(pmdp);
+	if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd))
+		return;
+
+	ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl);
+	if (!ptep)
+		return;
+
+	/*
+	 * Several paths exists that takes the ptl lock and then call the
+	 * mmu_notifier, which takes the mmu_lock. The unmap path, instead,
+	 * takes the mmu_lock in write mode first, and then potentially
+	 * calls this function, which takes the ptl lock. This can lead to a
+	 * deadlock.
+	 * The unused page mechanism is only an optimization, if the
+	 * _PAGE_UNUSED bit is not set, the unused page is swapped as normal
+	 * instead of being discarded.
+	 * If the lock is contended the bit is not set and the deadlock is
+	 * avoided.
+	 */
+	if (spin_trylock(ptl)) {
+		/*
+		 * Make sure the pte we are touching is still the correct
+		 * one. In theory this check should not be needed, but
+		 * better safe than sorry.
+		 * Disabling interrupts or holding the mmap lock is enough to
+		 * guarantee that no concurrent updates to the page tables
+		 * are possible.
+		 */
+		if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp))))
+			__atomic64_or(_PAGE_UNUSED, (long *)ptep);
+		spin_unlock(ptl);
+	}
+
+	pte_unmap(ptep);
+}
+EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused);
+
 static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
 				   unsigned long end, struct mm_walk *walk)
 {
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@ -135,29 +135,6 @@ static inline pte_t __rste_to_pte(unsigned long rste)
 	return __pte(pteval);
 }

-static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
-{
-	struct folio *folio;
-	unsigned long size, paddr;
-
-	if (!mm_uses_skeys(mm) ||
-	    rste & _SEGMENT_ENTRY_INVALID)
-		return;
-
-	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
-		folio = page_folio(pud_page(__pud(rste)));
-		size = PUD_SIZE;
-		paddr = rste & PUD_MASK;
-	} else {
-		folio = page_folio(pmd_page(__pmd(rste)));
-		size = PMD_SIZE;
-		paddr = rste & PMD_MASK;
-	}
-
-	if (!test_and_set_bit(PG_arch_1, &folio->flags.f))
-		__storage_key_init_range(paddr, paddr + size);
-}
-
 void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t pte)
 {
@ -173,7 +150,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	} else if (likely(pte_present(pte)))
 		rste |= _SEGMENT_ENTRY_LARGE;

-	clear_huge_pte_skeys(mm, rste);
 	set_pte(ptep, __pte(rste));
 }

--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@ -13,6 +13,7 @@
 #include <asm/page.h>

 int __bootdata_preserved(cmma_flag);
+EXPORT_SYMBOL(cmma_flag);

 void arch_free_page(struct page *page, int order)
 {
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@ -16,13 +16,6 @@
 #include <asm/asm.h>
 #include <asm/set_memory.h>

-static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
-{
-	asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],1,0"
-		     : [addr] "+a" (addr) : [skey] "d" (skey));
-	return addr;
-}
-
 void __storage_key_init_range(unsigned long start, unsigned long end)
 {
 	unsigned long boundary, size;
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@ -114,30 +114,6 @@ err_p4d:
 	return -ENOMEM;
 }

-#ifdef CONFIG_PGSTE
-
-struct ptdesc *page_table_alloc_pgste_noprof(struct mm_struct *mm)
-{
-	struct ptdesc *ptdesc;
-	u64 *table;
-
-	ptdesc = pagetable_alloc_noprof(GFP_KERNEL_ACCOUNT, 0);
-	if (ptdesc) {
-		table = (u64 *)ptdesc_address(ptdesc);
-		__arch_set_page_dat(table, 1);
-		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
-		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
-	}
-	return ptdesc;
-}
-
-void page_table_free_pgste(struct ptdesc *ptdesc)
-{
-	pagetable_free(ptdesc);
-}
-
-#endif /* CONFIG_PGSTE */
-
 unsigned long *page_table_alloc_noprof(struct mm_struct *mm)
 {
 	gfp_t gfp = GFP_KERNEL_ACCOUNT;
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@ -24,7 +24,6 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/page-states.h>
-#include <asm/pgtable.h>
 #include <asm/machine.h>

 pgprot_t pgprot_writecombine(pgprot_t prot)
@ -116,149 +115,14 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
 	return old;
 }

-static inline pgste_t pgste_get(pte_t *ptep)
-{
-	unsigned long pgste = 0;
-#ifdef CONFIG_PGSTE
-	pgste = *(unsigned long *)(ptep + PTRS_PER_PTE);
-#endif
-	return __pgste(pgste);
-}
-
-static inline void pgste_set(pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	*(pgste_t *)(ptep + PTRS_PER_PTE) = pgste;
-#endif
-}
-
-static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
-				       struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	unsigned long address, bits, skey;
-
-	if (!mm_uses_skeys(mm) || pte_val(pte) & _PAGE_INVALID)
-		return pgste;
-	address = pte_val(pte) & PAGE_MASK;
-	skey = (unsigned long) page_get_storage_key(address);
-	bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
-	/* Transfer page changed & referenced bit to guest bits in pgste */
-	pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */
-	/* Copy page access key and fetch protection bit to pgste */
-	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
-#endif
-	return pgste;
-
-}
-
-static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry,
-				 struct mm_struct *mm)
-{
-#ifdef CONFIG_PGSTE
-	unsigned long address;
-	unsigned long nkey;
-
-	if (!mm_uses_skeys(mm) || pte_val(entry) & _PAGE_INVALID)
-		return;
-	VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID));
-	address = pte_val(entry) & PAGE_MASK;
-	/*
-	 * Set page access key and fetch protection bit from pgste.
-	 * The guest C/R information is still in the PGSTE, set real
-	 * key C/R to 0.
-	 */
-	nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
-	nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
-	page_set_storage_key(address, nkey, 0);
-#endif
-}
-
-static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
-{
-#ifdef CONFIG_PGSTE
-	if ((pte_val(entry) & _PAGE_PRESENT) &&
-	    (pte_val(entry) & _PAGE_WRITE) &&
-	    !(pte_val(entry) & _PAGE_INVALID)) {
-		if (!machine_has_esop()) {
-			/*
-			 * Without enhanced suppression-on-protection force
-			 * the dirty bit on for all writable ptes.
-			 */
-			entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY));
-			entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT));
-		}
-		if (!(pte_val(entry) & _PAGE_PROTECT))
-			/* This pte allows write access, set user-dirty */
-			pgste = set_pgste_bit(pgste, PGSTE_UC_BIT);
-	}
-#endif
-	set_pte(ptep, entry);
-	return pgste;
-}
-
-static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
-				       unsigned long addr,
-				       pte_t *ptep, pgste_t pgste)
-{
-#ifdef CONFIG_PGSTE
-	unsigned long bits;
-
-	bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
-	if (bits) {
-		pgste = __pgste(pgste_val(pgste) ^ bits);
-		ptep_notify(mm, addr, ptep, bits);
-	}
-#endif
-	return pgste;
-}
-
-static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
-				      unsigned long addr, pte_t *ptep)
-{
-	pgste_t pgste = __pgste(0);
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_get_lock(ptep);
-		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
-	}
-	return pgste;
-}
-
-static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
-				    unsigned long addr, pte_t *ptep,
-				    pgste_t pgste, pte_t old, pte_t new)
-{
-	if (mm_has_pgste(mm)) {
-		if (pte_val(old) & _PAGE_INVALID)
-			pgste_set_key(ptep, pgste, new, mm);
-		if (pte_val(new) & _PAGE_INVALID) {
-			pgste = pgste_update_all(old, pgste, mm);
-			if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
-			    _PGSTE_GPS_USAGE_UNUSED)
-				old = set_pte_bit(old, __pgprot(_PAGE_UNUSED));
-		}
-		pgste = pgste_set_pte(ptep, pgste, new);
-		pgste_set_unlock(ptep, pgste);
-	} else {
-		set_pte(ptep, new);
-	}
-	return old;
-}
-
 pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
 		       pte_t *ptep, pte_t new)
 {
-	pgste_t pgste;
 	pte_t old;
-	int nodat;

 	preempt_disable();
-	pgste = ptep_xchg_start(mm, addr, ptep);
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	old = ptep_flush_direct(mm, addr, ptep, nodat);
-	old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+	old = ptep_flush_direct(mm, addr, ptep, 1);
+	set_pte(ptep, new);
 	preempt_enable();
 	return old;
 }
@ -292,15 +156,11 @@ EXPORT_SYMBOL(ptep_reset_dat_prot);
 pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t new)
 {
-	pgste_t pgste;
 	pte_t old;
-	int nodat;

 	preempt_disable();
-	pgste = ptep_xchg_start(mm, addr, ptep);
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	old = ptep_flush_lazy(mm, addr, ptep, nodat);
-	old = ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+	old = ptep_flush_lazy(mm, addr, ptep, 1);
+	set_pte(ptep, new);
 	preempt_enable();
 	return old;
 }
@ -309,47 +169,22 @@ EXPORT_SYMBOL(ptep_xchg_lazy);
 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t *ptep)
 {
-	pgste_t pgste;
-	pte_t old;
-	int nodat;
-	struct mm_struct *mm = vma->vm_mm;
-
-	pgste = ptep_xchg_start(mm, addr, ptep);
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	old = ptep_flush_lazy(mm, addr, ptep, nodat);
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_update_all(old, pgste, mm);
-		pgste_set(ptep, pgste);
-	}
-	return old;
+	return ptep_flush_lazy(vma->vm_mm, addr, ptep, 1);
 }

 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
 			     pte_t *ptep, pte_t old_pte, pte_t pte)
 {
-	pgste_t pgste;
-	struct mm_struct *mm = vma->vm_mm;
-
-	if (mm_has_pgste(mm)) {
-		pgste = pgste_get(ptep);
-		pgste_set_key(ptep, pgste, pte, mm);
-		pgste = pgste_set_pte(ptep, pgste, pte);
-		pgste_set_unlock(ptep, pgste);
-	} else {
-		set_pte(ptep, pte);
-	}
+	set_pte(ptep, pte);
 }

 static inline void pmdp_idte_local(struct mm_struct *mm,
 				   unsigned long addr, pmd_t *pmdp)
 {
 	if (machine_has_tlb_guest())
-		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
-			    mm->context.asce, IDTE_LOCAL);
+		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE, mm->context.asce, IDTE_LOCAL);
 	else
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_LOCAL);
-	if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
-		gmap_pmdp_idte_local(mm, addr);
 }

 static inline void pmdp_idte_global(struct mm_struct *mm,
@ -358,12 +193,8 @@ static inline void pmdp_idte_global(struct mm_struct *mm,
 	if (machine_has_tlb_guest()) {
 		__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
 			    mm->context.asce, IDTE_GLOBAL);
-		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
-			gmap_pmdp_idte_global(mm, addr);
 	} else {
 		__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
-		if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
-			gmap_pmdp_idte_global(mm, addr);
 	}
 }

@ -398,8 +229,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
 			  cpumask_of(smp_processor_id()))) {
 		set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID)));
 		mm->context.flush_mm = 1;
-		if (mm_has_pgste(mm))
-			gmap_pmdp_invalidate(mm, addr);
 	} else {
 		pmdp_idte_global(mm, addr, pmdp);
 	}
@ -407,40 +236,6 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
 	return old;
 }

-#ifdef CONFIG_PGSTE
-static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
-{
-	struct vm_area_struct *vma;
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-
-	/* We need a valid VMA, otherwise this is clearly a fault. */
-	vma = vma_lookup(mm, addr);
-	if (!vma)
-		return -EFAULT;
-
-	pgd = pgd_offset(mm, addr);
-	if (!pgd_present(*pgd))
-		return -ENOENT;
-
-	p4d = p4d_offset(pgd, addr);
-	if (!p4d_present(*p4d))
-		return -ENOENT;
-
-	pud = pud_offset(p4d, addr);
-	if (!pud_present(*pud))
-		return -ENOENT;
-
-	/* Large PUDs are not supported yet. */
-	if (pud_leaf(*pud))
-		return -EFAULT;
-
-	*pmdp = pmd_offset(pud, addr);
-	return 0;
-}
-#endif
-
 pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
 		       pmd_t *pmdp, pmd_t new)
 {
@ -558,598 +353,3 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 	return pgtable;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-#ifdef CONFIG_PGSTE
-void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, pte_t entry)
-{
-	pgste_t pgste;
-
-	/* the mm_has_pgste() check is done in set_pte_at() */
-	preempt_disable();
-	pgste = pgste_get_lock(ptep);
-	pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO);
-	pgste_set_key(ptep, pgste, entry, mm);
-	pgste = pgste_set_pte(ptep, pgste, entry);
-	pgste_set_unlock(ptep, pgste);
-	preempt_enable();
-}
-
-void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pgste_t pgste;
-
-	preempt_disable();
-	pgste = pgste_get_lock(ptep);
-	pgste = set_pgste_bit(pgste, PGSTE_IN_BIT);
-	pgste_set_unlock(ptep, pgste);
-	preempt_enable();
-}
-
-/**
- * ptep_force_prot - change access rights of a locked pte
- * @mm: pointer to the process mm_struct
- * @addr: virtual address in the guest address space
- * @ptep: pointer to the page table entry
- * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
- * @bit: pgste bit to set (e.g. for notification)
- *
- * Returns 0 if the access rights were changed and -EAGAIN if the current
- * and requested access rights are incompatible.
- */
-int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
-		    pte_t *ptep, int prot, unsigned long bit)
-{
-	pte_t entry;
-	pgste_t pgste;
-	int pte_i, pte_p, nodat;
-
-	pgste = pgste_get_lock(ptep);
-	entry = *ptep;
-	/* Check pte entry after all locks have been acquired */
-	pte_i = pte_val(entry) & _PAGE_INVALID;
-	pte_p = pte_val(entry) & _PAGE_PROTECT;
-	if ((pte_i && (prot != PROT_NONE)) ||
-	    (pte_p && (prot & PROT_WRITE))) {
-		pgste_set_unlock(ptep, pgste);
-		return -EAGAIN;
-	}
-	/* Change access rights and set pgste bit */
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	if (prot == PROT_NONE && !pte_i) {
-		ptep_flush_direct(mm, addr, ptep, nodat);
-		pgste = pgste_update_all(entry, pgste, mm);
-		entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID));
-	}
-	if (prot == PROT_READ && !pte_p) {
-		ptep_flush_direct(mm, addr, ptep, nodat);
-		entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
-		entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
-	}
-	pgste = set_pgste_bit(pgste, bit);
-	pgste = pgste_set_pte(ptep, pgste, entry);
-	pgste_set_unlock(ptep, pgste);
-	return 0;
-}
-
-int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
-		    pte_t *sptep, pte_t *tptep, pte_t pte)
-{
-	pgste_t spgste, tpgste;
-	pte_t spte, tpte;
-	int rc = -EAGAIN;
-
-	if (!(pte_val(*tptep) & _PAGE_INVALID))
-		return 0;	/* already shadowed */
-	spgste = pgste_get_lock(sptep);
-	spte = *sptep;
-	if (!(pte_val(spte) & _PAGE_INVALID) &&
-	    !((pte_val(spte) & _PAGE_PROTECT) &&
-	      !(pte_val(pte) & _PAGE_PROTECT))) {
-		spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT);
-		tpgste = pgste_get_lock(tptep);
-		tpte = __pte((pte_val(spte) & PAGE_MASK) |
-			     (pte_val(pte) & _PAGE_PROTECT));
-		/* don't touch the storage key - it belongs to parent pgste */
-		tpgste = pgste_set_pte(tptep, tpgste, tpte);
-		pgste_set_unlock(tptep, tpgste);
-		rc = 1;
-	}
-	pgste_set_unlock(sptep, spgste);
-	return rc;
-}
-
-void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
-{
-	pgste_t pgste;
-	int nodat;
-
-	pgste = pgste_get_lock(ptep);
-	/* notifier is called by the caller */
-	nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-	ptep_flush_direct(mm, saddr, ptep, nodat);
-	/* don't touch the storage key - it belongs to parent pgste */
-	pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
-	pgste_set_unlock(ptep, pgste);
-}
-
-static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
-{
-	if (softleaf_is_swap(entry))
-		dec_mm_counter(mm, MM_SWAPENTS);
-	else if (softleaf_is_migration(entry)) {
-		struct folio *folio = softleaf_to_folio(entry);
-
-		dec_mm_counter(mm, mm_counter(folio));
-	}
-	free_swap_and_cache(entry);
-}
-
-void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
-		     pte_t *ptep, int reset)
-{
-	unsigned long pgstev;
-	pgste_t pgste;
-	pte_t pte;
-
-	/* Zap unused and logically-zero pages */
-	preempt_disable();
-	pgste = pgste_get_lock(ptep);
-	pgstev = pgste_val(pgste);
-	pte = *ptep;
-	if (!reset && pte_swap(pte) &&
-	    ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
-	     (pgstev & _PGSTE_GPS_ZERO))) {
-		ptep_zap_softleaf_entry(mm, softleaf_from_pte(pte));
-		pte_clear(mm, addr, ptep);
-	}
-	if (reset)
-		pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
-	pgste_set_unlock(ptep, pgste);
-	preempt_enable();
-}
-
-void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	unsigned long ptev;
-	pgste_t pgste;
-
-	/* Clear storage key ACC and F, but set R/C */
-	preempt_disable();
-	pgste = pgste_get_lock(ptep);
-	pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT);
-	ptev = pte_val(*ptep);
-	if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
-		page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
-	pgste_set_unlock(ptep, pgste);
-	preempt_enable();
-}
-
-/*
- * Test and reset if a guest page is dirty
- */
-bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
-		       pte_t *ptep)
-{
-	pgste_t pgste;
-	pte_t pte;
-	bool dirty;
-	int nodat;
-
-	pgste = pgste_get_lock(ptep);
-	dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
-	pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT);
-	pte = *ptep;
-	if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
-		pgste = pgste_pte_notify(mm, addr, ptep, pgste);
-		nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
-		ptep_ipte_global(mm, addr, ptep, nodat);
-		if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE))
-			pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
-		else
-			pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
-		set_pte(ptep, pte);
-	}
-	pgste_set_unlock(ptep, pgste);
-	return dirty;
-}
-EXPORT_SYMBOL_GPL(ptep_test_and_clear_uc);
-
-int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned char key, bool nq)
-{
-	unsigned long keyul, paddr;
-	spinlock_t *ptl;
-	pgste_t old, new;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	/*
-	 * If we don't have a PTE table and if there is no huge page mapped,
-	 * we can ignore attempts to set the key to 0, because it already is 0.
-	 */
-	switch (pmd_lookup(mm, addr, &pmdp)) {
-	case -ENOENT:
-		return key ? -EFAULT : 0;
-	case 0:
-		break;
-	default:
-		return -EFAULT;
-	}
-again:
-	ptl = pmd_lock(mm, pmdp);
-	if (!pmd_present(*pmdp)) {
-		spin_unlock(ptl);
-		return key ? -EFAULT : 0;
-	}
-
-	if (pmd_leaf(*pmdp)) {
-		paddr = pmd_val(*pmdp) & HPAGE_MASK;
-		paddr |= addr & ~HPAGE_MASK;
-		/*
-		 * Huge pmds need quiescing operations, they are
-		 * always mapped.
-		 */
-		page_set_storage_key(paddr, key, 1);
-		spin_unlock(ptl);
-		return 0;
-	}
-	spin_unlock(ptl);
-
-	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-	if (!ptep)
-		goto again;
-	new = old = pgste_get_lock(ptep);
-	new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT |
-				   PGSTE_ACC_BITS | PGSTE_FP_BIT);
-	keyul = (unsigned long) key;
-	new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48);
-	new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
-	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-		unsigned long bits, skey;
-
-		paddr = pte_val(*ptep) & PAGE_MASK;
-		skey = (unsigned long) page_get_storage_key(paddr);
-		bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
-		skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
-		/* Set storage key ACC and FP */
-		page_set_storage_key(paddr, skey, !nq);
-		/* Merge host changed & referenced into pgste  */
-		new = set_pgste_bit(new, bits << 52);
-	}
-	/* changing the guest storage key is considered a change of the page */
-	if ((pgste_val(new) ^ pgste_val(old)) &
-	    (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
-		new = set_pgste_bit(new, PGSTE_UC_BIT);
-
-	pgste_set_unlock(ptep, new);
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-}
-EXPORT_SYMBOL(set_guest_storage_key);
-
-/*
- * Conditionally set a guest storage key (handling csske).
- * oldkey will be updated when either mr or mc is set and a pointer is given.
- *
- * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
- * storage key was updated and -EFAULT on access errors.
- */
-int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			       unsigned char key, unsigned char *oldkey,
-			       bool nq, bool mr, bool mc)
-{
-	unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
-	int rc;
-
-	/* we can drop the pgste lock between getting and setting the key */
-	if (mr | mc) {
-		rc = get_guest_storage_key(current->mm, addr, &tmp);
-		if (rc)
-			return rc;
-		if (oldkey)
-			*oldkey = tmp;
-		if (!mr)
-			mask |= _PAGE_REFERENCED;
-		if (!mc)
-			mask |= _PAGE_CHANGED;
-		if (!((tmp ^ key) & mask))
-			return 0;
-	}
-	rc = set_guest_storage_key(current->mm, addr, key, nq);
-	return rc < 0 ? rc : 1;
-}
-EXPORT_SYMBOL(cond_set_guest_storage_key);
-
-/*
- * Reset a guest reference bit (rrbe), returning the reference and changed bit.
- *
- * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
- */
-int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
-{
-	spinlock_t *ptl;
-	unsigned long paddr;
-	pgste_t old, new;
-	pmd_t *pmdp;
-	pte_t *ptep;
-	int cc = 0;
-
-	/*
-	 * If we don't have a PTE table and if there is no huge page mapped,
-	 * the storage key is 0 and there is nothing for us to do.
-	 */
-	switch (pmd_lookup(mm, addr, &pmdp)) {
-	case -ENOENT:
-		return 0;
-	case 0:
-		break;
-	default:
-		return -EFAULT;
-	}
-again:
-	ptl = pmd_lock(mm, pmdp);
-	if (!pmd_present(*pmdp)) {
-		spin_unlock(ptl);
-		return 0;
-	}
-
-	if (pmd_leaf(*pmdp)) {
-		paddr = pmd_val(*pmdp) & HPAGE_MASK;
-		paddr |= addr & ~HPAGE_MASK;
-		cc = page_reset_referenced(paddr);
-		spin_unlock(ptl);
-		return cc;
-	}
-	spin_unlock(ptl);
-
-	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-	if (!ptep)
-		goto again;
-	new = old = pgste_get_lock(ptep);
-	/* Reset guest reference bit only */
-	new = clear_pgste_bit(new, PGSTE_GR_BIT);
-
-	if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-		paddr = pte_val(*ptep) & PAGE_MASK;
-		cc = page_reset_referenced(paddr);
-		/* Merge real referenced bit into host-set */
-		new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT);
-	}
-	/* Reflect guest's logical view, not physical */
-	cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
-	/* Changing the guest storage key is considered a change of the page */
-	if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
-		new = set_pgste_bit(new, PGSTE_UC_BIT);
-
-	pgste_set_unlock(ptep, new);
-	pte_unmap_unlock(ptep, ptl);
-	return cc;
-}
-EXPORT_SYMBOL(reset_guest_reference_bit);
-
-int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
-			  unsigned char *key)
-{
-	unsigned long paddr;
-	spinlock_t *ptl;
-	pgste_t pgste;
-	pmd_t *pmdp;
-	pte_t *ptep;
-
-	/*
-	 * If we don't have a PTE table and if there is no huge page mapped,
-	 * the storage key is 0.
-	 */
-	*key = 0;
-
-	switch (pmd_lookup(mm, addr, &pmdp)) {
-	case -ENOENT:
-		return 0;
-	case 0:
-		break;
-	default:
-		return -EFAULT;
-	}
-again:
-	ptl = pmd_lock(mm, pmdp);
-	if (!pmd_present(*pmdp)) {
-		spin_unlock(ptl);
-		return 0;
-	}
-
-	if (pmd_leaf(*pmdp)) {
-		paddr = pmd_val(*pmdp) & HPAGE_MASK;
-		paddr |= addr & ~HPAGE_MASK;
-		*key = page_get_storage_key(paddr);
-		spin_unlock(ptl);
-		return 0;
-	}
-	spin_unlock(ptl);
-
-	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-	if (!ptep)
-		goto again;
-	pgste = pgste_get_lock(ptep);
-	*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
-	paddr = pte_val(*ptep) & PAGE_MASK;
-	if (!(pte_val(*ptep) & _PAGE_INVALID))
-		*key = page_get_storage_key(paddr);
-	/* Reflect guest's logical view, not physical */
-	*key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
-	pgste_set_unlock(ptep, pgste);
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-}
-EXPORT_SYMBOL(get_guest_storage_key);
-
-/**
- * pgste_perform_essa - perform ESSA actions on the PGSTE.
- * @mm: the memory context. It must have PGSTEs, no check is performed here!
- * @hva: the host virtual address of the page whose PGSTE is to be processed
- * @orc: the specific action to perform, see the ESSA_SET_* macros.
- * @oldpte: the PTE will be saved there if the pointer is not NULL.
- * @oldpgste: the old PGSTE will be saved there if the pointer is not NULL.
- *
- * Return: 1 if the page is to be added to the CBRL, otherwise 0,
- *	   or < 0 in case of error. -EINVAL is returned for invalid values
- *	   of orc, -EFAULT for invalid addresses.
- */
-int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
-			unsigned long *oldpte, unsigned long *oldpgste)
-{
-	struct vm_area_struct *vma;
-	unsigned long pgstev;
-	spinlock_t *ptl;
-	pgste_t pgste;
-	pte_t *ptep;
-	int res = 0;
-
-	WARN_ON_ONCE(orc > ESSA_MAX);
-	if (unlikely(orc > ESSA_MAX))
-		return -EINVAL;
-
-	vma = vma_lookup(mm, hva);
-	if (!vma || is_vm_hugetlb_page(vma))
-		return -EFAULT;
-	ptep = get_locked_pte(mm, hva, &ptl);
-	if (unlikely(!ptep))
-		return -EFAULT;
-	pgste = pgste_get_lock(ptep);
-	pgstev = pgste_val(pgste);
-	if (oldpte)
-		*oldpte = pte_val(*ptep);
-	if (oldpgste)
-		*oldpgste = pgstev;
-
-	switch (orc) {
-	case ESSA_GET_STATE:
-		break;
-	case ESSA_SET_STABLE:
-		pgstev &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
-		pgstev |= _PGSTE_GPS_USAGE_STABLE;
-		break;
-	case ESSA_SET_UNUSED:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		pgstev |= _PGSTE_GPS_USAGE_UNUSED;
-		if (pte_val(*ptep) & _PAGE_INVALID)
-			res = 1;
-		break;
-	case ESSA_SET_VOLATILE:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
-		if (pte_val(*ptep) & _PAGE_INVALID)
-			res = 1;
-		break;
-	case ESSA_SET_POT_VOLATILE:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-			pgstev |= _PGSTE_GPS_USAGE_POT_VOLATILE;
-			break;
-		}
-		if (pgstev & _PGSTE_GPS_ZERO) {
-			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
-			break;
-		}
-		if (!(pgstev & PGSTE_GC_BIT)) {
-			pgstev |= _PGSTE_GPS_USAGE_VOLATILE;
-			res = 1;
-			break;
-		}
-		break;
-	case ESSA_SET_STABLE_RESIDENT:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		pgstev |= _PGSTE_GPS_USAGE_STABLE;
-		/*
-		 * Since the resident state can go away any time after this
-		 * call, we will not make this page resident. We can revisit
-		 * this decision if a guest will ever start using this.
-		 */
-		break;
-	case ESSA_SET_STABLE_IF_RESIDENT:
-		if (!(pte_val(*ptep) & _PAGE_INVALID)) {
-			pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-			pgstev |= _PGSTE_GPS_USAGE_STABLE;
-		}
-		break;
-	case ESSA_SET_STABLE_NODAT:
-		pgstev &= ~_PGSTE_GPS_USAGE_MASK;
-		pgstev |= _PGSTE_GPS_USAGE_STABLE | _PGSTE_GPS_NODAT;
-		break;
-	default:
-		/* we should never get here! */
-		break;
-	}
-	/* If we are discarding a page, set it to logical zero */
-	if (res)
-		pgstev |= _PGSTE_GPS_ZERO;
-
-	pgste = __pgste(pgstev);
-	pgste_set_unlock(ptep, pgste);
-	pte_unmap_unlock(ptep, ptl);
-	return res;
-}
-EXPORT_SYMBOL(pgste_perform_essa);
-
-/**
- * set_pgste_bits - set specific PGSTE bits.
- * @mm: the memory context. It must have PGSTEs, no check is performed here!
- * @hva: the host virtual address of the page whose PGSTE is to be processed
- * @bits: a bitmask representing the bits that will be touched
- * @value: the values of the bits to be written. Only the bits in the mask
- *	   will be written.
- *
- * Return: 0 on success, < 0 in case of error.
- */
-int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
-			unsigned long bits, unsigned long value)
-{
-	struct vm_area_struct *vma;
-	spinlock_t *ptl;
-	pgste_t new;
-	pte_t *ptep;
-
-	vma = vma_lookup(mm, hva);
-	if (!vma || is_vm_hugetlb_page(vma))
-		return -EFAULT;
-	ptep = get_locked_pte(mm, hva, &ptl);
-	if (unlikely(!ptep))
-		return -EFAULT;
-	new = pgste_get_lock(ptep);
-
-	new = clear_pgste_bit(new, bits);
-	new = set_pgste_bit(new, value & bits);
-
-	pgste_set_unlock(ptep, new);
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-}
-EXPORT_SYMBOL(set_pgste_bits);
-
-/**
- * get_pgste - get the current PGSTE for the given address.
- * @mm: the memory context. It must have PGSTEs, no check is performed here!
- * @hva: the host virtual address of the page whose PGSTE is to be processed
- * @pgstep: will be written with the current PGSTE for the given address.
- *
- * Return: 0 on success, < 0 in case of error.
- */
-int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
-{
-	struct vm_area_struct *vma;
-	spinlock_t *ptl;
-	pte_t *ptep;
-
-	vma = vma_lookup(mm, hva);
-	if (!vma || is_vm_hugetlb_page(vma))
-		return -EFAULT;
-	ptep = get_locked_pte(mm, hva, &ptl);
-	if (unlikely(!ptep))
-		return -EFAULT;
-	*pgstep = pgste_val(pgste_get(ptep));
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-}
-EXPORT_SYMBOL(get_pgste);
-#endif
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@ -645,7 +645,9 @@ static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *mem

 struct kvm_s390_adapter_int {
 	u64 ind_addr;
+	u64 ind_gaddr;
 	u64 summary_addr;
+	u64 summary_gaddr;
 	u64 ind_offset;
 	u32 summary_offset;
 	u32 adapter_id;
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@ -984,6 +984,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_GUEST_MEMFD_FLAGS 244
 #define KVM_CAP_ARM_SEA_TO_USER 245
 #define KVM_CAP_S390_USER_OPEREXEC 246
+#define KVM_CAP_S390_KEYOP 247

 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
@ -1229,6 +1230,16 @@ struct kvm_vfio_spapr_tce {
 	__s32	tablefd;
 };

+#define KVM_S390_KEYOP_ISKE 0x01
+#define KVM_S390_KEYOP_RRBE 0x02
+#define KVM_S390_KEYOP_SSKE 0x03
+struct kvm_s390_keyop {
+	__u64 guest_addr;
+	__u8  key;
+	__u8  operation;
+	__u8  pad[6];
+};
+
 /*
 * KVM_CREATE_VCPU receives as a parameter the vcpu slot, and returns
 * a vcpu fd.
@ -1248,6 +1259,7 @@ struct kvm_vfio_spapr_tce {
 #define KVM_S390_UCAS_MAP        _IOW(KVMIO, 0x50, struct kvm_s390_ucas_mapping)
 #define KVM_S390_UCAS_UNMAP      _IOW(KVMIO, 0x51, struct kvm_s390_ucas_mapping)
 #define KVM_S390_VCPU_FAULT	 _IOW(KVMIO, 0x52, unsigned long)
+#define KVM_S390_KEYOP           _IOWR(KVMIO, 0x53, struct kvm_s390_keyop)

 /* Device model IOC */
 #define KVM_CREATE_IRQCHIP        _IO(KVMIO,   0x60)
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@ -343,15 +343,6 @@ int hugepage_madvise(struct vm_area_struct *vma,
 {
 	switch (advice) {
 	case MADV_HUGEPAGE:
-#ifdef CONFIG_S390
-		/*
-		 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
-		 * can't handle this properly after s390_enable_sie, so we simply
-		 * ignore the madvise to prevent qemu from causing a SIGSEGV.
-		 */
-		if (mm_has_pgste(vma->vm_mm))
-			return 0;
-#endif
 		*vm_flags &= ~VM_NOHUGEPAGE;
 		*vm_flags |= VM_HUGEPAGE;
 		/*
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@ -203,6 +203,7 @@ TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test
 TEST_GEN_PROGS_s390 += s390/shared_zeropage_test
 TEST_GEN_PROGS_s390 += s390/ucontrol_test
 TEST_GEN_PROGS_s390 += s390/user_operexec
+TEST_GEN_PROGS_s390 += s390/keyop
 TEST_GEN_PROGS_s390 += rseq_test

 TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON)
--- a/tools/testing/selftests/kvm/s390/keyop.c
+++ b/tools/testing/selftests/kvm/s390/keyop.c
@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for s390x KVM_S390_KEYOP
+ *
+ * Copyright IBM Corp. 2026
+ *
+ * Authors:
+ *  Claudio Imbrenda <imbrenda@linux.ibm.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <linux/bits.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kselftest.h"
+#include "processor.h"
+
+#define BUF_PAGES 128UL
+#define GUEST_PAGES 256UL
+
+#define BUF_START_GFN	(GUEST_PAGES - BUF_PAGES)
+#define BUF_START_ADDR	(BUF_START_GFN << PAGE_SHIFT)
+
+#define KEY_BITS_ACC	0xf0
+#define KEY_BIT_F	0x08
+#define KEY_BIT_R	0x04
+#define KEY_BIT_C	0x02
+
+#define KEY_BITS_RC	(KEY_BIT_R | KEY_BIT_C)
+#define KEY_BITS_ALL	(KEY_BITS_ACC | KEY_BIT_F | KEY_BITS_RC)
+
+static unsigned char tmp[BUF_PAGES];
+static unsigned char old[BUF_PAGES];
+static unsigned char expected[BUF_PAGES];
+
+static int _get_skeys(struct kvm_vcpu *vcpu, unsigned char skeys[])
+{
+	struct kvm_s390_skeys skeys_ioctl = {
+		.start_gfn = BUF_START_GFN,
+		.count = BUF_PAGES,
+		.skeydata_addr = (unsigned long)skeys,
+	};
+
+	return __vm_ioctl(vcpu->vm, KVM_S390_GET_SKEYS, &skeys_ioctl);
+}
+
+static void get_skeys(struct kvm_vcpu *vcpu, unsigned char skeys[])
+{
+	int r = _get_skeys(vcpu, skeys);
+
+	TEST_ASSERT(!r, "Failed to get storage keys, r=%d", r);
+}
+
+static void set_skeys(struct kvm_vcpu *vcpu, unsigned char skeys[])
+{
+	struct kvm_s390_skeys skeys_ioctl = {
+		.start_gfn = BUF_START_GFN,
+		.count = BUF_PAGES,
+		.skeydata_addr = (unsigned long)skeys,
+	};
+	int r;
+
+	r = __vm_ioctl(vcpu->vm, KVM_S390_SET_SKEYS, &skeys_ioctl);
+	TEST_ASSERT(!r, "Failed to set storage keys, r=%d", r);
+}
+
+static int do_keyop(struct kvm_vcpu *vcpu, int op, unsigned long page_idx, unsigned char skey)
+{
+	struct kvm_s390_keyop keyop = {
+		.guest_addr = BUF_START_ADDR + page_idx * PAGE_SIZE,
+		.key = skey,
+		.operation = op,
+	};
+	int r;
+
+	r = __vm_ioctl(vcpu->vm, KVM_S390_KEYOP, &keyop);
+	TEST_ASSERT(!r, "Failed to perform keyop, r=%d", r);
+	TEST_ASSERT((keyop.key & 1) == 0,
+		    "Last bit of key is 1, should be 0! page %lu, new key=%#x, old key=%#x",
+		    page_idx, skey, keyop.key);
+
+	return keyop.key;
+}
+
+static void fault_in_buffer(struct kvm_vcpu *vcpu, int where, int cur_loc)
+{
+	unsigned long i;
+	int r;
+
+	if (where != cur_loc)
+		return;
+
+	for (i = 0; i < BUF_PAGES; i++) {
+		r = ioctl(vcpu->fd, KVM_S390_VCPU_FAULT, BUF_START_ADDR + i * PAGE_SIZE);
+		TEST_ASSERT(!r, "Faulting in buffer page %lu, r=%d", i, r);
+	}
+}
+
+static inline void set_pattern(unsigned char skeys[])
+{
+	int i;
+
+	for (i = 0; i < BUF_PAGES; i++)
+		skeys[i] = i << 1;
+}
+
+static void dump_sk(const unsigned char skeys[], const char *descr)
+{
+	int i, j;
+
+	fprintf(stderr, "# %s:\n", descr);
+	for (i = 0; i < BUF_PAGES; i += 32) {
+		fprintf(stderr, "# %3d: ", i);
+		for (j = 0; j < 32; j++)
+			fprintf(stderr, "%02x ", skeys[i + j]);
+		fprintf(stderr, "\n");
+	}
+}
+
+static inline void compare(const unsigned char what[], const unsigned char expected[],
+			   const char *descr, int fault_in_loc)
+{
+	int i;
+
+	for (i = 0; i < BUF_PAGES; i++) {
+		if (expected[i] != what[i]) {
+			dump_sk(expected, "Expected");
+			dump_sk(what, "Got");
+		}
+		TEST_ASSERT(expected[i] == what[i],
+			    "%s! fault-in location %d, page %d, expected %#x, got %#x",
+			    descr, fault_in_loc, i, expected[i], what[i]);
+	}
+}
+
+static inline void clear_all(void)
+{
+	memset(tmp, 0, BUF_PAGES);
+	memset(old, 0, BUF_PAGES);
+	memset(expected, 0, BUF_PAGES);
+}
+
+static void test_init(struct kvm_vcpu *vcpu, int fault_in)
+{
+	/* Set all storage keys to zero */
+	fault_in_buffer(vcpu, fault_in, 1);
+	set_skeys(vcpu, expected);
+
+	fault_in_buffer(vcpu, fault_in, 2);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "Setting keys not zero", fault_in);
+
+	/* Set storage keys to a sequential pattern */
+	fault_in_buffer(vcpu, fault_in, 3);
+	set_pattern(expected);
+	set_skeys(vcpu, expected);
+
+	fault_in_buffer(vcpu, fault_in, 4);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "Setting storage keys failed", fault_in);
+}
+
+static void test_rrbe(struct kvm_vcpu *vcpu, int fault_in)
+{
+	unsigned char k;
+	int i;
+
+	/* Set storage keys to a sequential pattern */
+	fault_in_buffer(vcpu, fault_in, 1);
+	set_pattern(expected);
+	set_skeys(vcpu, expected);
+
+	/* Call the RRBE KEYOP ioctl on each page and verify the result */
+	fault_in_buffer(vcpu, fault_in, 2);
+	for (i = 0; i < BUF_PAGES; i++) {
+		k = do_keyop(vcpu, KVM_S390_KEYOP_RRBE, i, 0xff);
+		TEST_ASSERT((expected[i] & KEY_BITS_RC) == k,
+			    "Old R or C value mismatch! expected: %#x, got %#x",
+			    expected[i] & KEY_BITS_RC, k);
+		if (i == BUF_PAGES / 2)
+			fault_in_buffer(vcpu, fault_in, 3);
+	}
+
+	for (i = 0; i < BUF_PAGES; i++)
+		expected[i] &= ~KEY_BIT_R;
+
+	/* Verify that only the R bit has been cleared */
+	fault_in_buffer(vcpu, fault_in, 4);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "New value mismatch", fault_in);
+}
+
+static void test_iske(struct kvm_vcpu *vcpu, int fault_in)
+{
+	int i;
+
+	/* Set storage keys to a sequential pattern */
+	fault_in_buffer(vcpu, fault_in, 1);
+	set_pattern(expected);
+	set_skeys(vcpu, expected);
+
+	/* Call the ISKE KEYOP ioctl on each page and verify the result */
+	fault_in_buffer(vcpu, fault_in, 2);
+	for (i = 0; i < BUF_PAGES; i++) {
+		tmp[i] = do_keyop(vcpu, KVM_S390_KEYOP_ISKE, i, 0xff);
+		if (i == BUF_PAGES / 2)
+			fault_in_buffer(vcpu, fault_in, 3);
+	}
+	compare(tmp, expected, "Old value mismatch", fault_in);
+
+	/* Check storage keys have not changed */
+	fault_in_buffer(vcpu, fault_in, 4);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "Storage keys values changed", fault_in);
+}
+
+static void test_sske(struct kvm_vcpu *vcpu, int fault_in)
+{
+	int i;
+
+	/* Set storage keys to a sequential pattern */
+	fault_in_buffer(vcpu, fault_in, 1);
+	set_pattern(tmp);
+	set_skeys(vcpu, tmp);
+
+	/* Call the SSKE KEYOP ioctl on each page and verify the result */
+	fault_in_buffer(vcpu, fault_in, 2);
+	for (i = 0; i < BUF_PAGES; i++) {
+		expected[i] = ~tmp[i] & KEY_BITS_ALL;
+		/* Set the new storage keys to be the bit-inversion of the previous ones */
+		old[i] = do_keyop(vcpu, KVM_S390_KEYOP_SSKE, i, expected[i] | 1);
+		if (i == BUF_PAGES / 2)
+			fault_in_buffer(vcpu, fault_in, 3);
+	}
+	compare(old, tmp, "Old value mismatch", fault_in);
+
+	/* Verify that the storage keys have been set correctly */
+	fault_in_buffer(vcpu, fault_in, 4);
+	get_skeys(vcpu, tmp);
+	compare(tmp, expected, "New value mismatch", fault_in);
+}
+
+static struct testdef {
+	const char *name;
+	void (*test)(struct kvm_vcpu *vcpu, int fault_in_location);
+	int n_fault_in_locations;
+} testplan[] = {
+	{ "Initialization", test_init, 5 },
+	{ "RRBE", test_rrbe, 5 },
+	{ "ISKE", test_iske, 5 },
+	{ "SSKE", test_sske, 5 },
+};
+
+static void run_test(void (*the_test)(struct kvm_vcpu *, int), int fault_in_location)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int r;
+
+	vm = vm_create_barebones();
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, GUEST_PAGES, 0);
+	vcpu = __vm_vcpu_add(vm, 0);
+
+	r = _get_skeys(vcpu, tmp);
+	TEST_ASSERT(r == KVM_S390_GET_SKEYS_NONE,
+		    "Storage keys are not disabled initially, r=%d", r);
+
+	clear_all();
+
+	the_test(vcpu, fault_in_location);
+
+	kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+	int i, f;
+
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_KEYOP));
+	TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL));
+
+	ksft_print_header();
+	for (i = 0, f = 0; i < ARRAY_SIZE(testplan); i++)
+		f += testplan[i].n_fault_in_locations;
+	ksft_set_plan(f);
+
+	for (i = 0; i < ARRAY_SIZE(testplan); i++) {
+		for (f = 0; f < testplan[i].n_fault_in_locations; f++) {
+			run_test(testplan[i].test, f);
+			ksft_test_result_pass("%s (fault-in location %d)\n", testplan[i].name, f);
+		}
+	}
+
+	ksft_finished();	/* Print results and exit() accordingly */
+}