Merge branches 'fixes', 'arm/smmu/updates', 'intel/vt-d', 'amd/amd-vi' and 'core' into next

This commit is contained in:
Joerg Roedel 2026-02-06 11:10:40 +01:00
49 changed files with 2000 additions and 502 deletions

View file

@ -2675,6 +2675,15 @@ Kernel parameters
1 - Bypass the IOMMU for DMA.
unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH.
iommu.debug_pagealloc=
[KNL,EARLY] When CONFIG_IOMMU_DEBUG_PAGEALLOC is set, this
parameter enables the feature at boot time. By default, it
is disabled and the system behaves the same way as a kernel
built without CONFIG_IOMMU_DEBUG_PAGEALLOC.
Format: { "0" | "1" }
0 - Sanitizer disabled.
1 - Sanitizer enabled, expect runtime overhead.
io7= [HW] IO7 for Marvel-based Alpha systems
See comment before marvel_specify_io7 in
arch/alpha/kernel/core_marvel.c.

View file

@ -13251,6 +13251,7 @@ F: drivers/iommu/
F: include/linux/iommu.h
F: include/linux/iova.h
F: include/linux/of_iommu.h
F: rust/kernel/iommu/
IOMMUFD
M: Jason Gunthorpe <jgg@nvidia.com>

View file

@ -384,6 +384,25 @@ config SPRD_IOMMU
Say Y here if you want to use the multimedia devices listed above.
config IOMMU_DEBUG_PAGEALLOC
bool "Debug IOMMU mappings against page allocations"
depends on DEBUG_PAGEALLOC && IOMMU_API && PAGE_EXTENSION
help
This enables a consistency check between the kernel page allocator and
the IOMMU subsystem. It verifies that pages being allocated or freed
are not currently mapped in any IOMMU domain.
This helps detect DMA use-after-free bugs where a driver frees a page
but forgets to unmap it from the IOMMU, potentially allowing a device
to overwrite memory that the kernel has repurposed.
These checks are best-effort and may not detect all problems.
Due to performance overhead, this feature is disabled by default.
You must enable "iommu.debug_pagealloc" from the kernel command
line to activate the runtime checks.
If unsure, say N.
endif # IOMMU_SUPPORT
source "drivers/iommu/generic_pt/Kconfig"

View file

@ -36,3 +36,4 @@ obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o
obj-$(CONFIG_APPLE_DART) += apple-dart.o
obj-$(CONFIG_IOMMU_DEBUG_PAGEALLOC) += iommu-debug-pagealloc.o

View file

@ -30,6 +30,16 @@ config AMD_IOMMU
your BIOS for an option to enable it or if you have an IVRS ACPI
table.
config AMD_IOMMU_IOMMUFD
bool "Enable IOMMUFD features for AMD IOMMU (EXPERIMENTAL)"
depends on IOMMUFD
depends on AMD_IOMMU
help
Support for IOMMUFD features intended to support virtual machines
with accelerated virtual IOMMUs.
Say Y here if you are doing development and testing on this feature.
config AMD_IOMMU_DEBUGFS
bool "Enable AMD IOMMU internals in DebugFS"
depends on AMD_IOMMU && IOMMU_DEBUGFS

View file

@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y += iommu.o init.o quirks.o ppr.o pasid.o
obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o nested.o
obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o

View file

@ -190,4 +190,37 @@ void amd_iommu_domain_set_pgtable(struct protection_domain *domain,
struct dev_table_entry *get_dev_table(struct amd_iommu *iommu);
struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid);
void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
struct protection_domain *domain, u16 domid,
struct pt_iommu_amdv1_hw_info *pt_info,
struct dev_table_entry *new);
void amd_iommu_update_dte(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data,
struct dev_table_entry *new);
static inline void
amd_iommu_make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *new)
{
struct dev_table_entry *initial_dte;
struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev);
/* All existing DTE must have V bit set */
new->data128[0] = DTE_FLAG_V;
new->data128[1] = 0;
/*
* Restore cached persistent DTE bits, which can be set by information
* in IVRS table. See set_dev_entry_from_acpi().
*/
initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid);
if (initial_dte) {
new->data128[0] |= initial_dte->data128[0];
new->data128[1] |= initial_dte->data128[1];
}
}
/* NESTED */
struct iommu_domain *
amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
const struct iommu_user_data *user_data);
#endif /* AMD_IOMMU_H */

View file

@ -17,9 +17,12 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
#include <linux/iommufd.h>
#include <linux/irqreturn.h>
#include <linux/generic_pt/iommu.h>
#include <uapi/linux/iommufd.h>
/*
* Maximum number of IOMMUs supported
*/
@ -108,6 +111,7 @@
/* Extended Feature 2 Bits */
#define FEATURE_SEVSNPIO_SUP BIT_ULL(1)
#define FEATURE_GCR3TRPMODE BIT_ULL(3)
#define FEATURE_SNPAVICSUP GENMASK_ULL(7, 5)
#define FEATURE_SNPAVICSUP_GAM(x) \
(FIELD_GET(FEATURE_SNPAVICSUP, x) == 0x1)
@ -186,6 +190,7 @@
#define CONTROL_EPH_EN 45
#define CONTROL_XT_EN 50
#define CONTROL_INTCAPXT_EN 51
#define CONTROL_GCR3TRPMODE 58
#define CONTROL_IRTCACHEDIS 59
#define CONTROL_SNPAVIC_EN 61
@ -350,6 +355,9 @@
#define DTE_FLAG_V BIT_ULL(0)
#define DTE_FLAG_TV BIT_ULL(1)
#define DTE_FLAG_HAD (3ULL << 7)
#define DTE_MODE_MASK GENMASK_ULL(11, 9)
#define DTE_HOST_TRP GENMASK_ULL(51, 12)
#define DTE_FLAG_PPR BIT_ULL(52)
#define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55)
#define DTE_GLX GENMASK_ULL(57, 56)
@ -358,7 +366,7 @@
#define DTE_FLAG_IOTLB BIT_ULL(32)
#define DTE_FLAG_MASK (0x3ffULL << 32)
#define DEV_DOMID_MASK 0xffffULL
#define DTE_DOMID_MASK GENMASK_ULL(15, 0)
#define DTE_GCR3_14_12 GENMASK_ULL(60, 58)
#define DTE_GCR3_30_15 GENMASK_ULL(31, 16)
@ -493,6 +501,38 @@ struct pdom_iommu_info {
u32 refcnt; /* Count of attached dev/pasid per domain/IOMMU */
};
struct amd_iommu_viommu {
struct iommufd_viommu core;
struct protection_domain *parent; /* nest parent domain for this viommu */
struct list_head pdom_list; /* For protection_domain->viommu_list */
/*
* Per-vIOMMU guest domain ID to host domain ID mapping.
* Indexed by guest domain ID.
*/
struct xarray gdomid_array;
};
/*
* Contains guest domain ID mapping info,
* which is stored in the struct xarray gdomid_array.
*/
struct guest_domain_mapping_info {
refcount_t users;
u32 hdom_id; /* Host domain ID */
};
/*
* Nested domain is specifically used for nested translation
*/
struct nested_domain {
struct iommu_domain domain; /* generic domain handle used by iommu core code */
u16 gdom_id; /* domain ID from gDTE */
struct guest_domain_mapping_info *gdom_info;
struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */
struct amd_iommu_viommu *viommu; /* AMD hw-viommu this nested domain belong to */
};
/*
* This structure contains generic data for IOMMU protection domains
* independent of their use.
@ -513,6 +553,12 @@ struct protection_domain {
struct mmu_notifier mn; /* mmu notifier for the SVA domain */
struct list_head dev_data_list; /* List of pdom_dev_data */
/*
* Store reference to list of vIOMMUs, which use this protection domain.
* This will be used to look up host domain ID when flushing this domain.
*/
struct list_head viommu_list;
};
PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain);
PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain);
@ -706,7 +752,7 @@ struct amd_iommu {
u32 flags;
volatile u64 *cmd_sem;
atomic64_t cmd_sem_val;
u64 cmd_sem_val;
/*
* Track physical address to directly use it in build_completion_wait()
* and avoid adding any special checks and handling for kdump.

View file

@ -1122,6 +1122,14 @@ static void iommu_enable_gt(struct amd_iommu *iommu)
return;
iommu_feature_enable(iommu, CONTROL_GT_EN);
/*
* This feature needs to be enabled prior to a call
* to iommu_snp_enable(). Since this function is called
* in early_enable_iommu(), it is safe to enable here.
*/
if (check_feature2(FEATURE_GCR3TRPMODE))
iommu_feature_enable(iommu, CONTROL_GCR3TRPMODE);
}
/* sets a specific bit in the device table entry. */
@ -1179,7 +1187,7 @@ static bool __reuse_device_table(struct amd_iommu *iommu)
for (devid = 0; devid <= pci_seg->last_bdf; devid++) {
old_dev_tbl_entry = &pci_seg->old_dev_tbl_cpy[devid];
dte_v = FIELD_GET(DTE_FLAG_V, old_dev_tbl_entry->data[0]);
dom_id = FIELD_GET(DEV_DOMID_MASK, old_dev_tbl_entry->data[1]);
dom_id = FIELD_GET(DTE_DOMID_MASK, old_dev_tbl_entry->data[1]);
if (!dte_v || !dom_id)
continue;
@ -1877,7 +1885,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h,
iommu->pci_seg = pci_seg;
raw_spin_lock_init(&iommu->lock);
atomic64_set(&iommu->cmd_sem_val, 0);
iommu->cmd_sem_val = 0;
/* Add IOMMU to internal data structures */
list_add_tail(&iommu->list, &amd_iommu_list);

View file

@ -43,6 +43,7 @@
#include <linux/generic_pt/iommu.h>
#include "amd_iommu.h"
#include "iommufd.h"
#include "../irq_remapping.h"
#include "../iommu-pages.h"
@ -75,6 +76,8 @@ static void set_dte_entry(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data,
phys_addr_t top_paddr, unsigned int top_level);
static int device_flush_dte(struct iommu_dev_data *dev_data);
static void amd_iommu_change_top(struct pt_iommu *iommu_table,
phys_addr_t top_paddr, unsigned int top_level);
@ -85,6 +88,10 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain);
static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
bool enable);
static void clone_aliases(struct amd_iommu *iommu, struct device *dev);
static int iommu_completion_wait(struct amd_iommu *iommu);
/****************************************************************************
*
* Helper functions
@ -202,6 +209,16 @@ static void update_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_da
spin_unlock_irqrestore(&dev_data->dte_lock, flags);
}
void amd_iommu_update_dte(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data,
struct dev_table_entry *new)
{
update_dte256(iommu, dev_data, new);
clone_aliases(iommu, dev_data->dev);
device_flush_dte(dev_data);
iommu_completion_wait(iommu);
}
static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data,
struct dev_table_entry *dte)
{
@ -1185,7 +1202,12 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data)
{
int i = 0;
while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) {
/*
* cmd_sem holds a monotonically non-decreasing completion sequence
* number.
*/
while ((__s64)(READ_ONCE(*iommu->cmd_sem) - data) < 0 &&
i < LOOP_TIMEOUT) {
udelay(1);
i += 1;
}
@ -1417,6 +1439,12 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
return iommu_queue_command_sync(iommu, cmd, true);
}
static u64 get_cmdsem_val(struct amd_iommu *iommu)
{
lockdep_assert_held(&iommu->lock);
return ++iommu->cmd_sem_val;
}
/*
* This function queues a completion wait command into the command
* buffer of an IOMMU
@ -1431,20 +1459,19 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
if (!iommu->need_sync)
return 0;
data = atomic64_inc_return(&iommu->cmd_sem_val);
build_completion_wait(&cmd, iommu, data);
raw_spin_lock_irqsave(&iommu->lock, flags);
data = get_cmdsem_val(iommu);
build_completion_wait(&cmd, iommu, data);
ret = __iommu_queue_command_sync(iommu, &cmd, false);
raw_spin_unlock_irqrestore(&iommu->lock, flags);
if (ret)
goto out_unlock;
return ret;
ret = wait_on_sem(iommu, data);
out_unlock:
raw_spin_unlock_irqrestore(&iommu->lock, flags);
return ret;
}
@ -1522,6 +1549,32 @@ static void amd_iommu_flush_tlb_domid(struct amd_iommu *iommu, u32 dom_id)
iommu_completion_wait(iommu);
}
static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size)
{
int ret = 0;
struct amd_iommu_viommu *aviommu;
list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) {
unsigned long i;
struct guest_domain_mapping_info *gdom_info;
struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev,
struct amd_iommu, iommu);
xa_lock(&aviommu->gdomid_array);
xa_for_each(&aviommu->gdomid_array, i, gdom_info) {
struct iommu_cmd cmd;
pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__,
iommu->devid, gdom_info->hdom_id);
build_inv_iommu_pages(&cmd, address, size, gdom_info->hdom_id,
IOMMU_NO_PASID, false);
ret |= iommu_queue_command(iommu, &cmd);
}
xa_unlock(&aviommu->gdomid_array);
}
return ret;
}
static void amd_iommu_flush_all(struct amd_iommu *iommu)
{
struct iommu_cmd cmd;
@ -1670,6 +1723,17 @@ static int domain_flush_pages_v1(struct protection_domain *pdom,
ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd);
}
/*
* A domain w/ v1 table can be a nest parent, which can have
* multiple nested domains. Each nested domain has 1:1 mapping
* between gDomID and hDomID. Therefore, flush every hDomID
* associated to this nest parent domain.
*
* See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested()
*/
if (!list_empty(&pdom->viommu_list))
ret |= iommu_flush_pages_v1_hdom_ids(pdom, address, size);
return ret;
}
@ -2010,127 +2074,112 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
return ret;
}
static void make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *ptr,
struct dev_table_entry *new)
{
/* All existing DTE must have V bit set */
new->data128[0] = DTE_FLAG_V;
new->data128[1] = 0;
}
/*
* Note:
* The old value for GCR3 table and GPT have been cleared from caller.
*/
static void set_dte_gcr3_table(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data,
struct dev_table_entry *target)
static void set_dte_gcr3_table(struct iommu_dev_data *dev_data,
struct dev_table_entry *new)
{
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
u64 gcr3;
u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
if (!gcr3_info->gcr3_tbl)
return;
new->data[0] |= DTE_FLAG_TV |
(dev_data->ppr ? DTE_FLAG_PPR : 0) |
(pdom_is_v2_pgtbl_mode(dev_data->domain) ? DTE_FLAG_GIOV : 0) |
DTE_FLAG_GV |
FIELD_PREP(DTE_GLX, gcr3_info->glx) |
FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12) |
DTE_FLAG_IR | DTE_FLAG_IW;
pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n",
__func__, dev_data->devid, gcr3_info->glx,
(unsigned long long)gcr3_info->gcr3_tbl);
gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
target->data[0] |= DTE_FLAG_GV |
FIELD_PREP(DTE_GLX, gcr3_info->glx) |
FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12);
if (pdom_is_v2_pgtbl_mode(dev_data->domain))
target->data[0] |= DTE_FLAG_GIOV;
target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) |
FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31);
new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, dev_data->gcr3_info.domid) |
FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) |
(dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) |
FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31);
/* Guest page table can only support 4 and 5 levels */
if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL);
new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL);
else
target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL);
}
void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data,
struct protection_domain *domain, u16 domid,
struct pt_iommu_amdv1_hw_info *pt_info,
struct dev_table_entry *new)
{
u64 host_pt_root = __sme_set(pt_info->host_pt_root);
/* Note Dirty tracking is used for v1 table only for now */
new->data[0] |= DTE_FLAG_TV |
FIELD_PREP(DTE_MODE_MASK, pt_info->mode) |
(domain->dirty_tracking ? DTE_FLAG_HAD : 0) |
FIELD_PREP(DTE_HOST_TRP, host_pt_root >> 12) |
DTE_FLAG_IR | DTE_FLAG_IW;
new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domid) |
(dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0);
}
static void set_dte_v1(struct iommu_dev_data *dev_data,
struct protection_domain *domain, u16 domid,
phys_addr_t top_paddr, unsigned int top_level,
struct dev_table_entry *new)
{
struct pt_iommu_amdv1_hw_info pt_info;
/*
* When updating the IO pagetable, the new top and level
* are provided as parameters. For other operations i.e.
* device attach, retrieve the current pagetable info
* via the IOMMU PT API.
*/
if (top_paddr) {
pt_info.host_pt_root = top_paddr;
pt_info.mode = top_level + 1;
} else {
WARN_ON(top_paddr || top_level);
pt_iommu_amdv1_hw_info(&domain->amdv1, &pt_info);
}
amd_iommu_set_dte_v1(dev_data, domain, domid, &pt_info, new);
}
static void set_dte_passthrough(struct iommu_dev_data *dev_data,
struct protection_domain *domain,
struct dev_table_entry *new)
{
new->data[0] |= DTE_FLAG_TV | DTE_FLAG_IR | DTE_FLAG_IW;
new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domain->id) |
(dev_data->ats_enabled) ? DTE_FLAG_IOTLB : 0;
}
static void set_dte_entry(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data,
phys_addr_t top_paddr, unsigned int top_level)
{
u16 domid;
u32 old_domid;
struct dev_table_entry *initial_dte;
struct dev_table_entry new = {};
struct protection_domain *domain = dev_data->domain;
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
struct pt_iommu_amdv1_hw_info pt_info;
make_clear_dte(dev_data, dte, &new);
amd_iommu_make_clear_dte(dev_data, &new);
if (gcr3_info && gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;
else {
domid = domain->id;
old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK;
if (gcr3_info->gcr3_tbl)
set_dte_gcr3_table(dev_data, &new);
else if (domain->domain.type == IOMMU_DOMAIN_IDENTITY)
set_dte_passthrough(dev_data, domain, &new);
else if ((domain->domain.type & __IOMMU_DOMAIN_PAGING) &&
domain->pd_mode == PD_MODE_V1)
set_dte_v1(dev_data, domain, domain->id, top_paddr, top_level, &new);
else
WARN_ON(true);
if (domain->domain.type & __IOMMU_DOMAIN_PAGING) {
/*
* When updating the IO pagetable, the new top and level
* are provided as parameters. For other operations i.e.
* device attach, retrieve the current pagetable info
* via the IOMMU PT API.
*/
if (top_paddr) {
pt_info.host_pt_root = top_paddr;
pt_info.mode = top_level + 1;
} else {
WARN_ON(top_paddr || top_level);
pt_iommu_amdv1_hw_info(&domain->amdv1,
&pt_info);
}
new.data[0] |= __sme_set(pt_info.host_pt_root) |
(pt_info.mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
}
}
new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW;
/*
* When SNP is enabled, we can only support TV=1 with non-zero domain ID.
* This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in
* do_iommu_domain_alloc().
*/
WARN_ON(amd_iommu_snp_en && (domid == 0));
new.data[0] |= DTE_FLAG_TV;
if (dev_data->ppr)
new.data[0] |= 1ULL << DEV_ENTRY_PPR;
if (domain->dirty_tracking)
new.data[0] |= DTE_FLAG_HAD;
if (dev_data->ats_enabled)
new.data[1] |= DTE_FLAG_IOTLB;
old_domid = READ_ONCE(dte->data[1]) & DEV_DOMID_MASK;
new.data[1] |= domid;
/*
* Restore cached persistent DTE bits, which can be set by information
* in IVRS table. See set_dev_entry_from_acpi().
*/
initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid);
if (initial_dte) {
new.data128[0] |= initial_dte->data128[0];
new.data128[1] |= initial_dte->data128[1];
}
set_dte_gcr3_table(iommu, dev_data, &new);
update_dte256(iommu, dev_data, &new);
amd_iommu_update_dte(iommu, dev_data, &new);
/*
* A kdump kernel might be replacing a domain ID that was copied from
@ -2148,10 +2197,9 @@ static void set_dte_entry(struct amd_iommu *iommu,
static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data)
{
struct dev_table_entry new = {};
struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid];
make_clear_dte(dev_data, dte, &new);
update_dte256(iommu, dev_data, &new);
amd_iommu_make_clear_dte(dev_data, &new);
amd_iommu_update_dte(iommu, dev_data, &new);
}
/* Update and flush DTE for the given device */
@ -2163,10 +2211,6 @@ static void dev_update_dte(struct iommu_dev_data *dev_data, bool set)
set_dte_entry(iommu, dev_data, 0, 0);
else
clear_dte_entry(iommu, dev_data);
clone_aliases(iommu, dev_data->dev);
device_flush_dte(dev_data);
iommu_completion_wait(iommu);
}
/*
@ -2499,6 +2543,7 @@ static void protection_domain_init(struct protection_domain *domain)
spin_lock_init(&domain->lock);
INIT_LIST_HEAD(&domain->dev_list);
INIT_LIST_HEAD(&domain->dev_data_list);
INIT_LIST_HEAD(&domain->viommu_list);
xa_init(&domain->iommu_array);
}
@ -2760,6 +2805,14 @@ static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev,
return &domain->domain;
}
static inline bool is_nest_parent_supported(u32 flags)
{
/* Only allow nest parent when these features are supported */
return check_feature(FEATURE_GT) &&
check_feature(FEATURE_GIOSUP) &&
check_feature2(FEATURE_GCR3TRPMODE);
}
static struct iommu_domain *
amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
const struct iommu_user_data *user_data)
@ -2767,16 +2820,28 @@ amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
{
struct amd_iommu *iommu = get_amd_iommu_from_dev(dev);
const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
IOMMU_HWPT_ALLOC_PASID;
IOMMU_HWPT_ALLOC_PASID |
IOMMU_HWPT_ALLOC_NEST_PARENT;
if ((flags & ~supported_flags) || user_data)
return ERR_PTR(-EOPNOTSUPP);
switch (flags & supported_flags) {
case IOMMU_HWPT_ALLOC_DIRTY_TRACKING:
/* Allocate domain with v1 page table for dirty tracking */
if (!amd_iommu_hd_support(iommu))
case IOMMU_HWPT_ALLOC_NEST_PARENT:
case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT:
/*
* Allocate domain with v1 page table for dirty tracking
* and/or Nest parent.
*/
if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
!amd_iommu_hd_support(iommu))
break;
if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) &&
!is_nest_parent_supported(flags))
break;
return amd_iommu_domain_alloc_paging_v1(dev, flags);
case IOMMU_HWPT_ALLOC_PASID:
/* Allocate domain with v2 page table if IOMMU supports PASID. */
@ -3078,6 +3143,7 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.hw_info = amd_iommufd_hw_info,
.blocked_domain = &blocked_domain,
.release_domain = &blocked_domain,
.identity_domain = &identity_domain.domain,
@ -3090,6 +3156,8 @@ const struct iommu_ops amd_iommu_ops = {
.is_attach_deferred = amd_iommu_is_attach_deferred,
.def_domain_type = amd_iommu_def_domain_type,
.page_response = amd_iommu_page_response,
.get_viommu_size = amd_iommufd_get_viommu_size,
.viommu_init = amd_iommufd_viommu_init,
};
#ifdef CONFIG_IRQ_REMAP
@ -3114,18 +3182,23 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid)
return;
build_inv_irt(&cmd, devid);
data = atomic64_inc_return(&iommu->cmd_sem_val);
build_completion_wait(&cmd2, iommu, data);
raw_spin_lock_irqsave(&iommu->lock, flags);
data = get_cmdsem_val(iommu);
build_completion_wait(&cmd2, iommu, data);
ret = __iommu_queue_command_sync(iommu, &cmd, true);
if (ret)
goto out;
goto out_err;
ret = __iommu_queue_command_sync(iommu, &cmd2, false);
if (ret)
goto out;
goto out_err;
raw_spin_unlock_irqrestore(&iommu->lock, flags);
wait_on_sem(iommu, data);
out:
return;
out_err:
raw_spin_unlock_irqrestore(&iommu->lock, flags);
}
@ -3239,7 +3312,7 @@ static struct irq_remap_table *alloc_irq_table(struct amd_iommu *iommu,
struct irq_remap_table *new_table = NULL;
struct amd_iommu_pci_seg *pci_seg;
unsigned long flags;
int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE;
int nid = iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE;
u16 alias;
spin_lock_irqsave(&iommu_table_lock, flags);

View file

@ -0,0 +1,77 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2025 Advanced Micro Devices, Inc.
*/
#include <linux/iommu.h>
#include "iommufd.h"
#include "amd_iommu.h"
#include "amd_iommu_types.h"
static const struct iommufd_viommu_ops amd_viommu_ops;
void *amd_iommufd_hw_info(struct device *dev, u32 *length, enum iommu_hw_info_type *type)
{
struct iommu_hw_info_amd *hwinfo;
if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
*type != IOMMU_HW_INFO_TYPE_AMD)
return ERR_PTR(-EOPNOTSUPP);
hwinfo = kzalloc(sizeof(*hwinfo), GFP_KERNEL);
if (!hwinfo)
return ERR_PTR(-ENOMEM);
*length = sizeof(*hwinfo);
*type = IOMMU_HW_INFO_TYPE_AMD;
hwinfo->efr = amd_iommu_efr;
hwinfo->efr2 = amd_iommu_efr2;
return hwinfo;
}
size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type viommu_type)
{
return VIOMMU_STRUCT_SIZE(struct amd_iommu_viommu, core);
}
int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent,
const struct iommu_user_data *user_data)
{
unsigned long flags;
struct protection_domain *pdom = to_pdomain(parent);
struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
xa_init_flags(&aviommu->gdomid_array, XA_FLAGS_ALLOC1);
aviommu->parent = pdom;
viommu->ops = &amd_viommu_ops;
spin_lock_irqsave(&pdom->lock, flags);
list_add(&aviommu->pdom_list, &pdom->viommu_list);
spin_unlock_irqrestore(&pdom->lock, flags);
return 0;
}
static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu)
{
unsigned long flags;
struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
struct protection_domain *pdom = aviommu->parent;
spin_lock_irqsave(&pdom->lock, flags);
list_del(&aviommu->pdom_list);
spin_unlock_irqrestore(&pdom->lock, flags);
xa_destroy(&aviommu->gdomid_array);
}
/*
* See include/linux/iommufd.h
* struct iommufd_viommu_ops - vIOMMU specific operations
*/
static const struct iommufd_viommu_ops amd_viommu_ops = {
.destroy = amd_iommufd_viommu_destroy,
};

View file

@ -0,0 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2025 Advanced Micro Devices, Inc.
*/
#ifndef AMD_IOMMUFD_H
#define AMD_IOMMUFD_H
#if IS_ENABLED(CONFIG_AMD_IOMMU_IOMMUFD)
void *amd_iommufd_hw_info(struct device *dev, u32 *length, enum iommu_hw_info_type *type);
size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type viommu_type);
int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent,
const struct iommu_user_data *user_data);
#else
#define amd_iommufd_hw_info NULL
#define amd_iommufd_viommu_init NULL
#define amd_iommufd_get_viommu_size NULL
#endif /* CONFIG_AMD_IOMMU_IOMMUFD */
#endif /* AMD_IOMMUFD_H */

294
drivers/iommu/amd/nested.c Normal file
View file

@ -0,0 +1,294 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2025 Advanced Micro Devices, Inc.
*/
#define dev_fmt(fmt) "AMD-Vi: " fmt
#include <linux/iommu.h>
#include <linux/refcount.h>
#include <uapi/linux/iommufd.h>
#include "amd_iommu.h"
static const struct iommu_domain_ops nested_domain_ops;
static inline struct nested_domain *to_ndomain(struct iommu_domain *dom)
{
return container_of(dom, struct nested_domain, domain);
}
/*
* Validate guest DTE to make sure that configuration for host (v1)
* and guest (v2) page tables are valid when allocating nested domain.
*/
static int validate_gdte_nested(struct iommu_hwpt_amd_guest *gdte)
{
u32 gpt_level = FIELD_GET(DTE_GPT_LEVEL_MASK, gdte->dte[2]);
/* Must be zero: Mode, Host-TPR */
if (FIELD_GET(DTE_MODE_MASK, gdte->dte[0]) != 0 ||
FIELD_GET(DTE_HOST_TRP, gdte->dte[0]) != 0)
return -EINVAL;
/* GCR3 TRP must be non-zero if V, GV is set */
if (FIELD_GET(DTE_FLAG_V, gdte->dte[0]) == 1 &&
FIELD_GET(DTE_FLAG_GV, gdte->dte[0]) == 1 &&
FIELD_GET(DTE_GCR3_14_12, gdte->dte[0]) == 0 &&
FIELD_GET(DTE_GCR3_30_15, gdte->dte[1]) == 0 &&
FIELD_GET(DTE_GCR3_51_31, gdte->dte[1]) == 0)
return -EINVAL;
/* Valid Guest Paging Mode values are 0 and 1 */
if (gpt_level != GUEST_PGTABLE_4_LEVEL &&
gpt_level != GUEST_PGTABLE_5_LEVEL)
return -EINVAL;
/* GLX = 3 is reserved */
if (FIELD_GET(DTE_GLX, gdte->dte[0]) == 3)
return -EINVAL;
/*
* We need to check host capability before setting
* the Guest Paging Mode
*/
if (gpt_level == GUEST_PGTABLE_5_LEVEL &&
amd_iommu_gpt_level < PAGE_MODE_5_LEVEL)
return -EOPNOTSUPP;
return 0;
}
static void *gdom_info_load_or_alloc_locked(struct xarray *xa, unsigned long index)
{
struct guest_domain_mapping_info *elm, *res;
elm = xa_load(xa, index);
if (elm)
return elm;
xa_unlock(xa);
elm = kzalloc(sizeof(struct guest_domain_mapping_info), GFP_KERNEL);
xa_lock(xa);
if (!elm)
return ERR_PTR(-ENOMEM);
res = __xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
if (xa_is_err(res))
res = ERR_PTR(xa_err(res));
if (res) {
kfree(elm);
return res;
}
refcount_set(&elm->users, 0);
return elm;
}
/*
* This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested()
* during the call to struct iommu_ops.viommu_init().
*/
struct iommu_domain *
amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags,
const struct iommu_user_data *user_data)
{
int ret;
struct nested_domain *ndom;
struct guest_domain_mapping_info *gdom_info;
struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core);
if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST)
return ERR_PTR(-EOPNOTSUPP);
ndom = kzalloc(sizeof(*ndom), GFP_KERNEL);
if (!ndom)
return ERR_PTR(-ENOMEM);
ret = iommu_copy_struct_from_user(&ndom->gdte, user_data,
IOMMU_HWPT_DATA_AMD_GUEST,
dte);
if (ret)
goto out_err;
ret = validate_gdte_nested(&ndom->gdte);
if (ret)
goto out_err;
ndom->gdom_id = FIELD_GET(DTE_DOMID_MASK, ndom->gdte.dte[1]);
ndom->domain.ops = &nested_domain_ops;
ndom->domain.type = IOMMU_DOMAIN_NESTED;
ndom->viommu = aviommu;
/*
* Normally, when a guest has multiple pass-through devices,
* the IOMMU driver setup DTEs with the same stage-2 table and
* use the same host domain ID (hDomId). In case of nested translation,
* if the guest setup different stage-1 tables with same PASID,
* IOMMU would use the same TLB tag. This will results in TLB
* aliasing issue.
*
* The guest is assigning gDomIDs based on its own algorithm for managing
* cache tags of (DomID, PASID). Within a single viommu, the nest parent domain
* (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID
* to a single hDomID. This is done using an xarray in the vIOMMU to
* keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES
* command must be issued for each hDomID in the xarray.
*/
xa_lock(&aviommu->gdomid_array);
gdom_info = gdom_info_load_or_alloc_locked(&aviommu->gdomid_array, ndom->gdom_id);
if (IS_ERR(gdom_info)) {
xa_unlock(&aviommu->gdomid_array);
ret = PTR_ERR(gdom_info);
goto out_err;
}
/* Check if gDomID exist */
if (refcount_inc_not_zero(&gdom_info->users)) {
ndom->gdom_info = gdom_info;
xa_unlock(&aviommu->gdomid_array);
pr_debug("%s: Found gdom_id=%#x, hdom_id=%#x\n",
__func__, ndom->gdom_id, gdom_info->hdom_id);
return &ndom->domain;
}
/* The gDomID does not exist. We allocate new hdom_id */
gdom_info->hdom_id = amd_iommu_pdom_id_alloc();
if (gdom_info->hdom_id <= 0) {
__xa_cmpxchg(&aviommu->gdomid_array,
ndom->gdom_id, gdom_info, NULL, GFP_ATOMIC);
xa_unlock(&aviommu->gdomid_array);
ret = -ENOSPC;
goto out_err_gdom_info;
}
ndom->gdom_info = gdom_info;
refcount_set(&gdom_info->users, 1);
xa_unlock(&aviommu->gdomid_array);
pr_debug("%s: Allocate gdom_id=%#x, hdom_id=%#x\n",
__func__, ndom->gdom_id, gdom_info->hdom_id);
return &ndom->domain;
out_err_gdom_info:
kfree(gdom_info);
out_err:
kfree(ndom);
return ERR_PTR(ret);
}
static void set_dte_nested(struct amd_iommu *iommu, struct iommu_domain *dom,
struct iommu_dev_data *dev_data, struct dev_table_entry *new)
{
struct protection_domain *parent;
struct nested_domain *ndom = to_ndomain(dom);
struct iommu_hwpt_amd_guest *gdte = &ndom->gdte;
struct pt_iommu_amdv1_hw_info pt_info;
/*
* The nest parent domain is attached during the call to the
* struct iommu_ops.viommu_init(), which will be stored as part
* of the struct amd_iommu_viommu.parent.
*/
if (WARN_ON(!ndom->viommu || !ndom->viommu->parent))
return;
parent = ndom->viommu->parent;
amd_iommu_make_clear_dte(dev_data, new);
/* Retrieve the current pagetable info via the IOMMU PT API. */
pt_iommu_amdv1_hw_info(&parent->amdv1, &pt_info);
/*
* Use domain ID from nested domain to program DTE.
* See amd_iommu_alloc_domain_nested().
*/
amd_iommu_set_dte_v1(dev_data, parent, ndom->gdom_info->hdom_id,
&pt_info, new);
/* GV is required for nested page table */
new->data[0] |= DTE_FLAG_GV;
/* Guest PPR */
new->data[0] |= gdte->dte[0] & DTE_FLAG_PPR;
/* Guest translation stuff */
new->data[0] |= gdte->dte[0] & (DTE_GLX | DTE_FLAG_GIOV);
/* GCR3 table */
new->data[0] |= gdte->dte[0] & DTE_GCR3_14_12;
new->data[1] |= gdte->dte[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31);
/* Guest paging mode */
new->data[2] |= gdte->dte[2] & DTE_GPT_LEVEL_MASK;
}
static int nested_attach_device(struct iommu_domain *dom, struct device *dev,
struct iommu_domain *old)
{
struct dev_table_entry new = {0};
struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev);
struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data);
int ret = 0;
/*
* Needs to make sure PASID is not enabled
* for this attach path.
*/
if (WARN_ON(dev_data->pasid_enabled))
return -EINVAL;
mutex_lock(&dev_data->mutex);
set_dte_nested(iommu, dom, dev_data, &new);
amd_iommu_update_dte(iommu, dev_data, &new);
mutex_unlock(&dev_data->mutex);
return ret;
}
static void nested_domain_free(struct iommu_domain *dom)
{
struct guest_domain_mapping_info *curr;
struct nested_domain *ndom = to_ndomain(dom);
struct amd_iommu_viommu *aviommu = ndom->viommu;
xa_lock(&aviommu->gdomid_array);
if (!refcount_dec_and_test(&ndom->gdom_info->users)) {
xa_unlock(&aviommu->gdomid_array);
return;
}
/*
* The refcount for the gdom_id to hdom_id mapping is zero.
* It is now safe to remove the mapping.
*/
curr = __xa_cmpxchg(&aviommu->gdomid_array, ndom->gdom_id,
ndom->gdom_info, NULL, GFP_ATOMIC);
xa_unlock(&aviommu->gdomid_array);
if (WARN_ON(!curr || xa_err(curr)))
return;
/* success */
pr_debug("%s: Free gdom_id=%#x, hdom_id=%#x\n",
__func__, ndom->gdom_id, curr->hdom_id);
amd_iommu_pdom_id_free(ndom->gdom_info->hdom_id);
kfree(curr);
kfree(ndom);
}
static const struct iommu_domain_ops nested_domain_ops = {
.attach_dev = nested_attach_device,
.free = nested_domain_free,
};

View file

@ -121,7 +121,6 @@ config ARM_SMMU_V3_KUNIT_TEST
config TEGRA241_CMDQV
bool "NVIDIA Tegra241 CMDQ-V extension support for ARM SMMUv3"
depends on ACPI
help
Support for NVIDIA CMDQ-Virtualization extension for ARM SMMUv3. The
CMDQ-V extension is similar to v3.3 ECMDQ for multi command queues

View file

@ -177,7 +177,9 @@ static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
* config bit here base this off the EATS value in the STE. If the EATS
* is set then the VM must generate ATC flushes.
*/
state.disable_ats = !nested_domain->enable_ats;
if (FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0])) ==
STRTAB_STE_0_CFG_S1_TRANS)
state.disable_ats = !nested_domain->enable_ats;
ret = arm_smmu_attach_prepare(&state, domain);
if (ret) {
mutex_unlock(&arm_smmu_asid_lock);

View file

@ -33,18 +33,25 @@ static struct mm_struct sva_mm = {
enum arm_smmu_test_master_feat {
ARM_SMMU_MASTER_TEST_ATS = BIT(0),
ARM_SMMU_MASTER_TEST_STALL = BIT(1),
ARM_SMMU_MASTER_TEST_NESTED = BIT(2),
};
static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste,
enum arm_smmu_test_master_feat feat);
static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry,
const __le64 *used_bits,
const __le64 *target,
const __le64 *safe,
unsigned int length)
{
bool differs = false;
unsigned int i;
for (i = 0; i < length; i++) {
if ((entry[i] & used_bits[i]) != target[i])
__le64 used = used_bits[i] & ~safe[i];
if ((entry[i] & used) != (target[i] & used))
differs = true;
}
return differs;
@ -56,12 +63,24 @@ arm_smmu_test_writer_record_syncs(struct arm_smmu_entry_writer *writer)
struct arm_smmu_test_writer *test_writer =
container_of(writer, struct arm_smmu_test_writer, writer);
__le64 *entry_used_bits;
__le64 *safe_target;
__le64 *safe_init;
entry_used_bits = kunit_kzalloc(
test_writer->test, sizeof(*entry_used_bits) * NUM_ENTRY_QWORDS,
GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test_writer->test, entry_used_bits);
safe_target = kunit_kzalloc(test_writer->test,
sizeof(*safe_target) * NUM_ENTRY_QWORDS,
GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test_writer->test, safe_target);
safe_init = kunit_kzalloc(test_writer->test,
sizeof(*safe_init) * NUM_ENTRY_QWORDS,
GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test_writer->test, safe_init);
pr_debug("STE value is now set to: ");
print_hex_dump_debug(" ", DUMP_PREFIX_NONE, 16, 8,
test_writer->entry,
@ -79,14 +98,23 @@ arm_smmu_test_writer_record_syncs(struct arm_smmu_entry_writer *writer)
* configuration.
*/
writer->ops->get_used(test_writer->entry, entry_used_bits);
if (writer->ops->get_update_safe)
writer->ops->get_update_safe(test_writer->entry,
test_writer->init_entry,
safe_init);
if (writer->ops->get_update_safe)
writer->ops->get_update_safe(test_writer->entry,
test_writer->target_entry,
safe_target);
KUNIT_EXPECT_FALSE(
test_writer->test,
arm_smmu_entry_differs_in_used_bits(
test_writer->entry, entry_used_bits,
test_writer->init_entry, NUM_ENTRY_QWORDS) &&
test_writer->init_entry, safe_init,
NUM_ENTRY_QWORDS) &&
arm_smmu_entry_differs_in_used_bits(
test_writer->entry, entry_used_bits,
test_writer->target_entry,
test_writer->target_entry, safe_target,
NUM_ENTRY_QWORDS));
}
}
@ -106,6 +134,7 @@ arm_smmu_v3_test_debug_print_used_bits(struct arm_smmu_entry_writer *writer,
static const struct arm_smmu_entry_writer_ops test_ste_ops = {
.sync = arm_smmu_test_writer_record_syncs,
.get_used = arm_smmu_get_ste_used,
.get_update_safe = arm_smmu_get_ste_update_safe,
};
static const struct arm_smmu_entry_writer_ops test_cd_ops = {
@ -185,6 +214,18 @@ static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste,
};
arm_smmu_make_cdtable_ste(ste, &master, ats_enabled, s1dss);
if (feat & ARM_SMMU_MASTER_TEST_NESTED) {
struct arm_smmu_ste s2ste;
int i;
arm_smmu_test_make_s2_ste(&s2ste,
feat & ~ARM_SMMU_MASTER_TEST_NESTED);
ste->data[0] |= cpu_to_le64(
FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_NESTED));
ste->data[1] |= cpu_to_le64(STRTAB_STE_1_MEV);
for (i = 2; i < NUM_ENTRY_QWORDS; i++)
ste->data[i] = s2ste.data[i];
}
}
static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test)
@ -542,6 +583,35 @@ static void arm_smmu_v3_write_ste_test_s2_to_s1_stall(struct kunit *test)
NUM_EXPECTED_SYNCS(3));
}
static void
arm_smmu_v3_write_ste_test_nested_s1dssbypass_to_s1bypass(struct kunit *test)
{
struct arm_smmu_ste s1_ste;
struct arm_smmu_ste s2_ste;
arm_smmu_test_make_cdtable_ste(
&s1_ste, STRTAB_STE_1_S1DSS_BYPASS, fake_cdtab_dma_addr,
ARM_SMMU_MASTER_TEST_ATS | ARM_SMMU_MASTER_TEST_NESTED);
arm_smmu_test_make_s2_ste(&s2_ste, 0);
/* Expect an additional sync to unset ignored bits: EATS and MEV */
arm_smmu_v3_test_ste_expect_hitless_transition(test, &s1_ste, &s2_ste,
NUM_EXPECTED_SYNCS(3));
}
static void
arm_smmu_v3_write_ste_test_nested_s1bypass_to_s1dssbypass(struct kunit *test)
{
struct arm_smmu_ste s1_ste;
struct arm_smmu_ste s2_ste;
arm_smmu_test_make_cdtable_ste(
&s1_ste, STRTAB_STE_1_S1DSS_BYPASS, fake_cdtab_dma_addr,
ARM_SMMU_MASTER_TEST_ATS | ARM_SMMU_MASTER_TEST_NESTED);
arm_smmu_test_make_s2_ste(&s2_ste, 0);
arm_smmu_v3_test_ste_expect_hitless_transition(test, &s2_ste, &s1_ste,
NUM_EXPECTED_SYNCS(2));
}
static void arm_smmu_v3_write_cd_test_sva_clear(struct kunit *test)
{
struct arm_smmu_cd cd = {};
@ -588,6 +658,8 @@ static struct kunit_case arm_smmu_v3_test_cases[] = {
KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_change_asid),
KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2_stall),
KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1_stall),
KUNIT_CASE(arm_smmu_v3_write_ste_test_nested_s1dssbypass_to_s1bypass),
KUNIT_CASE(arm_smmu_v3_write_ste_test_nested_s1bypass_to_s1dssbypass),
KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_clear),
KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_release),
{},

View file

@ -487,20 +487,26 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
*/
static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
{
int val;
/*
* We can try to avoid the cmpxchg() loop by simply incrementing the
* lock counter. When held in exclusive state, the lock counter is set
* to INT_MIN so these increments won't hurt as the value will remain
* negative.
* When held in exclusive state, the lock counter is set to INT_MIN
* so these increments won't hurt as the value will remain negative.
* The increment will also signal the exclusive locker that there are
* shared waiters.
*/
if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
return;
do {
val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
} while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
/*
* Someone else is holding the lock in exclusive state, so wait
* for them to finish. Since we already incremented the lock counter,
* no exclusive lock can be acquired until we finish. We don't need
* the return value since we only care that the exclusive lock is
* released (i.e. the lock counter is non-negative).
* Once the exclusive locker releases the lock, the sign bit will
* be cleared and our increment will make the lock counter positive,
* allowing us to proceed.
*/
atomic_cond_read_relaxed(&cmdq->lock, VAL > 0);
}
static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
@ -527,9 +533,14 @@ static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
__ret; \
})
/*
* Only clear the sign bit when releasing the exclusive lock this will
* allow any shared_lock() waiters to proceed without the possibility
* of entering the exclusive lock in a tight loop.
*/
#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \
({ \
atomic_set_release(&cmdq->lock, 0); \
atomic_fetch_andnot_release(INT_MIN, &cmdq->lock); \
local_irq_restore(flags); \
})
@ -1082,6 +1093,49 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
}
EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_used);
VISIBLE_IF_KUNIT
void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target,
__le64 *safe_bits)
{
const __le64 eats_s1chk =
FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_S1CHK);
const __le64 eats_trans =
FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS);
/*
* When an STE changes EATS_TRANS, the sequencing code in the attach
* logic already will have the PCI cap for ATS disabled. Thus at this
* moment we can expect that the device will not generate ATS queries
* and so we don't care about the sequencing of EATS. The purpose of
* EATS_TRANS is to protect the system from hostile untrusted devices
* that issue ATS when the PCI config space is disabled. However, if
* EATS_TRANS is being changed, then we must have already trusted the
* device as the EATS_TRANS security block is being disabled.
*
* Note: now the EATS_TRANS update is moved to the first entry_set().
* Changing S2S and EATS might transiently result in S2S=1 and EATS=1
* which is a bad STE (see "5.2 Stream Table Entry"). In such a case,
* we can't do a hitless update. Also, it should not be added to the
* safe bits with STRTAB_STE_1_EATS_S1CHK, because EATS=0b11 would be
* effectively an errant 0b00 configuration.
*/
if (!((cur[1] | target[1]) & cpu_to_le64(eats_s1chk)) &&
!((cur[2] | target[2]) & cpu_to_le64(STRTAB_STE_2_S2S)))
safe_bits[1] |= cpu_to_le64(eats_trans);
/*
* MEV does not meaningfully impact the operation of the HW, it only
* changes how many fault events are generated, thus we can relax it
* when computing the ordering. The spec notes the device can act like
* MEV=1 anyhow:
*
* Note: Software must expect, and be able to deal with, coalesced
* fault records even when MEV == 0.
*/
safe_bits[1] |= cpu_to_le64(STRTAB_STE_1_MEV);
}
EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_update_safe);
/*
* Figure out if we can do a hitless update of entry to become target. Returns a
* bit mask where 1 indicates that qword needs to be set disruptively.
@ -1094,13 +1148,22 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
{
__le64 target_used[NUM_ENTRY_QWORDS] = {};
__le64 cur_used[NUM_ENTRY_QWORDS] = {};
__le64 safe[NUM_ENTRY_QWORDS] = {};
u8 used_qword_diff = 0;
unsigned int i;
writer->ops->get_used(entry, cur_used);
writer->ops->get_used(target, target_used);
if (writer->ops->get_update_safe)
writer->ops->get_update_safe(entry, target, safe);
for (i = 0; i != NUM_ENTRY_QWORDS; i++) {
/*
* Safe is only used for bits that are used by both entries,
* otherwise it is sequenced according to the unused entry.
*/
safe[i] &= target_used[i] & cur_used[i];
/*
* Check that masks are up to date, the make functions are not
* allowed to set a bit to 1 if the used function doesn't say it
@ -1109,6 +1172,7 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
WARN_ON_ONCE(target[i] & ~target_used[i]);
/* Bits can change because they are not currently being used */
cur_used[i] &= ~safe[i];
unused_update[i] = (entry[i] & cur_used[i]) |
(target[i] & ~cur_used[i]);
/*
@ -1121,7 +1185,7 @@ static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
return used_qword_diff;
}
static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
static void entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
const __le64 *target, unsigned int start,
unsigned int len)
{
@ -1137,7 +1201,6 @@ static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
if (changed)
writer->ops->sync(writer);
return changed;
}
/*
@ -1207,12 +1270,9 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
entry_set(writer, entry, target, 0, 1);
} else {
/*
* No inuse bit changed. Sanity check that all unused bits are 0
* in the entry. The target was already sanity checked by
* compute_qword_diff().
* No inuse bit changed, though safe bits may have changed.
*/
WARN_ON_ONCE(
entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS));
entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS);
}
}
EXPORT_SYMBOL_IF_KUNIT(arm_smmu_write_entry);
@ -1543,6 +1603,7 @@ static void arm_smmu_ste_writer_sync_entry(struct arm_smmu_entry_writer *writer)
static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = {
.sync = arm_smmu_ste_writer_sync_entry,
.get_used = arm_smmu_get_ste_used,
.get_update_safe = arm_smmu_get_ste_update_safe,
};
static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
@ -2551,7 +2612,7 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
ARM_SMMU_FEAT_VAX) ? 52 : 48;
pgtbl_cfg.ias = min_t(unsigned long, ias, VA_BITS);
pgtbl_cfg.oas = smmu->ias;
pgtbl_cfg.oas = smmu->oas;
if (enable_dirty)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_HD;
fmt = ARM_64_LPAE_S1;
@ -2561,7 +2622,7 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
case ARM_SMMU_DOMAIN_S2:
if (enable_dirty)
return -EOPNOTSUPP;
pgtbl_cfg.ias = smmu->ias;
pgtbl_cfg.ias = smmu->oas;
pgtbl_cfg.oas = smmu->oas;
fmt = ARM_64_LPAE_S2;
finalise_stage_fn = arm_smmu_domain_finalise_s2;
@ -3125,7 +3186,8 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master,
struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
struct arm_smmu_cd *cd, struct iommu_domain *old)
{
struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev);
struct iommu_domain *sid_domain =
iommu_driver_get_domain_for_dev(master->dev);
struct arm_smmu_attach_state state = {
.master = master,
.ssid = pasid,
@ -3191,7 +3253,7 @@ static int arm_smmu_blocking_set_dev_pasid(struct iommu_domain *new_domain,
*/
if (!arm_smmu_ssids_in_use(&master->cd_table)) {
struct iommu_domain *sid_domain =
iommu_get_domain_for_dev(master->dev);
iommu_driver_get_domain_for_dev(master->dev);
if (sid_domain->type == IOMMU_DOMAIN_IDENTITY ||
sid_domain->type == IOMMU_DOMAIN_BLOCKED)
@ -4395,13 +4457,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
}
/* We only support the AArch64 table format at present */
switch (FIELD_GET(IDR0_TTF, reg)) {
case IDR0_TTF_AARCH32_64:
smmu->ias = 40;
fallthrough;
case IDR0_TTF_AARCH64:
break;
default:
if (!(FIELD_GET(IDR0_TTF, reg) & IDR0_TTF_AARCH64)) {
dev_err(smmu->dev, "AArch64 table format not supported!\n");
return -ENXIO;
}
@ -4514,8 +4570,6 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
dev_warn(smmu->dev,
"failed to set DMA mask for table walker\n");
smmu->ias = max(smmu->ias, smmu->oas);
if ((smmu->features & ARM_SMMU_FEAT_TRANS_S1) &&
(smmu->features & ARM_SMMU_FEAT_TRANS_S2))
smmu->features |= ARM_SMMU_FEAT_NESTING;
@ -4525,11 +4579,40 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
if (arm_smmu_sva_supported(smmu))
smmu->features |= ARM_SMMU_FEAT_SVA;
dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n",
smmu->ias, smmu->oas, smmu->features);
dev_info(smmu->dev, "oas %lu-bit (features 0x%08x)\n",
smmu->oas, smmu->features);
return 0;
}
#ifdef CONFIG_TEGRA241_CMDQV
static void tegra_cmdqv_dt_probe(struct device_node *smmu_node,
struct arm_smmu_device *smmu)
{
struct platform_device *pdev;
struct device_node *np;
np = of_parse_phandle(smmu_node, "nvidia,cmdqv", 0);
if (!np)
return;
/* Tegra241 CMDQV driver is responsible for put_device() */
pdev = of_find_device_by_node(np);
of_node_put(np);
if (!pdev)
return;
smmu->impl_dev = &pdev->dev;
smmu->options |= ARM_SMMU_OPT_TEGRA241_CMDQV;
dev_dbg(smmu->dev, "found companion CMDQV device: %s\n",
dev_name(smmu->impl_dev));
}
#else
static void tegra_cmdqv_dt_probe(struct device_node *smmu_node,
struct arm_smmu_device *smmu)
{
}
#endif
#ifdef CONFIG_ACPI
#ifdef CONFIG_TEGRA241_CMDQV
static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node,
@ -4542,10 +4625,11 @@ static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node,
adev = acpi_dev_get_first_match_dev("NVDA200C", uid, -1);
if (adev) {
/* Tegra241 CMDQV driver is responsible for put_device() */
smmu->impl_dev = &adev->dev;
smmu->impl_dev = get_device(acpi_get_first_physical_node(adev));
smmu->options |= ARM_SMMU_OPT_TEGRA241_CMDQV;
dev_info(smmu->dev, "found companion CMDQV device: %s\n",
dev_name(smmu->impl_dev));
acpi_dev_put(adev);
}
kfree(uid);
}
@ -4634,6 +4718,9 @@ static int arm_smmu_device_dt_probe(struct platform_device *pdev,
if (of_dma_is_coherent(dev->of_node))
smmu->features |= ARM_SMMU_FEAT_COHERENCY;
if (of_device_is_compatible(dev->of_node, "nvidia,tegra264-smmu"))
tegra_cmdqv_dt_probe(dev->of_node, smmu);
return ret;
}

View file

@ -43,7 +43,6 @@ struct arm_vsmmu;
#define IDR0_COHACC (1 << 4)
#define IDR0_TTF GENMASK(3, 2)
#define IDR0_TTF_AARCH64 2
#define IDR0_TTF_AARCH32_64 3
#define IDR0_S1P (1 << 1)
#define IDR0_S2P (1 << 0)
@ -784,7 +783,6 @@ struct arm_smmu_device {
int gerr_irq;
int combined_irq;
unsigned long ias; /* IPA */
unsigned long oas; /* PA */
unsigned long pgsize_bitmap;
@ -900,6 +898,8 @@ struct arm_smmu_entry_writer {
struct arm_smmu_entry_writer_ops {
void (*get_used)(const __le64 *entry, __le64 *used);
void (*get_update_safe)(const __le64 *cur, const __le64 *target,
__le64 *safe_bits);
void (*sync)(struct arm_smmu_entry_writer *writer);
};
@ -911,6 +911,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
#if IS_ENABLED(CONFIG_KUNIT)
void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits);
void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target,
__le64 *safe_bits);
void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur,
const __le64 *target);
void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits);

View file

@ -3,17 +3,15 @@
#define dev_fmt(fmt) "tegra241_cmdqv: " fmt
#include <linux/acpi.h>
#include <linux/debugfs.h>
#include <linux/dma-mapping.h>
#include <linux/interrupt.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
#include <linux/iopoll.h>
#include <linux/platform_device.h>
#include <uapi/linux/iommufd.h>
#include <acpi/acpixf.h>
#include "arm-smmu-v3.h"
/* CMDQV register page base and size defines */
@ -854,69 +852,6 @@ static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = {
/* Probe Functions */
static int tegra241_cmdqv_acpi_is_memory(struct acpi_resource *res, void *data)
{
struct resource_win win;
return !acpi_dev_resource_address_space(res, &win);
}
static int tegra241_cmdqv_acpi_get_irqs(struct acpi_resource *ares, void *data)
{
struct resource r;
int *irq = data;
if (*irq <= 0 && acpi_dev_resource_interrupt(ares, 0, &r))
*irq = r.start;
return 1; /* No need to add resource to the list */
}
static struct resource *
tegra241_cmdqv_find_acpi_resource(struct device *dev, int *irq)
{
struct acpi_device *adev = to_acpi_device(dev);
struct list_head resource_list;
struct resource_entry *rentry;
struct resource *res = NULL;
int ret;
INIT_LIST_HEAD(&resource_list);
ret = acpi_dev_get_resources(adev, &resource_list,
tegra241_cmdqv_acpi_is_memory, NULL);
if (ret < 0) {
dev_err(dev, "failed to get memory resource: %d\n", ret);
return NULL;
}
rentry = list_first_entry_or_null(&resource_list,
struct resource_entry, node);
if (!rentry) {
dev_err(dev, "failed to get memory resource entry\n");
goto free_list;
}
/* Caller must free the res */
res = kzalloc(sizeof(*res), GFP_KERNEL);
if (!res)
goto free_list;
*res = *rentry->res;
acpi_dev_free_resource_list(&resource_list);
INIT_LIST_HEAD(&resource_list);
if (irq)
ret = acpi_dev_get_resources(adev, &resource_list,
tegra241_cmdqv_acpi_get_irqs, irq);
if (ret < 0 || !irq || *irq <= 0)
dev_warn(dev, "no interrupt. errors will not be reported\n");
free_list:
acpi_dev_free_resource_list(&resource_list);
return res;
}
static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu)
{
struct tegra241_cmdqv *cmdqv =
@ -1042,18 +977,23 @@ iounmap:
struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
{
struct platform_device *pdev = to_platform_device(smmu->impl_dev);
struct arm_smmu_device *new_smmu;
struct resource *res = NULL;
struct resource *res;
int irq;
if (!smmu->dev->of_node)
res = tegra241_cmdqv_find_acpi_resource(smmu->impl_dev, &irq);
if (!res)
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (!res) {
dev_err(&pdev->dev, "no memory resource found for CMDQV\n");
goto out_fallback;
}
irq = platform_get_irq_optional(pdev, 0);
if (irq <= 0)
dev_warn(&pdev->dev,
"no interrupt. errors will not be reported\n");
new_smmu = __tegra241_cmdqv_probe(smmu, res, irq);
kfree(res);
if (new_smmu)
return new_smmu;

View file

@ -41,12 +41,40 @@ static const struct of_device_id qcom_smmu_actlr_client_of_match[] = {
.data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) },
{ .compatible = "qcom,fastrpc",
.data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) },
{ .compatible = "qcom,qcm2290-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sa8775p-mdss",
.data = (const void *) (PREFETCH_DEFAULT | CMTLB) },
{ .compatible = "qcom,sc7280-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sc7280-venus",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sc8180x-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sc8280xp-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sm6115-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sm6125-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sm6350-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sm8150-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sm8250-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sm8350-mdss",
.data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) },
{ .compatible = "qcom,sm8450-mdss",
.data = (const void *) (PREFETCH_DEFAULT | CMTLB) },
{ .compatible = "qcom,sm8550-mdss",
.data = (const void *) (PREFETCH_DEFAULT | CMTLB) },
{ .compatible = "qcom,sm8650-mdss",
.data = (const void *) (PREFETCH_DEFAULT | CMTLB) },
{ .compatible = "qcom,sm8750-mdss",
.data = (const void *) (PREFETCH_DEFAULT | CMTLB) },
{ .compatible = "qcom,x1e80100-mdss",
.data = (const void *) (PREFETCH_DEFAULT | CMTLB) },
{ }
};

View file

@ -761,14 +761,10 @@ static struct platform_driver qcom_iommu_ctx_driver = {
static bool qcom_iommu_has_secure_context(struct qcom_iommu_dev *qcom_iommu)
{
struct device_node *child;
for_each_child_of_node(qcom_iommu->dev->of_node, child) {
for_each_child_of_node_scoped(qcom_iommu->dev->of_node, child) {
if (of_device_is_compatible(child, "qcom,msm-iommu-v1-sec") ||
of_device_is_compatible(child, "qcom,msm-iommu-v2-sec")) {
of_node_put(child);
of_device_is_compatible(child, "qcom,msm-iommu-v2-sec"))
return true;
}
}
return false;

View file

@ -2097,10 +2097,8 @@ void dma_iova_destroy(struct device *dev, struct dma_iova_state *state,
}
EXPORT_SYMBOL_GPL(dma_iova_destroy);
void iommu_setup_dma_ops(struct device *dev)
void iommu_setup_dma_ops(struct device *dev, struct iommu_domain *domain)
{
struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
if (dev_is_pci(dev))
dev->iommu->pci_32bit_workaround = !iommu_dma_forcedac;

View file

@ -9,7 +9,7 @@
#ifdef CONFIG_IOMMU_DMA
void iommu_setup_dma_ops(struct device *dev);
void iommu_setup_dma_ops(struct device *dev, struct iommu_domain *domain);
int iommu_get_dma_cookie(struct iommu_domain *domain);
void iommu_put_dma_cookie(struct iommu_domain *domain);
@ -26,7 +26,8 @@ extern bool iommu_dma_forcedac;
#else /* CONFIG_IOMMU_DMA */
static inline void iommu_setup_dma_ops(struct device *dev)
static inline void iommu_setup_dma_ops(struct device *dev,
struct iommu_domain *domain)
{
}

View file

@ -354,7 +354,8 @@ static inline int amdv1pt_iommu_set_prot(struct pt_common *common,
* Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
* control this. For now if the tables use sme_set then so do the ptes.
*/
if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES))
if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES) &&
!(iommu_prot & IOMMU_MMIO))
pte = __sme_set(pte);
attrs->descriptor_bits = pte;

View file

@ -227,7 +227,8 @@ static inline int x86_64_pt_iommu_set_prot(struct pt_common *common,
* Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to
* control this. For now if the tables use sme_set then so do the ptes.
*/
if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES))
if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) &&
!(iommu_prot & IOMMU_MMIO))
pte = __sme_set(pte);
attrs->descriptor_bits = pte;

View file

@ -58,10 +58,9 @@ static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather,
* Note that the sync frees the gather's free list, so we must
* not have any pages on that list that are covered by iova/len
*/
} else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) {
iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
}
iommu_iotlb_gather_add_range(iotlb_gather, iova, len);
iommu_pages_list_splice(free_list, &iotlb_gather->freelist);
}

View file

@ -363,6 +363,13 @@ static void qi_batch_add_pasid_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16
qi_batch_increment_index(iommu, batch);
}
static bool intel_domain_use_piotlb(struct dmar_domain *domain)
{
return domain->domain.type == IOMMU_DOMAIN_SVA ||
domain->domain.type == IOMMU_DOMAIN_NESTED ||
intel_domain_is_fs_paging(domain);
}
static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag,
unsigned long addr, unsigned long pages,
unsigned long mask, int ih)
@ -370,7 +377,7 @@ static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *
struct intel_iommu *iommu = tag->iommu;
u64 type = DMA_TLB_PSI_FLUSH;
if (intel_domain_is_fs_paging(domain)) {
if (intel_domain_use_piotlb(domain)) {
qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr,
pages, ih, domain->qi_batch);
return;

View file

@ -1240,22 +1240,22 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8
}
did = context_domain_id(context);
context_clear_entry(context);
context_clear_present(context);
__iommu_flush_cache(iommu, context, sizeof(*context));
spin_unlock(&iommu->lock);
intel_context_flush_no_pasid(info, context, did);
context_clear_entry(context);
__iommu_flush_cache(iommu, context, sizeof(*context));
}
int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev,
ioasid_t pasid, u16 did, phys_addr_t fsptptr,
int flags, struct iommu_domain *old)
{
if (!old)
return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid,
did, flags);
return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did,
iommu_domain_did(old, iommu),
flags);
if (old)
intel_pasid_tear_down_entry(iommu, dev, pasid, false);
return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, did, flags);
}
static int domain_setup_second_level(struct intel_iommu *iommu,
@ -1263,23 +1263,20 @@ static int domain_setup_second_level(struct intel_iommu *iommu,
struct device *dev, ioasid_t pasid,
struct iommu_domain *old)
{
if (!old)
return intel_pasid_setup_second_level(iommu, domain,
dev, pasid);
return intel_pasid_replace_second_level(iommu, domain, dev,
iommu_domain_did(old, iommu),
pasid);
if (old)
intel_pasid_tear_down_entry(iommu, dev, pasid, false);
return intel_pasid_setup_second_level(iommu, domain, dev, pasid);
}
static int domain_setup_passthrough(struct intel_iommu *iommu,
struct device *dev, ioasid_t pasid,
struct iommu_domain *old)
{
if (!old)
return intel_pasid_setup_pass_through(iommu, dev, pasid);
return intel_pasid_replace_pass_through(iommu, dev,
iommu_domain_did(old, iommu),
pasid);
if (old)
intel_pasid_tear_down_entry(iommu, dev, pasid, false);
return intel_pasid_setup_pass_through(iommu, dev, pasid);
}
static int domain_setup_first_level(struct intel_iommu *iommu,

View file

@ -900,7 +900,26 @@ static inline int pfn_level_offset(u64 pfn, int level)
static inline void context_set_present(struct context_entry *context)
{
context->lo |= 1;
u64 val;
dma_wmb();
val = READ_ONCE(context->lo) | 1;
WRITE_ONCE(context->lo, val);
}
/*
* Clear the Present (P) bit (bit 0) of a context table entry. This initiates
* the transition of the entry's ownership from hardware to software. The
* caller is responsible for fulfilling the invalidation handshake recommended
* by the VT-d spec, Section 6.5.3.3 (Guidance to Software for Invalidations).
*/
static inline void context_clear_present(struct context_entry *context)
{
u64 val;
val = READ_ONCE(context->lo) & GENMASK_ULL(63, 1);
WRITE_ONCE(context->lo, val);
dma_wmb();
}
static inline void context_set_fault_enable(struct context_entry *context)

View file

@ -136,11 +136,10 @@ static int domain_setup_nested(struct intel_iommu *iommu,
struct device *dev, ioasid_t pasid,
struct iommu_domain *old)
{
if (!old)
return intel_pasid_setup_nested(iommu, dev, pasid, domain);
return intel_pasid_replace_nested(iommu, dev, pasid,
iommu_domain_did(old, iommu),
domain);
if (old)
intel_pasid_tear_down_entry(iommu, dev, pasid, false);
return intel_pasid_setup_nested(iommu, dev, pasid, domain);
}
static int intel_nested_set_dev_pasid(struct iommu_domain *domain,

View file

@ -153,6 +153,9 @@ retry:
if (!entries)
return NULL;
if (!ecap_coherent(info->iommu->ecap))
clflush_cache_range(entries, VTD_PAGE_SIZE);
/*
* The pasid directory table entry won't be freed after
* allocation. No worry about the race with free and
@ -165,10 +168,8 @@ retry:
iommu_free_pages(entries);
goto retry;
}
if (!ecap_coherent(info->iommu->ecap)) {
clflush_cache_range(entries, VTD_PAGE_SIZE);
if (!ecap_coherent(info->iommu->ecap))
clflush_cache_range(&dir[dir_index].val, sizeof(*dir));
}
}
return &entries[index];
@ -218,7 +219,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
if (!info || !info->ats_enabled)
return;
if (pci_dev_is_disconnected(to_pci_dev(dev)))
if (!pci_device_is_present(to_pci_dev(dev)))
return;
sid = PCI_DEVID(info->bus, info->devfn);
@ -272,7 +273,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
did = pasid_get_domain_id(pte);
pgtt = pasid_pte_get_pgtt(pte);
intel_pasid_clear_entry(dev, pasid, fault_ignore);
pasid_clear_present(pte);
spin_unlock(&iommu->lock);
if (!ecap_coherent(iommu->ecap))
@ -286,6 +287,10 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev,
iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
devtlb_invalidation_with_pasid(iommu, dev, pasid);
intel_pasid_clear_entry(dev, pasid, fault_ignore);
if (!ecap_coherent(iommu->ecap))
clflush_cache_range(pte, sizeof(*pte));
if (!fault_ignore)
intel_iommu_drain_pasid_prq(dev, pasid);
}
@ -412,50 +417,6 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev,
return 0;
}
int intel_pasid_replace_first_level(struct intel_iommu *iommu,
struct device *dev, phys_addr_t fsptptr,
u32 pasid, u16 did, u16 old_did,
int flags)
{
struct pasid_entry *pte, new_pte;
if (!ecap_flts(iommu->ecap)) {
pr_err("No first level translation support on %s\n",
iommu->name);
return -EINVAL;
}
if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) {
pr_err("No 5-level paging support for first-level on %s\n",
iommu->name);
return -EINVAL;
}
pasid_pte_config_first_level(iommu, &new_pte, fsptptr, did, flags);
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
if (!pte) {
spin_unlock(&iommu->lock);
return -ENODEV;
}
if (!pasid_pte_is_present(pte)) {
spin_unlock(&iommu->lock);
return -EINVAL;
}
WARN_ON(old_did != pasid_get_domain_id(pte));
*pte = new_pte;
spin_unlock(&iommu->lock);
intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
intel_iommu_drain_pasid_prq(dev, pasid);
return 0;
}
/*
* Set up the scalable mode pasid entry for second only translation type.
*/
@ -522,51 +483,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
return 0;
}
int intel_pasid_replace_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u16 old_did,
u32 pasid)
{
struct pasid_entry *pte, new_pte;
u16 did;
/*
* If hardware advertises no support for second level
* translation, return directly.
*/
if (!ecap_slts(iommu->ecap)) {
pr_err("No second level translation support on %s\n",
iommu->name);
return -EINVAL;
}
did = domain_id_iommu(domain, iommu);
pasid_pte_config_second_level(iommu, &new_pte, domain, did);
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
if (!pte) {
spin_unlock(&iommu->lock);
return -ENODEV;
}
if (!pasid_pte_is_present(pte)) {
spin_unlock(&iommu->lock);
return -EINVAL;
}
WARN_ON(old_did != pasid_get_domain_id(pte));
*pte = new_pte;
spin_unlock(&iommu->lock);
intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
intel_iommu_drain_pasid_prq(dev, pasid);
return 0;
}
/*
* Set up dirty tracking on a second only or nested translation type.
*/
@ -679,38 +595,6 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
return 0;
}
int intel_pasid_replace_pass_through(struct intel_iommu *iommu,
struct device *dev, u16 old_did,
u32 pasid)
{
struct pasid_entry *pte, new_pte;
u16 did = FLPT_DEFAULT_DID;
pasid_pte_config_pass_through(iommu, &new_pte, did);
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
if (!pte) {
spin_unlock(&iommu->lock);
return -ENODEV;
}
if (!pasid_pte_is_present(pte)) {
spin_unlock(&iommu->lock);
return -EINVAL;
}
WARN_ON(old_did != pasid_get_domain_id(pte));
*pte = new_pte;
spin_unlock(&iommu->lock);
intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
intel_iommu_drain_pasid_prq(dev, pasid);
return 0;
}
/*
* Set the page snoop control for a pasid entry which has been set up.
*/
@ -844,69 +728,6 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
return 0;
}
int intel_pasid_replace_nested(struct intel_iommu *iommu,
struct device *dev, u32 pasid,
u16 old_did, struct dmar_domain *domain)
{
struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg;
struct dmar_domain *s2_domain = domain->s2_domain;
u16 did = domain_id_iommu(domain, iommu);
struct pasid_entry *pte, new_pte;
/* Address width should match the address width supported by hardware */
switch (s1_cfg->addr_width) {
case ADDR_WIDTH_4LEVEL:
break;
case ADDR_WIDTH_5LEVEL:
if (!cap_fl5lp_support(iommu->cap)) {
dev_err_ratelimited(dev,
"5-level paging not supported\n");
return -EINVAL;
}
break;
default:
dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n",
s1_cfg->addr_width);
return -EINVAL;
}
if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) {
pr_err_ratelimited("No supervisor request support on %s\n",
iommu->name);
return -EINVAL;
}
if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) {
pr_err_ratelimited("No extended access flag support on %s\n",
iommu->name);
return -EINVAL;
}
pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did);
spin_lock(&iommu->lock);
pte = intel_pasid_get_entry(dev, pasid);
if (!pte) {
spin_unlock(&iommu->lock);
return -ENODEV;
}
if (!pasid_pte_is_present(pte)) {
spin_unlock(&iommu->lock);
return -EINVAL;
}
WARN_ON(old_did != pasid_get_domain_id(pte));
*pte = new_pte;
spin_unlock(&iommu->lock);
intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
intel_iommu_drain_pasid_prq(dev, pasid);
return 0;
}
/*
* Interfaces to setup or teardown a pasid table to the scalable-mode
* context table entry:
@ -1019,7 +840,7 @@ static int device_pasid_table_setup(struct device *dev, u8 bus, u8 devfn)
}
if (context_copied(iommu, bus, devfn)) {
context_clear_entry(context);
context_clear_present(context);
__iommu_flush_cache(iommu, context, sizeof(*context));
/*
@ -1039,6 +860,9 @@ static int device_pasid_table_setup(struct device *dev, u8 bus, u8 devfn)
iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID);
context_clear_entry(context);
__iommu_flush_cache(iommu, context, sizeof(*context));
/*
* At this point, the device is supposed to finish reset at
* its driver probe stage, so no in-flight DMA will exist,
@ -1102,6 +926,14 @@ static void __context_flush_dev_iotlb(struct device_domain_info *info)
if (!info->ats_enabled)
return;
/*
* Skip dev-IOTLB flush for inaccessible PCIe devices to prevent the
* Intel IOMMU from waiting indefinitely for an ATS invalidation that
* cannot complete.
*/
if (!pci_device_is_present(to_pci_dev(info->dev)))
return;
qi_flush_dev_iotlb(info->iommu, PCI_DEVID(info->bus, info->devfn),
info->pfsid, info->ats_qdep, 0, MAX_AGAW_PFN_WIDTH);

View file

@ -234,9 +234,23 @@ static inline void pasid_set_wpe(struct pasid_entry *pe)
*/
static inline void pasid_set_present(struct pasid_entry *pe)
{
dma_wmb();
pasid_set_bits(&pe->val[0], 1 << 0, 1);
}
/*
* Clear the Present (P) bit (bit 0) of a scalable-mode PASID table entry.
* This initiates the transition of the entry's ownership from hardware
* to software. The caller is responsible for fulfilling the invalidation
* handshake recommended by the VT-d spec, Section 6.5.3.3 (Guidance to
* Software for Invalidations).
*/
static inline void pasid_clear_present(struct pasid_entry *pe)
{
pasid_set_bits(&pe->val[0], 1 << 0, 0);
dma_wmb();
}
/*
* Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID
* entry.
@ -302,20 +316,6 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
struct device *dev, u32 pasid);
int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
u32 pasid, struct dmar_domain *domain);
int intel_pasid_replace_first_level(struct intel_iommu *iommu,
struct device *dev, phys_addr_t fsptptr,
u32 pasid, u16 did, u16 old_did, int flags);
int intel_pasid_replace_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u16 old_did,
u32 pasid);
int intel_pasid_replace_pass_through(struct intel_iommu *iommu,
struct device *dev, u16 old_did,
u32 pasid);
int intel_pasid_replace_nested(struct intel_iommu *iommu,
struct device *dev, u32 pasid,
u16 old_did, struct dmar_domain *domain);
void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
struct device *dev, u32 pasid,
bool fault_ignore);

View file

@ -0,0 +1,164 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2025 - Google Inc
* Author: Mostafa Saleh <smostafa@google.com>
* IOMMU API debug page alloc sanitizer
*/
#include <linux/atomic.h>
#include <linux/iommu.h>
#include <linux/iommu-debug-pagealloc.h>
#include <linux/kernel.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
#include "iommu-priv.h"
static bool needed;
DEFINE_STATIC_KEY_FALSE(iommu_debug_initialized);
struct iommu_debug_metadata {
atomic_t ref;
};
static __init bool need_iommu_debug(void)
{
return needed;
}
struct page_ext_operations page_iommu_debug_ops = {
.size = sizeof(struct iommu_debug_metadata),
.need = need_iommu_debug,
};
static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext)
{
return page_ext_data(page_ext, &page_iommu_debug_ops);
}
static void iommu_debug_inc_page(phys_addr_t phys)
{
struct page_ext *page_ext = page_ext_from_phys(phys);
struct iommu_debug_metadata *d;
if (!page_ext)
return;
d = get_iommu_data(page_ext);
WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0);
page_ext_put(page_ext);
}
static void iommu_debug_dec_page(phys_addr_t phys)
{
struct page_ext *page_ext = page_ext_from_phys(phys);
struct iommu_debug_metadata *d;
if (!page_ext)
return;
d = get_iommu_data(page_ext);
WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0);
page_ext_put(page_ext);
}
/*
* IOMMU page size doesn't have to match the CPU page size. So, we use
* the smallest IOMMU page size to refcount the pages in the vmemmap.
* That is important as both map and unmap has to use the same page size
* to update the refcount to avoid double counting the same page.
* And as we can't know from iommu_unmap() what was the original page size
* used for map, we just use the minimum supported one for both.
*/
static size_t iommu_debug_page_size(struct iommu_domain *domain)
{
return 1UL << __ffs(domain->pgsize_bitmap);
}
static bool iommu_debug_page_count(const struct page *page)
{
unsigned int ref;
struct page_ext *page_ext = page_ext_get(page);
struct iommu_debug_metadata *d = get_iommu_data(page_ext);
ref = atomic_read(&d->ref);
page_ext_put(page_ext);
return ref != 0;
}
void __iommu_debug_check_unmapped(const struct page *page, int numpages)
{
while (numpages--) {
if (WARN_ON(iommu_debug_page_count(page))) {
pr_warn("iommu: Detected page leak!\n");
dump_page_owner(page);
}
page++;
}
}
void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size)
{
size_t off, end;
size_t page_size = iommu_debug_page_size(domain);
if (WARN_ON(!phys || check_add_overflow(phys, size, &end)))
return;
for (off = 0 ; off < size ; off += page_size)
iommu_debug_inc_page(phys + off);
}
static void __iommu_debug_update_iova(struct iommu_domain *domain,
unsigned long iova, size_t size, bool inc)
{
size_t off, end;
size_t page_size = iommu_debug_page_size(domain);
if (WARN_ON(check_add_overflow(iova, size, &end)))
return;
for (off = 0 ; off < size ; off += page_size) {
phys_addr_t phys = iommu_iova_to_phys(domain, iova + off);
if (!phys)
continue;
if (inc)
iommu_debug_inc_page(phys);
else
iommu_debug_dec_page(phys);
}
}
void __iommu_debug_unmap_begin(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
__iommu_debug_update_iova(domain, iova, size, false);
}
void __iommu_debug_unmap_end(struct iommu_domain *domain,
unsigned long iova, size_t size,
size_t unmapped)
{
if ((unmapped == size) || WARN_ON_ONCE(unmapped > size))
return;
/* If unmap failed, re-increment the refcount. */
__iommu_debug_update_iova(domain, iova + unmapped,
size - unmapped, true);
}
void iommu_debug_init(void)
{
if (!needed)
return;
pr_info("iommu: Debugging page allocations, expect overhead or disable iommu.debug_pagealloc");
static_branch_enable(&iommu_debug_initialized);
}
static int __init iommu_debug_pagealloc(char *str)
{
return kstrtobool(str, &needed);
}
early_param("iommu.debug_pagealloc", iommu_debug_pagealloc);

View file

@ -5,6 +5,7 @@
#define __LINUX_IOMMU_PRIV_H
#include <linux/iommu.h>
#include <linux/iommu-debug-pagealloc.h>
#include <linux/msi.h>
static inline const struct iommu_ops *dev_iommu_ops(struct device *dev)
@ -65,4 +66,61 @@ static inline int iommufd_sw_msi(struct iommu_domain *domain,
int iommu_replace_device_pasid(struct iommu_domain *domain,
struct device *dev, ioasid_t pasid,
struct iommu_attach_handle *handle);
#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC
void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys,
size_t size);
void __iommu_debug_unmap_begin(struct iommu_domain *domain,
unsigned long iova, size_t size);
void __iommu_debug_unmap_end(struct iommu_domain *domain,
unsigned long iova, size_t size, size_t unmapped);
static inline void iommu_debug_map(struct iommu_domain *domain,
phys_addr_t phys, size_t size)
{
if (static_branch_unlikely(&iommu_debug_initialized))
__iommu_debug_map(domain, phys, size);
}
static inline void iommu_debug_unmap_begin(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
if (static_branch_unlikely(&iommu_debug_initialized))
__iommu_debug_unmap_begin(domain, iova, size);
}
static inline void iommu_debug_unmap_end(struct iommu_domain *domain,
unsigned long iova, size_t size,
size_t unmapped)
{
if (static_branch_unlikely(&iommu_debug_initialized))
__iommu_debug_unmap_end(domain, iova, size, unmapped);
}
void iommu_debug_init(void);
#else
static inline void iommu_debug_map(struct iommu_domain *domain,
phys_addr_t phys, size_t size)
{
}
static inline void iommu_debug_unmap_begin(struct iommu_domain *domain,
unsigned long iova, size_t size)
{
}
static inline void iommu_debug_unmap_end(struct iommu_domain *domain,
unsigned long iova, size_t size,
size_t unmapped)
{
}
static inline void iommu_debug_init(void)
{
}
#endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */
#endif /* __LINUX_IOMMU_PRIV_H */

View file

@ -61,6 +61,11 @@ struct iommu_group {
int id;
struct iommu_domain *default_domain;
struct iommu_domain *blocking_domain;
/*
* During a group device reset, @resetting_domain points to the physical
* domain, while @domain points to the attached domain before the reset.
*/
struct iommu_domain *resetting_domain;
struct iommu_domain *domain;
struct list_head entry;
unsigned int owner_cnt;
@ -232,6 +237,8 @@ static int __init iommu_subsys_init(void)
if (!nb)
return -ENOMEM;
iommu_debug_init();
for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) {
nb[i].notifier_call = iommu_bus_notifier;
bus_register_notifier(iommu_buses[i], &nb[i]);
@ -661,7 +668,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
}
if (group->default_domain)
iommu_setup_dma_ops(dev);
iommu_setup_dma_ops(dev, group->default_domain);
mutex_unlock(&group->mutex);
@ -1173,12 +1180,11 @@ static int iommu_create_device_direct_mappings(struct iommu_domain *domain,
struct device *dev)
{
struct iommu_resv_region *entry;
struct list_head mappings;
LIST_HEAD(mappings);
unsigned long pg_size;
int ret = 0;
pg_size = domain->pgsize_bitmap ? 1UL << __ffs(domain->pgsize_bitmap) : 0;
INIT_LIST_HEAD(&mappings);
if (WARN_ON_ONCE(iommu_is_dma_domain(domain) && !pg_size))
return -EINVAL;
@ -1949,7 +1955,7 @@ static int bus_iommu_probe(const struct bus_type *bus)
return ret;
}
for_each_group_device(group, gdev)
iommu_setup_dma_ops(gdev->dev);
iommu_setup_dma_ops(gdev->dev, group->default_domain);
mutex_unlock(&group->mutex);
/*
@ -2185,10 +2191,26 @@ EXPORT_SYMBOL_GPL(iommu_attach_device);
int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain)
{
if (dev->iommu && dev->iommu->attach_deferred)
return __iommu_attach_device(domain, dev, NULL);
/*
* This is called on the dma mapping fast path so avoid locking. This is
* racy, but we have an expectation that the driver will setup its DMAs
* inside probe while being single threaded to avoid racing.
*/
if (!dev->iommu || !dev->iommu->attach_deferred)
return 0;
return 0;
guard(mutex)(&dev->iommu_group->mutex);
/*
* This is a concurrent attach during a device reset. Reject it until
* pci_dev_reset_iommu_done() attaches the device to group->domain.
*
* Note that this might fail the iommu_dma_map(). But there's nothing
* more we can do here.
*/
if (dev->iommu_group->resetting_domain)
return -EBUSY;
return __iommu_attach_device(domain, dev, NULL);
}
void iommu_detach_device(struct iommu_domain *domain, struct device *dev)
@ -2210,6 +2232,15 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(iommu_detach_device);
/**
* iommu_get_domain_for_dev() - Return the DMA API domain pointer
* @dev: Device to query
*
* This function can be called within a driver bound to dev. The returned
* pointer is valid for the lifetime of the bound driver.
*
* It should not be called by drivers with driver_managed_dma = true.
*/
struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
{
/* Caller must be a probed driver on dev */
@ -2218,10 +2249,40 @@ struct iommu_domain *iommu_get_domain_for_dev(struct device *dev)
if (!group)
return NULL;
lockdep_assert_not_held(&group->mutex);
return group->domain;
}
EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev);
/**
* iommu_driver_get_domain_for_dev() - Return the driver-level domain pointer
* @dev: Device to query
*
* This function can be called by an iommu driver that wants to get the physical
* domain within an iommu callback function where group->mutex is held.
*/
struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev)
{
struct iommu_group *group = dev->iommu_group;
lockdep_assert_held(&group->mutex);
/*
* Driver handles the low-level __iommu_attach_device(), including the
* one invoked by pci_dev_reset_iommu_done() re-attaching the device to
* the cached group->domain. In this case, the driver must get the old
* domain from group->resetting_domain rather than group->domain. This
* prevents it from re-attaching the device from group->domain (old) to
* group->domain (new).
*/
if (group->resetting_domain)
return group->resetting_domain;
return group->domain;
}
EXPORT_SYMBOL_GPL(iommu_driver_get_domain_for_dev);
/*
* For IOMMU_DOMAIN_DMA implementations which already provide their own
* guarantees that the group and its default domain are valid and correct.
@ -2374,6 +2435,13 @@ static int __iommu_group_set_domain_internal(struct iommu_group *group,
if (WARN_ON(!new_domain))
return -EINVAL;
/*
* This is a concurrent attach during a device reset. Reject it until
* pci_dev_reset_iommu_done() attaches the device to group->domain.
*/
if (group->resetting_domain)
return -EBUSY;
/*
* Changing the domain is done by calling attach_dev() on the new
* domain. This switch does not have to be atomic and DMA can be
@ -2562,10 +2630,12 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova,
}
/* unroll mapping in case something went wrong */
if (ret)
if (ret) {
iommu_unmap(domain, orig_iova, orig_size - size);
else
} else {
trace_map(orig_iova, orig_paddr, orig_size);
iommu_debug_map(domain, orig_paddr, orig_size);
}
return ret;
}
@ -2627,6 +2697,8 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size);
iommu_debug_unmap_begin(domain, iova, size);
/*
* Keep iterating until we either unmap 'size' bytes (or more)
* or we hit an area that isn't mapped.
@ -2647,6 +2719,7 @@ static size_t __iommu_unmap(struct iommu_domain *domain,
}
trace_unmap(orig_iova, size, unmapped);
iommu_debug_unmap_end(domain, orig_iova, size, unmapped);
return unmapped;
}
@ -3148,7 +3221,7 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
/* Make sure dma_ops is appropriatley set */
for_each_group_device(group, gdev)
iommu_setup_dma_ops(gdev->dev);
iommu_setup_dma_ops(gdev->dev, group->default_domain);
out_unlock:
mutex_unlock(&group->mutex);
@ -3492,6 +3565,16 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
return -EINVAL;
mutex_lock(&group->mutex);
/*
* This is a concurrent attach during a device reset. Reject it until
* pci_dev_reset_iommu_done() attaches the device to group->domain.
*/
if (group->resetting_domain) {
ret = -EBUSY;
goto out_unlock;
}
for_each_group_device(group, device) {
/*
* Skip PASID validation for devices without PASID support
@ -3575,6 +3658,16 @@ int iommu_replace_device_pasid(struct iommu_domain *domain,
return -EINVAL;
mutex_lock(&group->mutex);
/*
* This is a concurrent attach during a device reset. Reject it until
* pci_dev_reset_iommu_done() attaches the device to group->domain.
*/
if (group->resetting_domain) {
ret = -EBUSY;
goto out_unlock;
}
entry = iommu_make_pasid_array_entry(domain, handle);
curr = xa_cmpxchg(&group->pasid_array, pasid, NULL,
XA_ZERO_ENTRY, GFP_KERNEL);
@ -3832,6 +3925,127 @@ err_unlock:
}
EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL");
/**
* pci_dev_reset_iommu_prepare() - Block IOMMU to prepare for a PCI device reset
* @pdev: PCI device that is going to enter a reset routine
*
* The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and block
* ATS before initiating a reset. This means that a PCIe device during the reset
* routine wants to block any IOMMU activity: translation and ATS invalidation.
*
* This function attaches the device's RID/PASID(s) the group->blocking_domain,
* setting the group->resetting_domain. This allows the IOMMU driver pausing any
* IOMMU activity while leaving the group->domain pointer intact. Later when the
* reset is finished, pci_dev_reset_iommu_done() can restore everything.
*
* Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done()
* before/after the core-level reset routine, to unset the resetting_domain.
*
* Return: 0 on success or negative error code if the preparation failed.
*
* These two functions are designed to be used by PCI reset functions that would
* not invoke any racy iommu_release_device(), since PCI sysfs node gets removed
* before it notifies with a BUS_NOTIFY_REMOVED_DEVICE. When using them in other
* case, callers must ensure there will be no racy iommu_release_device() call,
* which otherwise would UAF the dev->iommu_group pointer.
*/
int pci_dev_reset_iommu_prepare(struct pci_dev *pdev)
{
struct iommu_group *group = pdev->dev.iommu_group;
unsigned long pasid;
void *entry;
int ret;
if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev))
return 0;
guard(mutex)(&group->mutex);
/* Re-entry is not allowed */
if (WARN_ON(group->resetting_domain))
return -EBUSY;
ret = __iommu_group_alloc_blocking_domain(group);
if (ret)
return ret;
/* Stage RID domain at blocking_domain while retaining group->domain */
if (group->domain != group->blocking_domain) {
ret = __iommu_attach_device(group->blocking_domain, &pdev->dev,
group->domain);
if (ret)
return ret;
}
/*
* Stage PASID domains at blocking_domain while retaining pasid_array.
*
* The pasid_array is mostly fenced by group->mutex, except one reader
* in iommu_attach_handle_get(), so it's safe to read without xa_lock.
*/
xa_for_each_start(&group->pasid_array, pasid, entry, 1)
iommu_remove_dev_pasid(&pdev->dev, pasid,
pasid_array_entry_to_domain(entry));
group->resetting_domain = group->blocking_domain;
return ret;
}
EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare);
/**
* pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done
* @pdev: PCI device that has finished a reset routine
*
* After a PCIe device finishes a reset routine, it wants to restore its IOMMU
* IOMMU activity, including new translation as well as cache invalidation, by
* re-attaching all RID/PASID of the device's back to the domains retained in
* the core-level structure.
*
* Caller must pair it with a successful pci_dev_reset_iommu_prepare().
*
* Note that, although unlikely, there is a risk that re-attaching domains might
* fail due to some unexpected happening like OOM.
*/
void pci_dev_reset_iommu_done(struct pci_dev *pdev)
{
struct iommu_group *group = pdev->dev.iommu_group;
unsigned long pasid;
void *entry;
if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev))
return;
guard(mutex)(&group->mutex);
/* pci_dev_reset_iommu_prepare() was bypassed for the device */
if (!group->resetting_domain)
return;
/* pci_dev_reset_iommu_prepare() was not successfully called */
if (WARN_ON(!group->blocking_domain))
return;
/* Re-attach RID domain back to group->domain */
if (group->domain != group->blocking_domain) {
WARN_ON(__iommu_attach_device(group->domain, &pdev->dev,
group->blocking_domain));
}
/*
* Re-attach PASID domains back to the domains retained in pasid_array.
*
* The pasid_array is mostly fenced by group->mutex, except one reader
* in iommu_attach_handle_get(), so it's safe to read without xa_lock.
*/
xa_for_each_start(&group->pasid_array, pasid, entry, 1)
WARN_ON(__iommu_set_group_pasid(
pasid_array_entry_to_domain(entry), group, pasid,
group->blocking_domain));
group->resetting_domain = NULL;
}
EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done);
#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU)
/**
* iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain

View file

@ -9,6 +9,7 @@
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/iommu.h>
#include <linux/irqdomain.h>
#include <linux/pci.h>
#include <linux/msi.h>
@ -971,6 +972,7 @@ void pci_set_acpi_fwnode(struct pci_dev *dev)
int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
{
acpi_handle handle = ACPI_HANDLE(&dev->dev);
int ret;
if (!handle || !acpi_has_method(handle, "_RST"))
return -ENOTTY;
@ -978,12 +980,19 @@ int pci_dev_acpi_reset(struct pci_dev *dev, bool probe)
if (probe)
return 0;
if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) {
pci_warn(dev, "ACPI _RST failed\n");
return -ENOTTY;
ret = pci_dev_reset_iommu_prepare(dev);
if (ret) {
pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret);
return ret;
}
return 0;
if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) {
pci_warn(dev, "ACPI _RST failed\n");
ret = -ENOTTY;
}
pci_dev_reset_iommu_done(dev);
return ret;
}
bool acpi_pci_power_manageable(struct pci_dev *dev)

View file

@ -13,6 +13,7 @@
#include <linux/delay.h>
#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/iommu.h>
#include <linux/msi.h>
#include <linux/of.h>
#include <linux/pci.h>
@ -25,6 +26,7 @@
#include <linux/logic_pio.h>
#include <linux/device.h>
#include <linux/pm_runtime.h>
#include <linux/pci-ats.h>
#include <linux/pci_hotplug.h>
#include <linux/vmalloc.h>
#include <asm/dma.h>
@ -4330,13 +4332,22 @@ EXPORT_SYMBOL(pci_wait_for_pending_transaction);
*/
int pcie_flr(struct pci_dev *dev)
{
int ret;
if (!pci_wait_for_pending_transaction(dev))
pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n");
/* Have to call it after waiting for pending DMA transaction */
ret = pci_dev_reset_iommu_prepare(dev);
if (ret) {
pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret);
return ret;
}
pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR);
if (dev->imm_ready)
return 0;
goto done;
/*
* Per PCIe r4.0, sec 6.6.2, a device must complete an FLR within
@ -4345,7 +4356,10 @@ int pcie_flr(struct pci_dev *dev)
*/
msleep(100);
return pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS);
done:
pci_dev_reset_iommu_done(dev);
return ret;
}
EXPORT_SYMBOL_GPL(pcie_flr);
@ -4373,6 +4387,7 @@ EXPORT_SYMBOL_GPL(pcie_reset_flr);
static int pci_af_flr(struct pci_dev *dev, bool probe)
{
int ret;
int pos;
u8 cap;
@ -4399,10 +4414,17 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
PCI_AF_STATUS_TP << 8))
pci_err(dev, "timed out waiting for pending transaction; performing AF function level reset anyway\n");
/* Have to call it after waiting for pending DMA transaction */
ret = pci_dev_reset_iommu_prepare(dev);
if (ret) {
pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret);
return ret;
}
pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR);
if (dev->imm_ready)
return 0;
goto done;
/*
* Per Advanced Capabilities for Conventional PCI ECN, 13 April 2006,
@ -4412,7 +4434,10 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
*/
msleep(100);
return pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS);
done:
pci_dev_reset_iommu_done(dev);
return ret;
}
/**
@ -4433,6 +4458,7 @@ static int pci_af_flr(struct pci_dev *dev, bool probe)
static int pci_pm_reset(struct pci_dev *dev, bool probe)
{
u16 csr;
int ret;
if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET)
return -ENOTTY;
@ -4447,6 +4473,12 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
if (dev->current_state != PCI_D0)
return -EINVAL;
ret = pci_dev_reset_iommu_prepare(dev);
if (ret) {
pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret);
return ret;
}
csr &= ~PCI_PM_CTRL_STATE_MASK;
csr |= PCI_D3hot;
pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
@ -4457,7 +4489,9 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
pci_dev_d3_sleep(dev);
return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
pci_dev_reset_iommu_done(dev);
return ret;
}
/**
@ -4885,10 +4919,20 @@ static int pci_reset_bus_function(struct pci_dev *dev, bool probe)
return -ENOTTY;
}
rc = pci_dev_reset_iommu_prepare(dev);
if (rc) {
pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", rc);
return rc;
}
rc = pci_dev_reset_slot_function(dev, probe);
if (rc != -ENOTTY)
return rc;
return pci_parent_bus_reset(dev, probe);
goto done;
rc = pci_parent_bus_reset(dev, probe);
done:
pci_dev_reset_iommu_done(dev);
return rc;
}
static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
@ -4912,6 +4956,12 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
if (rc)
return -ENOTTY;
rc = pci_dev_reset_iommu_prepare(dev);
if (rc) {
pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", rc);
return rc;
}
if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) {
val = reg;
} else {
@ -4926,6 +4976,7 @@ static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL,
reg);
pci_dev_reset_iommu_done(dev);
return rc;
}

View file

@ -21,6 +21,7 @@
#include <linux/pci.h>
#include <linux/isa-dma.h> /* isa_dma_bridge_buggy */
#include <linux/init.h>
#include <linux/iommu.h>
#include <linux/delay.h>
#include <linux/acpi.h>
#include <linux/dmi.h>
@ -4228,6 +4229,22 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = {
{ 0 }
};
static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe,
const struct pci_dev_reset_methods *i)
{
int ret;
ret = pci_dev_reset_iommu_prepare(dev);
if (ret) {
pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret);
return ret;
}
ret = i->reset(dev, probe);
pci_dev_reset_iommu_done(dev);
return ret;
}
/*
* These device-specific reset methods are here rather than in a driver
* because when a host assigns a device to a guest VM, the host may need
@ -4242,7 +4259,7 @@ int pci_dev_specific_reset(struct pci_dev *dev, bool probe)
i->vendor == (u16)PCI_ANY_ID) &&
(i->device == dev->device ||
i->device == (u16)PCI_ANY_ID))
return i->reset(dev, probe);
return __pci_dev_specific_reset(dev, probe, i);
}
return -ENOTTY;

View file

@ -0,0 +1,32 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2025 - Google Inc
* Author: Mostafa Saleh <smostafa@google.com>
* IOMMU API debug page alloc sanitizer
*/
#ifndef __LINUX_IOMMU_DEBUG_PAGEALLOC_H
#define __LINUX_IOMMU_DEBUG_PAGEALLOC_H
#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC
DECLARE_STATIC_KEY_FALSE(iommu_debug_initialized);
extern struct page_ext_operations page_iommu_debug_ops;
void __iommu_debug_check_unmapped(const struct page *page, int numpages);
static inline void iommu_debug_check_unmapped(const struct page *page, int numpages)
{
if (static_branch_unlikely(&iommu_debug_initialized))
__iommu_debug_check_unmapped(page, numpages);
}
#else
static inline void iommu_debug_check_unmapped(const struct page *page,
int numpages)
{
}
#endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */
#endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */

View file

@ -910,6 +910,7 @@ extern int iommu_attach_device(struct iommu_domain *domain,
extern void iommu_detach_device(struct iommu_domain *domain,
struct device *dev);
extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev);
extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
phys_addr_t paddr, size_t size, int prot, gfp_t gfp);
@ -1187,6 +1188,10 @@ void iommu_detach_device_pasid(struct iommu_domain *domain,
struct device *dev, ioasid_t pasid);
ioasid_t iommu_alloc_global_pasid(struct device *dev);
void iommu_free_global_pasid(ioasid_t pasid);
/* PCI device reset functions */
int pci_dev_reset_iommu_prepare(struct pci_dev *pdev);
void pci_dev_reset_iommu_done(struct pci_dev *pdev);
#else /* CONFIG_IOMMU_API */
struct iommu_ops {};
@ -1510,6 +1515,15 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev)
}
static inline void iommu_free_global_pasid(ioasid_t pasid) {}
static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev)
{
return 0;
}
static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev)
{
}
#endif /* CONFIG_IOMMU_API */
#ifdef CONFIG_IRQ_MSI_IOMMU

View file

@ -36,6 +36,7 @@
#include <linux/rcuwait.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/iommu-debug-pagealloc.h>
struct mempolicy;
struct anon_vma;
@ -4137,12 +4138,16 @@ extern void __kernel_map_pages(struct page *page, int numpages, int enable);
#ifdef CONFIG_DEBUG_PAGEALLOC
static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
{
iommu_debug_check_unmapped(page, numpages);
if (debug_pagealloc_enabled_static())
__kernel_map_pages(page, numpages, 1);
}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
{
iommu_debug_check_unmapped(page, numpages);
if (debug_pagealloc_enabled_static())
__kernel_map_pages(page, numpages, 0);
}

View file

@ -93,6 +93,7 @@ static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn)
#endif
extern struct page_ext *page_ext_get(const struct page *page);
extern struct page_ext *page_ext_from_phys(phys_addr_t phys);
extern void page_ext_put(struct page_ext *page_ext);
extern struct page_ext *page_ext_lookup(unsigned long pfn);
@ -215,6 +216,11 @@ static inline struct page_ext *page_ext_get(const struct page *page)
return NULL;
}
static inline struct page_ext *page_ext_from_phys(phys_addr_t phys)
{
return NULL;
}
static inline void page_ext_put(struct page_ext *page_ext)
{
}

View file

@ -465,16 +465,27 @@ struct iommu_hwpt_arm_smmuv3 {
__aligned_le64 ste[2];
};
/**
* struct iommu_hwpt_amd_guest - AMD IOMMU guest I/O page table data
* (IOMMU_HWPT_DATA_AMD_GUEST)
* @dte: Guest Device Table Entry (DTE)
*/
struct iommu_hwpt_amd_guest {
__aligned_u64 dte[4];
};
/**
* enum iommu_hwpt_data_type - IOMMU HWPT Data Type
* @IOMMU_HWPT_DATA_NONE: no data
* @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
* @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
* @IOMMU_HWPT_DATA_AMD_GUEST: AMD IOMMU guest page table
*/
enum iommu_hwpt_data_type {
IOMMU_HWPT_DATA_NONE = 0,
IOMMU_HWPT_DATA_VTD_S1 = 1,
IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
IOMMU_HWPT_DATA_AMD_GUEST = 3,
};
/**
@ -623,6 +634,32 @@ struct iommu_hw_info_tegra241_cmdqv {
__u8 __reserved;
};
/**
* struct iommu_hw_info_amd - AMD IOMMU device info
*
* @efr : Value of AMD IOMMU Extended Feature Register (EFR)
* @efr2: Value of AMD IOMMU Extended Feature 2 Register (EFR2)
*
* Please See description of these registers in the following sections of
* the AMD I/O Virtualization Technology (IOMMU) Specification.
* (https://docs.amd.com/v/u/en-US/48882_3.10_PUB)
*
* - MMIO Offset 0030h IOMMU Extended Feature Register
* - MMIO Offset 01A0h IOMMU Extended Feature 2 Register
*
* Note: The EFR and EFR2 are raw values reported by hardware.
* VMM is responsible to determine the appropriate flags to be exposed to
* the VM since cetertain features are not currently supported by the kernel
* for HW-vIOMMU.
*
* Current VMM-allowed list of feature flags are:
* - EFR[GTSup, GASup, GioSup, PPRSup, EPHSup, GATS, GLX, PASmax]
*/
struct iommu_hw_info_amd {
__aligned_u64 efr;
__aligned_u64 efr2;
};
/**
* enum iommu_hw_info_type - IOMMU Hardware Info Types
* @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware
@ -632,6 +669,7 @@ struct iommu_hw_info_tegra241_cmdqv {
* @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
* @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM
* SMMUv3) info type
* @IOMMU_HW_INFO_TYPE_AMD: AMD IOMMU info type
*/
enum iommu_hw_info_type {
IOMMU_HW_INFO_TYPE_NONE = 0,
@ -639,6 +677,7 @@ enum iommu_hw_info_type {
IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3,
IOMMU_HW_INFO_TYPE_AMD = 4,
};
/**

View file

@ -964,6 +964,10 @@ struct vfio_device_bind_iommufd {
* hwpt corresponding to the given pt_id.
*
* Return: 0 on success, -errno on failure.
*
* When a device is resetting, -EBUSY will be returned to reject any concurrent
* attachment to the resetting device itself or any sibling device in the IOMMU
* group having the resetting device.
*/
struct vfio_device_attach_iommufd_pt {
__u32 argsz;

View file

@ -11,6 +11,7 @@
#include <linux/page_table_check.h>
#include <linux/rcupdate.h>
#include <linux/pgalloc_tag.h>
#include <linux/iommu-debug-pagealloc.h>
/*
* struct page extension
@ -89,6 +90,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
#ifdef CONFIG_PAGE_TABLE_CHECK
&page_table_check_ops,
#endif
#ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC
&page_iommu_debug_ops,
#endif
};
unsigned long page_ext_size;
@ -534,6 +538,29 @@ struct page_ext *page_ext_get(const struct page *page)
return page_ext;
}
/**
* page_ext_from_phys() - Get the page_ext structure for a physical address.
* @phys: The physical address to query.
*
* This function safely gets the `struct page_ext` associated with a given
* physical address. It performs validation to ensure the address corresponds
* to a valid, online struct page before attempting to access it.
* It returns NULL for MMIO, ZONE_DEVICE, holes and offline memory.
*
* Return: NULL if no page_ext exists for this physical address.
* Context: Any context. Caller may not sleep until they have called
* page_ext_put().
*/
struct page_ext *page_ext_from_phys(phys_addr_t phys)
{
struct page *page = pfn_to_online_page(__phys_to_pfn(phys));
if (!page)
return NULL;
return page_ext_get(page);
}
/**
* page_ext_put() - Working with page extended information is done.
* @page_ext: Page extended information received from page_ext_get().

View file

@ -56,9 +56,10 @@
#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/firmware.h>
#include <linux/interrupt.h>
#include <linux/fs.h>
#include <linux/i2c.h>
#include <linux/interrupt.h>
#include <linux/io-pgtable.h>
#include <linux/ioport.h>
#include <linux/jiffies.h>
#include <linux/jump_label.h>

5
rust/kernel/iommu/mod.rs Normal file
View file

@ -0,0 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
//! Rust support related to IOMMU.
pub mod pgtable;

View file

@ -0,0 +1,279 @@
// SPDX-License-Identifier: GPL-2.0
//! IOMMU page table management.
//!
//! C header: [`include/linux/io-pgtable.h`](srctree/include/linux/io-pgtable.h)
use core::{
marker::PhantomData,
ptr::NonNull, //
};
use crate::{
alloc,
bindings,
device::{
Bound,
Device, //
},
devres::Devres,
error::to_result,
io::PhysAddr,
prelude::*, //
};
use bindings::io_pgtable_fmt;
/// Protection flags used with IOMMU mappings.
pub mod prot {
/// Read access.
pub const READ: u32 = bindings::IOMMU_READ;
/// Write access.
pub const WRITE: u32 = bindings::IOMMU_WRITE;
/// Request cache coherency.
pub const CACHE: u32 = bindings::IOMMU_CACHE;
/// Request no-execute permission.
pub const NOEXEC: u32 = bindings::IOMMU_NOEXEC;
/// MMIO peripheral mapping.
pub const MMIO: u32 = bindings::IOMMU_MMIO;
/// Privileged mapping.
pub const PRIVILEGED: u32 = bindings::IOMMU_PRIV;
}
/// Represents a requested `io_pgtable` configuration.
pub struct Config {
/// Quirk bitmask (type-specific).
pub quirks: usize,
/// Valid page sizes, as a bitmask of powers of two.
pub pgsize_bitmap: usize,
/// Input address space size in bits.
pub ias: u32,
/// Output address space size in bits.
pub oas: u32,
/// IOMMU uses coherent accesses for page table walks.
pub coherent_walk: bool,
}
/// An io page table using a specific format.
///
/// # Invariants
///
/// The pointer references a valid io page table.
pub struct IoPageTable<F: IoPageTableFmt> {
ptr: NonNull<bindings::io_pgtable_ops>,
_marker: PhantomData<F>,
}
// SAFETY: `struct io_pgtable_ops` is not restricted to a single thread.
unsafe impl<F: IoPageTableFmt> Send for IoPageTable<F> {}
// SAFETY: `struct io_pgtable_ops` may be accessed concurrently.
unsafe impl<F: IoPageTableFmt> Sync for IoPageTable<F> {}
/// The format used by this page table.
pub trait IoPageTableFmt: 'static {
/// The value representing this format.
const FORMAT: io_pgtable_fmt;
}
impl<F: IoPageTableFmt> IoPageTable<F> {
/// Create a new `IoPageTable` as a device resource.
#[inline]
pub fn new(
dev: &Device<Bound>,
config: Config,
) -> impl PinInit<Devres<IoPageTable<F>>, Error> + '_ {
// SAFETY: Devres ensures that the value is dropped during device unbind.
Devres::new(dev, unsafe { Self::new_raw(dev, config) })
}
/// Create a new `IoPageTable`.
///
/// # Safety
///
/// If successful, then the returned `IoPageTable` must be dropped before the device is
/// unbound.
#[inline]
pub unsafe fn new_raw(dev: &Device<Bound>, config: Config) -> Result<IoPageTable<F>> {
let mut raw_cfg = bindings::io_pgtable_cfg {
quirks: config.quirks,
pgsize_bitmap: config.pgsize_bitmap,
ias: config.ias,
oas: config.oas,
coherent_walk: config.coherent_walk,
tlb: &raw const NOOP_FLUSH_OPS,
iommu_dev: dev.as_raw(),
// SAFETY: All zeroes is a valid value for `struct io_pgtable_cfg`.
..unsafe { core::mem::zeroed() }
};
// SAFETY:
// * The raw_cfg pointer is valid for the duration of this call.
// * The provided `FLUSH_OPS` contains valid function pointers that accept a null pointer
// as cookie.
// * The caller ensures that the io pgtable does not outlive the device.
let ops = unsafe {
bindings::alloc_io_pgtable_ops(F::FORMAT, &mut raw_cfg, core::ptr::null_mut())
};
// INVARIANT: We successfully created a valid page table.
Ok(IoPageTable {
ptr: NonNull::new(ops).ok_or(ENOMEM)?,
_marker: PhantomData,
})
}
/// Obtain a raw pointer to the underlying `struct io_pgtable_ops`.
#[inline]
pub fn raw_ops(&self) -> *mut bindings::io_pgtable_ops {
self.ptr.as_ptr()
}
/// Obtain a raw pointer to the underlying `struct io_pgtable`.
#[inline]
pub fn raw_pgtable(&self) -> *mut bindings::io_pgtable {
// SAFETY: The io_pgtable_ops of an io-pgtable is always the ops field of a io_pgtable.
unsafe { kernel::container_of!(self.raw_ops(), bindings::io_pgtable, ops) }
}
/// Obtain a raw pointer to the underlying `struct io_pgtable_cfg`.
#[inline]
pub fn raw_cfg(&self) -> *mut bindings::io_pgtable_cfg {
// SAFETY: The `raw_pgtable()` method returns a valid pointer.
unsafe { &raw mut (*self.raw_pgtable()).cfg }
}
/// Map a physically contiguous range of pages of the same size.
///
/// Even if successful, this operation may not map the entire range. In that case, only a
/// prefix of the range is mapped, and the returned integer indicates its length in bytes. In
/// this case, the caller will usually call `map_pages` again for the remaining range.
///
/// The returned [`Result`] indicates whether an error was encountered while mapping pages.
/// Note that this may return a non-zero length even if an error was encountered. The caller
/// will usually [unmap the relevant pages](Self::unmap_pages) on error.
///
/// The caller must flush the TLB before using the pgtable to access the newly created mapping.
///
/// # Safety
///
/// * No other io-pgtable operation may access the range `iova .. iova+pgsize*pgcount` while
/// this `map_pages` operation executes.
/// * This page table must not contain any mapping that overlaps with the mapping created by
/// this call.
/// * If this page table is live, then the caller must ensure that it's okay to access the
/// physical address being mapped for the duration in which it is mapped.
#[inline]
pub unsafe fn map_pages(
&self,
iova: usize,
paddr: PhysAddr,
pgsize: usize,
pgcount: usize,
prot: u32,
flags: alloc::Flags,
) -> (usize, Result) {
let mut mapped: usize = 0;
// SAFETY: The `map_pages` function in `io_pgtable_ops` is never null.
let map_pages = unsafe { (*self.raw_ops()).map_pages.unwrap_unchecked() };
// SAFETY: The safety requirements of this method are sufficient to call `map_pages`.
let ret = to_result(unsafe {
(map_pages)(
self.raw_ops(),
iova,
paddr,
pgsize,
pgcount,
prot as i32,
flags.as_raw(),
&mut mapped,
)
});
(mapped, ret)
}
/// Unmap a range of virtually contiguous pages of the same size.
///
/// This may not unmap the entire range, and returns the length of the unmapped prefix in
/// bytes.
///
/// # Safety
///
/// * No other io-pgtable operation may access the range `iova .. iova+pgsize*pgcount` while
/// this `unmap_pages` operation executes.
/// * This page table must contain one or more consecutive mappings starting at `iova` whose
/// total size is `pgcount * pgsize`.
#[inline]
#[must_use]
pub unsafe fn unmap_pages(&self, iova: usize, pgsize: usize, pgcount: usize) -> usize {
// SAFETY: The `unmap_pages` function in `io_pgtable_ops` is never null.
let unmap_pages = unsafe { (*self.raw_ops()).unmap_pages.unwrap_unchecked() };
// SAFETY: The safety requirements of this method are sufficient to call `unmap_pages`.
unsafe { (unmap_pages)(self.raw_ops(), iova, pgsize, pgcount, core::ptr::null_mut()) }
}
}
// For the initial users of these rust bindings, the GPU FW is managing the IOTLB and performs all
// required invalidations using a range. There is no need for it get ARM style invalidation
// instructions from the page table code.
//
// Support for flushing the TLB with ARM style invalidation instructions may be added in the
// future.
static NOOP_FLUSH_OPS: bindings::iommu_flush_ops = bindings::iommu_flush_ops {
tlb_flush_all: Some(rust_tlb_flush_all_noop),
tlb_flush_walk: Some(rust_tlb_flush_walk_noop),
tlb_add_page: None,
};
#[no_mangle]
extern "C" fn rust_tlb_flush_all_noop(_cookie: *mut core::ffi::c_void) {}
#[no_mangle]
extern "C" fn rust_tlb_flush_walk_noop(
_iova: usize,
_size: usize,
_granule: usize,
_cookie: *mut core::ffi::c_void,
) {
}
impl<F: IoPageTableFmt> Drop for IoPageTable<F> {
fn drop(&mut self) {
// SAFETY: The caller of `Self::ttbr()` promised that the page table is not live when this
// destructor runs.
unsafe { bindings::free_io_pgtable_ops(self.raw_ops()) };
}
}
/// The `ARM_64_LPAE_S1` page table format.
pub enum ARM64LPAES1 {}
impl IoPageTableFmt for ARM64LPAES1 {
const FORMAT: io_pgtable_fmt = bindings::io_pgtable_fmt_ARM_64_LPAE_S1 as io_pgtable_fmt;
}
impl IoPageTable<ARM64LPAES1> {
/// Access the `ttbr` field of the configuration.
///
/// This is the physical address of the page table, which may be passed to the device that
/// needs to use it.
///
/// # Safety
///
/// The caller must ensure that the device stops using the page table before dropping it.
#[inline]
pub unsafe fn ttbr(&self) -> u64 {
// SAFETY: `arm_lpae_s1_cfg` is the right cfg type for `ARM64LPAES1`.
unsafe { (*self.raw_cfg()).__bindgen_anon_1.arm_lpae_s1_cfg.ttbr }
}
/// Access the `mair` field of the configuration.
#[inline]
pub fn mair(&self) -> u64 {
// SAFETY: `arm_lpae_s1_cfg` is the right cfg type for `ARM64LPAES1`.
unsafe { (*self.raw_cfg()).__bindgen_anon_1.arm_lpae_s1_cfg.mair }
}
}

View file

@ -103,6 +103,7 @@ pub mod id_pool;
pub mod init;
pub mod io;
pub mod ioctl;
pub mod iommu;
pub mod iov;
pub mod irq;
pub mod jump_label;