From 10016118b6fade907143a32a7aeaa777063dc79c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:11 -0800 Subject: [PATCH 01/59] cxl/mem: Fix devm_cxl_memdev_edac_release() confusion A device release method is only for undoing allocations on the path to preparing the device for device_add(). In contrast, devm allocations are post device_add(), are acquired during / after ->probe() and are released synchronous with ->remove(). So, a "devm" helper in a "release" method is a clear anti-pattern. Move this devm release action where it belongs, an action created at edac object creation time. Otherwise, this leaks resources until cxl_memdev_release() time which may be long after these xarray and error record caches have gone idle. Note, this also fixes up the type of @cxlmd->err_rec_array which needlessly dropped type-safety. Fixes: 0b5ccb0de1e2 ("cxl/edac: Support for finding memory operation attributes from the current boot") Cc: Dave Jiang Cc: Jonathan Cameron Cc: Shiju Jose Cc: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Shiju Jose Reviewed-by: Shiju Jose Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-2-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/edac.c | 64 ++++++++++++++++++++++----------------- drivers/cxl/core/memdev.c | 1 - drivers/cxl/cxlmem.h | 5 +-- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/drivers/cxl/core/edac.c b/drivers/cxl/core/edac.c index 79994ca9bc9f..81160260e26b 100644 --- a/drivers/cxl/core/edac.c +++ b/drivers/cxl/core/edac.c @@ -1988,6 +1988,40 @@ static int cxl_memdev_soft_ppr_init(struct cxl_memdev *cxlmd, return 0; } +static void err_rec_free(void *_cxlmd) +{ + struct cxl_memdev *cxlmd = _cxlmd; + struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array; + struct cxl_event_gen_media *rec_gen_media; + struct cxl_event_dram *rec_dram; + unsigned long index; + + cxlmd->err_rec_array = NULL; + xa_for_each(&array_rec->rec_dram, index, rec_dram) + kfree(rec_dram); + xa_destroy(&array_rec->rec_dram); + + xa_for_each(&array_rec->rec_gen_media, index, rec_gen_media) + kfree(rec_gen_media); + xa_destroy(&array_rec->rec_gen_media); + kfree(array_rec); +} + +static int devm_cxl_memdev_setup_err_rec(struct cxl_memdev *cxlmd) +{ + struct cxl_mem_err_rec *array_rec = + kzalloc(sizeof(*array_rec), GFP_KERNEL); + + if (!array_rec) + return -ENOMEM; + + xa_init(&array_rec->rec_gen_media); + xa_init(&array_rec->rec_dram); + cxlmd->err_rec_array = array_rec; + + return devm_add_action_or_reset(&cxlmd->dev, err_rec_free, cxlmd); +} + int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd) { struct edac_dev_feature ras_features[CXL_NR_EDAC_DEV_FEATURES]; @@ -2038,15 +2072,9 @@ int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd) } if (repair_inst) { - struct cxl_mem_err_rec *array_rec = - devm_kzalloc(&cxlmd->dev, sizeof(*array_rec), - GFP_KERNEL); - if (!array_rec) - return -ENOMEM; - - xa_init(&array_rec->rec_gen_media); - xa_init(&array_rec->rec_dram); - cxlmd->err_rec_array = array_rec; + rc = devm_cxl_memdev_setup_err_rec(cxlmd); + if (rc) + return rc; } } @@ -2088,22 +2116,4 @@ int devm_cxl_region_edac_register(struct cxl_region *cxlr) } EXPORT_SYMBOL_NS_GPL(devm_cxl_region_edac_register, "CXL"); -void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd) -{ - struct cxl_mem_err_rec *array_rec = cxlmd->err_rec_array; - struct cxl_event_gen_media *rec_gen_media; - struct cxl_event_dram *rec_dram; - unsigned long index; - if (!IS_ENABLED(CONFIG_CXL_EDAC_MEM_REPAIR) || !array_rec) - return; - - xa_for_each(&array_rec->rec_dram, index, rec_dram) - kfree(rec_dram); - xa_destroy(&array_rec->rec_dram); - - xa_for_each(&array_rec->rec_gen_media, index, rec_gen_media) - kfree(rec_gen_media); - xa_destroy(&array_rec->rec_gen_media); -} -EXPORT_SYMBOL_NS_GPL(devm_cxl_memdev_edac_release, "CXL"); diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index e370d733e440..4dff7f44d908 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -27,7 +27,6 @@ static void cxl_memdev_release(struct device *dev) struct cxl_memdev *cxlmd = to_cxl_memdev(dev); ida_free(&cxl_memdev_ida, cxlmd->id); - devm_cxl_memdev_edac_release(cxlmd); kfree(cxlmd); } diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 434031a0c1f7..c12ab4fc9512 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -63,7 +63,7 @@ struct cxl_memdev { int depth; u8 scrub_cycle; int scrub_region_id; - void *err_rec_array; + struct cxl_mem_err_rec *err_rec_array; }; static inline struct cxl_memdev *to_cxl_memdev(struct device *dev) @@ -877,7 +877,6 @@ int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd); int devm_cxl_region_edac_register(struct cxl_region *cxlr); int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, union cxl_event *evt); int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt); -void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd); #else static inline int devm_cxl_memdev_edac_register(struct cxl_memdev *cxlmd) { return 0; } @@ -889,8 +888,6 @@ static inline int cxl_store_rec_gen_media(struct cxl_memdev *cxlmd, static inline int cxl_store_rec_dram(struct cxl_memdev *cxlmd, union cxl_event *evt) { return 0; } -static inline void devm_cxl_memdev_edac_release(struct cxl_memdev *cxlmd) -{ return; } #endif #ifdef CONFIG_CXL_SUSPEND From 1f1cb7f0c25574cf51501f8c8cece0047d7e8848 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:12 -0800 Subject: [PATCH 02/59] cxl/mem: Arrange for always-synchronous memdev attach In preparation for CXL accelerator drivers that have a hard dependency on CXL capability initialization, arrange for cxl_mem_probe() to always run synchronous with the device_add() of cxl_memdev instances. I.e. cxl_mem_driver registration is always complete before the first memdev creation event. At present, cxl_pci does not care about the attach state of the cxl_memdev because all generic memory expansion functionality can be handled by the cxl_core. For accelerators, however, that driver needs to perform driver specific initialization if CXL is available, or execute a fallback to PCIe only operation. This synchronous attach guarantee is also needed for Soft Reserve Recovery, which is an effort that needs to assert that devices have had a chance to attach before making a go / no-go decision on proceeding with CXL subsystem initialization. By moving devm_cxl_add_memdev() to cxl_mem.ko it removes async module loading as one reason that a memdev may not be attached upon return from devm_cxl_add_memdev(). Cc: Smita Koralahalli Cc: Alejandro Lucero Reviewed-by: Jonathan Cameron Tested-by: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-3-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/Kconfig | 2 +- drivers/cxl/core/memdev.c | 10 +++++++--- drivers/cxl/cxlmem.h | 2 ++ drivers/cxl/mem.c | 17 +++++++++++++++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 48b7314afdb8..f1361ed6a0d4 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -22,6 +22,7 @@ if CXL_BUS config CXL_PCI tristate "PCI manageability" default CXL_BUS + select CXL_MEM help The CXL specification defines a "CXL memory device" sub-class in the PCI "memory controller" base class of devices. Device's identified by @@ -89,7 +90,6 @@ config CXL_PMEM config CXL_MEM tristate "CXL: Memory Expansion" - depends on CXL_PCI default CXL_BUS help The CXL.mem protocol allows a device to act as a provider of "System diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 4dff7f44d908..7a4153e1c6a7 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -1050,8 +1050,12 @@ static const struct file_operations cxl_memdev_fops = { .llseek = noop_llseek, }; -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +/* + * Core helper for devm_cxl_add_memdev() that wants to both create a device and + * assert to the caller that upon return cxl_mem::probe() has been invoked. + */ +struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, + struct cxl_dev_state *cxlds) { struct cxl_memdev *cxlmd; struct device *dev; @@ -1093,7 +1097,7 @@ err: put_device(dev); return ERR_PTR(rc); } -EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); +EXPORT_SYMBOL_FOR_MODULES(__devm_cxl_add_memdev, "cxl_mem"); static void sanitize_teardown_notifier(void *data) { diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index c12ab4fc9512..012e68acad34 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -95,6 +95,8 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } +struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, + struct cxl_dev_state *cxlds); struct cxl_memdev *devm_cxl_add_memdev(struct device *host, struct cxl_dev_state *cxlds); int devm_cxl_sanitize_setup_notifier(struct device *host, diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 6e6777b7bafb..55883797ab2d 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -201,6 +201,22 @@ static int cxl_mem_probe(struct device *dev) return devm_add_action_or_reset(dev, enable_suspend, NULL); } +/** + * devm_cxl_add_memdev - Add a CXL memory device + * @host: devres alloc/release context and parent for the memdev + * @cxlds: CXL device state to associate with the memdev + * + * Upon return the device will have had a chance to attach to the + * cxl_mem driver, but may fail if the CXL topology is not ready + * (hardware CXL link down, or software platform CXL root not attached) + */ +struct cxl_memdev *devm_cxl_add_memdev(struct device *host, + struct cxl_dev_state *cxlds) +{ + return __devm_cxl_add_memdev(host, cxlds); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); + static ssize_t trigger_poison_list_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -248,6 +264,7 @@ static struct cxl_driver cxl_mem_driver = { .probe = cxl_mem_probe, .id = CXL_DEVICE_MEMORY_EXPANDER, .drv = { + .probe_type = PROBE_FORCE_SYNCHRONOUS, .dev_groups = cxl_mem_groups, }, }; From ae201a0092362ffdec7206efa1ec85e260fab8d2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:13 -0800 Subject: [PATCH 03/59] cxl/port: Arrange for always synchronous endpoint attach Make it so that upon return from devm_cxl_add_endpoint() that cxl_mem_probe() can assume that the endpoint has had a chance to complete cxl_port_probe(). I.e. cxl_port module loading has completed prior to device registration. Delete the MODULE_SOFTDEP() as it is not sufficient for this purpose, but a hard link-time dependency is reliable. Specifically MODULE_SOFTDEP() does not guarantee that the module loading has completed prior to the completion of the current module's init. Cc: Smita Koralahalli Cc: Alejandro Lucero Reviewed-by: Jonathan Cameron Tested-by: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-4-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/cxl.h | 2 ++ drivers/cxl/mem.c | 43 ------------------------------------------- drivers/cxl/port.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 43 deletions(-) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ba17fa86d249..c796c3db36e0 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -780,6 +780,8 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct cxl_dport *parent_dport); struct cxl_root *devm_cxl_add_root(struct device *host, const struct cxl_root_ops *ops); +int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd, + struct cxl_dport *parent_dport); struct cxl_root *find_cxl_root(struct cxl_port *port); DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_device(&_T->port.dev)) diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 55883797ab2d..d62931526fd4 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -45,44 +45,6 @@ static int cxl_mem_dpa_show(struct seq_file *file, void *data) return 0; } -static int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd, - struct cxl_dport *parent_dport) -{ - struct cxl_port *parent_port = parent_dport->port; - struct cxl_port *endpoint, *iter, *down; - int rc; - - /* - * Now that the path to the root is established record all the - * intervening ports in the chain. - */ - for (iter = parent_port, down = NULL; !is_cxl_root(iter); - down = iter, iter = to_cxl_port(iter->dev.parent)) { - struct cxl_ep *ep; - - ep = cxl_ep_load(iter, cxlmd); - ep->next = down; - } - - /* Note: endpoint port component registers are derived from @cxlds */ - endpoint = devm_cxl_add_port(host, &cxlmd->dev, CXL_RESOURCE_NONE, - parent_dport); - if (IS_ERR(endpoint)) - return PTR_ERR(endpoint); - - rc = cxl_endpoint_autoremove(cxlmd, endpoint); - if (rc) - return rc; - - if (!endpoint->dev.driver) { - dev_err(&cxlmd->dev, "%s failed probe\n", - dev_name(&endpoint->dev)); - return -ENXIO; - } - - return 0; -} - static int cxl_debugfs_poison_inject(void *data, u64 dpa) { struct cxl_memdev *cxlmd = data; @@ -275,8 +237,3 @@ MODULE_DESCRIPTION("CXL: Memory Expansion"); MODULE_LICENSE("GPL v2"); MODULE_IMPORT_NS("CXL"); MODULE_ALIAS_CXL(CXL_DEVICE_MEMORY_EXPANDER); -/* - * create_endpoint() wants to validate port driver attach immediately after - * endpoint registration. - */ -MODULE_SOFTDEP("pre: cxl_port"); diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 51c8f2f84717..7937e7e53797 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -156,10 +156,50 @@ static struct cxl_driver cxl_port_driver = { .probe = cxl_port_probe, .id = CXL_DEVICE_PORT, .drv = { + .probe_type = PROBE_FORCE_SYNCHRONOUS, .dev_groups = cxl_port_attribute_groups, }, }; +int devm_cxl_add_endpoint(struct device *host, struct cxl_memdev *cxlmd, + struct cxl_dport *parent_dport) +{ + struct cxl_port *parent_port = parent_dport->port; + struct cxl_port *endpoint, *iter, *down; + int rc; + + /* + * Now that the path to the root is established record all the + * intervening ports in the chain. + */ + for (iter = parent_port, down = NULL; !is_cxl_root(iter); + down = iter, iter = to_cxl_port(iter->dev.parent)) { + struct cxl_ep *ep; + + ep = cxl_ep_load(iter, cxlmd); + ep->next = down; + } + + /* Note: endpoint port component registers are derived from @cxlds */ + endpoint = devm_cxl_add_port(host, &cxlmd->dev, CXL_RESOURCE_NONE, + parent_dport); + if (IS_ERR(endpoint)) + return PTR_ERR(endpoint); + + rc = cxl_endpoint_autoremove(cxlmd, endpoint); + if (rc) + return rc; + + if (!endpoint->dev.driver) { + dev_err(&cxlmd->dev, "%s failed probe\n", + dev_name(&endpoint->dev)); + return -ENXIO; + } + + return 0; +} +EXPORT_SYMBOL_FOR_MODULES(devm_cxl_add_endpoint, "cxl_mem"); + static int __init cxl_port_init(void) { return cxl_driver_register(&cxl_port_driver); From 6e1d21903ff213f1384ce43daa279c0965904116 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:14 -0800 Subject: [PATCH 04/59] cxl/mem: Convert devm_cxl_add_memdev() to scope-based-cleanup In preparation for adding more setup steps, convert the current implementation to scope-based cleanup. The cxl_memdev_shutdown() is only required after cdev_device_add(). With that moved to a helper function it precludes the need to add scope-based-handler for that cleanup if devm_add_action_or_reset() fails. Cc: Smita Koralahalli Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham Tested-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251216005616.3090129-5-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 70 ++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 7a4153e1c6a7..92aea95859fb 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -1050,6 +1050,45 @@ static const struct file_operations cxl_memdev_fops = { .llseek = noop_llseek, }; +/* + * Activate ioctl operations, no cxl_memdev_rwsem manipulation needed as this is + * ordered with cdev_add() publishing the device. + */ +static int cxlmd_add(struct cxl_memdev *cxlmd, struct cxl_dev_state *cxlds) +{ + int rc; + + cxlmd->cxlds = cxlds; + cxlds->cxlmd = cxlmd; + + rc = cdev_device_add(&cxlmd->cdev, &cxlmd->dev); + if (rc) { + /* + * The cdev was briefly live, shutdown any ioctl operations that + * saw that state. + */ + cxl_memdev_shutdown(&cxlmd->dev); + return rc; + } + + return 0; +} + +DEFINE_FREE(put_cxlmd, struct cxl_memdev *, + if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) + +static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) +{ + int rc; + + rc = devm_add_action_or_reset(cxlmd->cxlds->dev, cxl_memdev_unregister, + cxlmd); + if (rc) + return ERR_PTR(rc); + + return cxlmd; +} + /* * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. @@ -1057,45 +1096,24 @@ static const struct file_operations cxl_memdev_fops = { struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, struct cxl_dev_state *cxlds) { - struct cxl_memdev *cxlmd; struct device *dev; - struct cdev *cdev; int rc; - cxlmd = cxl_memdev_alloc(cxlds, &cxl_memdev_fops); + struct cxl_memdev *cxlmd __free(put_cxlmd) = + cxl_memdev_alloc(cxlds, &cxl_memdev_fops); if (IS_ERR(cxlmd)) return cxlmd; dev = &cxlmd->dev; rc = dev_set_name(dev, "mem%d", cxlmd->id); if (rc) - goto err; + return ERR_PTR(rc); - /* - * Activate ioctl operations, no cxl_memdev_rwsem manipulation - * needed as this is ordered with cdev_add() publishing the device. - */ - cxlmd->cxlds = cxlds; - cxlds->cxlmd = cxlmd; - - cdev = &cxlmd->cdev; - rc = cdev_device_add(cdev, dev); - if (rc) - goto err; - - rc = devm_add_action_or_reset(host, cxl_memdev_unregister, cxlmd); + rc = cxlmd_add(cxlmd, cxlds); if (rc) return ERR_PTR(rc); - return cxlmd; -err: - /* - * The cdev was briefly live, shutdown any ioctl operations that - * saw that state. - */ - cxl_memdev_shutdown(dev); - put_device(dev); - return ERR_PTR(rc); + return cxl_memdev_autoremove(no_free_ptr(cxlmd)); } EXPORT_SYMBOL_FOR_MODULES(__devm_cxl_add_memdev, "cxl_mem"); From f2546eba53bbe38c4bb950f78625ccf4b1a2cbc8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:15 -0800 Subject: [PATCH 05/59] cxl/mem: Drop @host argument to devm_cxl_add_memdev() In all cases the device that created the 'struct cxl_dev_state' instance is also the device to host the devm cleanup of devm_cxl_add_memdev(). This simplifies the function prototype, and limits a degree of freedom of the API. Cc: Smita Koralahalli Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Reviewed-by: Ben Cheatham Tested-by: Alejandro Lucero Link: https://patch.msgid.link/20251216005616.3090129-6-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 3 +-- drivers/cxl/cxlmem.h | 6 ++---- drivers/cxl/mem.c | 9 +++++---- drivers/cxl/pci.c | 2 +- tools/testing/cxl/test/mem.c | 2 +- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 92aea95859fb..935a163f1527 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -1093,8 +1093,7 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. */ -struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds) { struct device *dev; int rc; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 012e68acad34..9db31c7993c4 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -95,10 +95,8 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } -struct cxl_memdev *__devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds); -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds); +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index d62931526fd4..677996c65272 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -165,17 +165,18 @@ static int cxl_mem_probe(struct device *dev) /** * devm_cxl_add_memdev - Add a CXL memory device - * @host: devres alloc/release context and parent for the memdev * @cxlds: CXL device state to associate with the memdev * * Upon return the device will have had a chance to attach to the * cxl_mem driver, but may fail if the CXL topology is not ready * (hardware CXL link down, or software platform CXL root not attached) + * + * The parent of the resulting device and the devm context for allocations is + * @cxlds->dev. */ -struct cxl_memdev *devm_cxl_add_memdev(struct device *host, - struct cxl_dev_state *cxlds) +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds) { - return __devm_cxl_add_memdev(host, cxlds); + return __devm_cxl_add_memdev(cxlds); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0be4e508affe..1c6fc5334806 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) dev_dbg(&pdev->dev, "No CXL Features discovered\n"); - cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 176dcde570cd..8a22b7601627 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(&pdev->dev, cxlds); + cxlmd = devm_cxl_add_memdev(cxlds); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); From 29317f8dc6ed601ec54575689c2cd55cc470bcce Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 15 Dec 2025 16:56:16 -0800 Subject: [PATCH 06/59] cxl/mem: Introduce cxl_memdev_attach for CXL-dependent operation Unlike the cxl_pci class driver that opportunistically enables memory expansion with no other dependent functionality, CXL accelerator drivers have distinct PCIe-only and CXL-enhanced operation states. If CXL is available some additional coherent memory/cache operations can be enabled, otherwise traditional DMA+MMIO over PCIe/CXL.io is a fallback. This constitutes a new mode of operation where the caller of devm_cxl_add_memdev() wants to make a "go/no-go" decision about running in CXL accelerated mode or falling back to PCIe-only operation. Part of that decision making process likely also includes additional CXL-acceleration-specific resource setup. Encapsulate both of those requirements into 'struct cxl_memdev_attach' that provides a ->probe() callback. The probe callback runs in cxl_mem_probe() context, after the port topology is successfully attached for the given memdev. It supports a contract where, upon successful return from devm_cxl_add_memdev(), everything needed for CXL accelerated operation has been enabled. Additionally the presence of @cxlmd->attach indicates that the accelerator driver be detached when CXL operation ends. This conceptually makes a CXL link loss event mirror a PCIe link loss event which results in triggering the ->remove() callback of affected devices+drivers. A driver can re-attach to recover back to PCIe-only operation. Live recovery, i.e. without a ->remove()/->probe() cycle, is left as a future consideration. [ dj: Repalce with updated commit log from Dan ] Cc: Smita Koralahalli Reviewed-by: Ben Cheatham Reviewed-by: Dave Jiang Tested-by: Alejandro Lucero Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251216005616.3090129-7-dan.j.williams@intel.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/memdev.c | 33 +++++++++++++++++++++++++++++---- drivers/cxl/cxlmem.h | 12 ++++++++++-- drivers/cxl/mem.c | 20 ++++++++++++++++---- drivers/cxl/pci.c | 2 +- tools/testing/cxl/test/mem.c | 2 +- 5 files changed, 57 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c index 935a163f1527..af3d0cc65138 100644 --- a/drivers/cxl/core/memdev.c +++ b/drivers/cxl/core/memdev.c @@ -641,14 +641,24 @@ static void detach_memdev(struct work_struct *work) struct cxl_memdev *cxlmd; cxlmd = container_of(work, typeof(*cxlmd), detach_work); - device_release_driver(&cxlmd->dev); + + /* + * When the creator of @cxlmd sets ->attach it indicates CXL operation + * is required. In that case, @cxlmd detach escalates to parent device + * detach. + */ + if (cxlmd->attach) + device_release_driver(cxlmd->dev.parent); + else + device_release_driver(&cxlmd->dev); put_device(&cxlmd->dev); } static struct lock_class_key cxl_memdev_key; static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, - const struct file_operations *fops) + const struct file_operations *fops, + const struct cxl_memdev_attach *attach) { struct cxl_memdev *cxlmd; struct device *dev; @@ -664,6 +674,8 @@ static struct cxl_memdev *cxl_memdev_alloc(struct cxl_dev_state *cxlds, goto err; cxlmd->id = rc; cxlmd->depth = -1; + cxlmd->attach = attach; + cxlmd->endpoint = ERR_PTR(-ENXIO); dev = &cxlmd->dev; device_initialize(dev); @@ -1081,6 +1093,18 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) { int rc; + /* + * If @attach is provided fail if the driver is not attached upon + * return. Note that failure here could be the result of a race to + * teardown the CXL port topology. I.e. cxl_mem_probe() could have + * succeeded and then cxl_mem unbound before the lock is acquired. + */ + guard(device)(&cxlmd->dev); + if (cxlmd->attach && !cxlmd->dev.driver) { + cxl_memdev_unregister(cxlmd); + return ERR_PTR(-ENXIO); + } + rc = devm_add_action_or_reset(cxlmd->cxlds->dev, cxl_memdev_unregister, cxlmd); if (rc) @@ -1093,13 +1117,14 @@ static struct cxl_memdev *cxl_memdev_autoremove(struct cxl_memdev *cxlmd) * Core helper for devm_cxl_add_memdev() that wants to both create a device and * assert to the caller that upon return cxl_mem::probe() has been invoked. */ -struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds) +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach) { struct device *dev; int rc; struct cxl_memdev *cxlmd __free(put_cxlmd) = - cxl_memdev_alloc(cxlds, &cxl_memdev_fops); + cxl_memdev_alloc(cxlds, &cxl_memdev_fops, attach); if (IS_ERR(cxlmd)) return cxlmd; diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 9db31c7993c4..ef202b34e5ea 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -34,6 +34,10 @@ (FIELD_GET(CXLMDEV_RESET_NEEDED_MASK, status) != \ CXLMDEV_RESET_NEEDED_NOT) +struct cxl_memdev_attach { + int (*probe)(struct cxl_memdev *cxlmd); +}; + /** * struct cxl_memdev - CXL bus object representing a Type-3 Memory Device * @dev: driver core device object @@ -43,6 +47,7 @@ * @cxl_nvb: coordinate removal of @cxl_nvd if present * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem * @endpoint: connection to the CXL port topology for this memory device + * @attach: creator of this memdev depends on CXL link attach to operate * @id: id number of this memdev instance. * @depth: endpoint port depth * @scrub_cycle: current scrub cycle set for this device @@ -59,6 +64,7 @@ struct cxl_memdev { struct cxl_nvdimm_bridge *cxl_nvb; struct cxl_nvdimm *cxl_nvd; struct cxl_port *endpoint; + const struct cxl_memdev_attach *attach; int id; int depth; u8 scrub_cycle; @@ -95,8 +101,10 @@ static inline bool is_cxl_endpoint(struct cxl_port *port) return is_cxl_memdev(port->uport_dev); } -struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds); -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds); +struct cxl_memdev *__devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach); int devm_cxl_sanitize_setup_notifier(struct device *host, struct cxl_memdev *cxlmd); struct cxl_memdev_state; diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 677996c65272..333c366b69e7 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -142,6 +142,12 @@ static int cxl_mem_probe(struct device *dev) return rc; } + if (cxlmd->attach) { + rc = cxlmd->attach->probe(cxlmd); + if (rc) + return rc; + } + rc = devm_cxl_memdev_edac_register(cxlmd); if (rc) dev_dbg(dev, "CXL memdev EDAC registration failed rc=%d\n", rc); @@ -166,17 +172,23 @@ static int cxl_mem_probe(struct device *dev) /** * devm_cxl_add_memdev - Add a CXL memory device * @cxlds: CXL device state to associate with the memdev + * @attach: Caller depends on CXL topology attachment * * Upon return the device will have had a chance to attach to the - * cxl_mem driver, but may fail if the CXL topology is not ready - * (hardware CXL link down, or software platform CXL root not attached) + * cxl_mem driver, but may fail to attach if the CXL topology is not ready + * (hardware CXL link down, or software platform CXL root not attached). + * + * When @attach is NULL it indicates the caller wants the memdev to remain + * registered even if it does not immediately attach to the CXL hierarchy. When + * @attach is provided a cxl_mem_probe() failure leads to failure of this routine. * * The parent of the resulting device and the devm context for allocations is * @cxlds->dev. */ -struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds) +struct cxl_memdev *devm_cxl_add_memdev(struct cxl_dev_state *cxlds, + const struct cxl_memdev_attach *attach) { - return __devm_cxl_add_memdev(cxlds); + return __devm_cxl_add_memdev(cxlds, attach); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_memdev, "CXL"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 1c6fc5334806..549368a9c868 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1006,7 +1006,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) dev_dbg(&pdev->dev, "No CXL Features discovered\n"); - cxlmd = devm_cxl_add_memdev(cxlds); + cxlmd = devm_cxl_add_memdev(cxlds, NULL); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index 8a22b7601627..cb87e8c0e63c 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1767,7 +1767,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) cxl_mock_add_event_logs(&mdata->mes); - cxlmd = devm_cxl_add_memdev(cxlds); + cxlmd = devm_cxl_add_memdev(cxlds, NULL); if (IS_ERR(cxlmd)) return PTR_ERR(cxlmd); From bc62f5b308cbdedf29132fe96e9d591e526527e1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 20 Nov 2025 03:19:17 +0000 Subject: [PATCH 07/59] dax/hmem, e820, resource: Defer Soft Reserved insertion until hmem is ready Insert Soft Reserved memory into a dedicated soft_reserve_resource tree instead of the iomem_resource tree at boot. Delay publishing these ranges into the iomem hierarchy until ownership is resolved and the HMEM path is ready to consume them. Publishing Soft Reserved ranges into iomem too early conflicts with CXL hotplug and prevents region assembly when those ranges overlap CXL windows. Follow up patches will reinsert Soft Reserved ranges into iomem after CXL window publication is complete and HMEM is ready to claim the memory. This provides a cleaner handoff between EFI-defined memory ranges and CXL resource management without trimming or deleting resources later. In the meantime "Soft Reserved" resources will no longer appear in /proc/iomem, only their results. I.e. with "memmap=4G%4G+0xefffffff" Before: 100000000-1ffffffff : Soft Reserved 100000000-1ffffffff : dax1.0 100000000-1ffffffff : System RAM (kmem) After: 100000000-1ffffffff : dax1.0 100000000-1ffffffff : System RAM (kmem) The expectation is that this does not lead to a user visible regression because the dax1.0 device is created in both instances. Co-developed-by: Smita Koralahalli [Smita: incorporate feedback from x86 maintainer review] Signed-off-by: Smita Koralahalli Link: https://patch.msgid.link/20251120031925.87762-2-Smita.KoralahalliChannabasappa@amd.com [djbw: cleanups and clarifications] Link: https://lore.kernel.org/69443f707b025_1cee10022@dwillia2-mobl4.notmuch Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- arch/x86/kernel/e820.c | 15 ++++++--- drivers/dax/hmem/device.c | 3 +- drivers/dax/hmem/hmem.c | 5 +-- include/linux/ioport.h | 5 +++ kernel/resource.c | 71 +++++++++++++++++++++++++++++++++------ 5 files changed, 80 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b15b97d3cb52..69c050f50e18 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1151,11 +1151,18 @@ void __init e820__reserve_resources_late(void) int i; struct resource *res; - res = e820_res; - for (i = 0; i < e820_table->nr_entries; i++) { - if (!res->parent && res->end) + for (i = 0, res = e820_res; i < e820_table->nr_entries; i++, res++) { + /* skip added or uninitialized resources */ + if (res->parent || !res->end) + continue; + + /* set aside soft-reserved resources for driver consideration */ + if (res->desc == IORES_DESC_SOFT_RESERVED) { + insert_resource_expand_to_fit(&soft_reserve_resource, res); + } else { + /* publish the rest immediately */ insert_resource_expand_to_fit(&iomem_resource, res); - res++; + } } /* diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c index f9e1a76a04a9..56e3cbd181b5 100644 --- a/drivers/dax/hmem/device.c +++ b/drivers/dax/hmem/device.c @@ -83,8 +83,7 @@ static __init int hmem_register_one(struct resource *res, void *data) static __init int hmem_init(void) { - walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED, - IORESOURCE_MEM, 0, -1, NULL, hmem_register_one); + walk_soft_reserve_res(0, -1, NULL, hmem_register_one); return 0; } diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c index c18451a37e4f..1cf7c2a0ee1c 100644 --- a/drivers/dax/hmem/hmem.c +++ b/drivers/dax/hmem/hmem.c @@ -73,11 +73,12 @@ static int hmem_register_device(struct device *host, int target_nid, return 0; } - rc = region_intersects(res->start, resource_size(res), IORESOURCE_MEM, - IORES_DESC_SOFT_RESERVED); + rc = region_intersects_soft_reserve(res->start, resource_size(res)); if (rc != REGION_INTERSECTS) return 0; + /* TODO: Add Soft-Reserved memory back to iomem */ + id = memregion_alloc(GFP_KERNEL); if (id < 0) { dev_err(host, "memregion allocation failure for %pr\n", res); diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 9afa30f9346f..95662b2fb458 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -232,6 +232,7 @@ struct resource_constraint { /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; +extern struct resource soft_reserve_resource; extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); extern int request_resource(struct resource *root, struct resource *new); @@ -418,6 +419,10 @@ walk_system_ram_res_rev(u64 start, u64 end, void *arg, extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); +extern int walk_soft_reserve_res(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)); +extern int +region_intersects_soft_reserve(resource_size_t start, size_t size); struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size); diff --git a/kernel/resource.c b/kernel/resource.c index e4e9bac12e6e..b40ac7615d55 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -48,6 +48,14 @@ struct resource iomem_resource = { }; EXPORT_SYMBOL(iomem_resource); +struct resource soft_reserve_resource = { + .name = "Soft Reserved", + .start = 0, + .end = -1, + .desc = IORES_DESC_SOFT_RESERVED, + .flags = IORESOURCE_MEM, +}; + static DEFINE_RWLOCK(resource_lock); /* @@ -321,13 +329,14 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long } /** - * find_next_iomem_res - Finds the lowest iomem resource that covers part of - * [@start..@end]. + * find_next_res - Finds the lowest resource that covers part of + * [@start..@end]. * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns * -ENODEV. Returns -EINVAL for invalid parameters. * + * @parent: resource tree root to search * @start: start address of the resource searched for * @end: end address of same resource * @flags: flags which the resource must have @@ -337,9 +346,9 @@ static bool is_type_match(struct resource *p, unsigned long flags, unsigned long * The caller must specify @start, @end, @flags, and @desc * (which may be IORES_DESC_NONE). */ -static int find_next_iomem_res(resource_size_t start, resource_size_t end, - unsigned long flags, unsigned long desc, - struct resource *res) +static int find_next_res(struct resource *parent, resource_size_t start, + resource_size_t end, unsigned long flags, + unsigned long desc, struct resource *res) { /* Skip children until we find a top level range that matches */ bool skip_children = true; @@ -353,7 +362,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for_each_resource(&iomem_resource, p, skip_children) { + for_each_resource(parent, p, skip_children) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -390,16 +399,23 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, return p ? 0 : -ENODEV; } -static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, - unsigned long flags, unsigned long desc, - void *arg, - int (*func)(struct resource *, void *)) +static int find_next_iomem_res(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + struct resource *res) +{ + return find_next_res(&iomem_resource, start, end, flags, desc, res); +} + +static int walk_res_desc(struct resource *parent, resource_size_t start, + resource_size_t end, unsigned long flags, + unsigned long desc, void *arg, + int (*func)(struct resource *, void *)) { struct resource res; int ret = -EINVAL; while (start < end && - !find_next_iomem_res(start, end, flags, desc, &res)) { + !find_next_res(parent, start, end, flags, desc, &res)) { ret = (*func)(&res, arg); if (ret) break; @@ -410,6 +426,15 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, return ret; } +static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, + unsigned long flags, unsigned long desc, + void *arg, + int (*func)(struct resource *, void *)) +{ + return walk_res_desc(&iomem_resource, start, end, flags, desc, arg, func); +} + + /** * walk_iomem_res_desc - Walks through iomem resources and calls func() * with matching resource ranges. @@ -434,6 +459,18 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, } EXPORT_SYMBOL_GPL(walk_iomem_res_desc); +/* + * In support of device drivers claiming Soft Reserved resources, walk the Soft + * Reserved resource deferral tree. + */ +int walk_soft_reserve_res(u64 start, u64 end, void *arg, + int (*func)(struct resource *, void *)) +{ + return walk_res_desc(&soft_reserve_resource, start, end, IORESOURCE_MEM, + IORES_DESC_SOFT_RESERVED, arg, func); +} +EXPORT_SYMBOL_GPL(walk_soft_reserve_res); + /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. @@ -656,6 +693,18 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags, } EXPORT_SYMBOL_GPL(region_intersects); +/* + * Check if the provided range is registered in the Soft Reserved resource + * deferral tree for driver consideration. + */ +int region_intersects_soft_reserve(resource_size_t start, size_t size) +{ + guard(read_lock)(&resource_lock); + return __region_intersects(&soft_reserve_resource, start, size, + IORESOURCE_MEM, IORES_DESC_SOFT_RESERVED); +} +EXPORT_SYMBOL_GPL(region_intersects_soft_reserve); + void __weak arch_remove_reservations(struct resource *avail) { } From 0f7afd80d81b739c4a9a6e4e24109ba1030c9c56 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:22 -0600 Subject: [PATCH 08/59] PCI: Move CXL DVSEC definitions into uapi/linux/pci_regs.h The CXL DVSECs are currently defined in cxl/core/cxlpci.h. These are not accessible to other subsystems. Move these to uapi/linux/pci_regs.h. The CXL DVSEC definitions will be renamed and reformatted to fit better with existing defines. Signed-off-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Signed-off-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-2-terry.bowman@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/cxlpci.h | 53 ----------------------------- include/uapi/linux/pci_regs.h | 64 ++++++++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 58 deletions(-) diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 1d526bea8431..cdb7cf3dbcb4 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -7,59 +7,6 @@ #define CXL_MEMORY_PROGIF 0x10 -/* - * See section 8.1 Configuration Space Registers in the CXL 2.0 - * Specification. Names are taken straight from the specification with "CXL" and - * "DVSEC" redundancies removed. When obvious, abbreviations may be used. - */ -#define PCI_DVSEC_HEADER1_LENGTH_MASK GENMASK(31, 20) - -/* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */ -#define CXL_DVSEC_PCIE_DEVICE 0 -#define CXL_DVSEC_CAP_OFFSET 0xA -#define CXL_DVSEC_MEM_CAPABLE BIT(2) -#define CXL_DVSEC_HDM_COUNT_MASK GENMASK(5, 4) -#define CXL_DVSEC_CTRL_OFFSET 0xC -#define CXL_DVSEC_MEM_ENABLE BIT(2) -#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) -#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) -#define CXL_DVSEC_MEM_INFO_VALID BIT(0) -#define CXL_DVSEC_MEM_ACTIVE BIT(1) -#define CXL_DVSEC_MEM_SIZE_LOW_MASK GENMASK(31, 28) -#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) -#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) -#define CXL_DVSEC_MEM_BASE_LOW_MASK GENMASK(31, 28) - -#define CXL_DVSEC_RANGE_MAX 2 - -/* CXL 2.0 8.1.4: Non-CXL Function Map DVSEC */ -#define CXL_DVSEC_FUNCTION_MAP 2 - -/* CXL 2.0 8.1.5: CXL 2.0 Extensions DVSEC for Ports */ -#define CXL_DVSEC_PORT_EXTENSIONS 3 - -/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */ -#define CXL_DVSEC_PORT_GPF 4 -#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8) -#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8) - -/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */ -#define CXL_DVSEC_DEVICE_GPF 5 - -/* CXL 2.0 8.1.8: PCIe DVSEC for Flex Bus Port */ -#define CXL_DVSEC_PCIE_FLEXBUS_PORT 7 - -/* CXL 2.0 8.1.9: Register Locator DVSEC */ -#define CXL_DVSEC_REG_LOCATOR 8 -#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC -#define CXL_DVSEC_REG_LOCATOR_BIR_MASK GENMASK(2, 0) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK GENMASK(15, 8) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK GENMASK(31, 16) - /* * NOTE: Currently all the functions which are enabled for CXL require their * vectors to be in the first 16. Use this as the default max. diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 3add74ae2594..6c4b6f19b18e 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1253,11 +1253,6 @@ #define PCI_DEV3_STA 0x0c /* Device 3 Status Register */ #define PCI_DEV3_STA_SEGMENT 0x8 /* Segment Captured (end-to-end flit-mode detected) */ -/* Compute Express Link (CXL r3.1, sec 8.1.5) */ -#define PCI_DVSEC_CXL_PORT 3 -#define PCI_DVSEC_CXL_PORT_CTL 0x0c -#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 - /* Integrity and Data Encryption Extended Capability */ #define PCI_IDE_CAP 0x04 #define PCI_IDE_CAP_LINK 0x1 /* Link IDE Stream Supported */ @@ -1338,4 +1333,63 @@ #define PCI_IDE_SEL_ADDR_3(x) (28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) #define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc) (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc)) +/* Compute Express Link (CXL r3.1, sec 8.1.5) */ +#define PCI_DVSEC_CXL_PORT 3 +#define PCI_DVSEC_CXL_PORT_CTL 0x0c +#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 + +/* + * Compute Express Link (CXL r3.2, sec 8.1) + * + * Note that CXL DVSEC id 3 and 7 to be ignored when the CXL link state + * is "disconnected" (CXL r3.2, sec 9.12.3). Re-enumerate these + * registers on downstream link-up events. + */ +#define PCI_DVSEC_HEADER1_LENGTH_MASK __GENMASK(31, 20) + +/* CXL 3.2 8.1.3: PCIe DVSEC for CXL Device */ +#define CXL_DVSEC_PCIE_DEVICE 0 +#define CXL_DVSEC_CAP_OFFSET 0xA +#define CXL_DVSEC_MEM_CAPABLE _BITUL(2) +#define CXL_DVSEC_HDM_COUNT_MASK __GENMASK(5, 4) +#define CXL_DVSEC_CTRL_OFFSET 0xC +#define CXL_DVSEC_MEM_ENABLE _BITUL(2) +#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) +#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) +#define CXL_DVSEC_MEM_INFO_VALID _BITUL(0) +#define CXL_DVSEC_MEM_ACTIVE _BITUL(1) +#define CXL_DVSEC_MEM_SIZE_LOW_MASK __GENMASK(31, 28) +#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) +#define CXL_DVSEC_MEM_BASE_LOW_MASK __GENMASK(31, 28) + +#define CXL_DVSEC_RANGE_MAX 2 + +/* CXL 3.2 8.1.4: Non-CXL Function Map DVSEC */ +#define CXL_DVSEC_FUNCTION_MAP 2 + +/* CXL 3.2 8.1.5: Extensions DVSEC for Ports */ +#define CXL_DVSEC_PORT 3 +#define CXL_DVSEC_PORT_CTL 0x0c +#define CXL_DVSEC_PORT_CTL_UNMASK_SBR 0x00000001 + +/* CXL 3.2 8.1.6: GPF DVSEC for CXL Port */ +#define CXL_DVSEC_PORT_GPF 4 +#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK __GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK __GENMASK(11, 8) +#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK __GENMASK(3, 0) +#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK __GENMASK(11, 8) + +/* CXL 3.2 8.1.7: GPF DVSEC for CXL Device */ +#define CXL_DVSEC_DEVICE_GPF 5 + +/* CXL 3.2 8.1.9: Register Locator DVSEC */ +#define CXL_DVSEC_REG_LOCATOR 8 +#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC +#define CXL_DVSEC_REG_LOCATOR_BIR_MASK __GENMASK(2, 0) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK __GENMASK(15, 8) +#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK __GENMASK(31, 16) + #endif /* LINUX_PCI_REGS_H */ From 6612bd9ff0b1001cff5f5d79db6ce44427d2e99c Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:23 -0600 Subject: [PATCH 09/59] PCI: Update CXL DVSEC definitions CXL DVSEC definitions were recently moved into uapi/pci_regs.h, but the newly added macros do not follow the file's existing naming conventions. The current format uses CXL_DVSEC_XYZ, while the new CXL entries must instead use the PCI_DVSEC_CXL_XYZ prefix to match the conventions already established in pci_regs.h. The new CXL DVSEC macros also introduce _MASK and _OFFSET suffixes, which are not used anywhere else in the file. These suffixes lengthen the identifiers and reduce readability. Remove _MASK and _OFFSET from the recently added definitions. Additionally, remove PCI_DVSEC_HEADER1_LENGTH, as it duplicates the existing PCI_DVSEC_HEADER1_LEN() macro. Update all existing references to use the new macro names. Finally, update the inline documentation to reference the latest revision of the CXL specification. Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-3-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 58 ++++++++++++------------ drivers/cxl/core/regs.c | 14 +++--- drivers/cxl/pci.c | 2 +- include/uapi/linux/pci_regs.h | 84 ++++++++++++++++------------------- 4 files changed, 76 insertions(+), 82 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 5b023a0178a4..077b386e0c8d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -86,12 +86,12 @@ static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id) i = 1; do { rc = pci_read_config_dword(pdev, - d + CXL_DVSEC_RANGE_SIZE_LOW(id), + d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(id), &temp); if (rc) return rc; - valid = FIELD_GET(CXL_DVSEC_MEM_INFO_VALID, temp); + valid = FIELD_GET(PCI_DVSEC_CXL_MEM_INFO_VALID, temp); if (valid) break; msleep(1000); @@ -121,11 +121,11 @@ static int cxl_dvsec_mem_range_active(struct cxl_dev_state *cxlds, int id) /* Check MEM ACTIVE bit, up to 60s timeout by default */ for (i = media_ready_timeout; i; i--) { rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(id), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(id), &temp); if (rc) return rc; - active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp); + active = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE, temp); if (active) break; msleep(1000); @@ -154,11 +154,11 @@ int cxl_await_media_ready(struct cxl_dev_state *cxlds) u16 cap; rc = pci_read_config_word(pdev, - d + CXL_DVSEC_CAP_OFFSET, &cap); + d + PCI_DVSEC_CXL_CAP, &cap); if (rc) return rc; - hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + hdm_count = FIELD_GET(PCI_DVSEC_CXL_HDM_COUNT, cap); for (i = 0; i < hdm_count; i++) { rc = cxl_dvsec_mem_range_valid(cxlds, i); if (rc) @@ -186,16 +186,16 @@ static int cxl_set_mem_enable(struct cxl_dev_state *cxlds, u16 val) u16 ctrl; int rc; - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, &ctrl); if (rc < 0) return rc; - if ((ctrl & CXL_DVSEC_MEM_ENABLE) == val) + if ((ctrl & PCI_DVSEC_CXL_MEM_ENABLE) == val) return 1; - ctrl &= ~CXL_DVSEC_MEM_ENABLE; + ctrl &= ~PCI_DVSEC_CXL_MEM_ENABLE; ctrl |= val; - rc = pci_write_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, ctrl); + rc = pci_write_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, ctrl); if (rc < 0) return rc; @@ -211,7 +211,7 @@ static int devm_cxl_enable_mem(struct device *host, struct cxl_dev_state *cxlds) { int rc; - rc = cxl_set_mem_enable(cxlds, CXL_DVSEC_MEM_ENABLE); + rc = cxl_set_mem_enable(cxlds, PCI_DVSEC_CXL_MEM_ENABLE); if (rc < 0) return rc; if (rc > 0) @@ -273,11 +273,11 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, return -ENXIO; } - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap); + rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CAP, &cap); if (rc) return rc; - if (!(cap & CXL_DVSEC_MEM_CAPABLE)) { + if (!(cap & PCI_DVSEC_CXL_MEM_CAPABLE)) { dev_dbg(dev, "Not MEM Capable\n"); return -ENXIO; } @@ -288,7 +288,7 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, * driver is for a spec defined class code which must be CXL.mem * capable, there is no point in continuing to enable CXL.mem. */ - hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap); + hdm_count = FIELD_GET(PCI_DVSEC_CXL_HDM_COUNT, cap); if (!hdm_count || hdm_count > 2) return -EINVAL; @@ -297,11 +297,11 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, * disabled, and they will remain moot after the HDM Decoder * capability is enabled. */ - rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl); + rc = pci_read_config_word(pdev, d + PCI_DVSEC_CXL_CTRL, &ctrl); if (rc) return rc; - info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl); + info->mem_enabled = FIELD_GET(PCI_DVSEC_CXL_MEM_ENABLE, ctrl); if (!info->mem_enabled) return 0; @@ -314,35 +314,35 @@ int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, return rc; rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i), &temp); if (rc) return rc; size = (u64)temp << 32; rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_SIZE_LOW(i), &temp); if (rc) return rc; - size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK; + size |= temp & PCI_DVSEC_CXL_MEM_SIZE_LOW; if (!size) { continue; } rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_BASE_HIGH(i), &temp); if (rc) return rc; base = (u64)temp << 32; rc = pci_read_config_dword( - pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp); + pdev, d + PCI_DVSEC_CXL_RANGE_BASE_LOW(i), &temp); if (rc) return rc; - base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK; + base |= temp & PCI_DVSEC_CXL_MEM_BASE_LOW; info->dvsec_range[ranges++] = (struct range) { .start = base, @@ -1068,7 +1068,7 @@ u16 cxl_gpf_get_dvsec(struct device *dev) is_port = false; dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, - is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF); + is_port ? PCI_DVSEC_CXL_PORT_GPF : PCI_DVSEC_CXL_DEVICE_GPF); if (!dvsec) dev_warn(dev, "%s GPF DVSEC not present\n", is_port ? "Port" : "Device"); @@ -1084,14 +1084,14 @@ static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase) switch (phase) { case 1: - offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET; - base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK; - scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK; + offset = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_CONTROL; + base = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_BASE; + scale = PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_SCALE; break; case 2: - offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET; - base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK; - scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK; + offset = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_CONTROL; + base = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_BASE; + scale = PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_SCALE; break; default: return -EINVAL; diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 5ca7b0eed568..a010b3214342 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -271,10 +271,10 @@ EXPORT_SYMBOL_NS_GPL(cxl_map_device_regs, "CXL"); static bool cxl_decode_regblock(struct pci_dev *pdev, u32 reg_lo, u32 reg_hi, struct cxl_register_map *map) { - u8 reg_type = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK, reg_lo); - int bar = FIELD_GET(CXL_DVSEC_REG_LOCATOR_BIR_MASK, reg_lo); + u8 reg_type = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID, reg_lo); + int bar = FIELD_GET(PCI_DVSEC_CXL_REG_LOCATOR_BIR, reg_lo); u64 offset = ((u64)reg_hi << 32) | - (reg_lo & CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK); + (reg_lo & PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW); if (offset > pci_resource_len(pdev, bar)) { dev_warn(&pdev->dev, @@ -311,15 +311,15 @@ static int __cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_ty }; regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, - CXL_DVSEC_REG_LOCATOR); + PCI_DVSEC_CXL_REG_LOCATOR); if (!regloc) return -ENXIO; pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, ®loc_size); - regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size); + regloc_size = PCI_DVSEC_HEADER1_LEN(regloc_size); - regloc += CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET; - regblocks = (regloc_size - CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET) / 8; + regloc += PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1; + regblocks = (regloc_size - PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1) / 8; for (i = 0; i < regblocks; i++, regloc += 8) { u32 reg_lo, reg_hi; diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0be4e508affe..b7f694bda913 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -933,7 +933,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) cxlds->rcd = is_cxl_restricted(pdev); cxlds->serial = pci_get_dsn(pdev); cxlds->cxl_dvsec = pci_find_dvsec_capability( - pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE); + pdev, PCI_VENDOR_ID_CXL, PCI_DVSEC_CXL_DEVICE); if (!cxlds->cxl_dvsec) dev_warn(&pdev->dev, "Device DVSEC not present, skip CXL.mem init\n"); diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 6c4b6f19b18e..662582bdccf0 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1333,63 +1333,57 @@ #define PCI_IDE_SEL_ADDR_3(x) (28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) #define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc) (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc)) -/* Compute Express Link (CXL r3.1, sec 8.1.5) */ -#define PCI_DVSEC_CXL_PORT 3 -#define PCI_DVSEC_CXL_PORT_CTL 0x0c -#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 - /* - * Compute Express Link (CXL r3.2, sec 8.1) + * Compute Express Link (CXL r4.0, sec 8.1) * * Note that CXL DVSEC id 3 and 7 to be ignored when the CXL link state - * is "disconnected" (CXL r3.2, sec 9.12.3). Re-enumerate these + * is "disconnected" (CXL r4.0, sec 9.12.3). Re-enumerate these * registers on downstream link-up events. */ -#define PCI_DVSEC_HEADER1_LENGTH_MASK __GENMASK(31, 20) -/* CXL 3.2 8.1.3: PCIe DVSEC for CXL Device */ -#define CXL_DVSEC_PCIE_DEVICE 0 -#define CXL_DVSEC_CAP_OFFSET 0xA -#define CXL_DVSEC_MEM_CAPABLE _BITUL(2) -#define CXL_DVSEC_HDM_COUNT_MASK __GENMASK(5, 4) -#define CXL_DVSEC_CTRL_OFFSET 0xC -#define CXL_DVSEC_MEM_ENABLE _BITUL(2) -#define CXL_DVSEC_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) -#define CXL_DVSEC_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) -#define CXL_DVSEC_MEM_INFO_VALID _BITUL(0) -#define CXL_DVSEC_MEM_ACTIVE _BITUL(1) -#define CXL_DVSEC_MEM_SIZE_LOW_MASK __GENMASK(31, 28) -#define CXL_DVSEC_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) -#define CXL_DVSEC_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) -#define CXL_DVSEC_MEM_BASE_LOW_MASK __GENMASK(31, 28) +/* CXL r4.0, 8.1.3: PCIe DVSEC for CXL Device */ +#define PCI_DVSEC_CXL_DEVICE 0 +#define PCI_DVSEC_CXL_CAP 0xA +#define PCI_DVSEC_CXL_MEM_CAPABLE _BITUL(2) +#define PCI_DVSEC_CXL_HDM_COUNT __GENMASK(5, 4) +#define PCI_DVSEC_CXL_CTRL 0xC +#define PCI_DVSEC_CXL_MEM_ENABLE _BITUL(2) +#define PCI_DVSEC_CXL_RANGE_SIZE_HIGH(i) (0x18 + (i * 0x10)) +#define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) +#define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) +#define PCI_DVSEC_CXL_MEM_ACTIVE _BITUL(1) +#define PCI_DVSEC_CXL_MEM_SIZE_LOW __GENMASK(31, 28) +#define PCI_DVSEC_CXL_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) +#define PCI_DVSEC_CXL_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) +#define PCI_DVSEC_CXL_MEM_BASE_LOW __GENMASK(31, 28) #define CXL_DVSEC_RANGE_MAX 2 -/* CXL 3.2 8.1.4: Non-CXL Function Map DVSEC */ -#define CXL_DVSEC_FUNCTION_MAP 2 +/* CXL r4.0, 8.1.4: Non-CXL Function Map DVSEC */ +#define PCI_DVSEC_CXL_FUNCTION_MAP 2 -/* CXL 3.2 8.1.5: Extensions DVSEC for Ports */ -#define CXL_DVSEC_PORT 3 -#define CXL_DVSEC_PORT_CTL 0x0c -#define CXL_DVSEC_PORT_CTL_UNMASK_SBR 0x00000001 +/* CXL r4.0, 8.1.5: Extensions DVSEC for Ports */ +#define PCI_DVSEC_CXL_PORT 3 +#define PCI_DVSEC_CXL_PORT_CTL 0x0c +#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 -/* CXL 3.2 8.1.6: GPF DVSEC for CXL Port */ -#define CXL_DVSEC_PORT_GPF 4 -#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK __GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK __GENMASK(11, 8) -#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK __GENMASK(3, 0) -#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK __GENMASK(11, 8) +/* CXL r4.0, 8.1.6: GPF DVSEC for CXL Port */ +#define PCI_DVSEC_CXL_PORT_GPF 4 +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_1_CONTROL 0x0C +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_BASE __GENMASK(3, 0) +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_1_TMO_SCALE __GENMASK(11, 8) +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_2_CONTROL 0xE +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_BASE __GENMASK(3, 0) +#define PCI_DVSEC_CXL_PORT_GPF_PHASE_2_TMO_SCALE __GENMASK(11, 8) -/* CXL 3.2 8.1.7: GPF DVSEC for CXL Device */ -#define CXL_DVSEC_DEVICE_GPF 5 +/* CXL r4.0, 8.1.7: GPF DVSEC for CXL Device */ +#define PCI_DVSEC_CXL_DEVICE_GPF 5 -/* CXL 3.2 8.1.9: Register Locator DVSEC */ -#define CXL_DVSEC_REG_LOCATOR 8 -#define CXL_DVSEC_REG_LOCATOR_BLOCK1_OFFSET 0xC -#define CXL_DVSEC_REG_LOCATOR_BIR_MASK __GENMASK(2, 0) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_ID_MASK __GENMASK(15, 8) -#define CXL_DVSEC_REG_LOCATOR_BLOCK_OFF_LOW_MASK __GENMASK(31, 16) +/* CXL r4.0, 8.1.9: Register Locator DVSEC */ +#define PCI_DVSEC_CXL_REG_LOCATOR 8 +#define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1 0xC +#define PCI_DVSEC_CXL_REG_LOCATOR_BIR __GENMASK(2, 0) +#define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_ID __GENMASK(15, 8) +#define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK_OFF_LOW __GENMASK(31, 16) #endif /* LINUX_PCI_REGS_H */ From 7c29ba02210c6e4570cdce53813a1ae68fb6d049 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:24 -0600 Subject: [PATCH 10/59] PCI: Introduce pcie_is_cxl() CXL is a protocol that runs on top of PCIe electricals. Its error model also runs on top of the PCIe AER error model by standardizing "internal" errors as "CXL" errors. Linux has historically ignored internal errors. CXL protocol error handling is then a task of enhancing the PCIe AER core to understand that PCIe ports (upstream and downstream) and endpoints may throw internal errors that represent standard CXL protocol errors. The proposed method to make that determination is to teach 'struct pci_dev' to cache when its link has trained the CXL.mem and/or CXL.cache protocols and then treat all internal errors as CXL errors. A design goal is to not burden the PCIe AER core with CXL knowledge beyond just enough to forward error notifications to the CXL RAS core. The forwarded notification looks up a 'struct cxl_port' or 'struct cxl_dport' companion device to the PCI device. Introduce set_pcie_cxl() with logic checking for CXL.mem or CXL.cache status in the CXL Flex Bus DVSEC status register. The CXL Flex Bus DVSEC presence is used because it is required for all the CXL PCIe devices.[1] [1] CXL 3.1 Spec, 8.1.1 PCIe Designated Vendor-Specific Extended Capability (DVSEC) ID Assignment, Table 8-2 Signed-off-by: Terry Bowman Reviewed-by: Ira Weiny Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alejandro Lucero Reviewed-by: Ben Cheatham Reviewed-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-4-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/pci/probe.c | 31 +++++++++++++++++++++++++++++++ include/linux/pci.h | 6 ++++++ include/uapi/linux/pci_regs.h | 6 ++++++ 3 files changed, 43 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 41183aed8f5d..bd7ce41d0c7a 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -1735,6 +1735,35 @@ static void set_pcie_thunderbolt(struct pci_dev *dev) dev->is_thunderbolt = 1; } +static void set_pcie_cxl(struct pci_dev *dev) +{ + struct pci_dev *bridge; + u16 dvsec, cap; + + if (!pci_is_pcie(dev)) + return; + + /* + * Update parent's CXL state because alternate protocol training + * may have changed + */ + bridge = pci_upstream_bridge(dev); + if (bridge) + set_pcie_cxl(bridge); + + dvsec = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_FLEXBUS_PORT); + if (!dvsec) + return; + + pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS, + &cap); + + dev->is_cxl = FIELD_GET(PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_CACHE, cap) || + FIELD_GET(PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_MEM, cap); + +} + static void set_pcie_untrusted(struct pci_dev *dev) { struct pci_dev *parent = pci_upstream_bridge(dev); @@ -2065,6 +2094,8 @@ int pci_setup_device(struct pci_dev *dev) /* Need to have dev->cfg_size ready */ set_pcie_thunderbolt(dev); + set_pcie_cxl(dev); + set_pcie_untrusted(dev); if (pci_is_pcie(dev)) diff --git a/include/linux/pci.h b/include/linux/pci.h index 864775651c6f..f8e8b3df794d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -463,6 +463,7 @@ struct pci_dev { unsigned int is_pciehp:1; unsigned int shpc_managed:1; /* SHPC owned by shpchp */ unsigned int is_thunderbolt:1; /* Thunderbolt controller */ + unsigned int is_cxl:1; /* Compute Express Link (CXL) */ /* * Devices marked being untrusted are the ones that can potentially * execute DMA attacks and similar. They are typically connected @@ -791,6 +792,11 @@ static inline bool pci_is_display(struct pci_dev *pdev) return (pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY; } +static inline bool pcie_is_cxl(struct pci_dev *pci_dev) +{ + return pci_dev->is_cxl; +} + #define for_each_pci_bridge(dev, bus) \ list_for_each_entry(dev, &bus->devices, bus_list) \ if (!pci_is_bridge(dev)) {} else diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 662582bdccf0..b6622fd60fd9 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1379,6 +1379,12 @@ /* CXL r4.0, 8.1.7: GPF DVSEC for CXL Device */ #define PCI_DVSEC_CXL_DEVICE_GPF 5 +/* CXL r4.0, 8.1.8: Flex Bus DVSEC */ +#define PCI_DVSEC_CXL_FLEXBUS_PORT 7 +#define PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS 0xE +#define PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_CACHE _BITUL(0) +#define PCI_DVSEC_CXL_FLEXBUS_PORT_STATUS_MEM _BITUL(2) + /* CXL r4.0, 8.1.9: Register Locator DVSEC */ #define PCI_DVSEC_CXL_REG_LOCATOR 8 #define PCI_DVSEC_CXL_REG_LOCATOR_BLOCK1 0xC From ca3d1a53e62093d17436abd447463da9c0f4e56b Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:25 -0600 Subject: [PATCH 11/59] cxl/pci: Remove unnecessary CXL Endpoint handling helper functions The CXL driver's cxl_handle_endpoint_cor_ras()/cxl_handle_endpoint_ras() are unnecessary helper functions used only for Endpoints. Remove these functions as they are not common for all CXL devices and do not provide value for EP handling. Rename __cxl_handle_ras to cxl_handle_ras() and __cxl_handle_cor_ras() to cxl_handle_cor_ras(). Signed-off-by: Terry Bowman Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Tested-by: Joshua Hahn Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-5-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 077b386e0c8d..3ec7407f0c5d 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,8 +632,8 @@ err: } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) +static void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, + void __iomem *ras_base) { void __iomem *addr; u32 status; @@ -649,11 +649,6 @@ static void __cxl_handle_cor_ras(struct cxl_dev_state *cxlds, } } -static void cxl_handle_endpoint_cor_ras(struct cxl_dev_state *cxlds) -{ - return __cxl_handle_cor_ras(cxlds, cxlds->regs.ras); -} - /* CXL spec rev3.0 8.2.4.16.1 */ static void header_log_copy(void __iomem *ras_base, u32 *log) { @@ -675,8 +670,8 @@ static void header_log_copy(void __iomem *ras_base, u32 *log) * Log the state of the RAS status registers and prepare them to log the * next error status. Return 1 if reset needed. */ -static bool __cxl_handle_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) +static bool cxl_handle_ras(struct cxl_dev_state *cxlds, + void __iomem *ras_base) { u32 hl[CXL_HEADERLOG_SIZE_U32]; void __iomem *addr; @@ -709,11 +704,6 @@ static bool __cxl_handle_ras(struct cxl_dev_state *cxlds, return true; } -static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds) -{ - return __cxl_handle_ras(cxlds, cxlds->regs.ras); -} - #ifdef CONFIG_PCIEAER_CXL static void cxl_dport_map_rch_aer(struct cxl_dport *dport) @@ -792,13 +782,13 @@ EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds, struct cxl_dport *dport) { - return __cxl_handle_cor_ras(cxlds, dport->regs.ras); + return cxl_handle_cor_ras(cxlds, dport->regs.ras); } static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds, struct cxl_dport *dport) { - return __cxl_handle_ras(cxlds, dport->regs.ras); + return cxl_handle_ras(cxlds, dport->regs.ras); } /* @@ -895,7 +885,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev) if (cxlds->rcd) cxl_handle_rdport_errors(cxlds); - cxl_handle_endpoint_cor_ras(cxlds); + cxl_handle_cor_ras(cxlds, cxlds->regs.ras); } } EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); @@ -924,7 +914,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, * chance the situation is recoverable dump the status of the RAS * capability registers and bounce the active state of the memdev. */ - ue = cxl_handle_endpoint_ras(cxlds); + ue = cxl_handle_ras(cxlds, cxlds->regs.ras); } From eb78ef4d6f0e51243c1ee117f801dbc503e886ab Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:26 -0600 Subject: [PATCH 12/59] cxl/pci: Remove unnecessary CXL RCH handling helper functions cxl_handle_rdport_cor_ras() and cxl_handle_rdport_ras() are specific to Restricted CXL Host (RCH) handling. Improve readability and maintainability by replacing these and instead using the common cxl_handle_cor_ras() and cxl_handle_ras() functions. Signed-off-by: Terry Bowman Reviewed-by: Alejandro Lucero Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-6-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 3ec7407f0c5d..51bb0f372e40 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -779,18 +779,6 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) } EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); -static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds, - struct cxl_dport *dport) -{ - return cxl_handle_cor_ras(cxlds, dport->regs.ras); -} - -static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds, - struct cxl_dport *dport) -{ - return cxl_handle_ras(cxlds, dport->regs.ras); -} - /* * Copy the AER capability registers using 32 bit read accesses. * This is necessary because RCRB AER capability is MMIO mapped. Clear the @@ -860,9 +848,9 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) pci_print_aer(pdev, severity, &aer_regs); if (severity == AER_CORRECTABLE) - cxl_handle_rdport_cor_ras(cxlds, dport); + cxl_handle_cor_ras(cxlds, dport->regs.ras); else - cxl_handle_rdport_ras(cxlds, dport); + cxl_handle_ras(cxlds, dport->regs.ras); } #else From bcfa289932a703dd189466ea5947212e8dddd399 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:27 -0600 Subject: [PATCH 13/59] PCI: Replace cxl_error_is_native() with pcie_aer_is_native() The AER driver includes a CXL support function cxl_error_is_native(). This function adds no additional value from pcie_aer_is_native(). Simplify the codebase by removing cxl_error_is_native() and replace occurrences of cxl_error_is_native() with pcie_aer_is_native(). Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-7-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/pci/pcie/aer.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index e0bcaa896803..c99ba2a1159c 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1166,13 +1166,6 @@ static bool is_cxl_mem_dev(struct pci_dev *dev) return true; } -static bool cxl_error_is_native(struct pci_dev *dev) -{ - struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); - - return (pcie_ports_native || host->native_aer); -} - static bool is_internal_error(struct aer_err_info *info) { if (info->severity == AER_CORRECTABLE) @@ -1186,7 +1179,7 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) struct aer_err_info *info = (struct aer_err_info *)data; const struct pci_error_handlers *err_handler; - if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) + if (!is_cxl_mem_dev(dev) || !pcie_aer_is_native(dev)) return 0; /* Protect dev->driver */ @@ -1227,7 +1220,7 @@ static int handles_cxl_error_iter(struct pci_dev *dev, void *data) bool *handles_cxl = data; if (!*handles_cxl) - *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); + *handles_cxl = is_cxl_mem_dev(dev) && pcie_aer_is_native(dev); /* Non-zero terminates iteration */ return *handles_cxl; From 7ff8b1d60881c5f97b5ae426e14d2822917d3b69 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Wed, 14 Jan 2026 12:20:28 -0600 Subject: [PATCH 14/59] cxl/pci: Remove CXL VH handling in CONFIG_PCIEAER_CXL conditional blocks from core/pci.c Create new config CONFIG_CXL_RAS and put all CXL RAS items behind the config. The config will depend on CPER and PCIE AER to build. Move the related VH RAS code from core/pci.c to core/ras.c. Restricted CXL host (RCH) RAS functions will be moved in a future patch. Cc: Robert Richter Reviewed-by: Joshua Hahn Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Reviewed-by: Alison Schofield Co-developed-by: Terry Bowman Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-8-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/Kconfig | 4 + drivers/cxl/core/Makefile | 2 +- drivers/cxl/core/core.h | 31 +++++++ drivers/cxl/core/pci.c | 189 +------------------------------------- drivers/cxl/core/ras.c | 176 +++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 8 -- drivers/cxl/cxlpci.h | 16 ++++ tools/testing/cxl/Kbuild | 2 +- 8 files changed, 233 insertions(+), 195 deletions(-) diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 48b7314afdb8..217888992c88 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -233,4 +233,8 @@ config CXL_MCE def_bool y depends on X86_MCE && MEMORY_FAILURE +config CXL_RAS + def_bool y + depends on ACPI_APEI_GHES && PCIEAER && CXL_PCI + endif diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 5ad8fef210b5..b2930cc54f8b 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -14,9 +14,9 @@ cxl_core-y += pci.o cxl_core-y += hdm.o cxl_core-y += pmu.o cxl_core-y += cdat.o -cxl_core-y += ras.o cxl_core-$(CONFIG_TRACING) += trace.o cxl_core-$(CONFIG_CXL_REGION) += region.o cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o +cxl_core-$(CONFIG_CXL_RAS) += ras.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 1fb66132b777..bc818de87ccc 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -144,8 +144,39 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +#ifdef CONFIG_CXL_RAS int cxl_ras_init(void); void cxl_ras_exit(void); +bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +#else +static inline int cxl_ras_init(void) +{ + return 0; +} + +static inline void cxl_ras_exit(void) +{ +} + +static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + return false; +} +static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } +#endif /* CONFIG_CXL_RAS */ + +/* Restricted CXL Host specific RAS functions */ +#ifdef CONFIG_CXL_RAS +void cxl_dport_map_rch_aer(struct cxl_dport *dport); +void cxl_disable_rch_root_ints(struct cxl_dport *dport); +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); +#else +static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } +static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } +static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } +#endif /* CONFIG_CXL_RAS */ + int cxl_gpf_port_setup(struct cxl_dport *dport); struct cxl_hdm; diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 51bb0f372e40..e132fff80979 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,81 +632,8 @@ err: } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -static void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) -{ - void __iomem *addr; - u32 status; - - if (!ras_base) - return; - - addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { - writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); - trace_cxl_aer_correctable_error(cxlds->cxlmd, status); - } -} - -/* CXL spec rev3.0 8.2.4.16.1 */ -static void header_log_copy(void __iomem *ras_base, u32 *log) -{ - void __iomem *addr; - u32 *log_addr; - int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); - - addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; - log_addr = log; - - for (i = 0; i < log_u32_size; i++) { - *log_addr = readl(addr); - log_addr++; - addr += sizeof(u32); - } -} - -/* - * Log the state of the RAS status registers and prepare them to log the - * next error status. Return 1 if reset needed. - */ -static bool cxl_handle_ras(struct cxl_dev_state *cxlds, - void __iomem *ras_base) -{ - u32 hl[CXL_HEADERLOG_SIZE_U32]; - void __iomem *addr; - u32 status; - u32 fe; - - if (!ras_base) - return false; - - addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; - status = readl(addr); - if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) - return false; - - /* If multiple errors, log header points to first error from ctrl reg */ - if (hweight32(status) > 1) { - void __iomem *rcc_addr = - ras_base + CXL_RAS_CAP_CONTROL_OFFSET; - - fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, - readl(rcc_addr))); - } else { - fe = status; - } - - header_log_copy(ras_base, hl); - trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); - writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); - - return true; -} - -#ifdef CONFIG_PCIEAER_CXL - -static void cxl_dport_map_rch_aer(struct cxl_dport *dport) +#ifdef CONFIG_CXL_RAS +void cxl_dport_map_rch_aer(struct cxl_dport *dport) { resource_size_t aer_phys; struct device *host; @@ -721,19 +648,7 @@ static void cxl_dport_map_rch_aer(struct cxl_dport *dport) } } -static void cxl_dport_map_ras(struct cxl_dport *dport) -{ - struct cxl_register_map *map = &dport->reg_map; - struct device *dev = dport->dport_dev; - - if (!map->component_map.ras.valid) - dev_dbg(dev, "RAS registers not found\n"); - else if (cxl_map_component_regs(map, &dport->regs.component, - BIT(CXL_CM_CAP_CAP_ID_RAS))) - dev_dbg(dev, "Failed to map RAS capability.\n"); -} - -static void cxl_disable_rch_root_ints(struct cxl_dport *dport) +void cxl_disable_rch_root_ints(struct cxl_dport *dport) { void __iomem *aer_base = dport->regs.dport_aer; u32 aer_cmd_mask, aer_cmd; @@ -757,28 +672,6 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport) writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); } -/** - * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport - * @dport: the cxl_dport that needs to be initialized - * @host: host device for devm operations - */ -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) -{ - dport->reg_map.host = host; - cxl_dport_map_ras(dport); - - if (dport->rch) { - struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); - - if (!host_bridge->native_aer) - return; - - cxl_dport_map_rch_aer(dport); - cxl_disable_rch_root_ints(dport); - } -} -EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); - /* * Copy the AER capability registers using 32 bit read accesses. * This is necessary because RCRB AER capability is MMIO mapped. Clear the @@ -827,7 +720,7 @@ static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, return false; } -static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { struct pci_dev *pdev = to_pci_dev(cxlds->dev); struct aer_capability_regs aer_regs; @@ -852,82 +745,8 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) else cxl_handle_ras(cxlds, dport->regs.ras); } - -#else -static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } #endif -void cxl_cor_error_detected(struct pci_dev *pdev) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct device *dev = &cxlds->cxlmd->dev; - - scoped_guard(device, dev) { - if (!dev->driver) { - dev_warn(&pdev->dev, - "%s: memdev disabled, abort error handling\n", - dev_name(dev)); - return; - } - - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); - - cxl_handle_cor_ras(cxlds, cxlds->regs.ras); - } -} -EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); - -pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, - pci_channel_state_t state) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - struct cxl_memdev *cxlmd = cxlds->cxlmd; - struct device *dev = &cxlmd->dev; - bool ue; - - scoped_guard(device, dev) { - if (!dev->driver) { - dev_warn(&pdev->dev, - "%s: memdev disabled, abort error handling\n", - dev_name(dev)); - return PCI_ERS_RESULT_DISCONNECT; - } - - if (cxlds->rcd) - cxl_handle_rdport_errors(cxlds); - /* - * A frozen channel indicates an impending reset which is fatal to - * CXL.mem operation, and will likely crash the system. On the off - * chance the situation is recoverable dump the status of the RAS - * capability registers and bounce the active state of the memdev. - */ - ue = cxl_handle_ras(cxlds, cxlds->regs.ras); - } - - - switch (state) { - case pci_channel_io_normal: - if (ue) { - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - } - return PCI_ERS_RESULT_CAN_RECOVER; - case pci_channel_io_frozen: - dev_warn(&pdev->dev, - "%s: frozen state error detected, disable CXL.mem\n", - dev_name(dev)); - device_release_driver(dev); - return PCI_ERS_RESULT_NEED_RESET; - case pci_channel_io_perm_failure: - dev_warn(&pdev->dev, - "failure state error detected, request disconnect\n"); - return PCI_ERS_RESULT_DISCONNECT; - } - return PCI_ERS_RESULT_NEED_RESET; -} -EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); - static int cxl_flit_size(struct pci_dev *pdev) { if (cxl_pci_flit_256(pdev)) diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 2731ba3a0799..b933030b8e1e 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "trace.h" static void cxl_cper_trace_corr_port_prot_err(struct pci_dev *pdev, @@ -124,3 +125,178 @@ void cxl_ras_exit(void) cxl_cper_unregister_prot_err_work(&cxl_cper_prot_err_work); cancel_work_sync(&cxl_cper_prot_err_work); } + +static void cxl_dport_map_ras(struct cxl_dport *dport) +{ + struct cxl_register_map *map = &dport->reg_map; + struct device *dev = dport->dport_dev; + + if (!map->component_map.ras.valid) + dev_dbg(dev, "RAS registers not found\n"); + else if (cxl_map_component_regs(map, &dport->regs.component, + BIT(CXL_CM_CAP_CAP_ID_RAS))) + dev_dbg(dev, "Failed to map RAS capability.\n"); +} + +/** + * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport + * @dport: the cxl_dport that needs to be initialized + * @host: host device for devm operations + */ +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) +{ + dport->reg_map.host = host; + cxl_dport_map_ras(dport); + + if (dport->rch) { + struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); + + if (!host_bridge->native_aer) + return; + + cxl_dport_map_rch_aer(dport); + cxl_disable_rch_root_ints(dport); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); + +void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + void __iomem *addr; + u32 status; + + if (!ras_base) + return; + + addr = ras_base + CXL_RAS_CORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { + writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); + trace_cxl_aer_correctable_error(cxlds->cxlmd, status); + } +} + +/* CXL spec rev3.0 8.2.4.16.1 */ +static void header_log_copy(void __iomem *ras_base, u32 *log) +{ + void __iomem *addr; + u32 *log_addr; + int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32); + + addr = ras_base + CXL_RAS_HEADER_LOG_OFFSET; + log_addr = log; + + for (i = 0; i < log_u32_size; i++) { + *log_addr = readl(addr); + log_addr++; + addr += sizeof(u32); + } +} + +/* + * Log the state of the RAS status registers and prepare them to log the + * next error status. Return 1 if reset needed. + */ +bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +{ + u32 hl[CXL_HEADERLOG_SIZE_U32]; + void __iomem *addr; + u32 status; + u32 fe; + + if (!ras_base) + return false; + + addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET; + status = readl(addr); + if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK)) + return false; + + /* If multiple errors, log header points to first error from ctrl reg */ + if (hweight32(status) > 1) { + void __iomem *rcc_addr = + ras_base + CXL_RAS_CAP_CONTROL_OFFSET; + + fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, + readl(rcc_addr))); + } else { + fe = status; + } + + header_log_copy(ras_base, hl); + trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); + writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); + + return true; +} + +void cxl_cor_error_detected(struct pci_dev *pdev) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct device *dev = &cxlds->cxlmd->dev; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + + cxl_handle_cor_ras(cxlds, cxlds->regs.ras); + } +} +EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); + +pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct cxl_memdev *cxlmd = cxlds->cxlmd; + struct device *dev = &cxlmd->dev; + bool ue; + + scoped_guard(device, dev) { + if (!dev->driver) { + dev_warn(&pdev->dev, + "%s: memdev disabled, abort error handling\n", + dev_name(dev)); + return PCI_ERS_RESULT_DISCONNECT; + } + + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + /* + * A frozen channel indicates an impending reset which is fatal to + * CXL.mem operation, and will likely crash the system. On the off + * chance the situation is recoverable dump the status of the RAS + * capability registers and bounce the active state of the memdev. + */ + ue = cxl_handle_ras(cxlds, cxlds->regs.ras); + } + + + switch (state) { + case pci_channel_io_normal: + if (ue) { + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + } + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + dev_warn(&pdev->dev, + "%s: frozen state error detected, disable CXL.mem\n", + dev_name(dev)); + device_release_driver(dev); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + dev_warn(&pdev->dev, + "failure state error detected, request disconnect\n"); + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} +EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ba17fa86d249..42a76a7a088f 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -803,14 +803,6 @@ struct cxl_dport *devm_cxl_add_rch_dport(struct cxl_port *port, struct device *dport_dev, int port_id, resource_size_t rcrb); -#ifdef CONFIG_PCIEAER_CXL -void cxl_setup_parent_dport(struct device *host, struct cxl_dport *dport); -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); -#else -static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, - struct device *host) { } -#endif - struct cxl_decoder *to_cxl_decoder(struct device *dev); struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev); struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev); diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index cdb7cf3dbcb4..6f9c78886fd9 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -76,7 +76,23 @@ static inline bool cxl_pci_flit_256(struct pci_dev *pdev) struct cxl_dev_state; void read_cdat_data(struct cxl_port *port); + +#ifdef CONFIG_CXL_RAS void cxl_cor_error_detected(struct pci_dev *pdev); pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, pci_channel_state_t state); +void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); +#else +static inline void cxl_cor_error_detected(struct pci_dev *pdev) { } + +static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + return PCI_ERS_RESULT_NONE; +} + +static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, + struct device *host) { } +#endif + #endif /* __CXL_PCI_H__ */ diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0e151d0572d1..b7ea66382f3b 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -57,12 +57,12 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o cxl_core-y += $(CXL_CORE_SRC)/pmu.o cxl_core-y += $(CXL_CORE_SRC)/cdat.o -cxl_core-y += $(CXL_CORE_SRC)/ras.o cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o From d18f1b7beadf1af1cd334ff789ba5a07ce285bbc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 14 Jan 2026 12:20:34 -0600 Subject: [PATCH 15/59] PCI/AER: Replace PCIEAER_CXL symbol with CXL_RAS One of the primary reasons for the CXL driver to exist is to perform error handling. If both PCIEAER and CXL are enabled then light up CXL error handling as well. Now that all RAS handling is moved under the CXL_RAS symbol, drop the previous PCIEAER_CXL symbol. Reviewed-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-14-terry.bowman@amd.com Acked-by: Bjorn Helgaas Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/Kconfig | 2 +- drivers/pci/pcie/Kconfig | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 217888992c88..70acddc08c39 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -235,6 +235,6 @@ config CXL_MCE config CXL_RAS def_bool y - depends on ACPI_APEI_GHES && PCIEAER && CXL_PCI + depends on ACPI_APEI_GHES && PCIEAER && CXL_BUS endif diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig index 17919b99fa66..207c2deae35f 100644 --- a/drivers/pci/pcie/Kconfig +++ b/drivers/pci/pcie/Kconfig @@ -49,15 +49,6 @@ config PCIEAER_INJECT gotten from: https://github.com/intel/aer-inject.git -config PCIEAER_CXL - bool "PCI Express CXL RAS support" - default y - depends on PCIEAER && CXL_PCI - help - Enables CXL error handling. - - If unsure, say Y. - # # PCI Express ECRC # From 0ff60f2ec3e4043a442e805f80f8a2445113ec8f Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:29 -0600 Subject: [PATCH 16/59] cxl/pci: Move CXL driver's RCH error handling into core/ras_rch.c Restricted CXL Host (RCH) protocol error handling uses a procedure distinct from the CXL Virtual Hierarchy (VH) handling. This is because of the differences in the RCH and VH topologies. Improve the maintainability and add ability to enable/disable RCH handling. Move and combine the RCH handling code into a single block conditionally compiled with the CONFIG_CXL_RCH_RAS kernel config. Signed-off-by: Terry Bowman Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260114182055.46029-9-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/Makefile | 1 + drivers/cxl/core/core.h | 11 +--- drivers/cxl/core/pci.c | 115 ----------------------------------- drivers/cxl/core/ras_rch.c | 121 +++++++++++++++++++++++++++++++++++++ tools/testing/cxl/Kbuild | 1 + 5 files changed, 126 insertions(+), 123 deletions(-) create mode 100644 drivers/cxl/core/ras_rch.c diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index b2930cc54f8b..b37f38d502d8 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -20,3 +20,4 @@ cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o cxl_core-$(CONFIG_CXL_RAS) += ras.o +cxl_core-$(CONFIG_CXL_RAS) += ras_rch.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index bc818de87ccc..724361195057 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -149,6 +149,9 @@ int cxl_ras_init(void); void cxl_ras_exit(void); bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +void cxl_dport_map_rch_aer(struct cxl_dport *dport); +void cxl_disable_rch_root_ints(struct cxl_dport *dport); +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); #else static inline int cxl_ras_init(void) { @@ -164,14 +167,6 @@ static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras return false; } static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } -#endif /* CONFIG_CXL_RAS */ - -/* Restricted CXL Host specific RAS functions */ -#ifdef CONFIG_CXL_RAS -void cxl_dport_map_rch_aer(struct cxl_dport *dport); -void cxl_disable_rch_root_ints(struct cxl_dport *dport); -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); -#else static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index e132fff80979..b838c59d7a3c 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -632,121 +632,6 @@ err: } EXPORT_SYMBOL_NS_GPL(read_cdat_data, "CXL"); -#ifdef CONFIG_CXL_RAS -void cxl_dport_map_rch_aer(struct cxl_dport *dport) -{ - resource_size_t aer_phys; - struct device *host; - u16 aer_cap; - - aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base); - if (aer_cap) { - host = dport->reg_map.host; - aer_phys = aer_cap + dport->rcrb.base; - dport->regs.dport_aer = devm_cxl_iomap_block(host, aer_phys, - sizeof(struct aer_capability_regs)); - } -} - -void cxl_disable_rch_root_ints(struct cxl_dport *dport) -{ - void __iomem *aer_base = dport->regs.dport_aer; - u32 aer_cmd_mask, aer_cmd; - - if (!aer_base) - return; - - /* - * Disable RCH root port command interrupts. - * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors - * - * This sequence may not be necessary. CXL spec states disabling - * the root cmd register's interrupts is required. But, PCI spec - * shows these are disabled by default on reset. - */ - aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | - PCI_ERR_ROOT_CMD_NONFATAL_EN | - PCI_ERR_ROOT_CMD_FATAL_EN); - aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); - aer_cmd &= ~aer_cmd_mask; - writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); -} - -/* - * Copy the AER capability registers using 32 bit read accesses. - * This is necessary because RCRB AER capability is MMIO mapped. Clear the - * status after copying. - * - * @aer_base: base address of AER capability block in RCRB - * @aer_regs: destination for copying AER capability - */ -static bool cxl_rch_get_aer_info(void __iomem *aer_base, - struct aer_capability_regs *aer_regs) -{ - int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); - u32 *aer_regs_buf = (u32 *)aer_regs; - int n; - - if (!aer_base) - return false; - - /* Use readl() to guarantee 32-bit accesses */ - for (n = 0; n < read_cnt; n++) - aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); - - writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); - writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); - - return true; -} - -/* Get AER severity. Return false if there is no error. */ -static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, - int *severity) -{ - if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { - if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) - *severity = AER_FATAL; - else - *severity = AER_NONFATAL; - return true; - } - - if (aer_regs->cor_status & ~aer_regs->cor_mask) { - *severity = AER_CORRECTABLE; - return true; - } - - return false; -} - -void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) -{ - struct pci_dev *pdev = to_pci_dev(cxlds->dev); - struct aer_capability_regs aer_regs; - struct cxl_dport *dport; - int severity; - - struct cxl_port *port __free(put_cxl_port) = - cxl_pci_find_port(pdev, &dport); - if (!port) - return; - - if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) - return; - - if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) - return; - - pci_print_aer(pdev, severity, &aer_regs); - - if (severity == AER_CORRECTABLE) - cxl_handle_cor_ras(cxlds, dport->regs.ras); - else - cxl_handle_ras(cxlds, dport->regs.ras); -} -#endif - static int cxl_flit_size(struct pci_dev *pdev) { if (cxl_pci_flit_256(pdev)) diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c new file mode 100644 index 000000000000..ed58afd18ecc --- /dev/null +++ b/drivers/cxl/core/ras_rch.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2025 AMD Corporation. All rights reserved. */ + +#include +#include +#include "cxl.h" +#include "core.h" +#include "cxlmem.h" + +void cxl_dport_map_rch_aer(struct cxl_dport *dport) +{ + resource_size_t aer_phys; + struct device *host; + u16 aer_cap; + + aer_cap = cxl_rcrb_to_aer(dport->dport_dev, dport->rcrb.base); + if (aer_cap) { + host = dport->reg_map.host; + aer_phys = aer_cap + dport->rcrb.base; + dport->regs.dport_aer = + devm_cxl_iomap_block(host, aer_phys, + sizeof(struct aer_capability_regs)); + } +} + +void cxl_disable_rch_root_ints(struct cxl_dport *dport) +{ + void __iomem *aer_base = dport->regs.dport_aer; + u32 aer_cmd_mask, aer_cmd; + + if (!aer_base) + return; + + /* + * Disable RCH root port command interrupts. + * CXL 3.0 12.2.1.1 - RCH Downstream Port-detected Errors + * + * This sequence may not be necessary. CXL spec states disabling + * the root cmd register's interrupts is required. But, PCI spec + * shows these are disabled by default on reset. + */ + aer_cmd_mask = (PCI_ERR_ROOT_CMD_COR_EN | + PCI_ERR_ROOT_CMD_NONFATAL_EN | + PCI_ERR_ROOT_CMD_FATAL_EN); + aer_cmd = readl(aer_base + PCI_ERR_ROOT_COMMAND); + aer_cmd &= ~aer_cmd_mask; + writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND); +} + +/* + * Copy the AER capability registers using 32 bit read accesses. + * This is necessary because RCRB AER capability is MMIO mapped. Clear the + * status after copying. + * + * @aer_base: base address of AER capability block in RCRB + * @aer_regs: destination for copying AER capability + */ +static bool cxl_rch_get_aer_info(void __iomem *aer_base, + struct aer_capability_regs *aer_regs) +{ + int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); + u32 *aer_regs_buf = (u32 *)aer_regs; + int n; + + if (!aer_base) + return false; + + /* Use readl() to guarantee 32-bit accesses */ + for (n = 0; n < read_cnt; n++) + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); + + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); + + return true; +} + +/* Get AER severity. Return false if there is no error. */ +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, + int *severity) +{ + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) + *severity = AER_FATAL; + else + *severity = AER_NONFATAL; + return true; + } + + if (aer_regs->cor_status & ~aer_regs->cor_mask) { + *severity = AER_CORRECTABLE; + return true; + } + + return false; +} + +void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct aer_capability_regs aer_regs; + struct cxl_dport *dport; + int severity; + + struct cxl_port *port __free(put_cxl_port) = + cxl_pci_find_port(pdev, &dport); + if (!port) + return; + + if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) + return; + + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) + return; + + pci_print_aer(pdev, severity, &aer_regs); + if (severity == AER_CORRECTABLE) + cxl_handle_cor_ras(cxlds, dport->regs.ras); + else + cxl_handle_ras(cxlds, dport->regs.ras); +} diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index b7ea66382f3b..6eceefefb0e0 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -63,6 +63,7 @@ cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras.o +cxl_core-$(CONFIG_CXL_RAS) += $(CXL_CORE_SRC)/ras_rch.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o From 6dc5fe212e74e6880a1da0093f627387d0a658bb Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:30 -0600 Subject: [PATCH 17/59] PCI/AER: Export pci_aer_unmask_internal_errors() Internal PCIe errors are not enabled by default during initialization because their behavior is too device-specific and there is no standard way to reason about them. However, for CXL an internal error is the standard mechanism for conveying CXL protocol errors. Export pci_aer_unmask_internal_errors() for CXL, but make it clear that they are only meant for CXL and the status quo for leaving them masked for PCIe in general remains. Signed-off-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-10-terry.bowman@amd.com Co-developed-by: Dan Williams Signed-off-by: Dan Williams Acked-by: Bjorn Helgaas Signed-off-by: Dave Jiang --- drivers/pci/pcie/aer.c | 11 ++++++++--- include/linux/aer.h | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index c99ba2a1159c..972ecaf6a832 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1120,8 +1120,6 @@ static bool find_source_device(struct pci_dev *parent, return true; } -#ifdef CONFIG_PCIEAER_CXL - /** * pci_aer_unmask_internal_errors - unmask internal errors * @dev: pointer to the pci_dev data structure @@ -1132,7 +1130,7 @@ static bool find_source_device(struct pci_dev *parent, * Note: AER must be enabled and supported by the device which must be * checked in advance, e.g. with pcie_aer_is_native(). */ -static void pci_aer_unmask_internal_errors(struct pci_dev *dev) +void pci_aer_unmask_internal_errors(struct pci_dev *dev) { int aer = dev->aer_cap; u32 mask; @@ -1146,6 +1144,13 @@ static void pci_aer_unmask_internal_errors(struct pci_dev *dev) pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, mask); } +/* + * Internal errors are too device-specific to enable generally, however for CXL + * their behavior is standardized for conveying CXL protocol errors. + */ +EXPORT_SYMBOL_FOR_MODULES(pci_aer_unmask_internal_errors, "cxl_core"); + +#ifdef CONFIG_PCIEAER_CXL static bool is_cxl_mem_dev(struct pci_dev *dev) { /* diff --git a/include/linux/aer.h b/include/linux/aer.h index 02940be66324..df0f5c382286 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -56,12 +56,14 @@ struct aer_capability_regs { #if defined(CONFIG_PCIEAER) int pci_aer_clear_nonfatal_status(struct pci_dev *dev); int pcie_aer_is_native(struct pci_dev *dev); +void pci_aer_unmask_internal_errors(struct pci_dev *dev); #else static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev) { return -EINVAL; } static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; } +static inline void pci_aer_unmask_internal_errors(struct pci_dev *dev) { } #endif void pci_print_aer(struct pci_dev *dev, int aer_severity, From 51ce56b1a5d6f7263739d4766ae445463c74b689 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:31 -0600 Subject: [PATCH 18/59] PCI/AER: Update is_internal_error() to be non-static is_aer_internal_error() The AER driver includes significant logic for handling CXL protocol errors. The AER driver will be updated in the future to separate the AER and CXL logic. Rename the is_internal_error() function to is_aer_internal_error() as it gives a more precise indication of the purpose. Make is_aer_internal_error() non-static to allow for the 2 different CXL topology error model implementations (RCH and VH) to share this helper. Signed-off-by: Terry Bowman Link: https://patch.msgid.link/20260114182055.46029-11-terry.bowman@amd.com Acked-by: Bjorn Helgaas Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/pci/pcie/aer.c | 4 ++-- drivers/pci/pcie/portdrv.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 972ecaf6a832..6943a75e6a78 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1171,7 +1171,7 @@ static bool is_cxl_mem_dev(struct pci_dev *dev) return true; } -static bool is_internal_error(struct aer_err_info *info) +bool is_aer_internal_error(struct aer_err_info *info) { if (info->severity == AER_CORRECTABLE) return info->status & PCI_ERR_COR_INTERNAL; @@ -1216,7 +1216,7 @@ static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) * device driver. */ if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && - is_internal_error(info)) + is_aer_internal_error(info)) pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); } diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index bd29d1cc7b8b..e7a0a2cffea9 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -123,4 +123,13 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {} #endif /* !CONFIG_PCIE_PME */ struct device *pcie_port_find_device(struct pci_dev *dev, u32 service); + +struct aer_err_info; + +#ifdef CONFIG_PCIEAER_CXL +bool is_aer_internal_error(struct aer_err_info *info); +#else +static inline bool is_aer_internal_error(struct aer_err_info *info) { return false; } +#endif /* CONFIG_PCIEAER_CXL */ + #endif /* _PORTDRV_H_ */ From 59010029faf27c82d1e786dfd1fb83b09f478d1b Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:32 -0600 Subject: [PATCH 19/59] PCI/AER: Move CXL RCH error handling to aer_cxl_rch.c The Restricted CXL Host (RCH) AER error handling logic currently resides in the AER driver file, aer.c. CXL specific changes conditionally compiled using #ifdefs. Improve the AER driver maintainability by separating the RCH specific logic from the AER driver's core functionality and removing the ifdefs. Introduce drivers/pci/pcie/aer_cxl_rch.c for moving the RCH AER logic into. Conditionally compile the file using the CONFIG_CXL_RCH_RAS Kconfig. Move the CXL logic into the new file but leave CXL helper function is_internal_error() in aer.c for now as it will be moved in future patch for CXL Virtual Hierarchy handling. To maintain compilation after the move other changes are required. Change cxl_rch_handle_error(), cxl_rch_enable_rcec(), and is_internal_error() to be non-static inorder for accessing from the AER driver. Update the new file with the SPDX and 2023 AMD copyright notations because the RCH bits were initially contributed in 2023 by AMD. See commit: commit 0a867568bb0d ("PCI/AER: Forward RCH downstream port-detected errors to the CXL.mem dev handler") Signed-off-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Ben Cheatham Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-12-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/pci/pcie/Makefile | 1 + drivers/pci/pcie/aer.c | 99 +----------------------------- drivers/pci/pcie/aer_cxl_rch.c | 106 +++++++++++++++++++++++++++++++++ drivers/pci/pcie/portdrv.h | 9 ++- 4 files changed, 114 insertions(+), 101 deletions(-) create mode 100644 drivers/pci/pcie/aer_cxl_rch.c diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile index 173829aa02e6..b0b43a18c304 100644 --- a/drivers/pci/pcie/Makefile +++ b/drivers/pci/pcie/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o obj-y += aspm.o obj-$(CONFIG_PCIEAER) += aer.o err.o tlp.o +obj-$(CONFIG_CXL_RAS) += aer_cxl_rch.o obj-$(CONFIG_PCIEAER_INJECT) += aer_inject.o obj-$(CONFIG_PCIE_PME) += pme.o obj-$(CONFIG_PCIE_DPC) += dpc.o diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 6943a75e6a78..ff499fd4a322 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1150,27 +1150,7 @@ void pci_aer_unmask_internal_errors(struct pci_dev *dev) */ EXPORT_SYMBOL_FOR_MODULES(pci_aer_unmask_internal_errors, "cxl_core"); -#ifdef CONFIG_PCIEAER_CXL -static bool is_cxl_mem_dev(struct pci_dev *dev) -{ - /* - * The capability, status, and control fields in Device 0, - * Function 0 DVSEC control the CXL functionality of the - * entire device (CXL 3.0, 8.1.3). - */ - if (dev->devfn != PCI_DEVFN(0, 0)) - return false; - - /* - * CXL Memory Devices must have the 502h class code set (CXL - * 3.0, 8.1.12.1). - */ - if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL) - return false; - - return true; -} - +#ifdef CONFIG_CXL_RAS bool is_aer_internal_error(struct aer_err_info *info) { if (info->severity == AER_CORRECTABLE) @@ -1178,83 +1158,6 @@ bool is_aer_internal_error(struct aer_err_info *info) return info->status & PCI_ERR_UNC_INTN; } - -static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) -{ - struct aer_err_info *info = (struct aer_err_info *)data; - const struct pci_error_handlers *err_handler; - - if (!is_cxl_mem_dev(dev) || !pcie_aer_is_native(dev)) - return 0; - - /* Protect dev->driver */ - device_lock(&dev->dev); - - err_handler = dev->driver ? dev->driver->err_handler : NULL; - if (!err_handler) - goto out; - - if (info->severity == AER_CORRECTABLE) { - if (err_handler->cor_error_detected) - err_handler->cor_error_detected(dev); - } else if (err_handler->error_detected) { - if (info->severity == AER_NONFATAL) - err_handler->error_detected(dev, pci_channel_io_normal); - else if (info->severity == AER_FATAL) - err_handler->error_detected(dev, pci_channel_io_frozen); - } -out: - device_unlock(&dev->dev); - return 0; -} - -static void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) -{ - /* - * Internal errors of an RCEC indicate an AER error in an - * RCH's downstream port. Check and handle them in the CXL.mem - * device driver. - */ - if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && - is_aer_internal_error(info)) - pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); -} - -static int handles_cxl_error_iter(struct pci_dev *dev, void *data) -{ - bool *handles_cxl = data; - - if (!*handles_cxl) - *handles_cxl = is_cxl_mem_dev(dev) && pcie_aer_is_native(dev); - - /* Non-zero terminates iteration */ - return *handles_cxl; -} - -static bool handles_cxl_errors(struct pci_dev *rcec) -{ - bool handles_cxl = false; - - if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && - pcie_aer_is_native(rcec)) - pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); - - return handles_cxl; -} - -static void cxl_rch_enable_rcec(struct pci_dev *rcec) -{ - if (!handles_cxl_errors(rcec)) - return; - - pci_aer_unmask_internal_errors(rcec); - pci_info(rcec, "CXL: Internal errors unmasked"); -} - -#else -static inline void cxl_rch_enable_rcec(struct pci_dev *dev) { } -static inline void cxl_rch_handle_error(struct pci_dev *dev, - struct aer_err_info *info) { } #endif /** diff --git a/drivers/pci/pcie/aer_cxl_rch.c b/drivers/pci/pcie/aer_cxl_rch.c new file mode 100644 index 000000000000..6b515edb12c1 --- /dev/null +++ b/drivers/pci/pcie/aer_cxl_rch.c @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2023 AMD Corporation. All rights reserved. */ + +#include +#include +#include +#include "../pci.h" +#include "portdrv.h" + +static bool is_cxl_mem_dev(struct pci_dev *dev) +{ + /* + * The capability, status, and control fields in Device 0, + * Function 0 DVSEC control the CXL functionality of the + * entire device (CXL 3.0, 8.1.3). + */ + if (dev->devfn != PCI_DEVFN(0, 0)) + return false; + + /* + * CXL Memory Devices must have the 502h class code set (CXL + * 3.0, 8.1.12.1). + */ + if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL) + return false; + + return true; +} + +static bool cxl_error_is_native(struct pci_dev *dev) +{ + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); + + return (pcie_ports_native || host->native_aer); +} + +static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) +{ + struct aer_err_info *info = (struct aer_err_info *)data; + const struct pci_error_handlers *err_handler; + + if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) + return 0; + + device_lock(&dev->dev); + + err_handler = dev->driver ? dev->driver->err_handler : NULL; + if (!err_handler) + goto out; + + if (info->severity == AER_CORRECTABLE) { + if (err_handler->cor_error_detected) + err_handler->cor_error_detected(dev); + } else if (err_handler->error_detected) { + if (info->severity == AER_NONFATAL) + err_handler->error_detected(dev, pci_channel_io_normal); + else if (info->severity == AER_FATAL) + err_handler->error_detected(dev, pci_channel_io_frozen); + } +out: + device_unlock(&dev->dev); + return 0; +} + +void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) +{ + /* + * Internal errors of an RCEC indicate an AER error in an + * RCH's downstream port. Check and handle them in the CXL.mem + * device driver. + */ + if (pci_pcie_type(dev) == PCI_EXP_TYPE_RC_EC && + is_aer_internal_error(info)) + pcie_walk_rcec(dev, cxl_rch_handle_error_iter, info); +} + +static int handles_cxl_error_iter(struct pci_dev *dev, void *data) +{ + bool *handles_cxl = data; + + if (!*handles_cxl) + *handles_cxl = is_cxl_mem_dev(dev) && cxl_error_is_native(dev); + + /* Non-zero terminates iteration */ + return *handles_cxl; +} + +static bool handles_cxl_errors(struct pci_dev *rcec) +{ + bool handles_cxl = false; + + if (pci_pcie_type(rcec) == PCI_EXP_TYPE_RC_EC && + pcie_aer_is_native(rcec)) + pcie_walk_rcec(rcec, handles_cxl_error_iter, &handles_cxl); + + return handles_cxl; +} + +void cxl_rch_enable_rcec(struct pci_dev *rcec) +{ + if (!handles_cxl_errors(rcec)) + return; + + pci_aer_unmask_internal_errors(rcec); + pci_info(rcec, "CXL: Internal errors unmasked"); +} diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h index e7a0a2cffea9..cc58bf2f2c84 100644 --- a/drivers/pci/pcie/portdrv.h +++ b/drivers/pci/pcie/portdrv.h @@ -126,10 +126,13 @@ struct device *pcie_port_find_device(struct pci_dev *dev, u32 service); struct aer_err_info; -#ifdef CONFIG_PCIEAER_CXL +#ifdef CONFIG_CXL_RAS bool is_aer_internal_error(struct aer_err_info *info); +void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info); +void cxl_rch_enable_rcec(struct pci_dev *rcec); #else static inline bool is_aer_internal_error(struct aer_err_info *info) { return false; } -#endif /* CONFIG_PCIEAER_CXL */ - +static inline void cxl_rch_handle_error(struct pci_dev *dev, struct aer_err_info *info) { } +static inline void cxl_rch_enable_rcec(struct pci_dev *rcec) { } +#endif /* CONFIG_CXL_RAS */ #endif /* _PORTDRV_H_ */ From da71bd360ded15626dabd59dd1d6939de38cab39 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:33 -0600 Subject: [PATCH 20/59] PCI/AER: Use guard() in cxl_rch_handle_error_iter() cxl_rch_handle_error_iter() includes a call to device_lock() using a goto for multiple return paths. Improve readability and maintainability by using the guard() lock variant. Signed-off-by: Terry Bowman Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Reviewed-by: Dan Williams Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/20260114182055.46029-13-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/pci/pcie/aer_cxl_rch.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pcie/aer_cxl_rch.c b/drivers/pci/pcie/aer_cxl_rch.c index 6b515edb12c1..e471eefec9c4 100644 --- a/drivers/pci/pcie/aer_cxl_rch.c +++ b/drivers/pci/pcie/aer_cxl_rch.c @@ -42,11 +42,11 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) if (!is_cxl_mem_dev(dev) || !cxl_error_is_native(dev)) return 0; - device_lock(&dev->dev); + guard(device)(&dev->dev); err_handler = dev->driver ? dev->driver->err_handler : NULL; if (!err_handler) - goto out; + return 0; if (info->severity == AER_CORRECTABLE) { if (err_handler->cor_error_detected) @@ -57,8 +57,6 @@ static int cxl_rch_handle_error_iter(struct pci_dev *dev, void *data) else if (info->severity == AER_FATAL) err_handler->error_detected(dev, pci_channel_io_frozen); } -out: - device_unlock(&dev->dev); return 0; } From 83cba5b31e6b0aeb32f41b9c954fe97b60db2817 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:35 -0600 Subject: [PATCH 21/59] PCI/AER: Report CXL or PCIe bus type in AER trace logging The AER service driver and aer_event tracing currently log 'PCIe Bus Type' for all errors. Update the driver and aer_event tracing to log 'CXL Bus Type' for CXL device errors. This requires that AER can identify and distinguish between PCIe errors and CXL errors. Introduce boolean 'is_cxl' to 'struct aer_err_info'. Add assignment in aer_get_device_error_info() and pci_print_aer(). Update the aer_event trace routine to accept a bus type string parameter. Signed-off-by: Terry Bowman Co-developed-by: Dan Williams Acked-by: Bjorn Helgaas Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-15-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/pci/pci.h | 8 +++++++- drivers/pci/pcie/aer.c | 20 +++++++++++++------- include/ras/ras_event.h | 12 ++++++++---- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 0e67014aa001..41ec38e82c08 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -738,7 +738,8 @@ struct aer_err_info { unsigned int multi_error_valid:1; unsigned int first_error:5; - unsigned int __pad2:2; + unsigned int __pad2:1; + unsigned int is_cxl:1; unsigned int tlp_header_valid:1; unsigned int status; /* COR/UNCOR Error Status */ @@ -749,6 +750,11 @@ struct aer_err_info { int aer_get_device_error_info(struct aer_err_info *info, int i); void aer_print_error(struct aer_err_info *info, int i); +static inline const char *aer_err_bus(struct aer_err_info *info) +{ + return info->is_cxl ? "CXL" : "PCIe"; +} + int pcie_read_tlp_log(struct pci_dev *dev, int where, int where2, unsigned int tlp_len, bool flit, struct pcie_tlp_log *log); diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index ff499fd4a322..49a4bd13c2d2 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -870,6 +870,7 @@ void aer_print_error(struct aer_err_info *info, int i) struct pci_dev *dev; int layer, agent, id; const char *level = info->level; + const char *bus_type = aer_err_bus(info); if (WARN_ON_ONCE(i >= AER_MAX_MULTI_ERR_DEVICES)) return; @@ -879,22 +880,22 @@ void aer_print_error(struct aer_err_info *info, int i) pci_dev_aer_stats_incr(dev, info); trace_aer_event(pci_name(dev), (info->status & ~info->mask), - info->severity, info->tlp_header_valid, &info->tlp); + info->severity, info->tlp_header_valid, &info->tlp, bus_type); if (!info->ratelimit_print[i]) return; if (!info->status) { - pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", - aer_error_severity_string[info->severity]); + pci_err(dev, "%s Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n", + bus_type, aer_error_severity_string[info->severity]); goto out; } layer = AER_GET_LAYER_ERROR(info->severity, info->status); agent = AER_GET_AGENT(info->severity, info->status); - aer_printk(level, dev, "PCIe Bus Error: severity=%s, type=%s, (%s)\n", - aer_error_severity_string[info->severity], + aer_printk(level, dev, "%s Bus Error: severity=%s, type=%s, (%s)\n", + bus_type, aer_error_severity_string[info->severity], aer_error_layer[layer], aer_agent_string[agent]); aer_printk(level, dev, " device [%04x:%04x] error status/mask=%08x/%08x\n", @@ -928,6 +929,7 @@ EXPORT_SYMBOL_GPL(cper_severity_to_aer); void pci_print_aer(struct pci_dev *dev, int aer_severity, struct aer_capability_regs *aer) { + const char *bus_type; int layer, agent, tlp_header_valid = 0; u32 status, mask; struct aer_err_info info = { @@ -948,10 +950,13 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, info.status = status; info.mask = mask; + info.is_cxl = pcie_is_cxl(dev); + + bus_type = aer_err_bus(&info); pci_dev_aer_stats_incr(dev, &info); - trace_aer_event(pci_name(dev), (status & ~mask), - aer_severity, tlp_header_valid, &aer->header_log); + trace_aer_event(pci_name(dev), (status & ~mask), aer_severity, + tlp_header_valid, &aer->header_log, bus_type); if (!aer_ratelimit(dev, info.severity)) return; @@ -1306,6 +1311,7 @@ int aer_get_device_error_info(struct aer_err_info *info, int i) /* Must reset in this function */ info->status = 0; info->tlp_header_valid = 0; + info->is_cxl = pcie_is_cxl(dev); /* The device might not support AER */ if (!aer) diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index eaecc3c5f772..fdb785fa4613 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -339,9 +339,11 @@ TRACE_EVENT(aer_event, const u32 status, const u8 severity, const u8 tlp_header_valid, - struct pcie_tlp_log *tlp), + struct pcie_tlp_log *tlp, + const char *bus_type), - TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp), + + TP_ARGS(dev_name, status, severity, tlp_header_valid, tlp, bus_type), TP_STRUCT__entry( __string( dev_name, dev_name ) @@ -349,10 +351,12 @@ TRACE_EVENT(aer_event, __field( u8, severity ) __field( u8, tlp_header_valid) __array( u32, tlp_header, PCIE_STD_MAX_TLP_HEADERLOG) + __string( bus_type, bus_type ) ), TP_fast_assign( __assign_str(dev_name); + __assign_str(bus_type); __entry->status = status; __entry->severity = severity; __entry->tlp_header_valid = tlp_header_valid; @@ -364,8 +368,8 @@ TRACE_EVENT(aer_event, } ), - TP_printk("%s PCIe Bus Error: severity=%s, %s, TLP Header=%s\n", - __get_str(dev_name), + TP_printk("%s %s Bus Error: severity=%s, %s, TLP Header=%s\n", + __get_str(dev_name), __get_str(bus_type), __entry->severity == AER_CORRECTABLE ? "Corrected" : __entry->severity == AER_FATAL ? "Fatal" : "Uncorrected, non-fatal", From fda78d848178fb2b4eea74d96218c6c98fbe8562 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Mon, 19 Jan 2026 18:40:58 -0800 Subject: [PATCH 22/59] PCI/AER: Update struct aer_err_info with kernel-doc formatting Update the existing 'struct aer_err_info' definition to use kernel-doc formatting. Remove the inline comments to reduce noise and do not introduce functional changes. This will improve readability and maintainability. Signed-off-by: Terry Bowman Reviewed-by: Dan Williams Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-16-terry.bowman@amd.com Acked-by: Bjorn Helgaas Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/pci/pci.h | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 41ec38e82c08..8ccb3ba61e11 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -724,16 +724,35 @@ static inline bool pci_dev_binding_disallowed(struct pci_dev *dev) #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ +/** + * struct aer_err_info - AER Error Information + * @dev: Devices reporting error + * @ratelimit_print: Flag to log or not log the devices' error. 0=NotLog/1=Log + * @__pad1: Padding for alignment + * @error_dev_num: Number of devices reporting an error + * @level: printk level to use in logging + * @id: Value from register PCI_ERR_ROOT_ERR_SRC + * @severity: AER severity, 0-UNCOR Non-fatal, 1-UNCOR fatal, 2-COR + * @root_ratelimit_print: Flag to log or not log the root's error. 0=NotLog/1=Log + * @multi_error_valid: If multiple errors are reported + * @first_error: First reported error + * @__pad2: Padding for alignment + * @is_cxl: Bus type error: 0-PCI Bus error, 1-CXL Bus error + * @tlp_header_valid: Indicates if TLP field contains error information + * @status: COR/UNCOR error status + * @mask: COR/UNCOR mask + * @tlp: Transaction packet information + */ struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; int ratelimit_print[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; - const char *level; /* printk level */ + const char *level; unsigned int id:16; - unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ - unsigned int root_ratelimit_print:1; /* 0=skip, 1=print */ + unsigned int severity:2; + unsigned int root_ratelimit_print:1; unsigned int __pad1:4; unsigned int multi_error_valid:1; @@ -742,9 +761,9 @@ struct aer_err_info { unsigned int is_cxl:1; unsigned int tlp_header_valid:1; - unsigned int status; /* COR/UNCOR Error Status */ - unsigned int mask; /* COR/UNCOR Error Mask */ - struct pcie_tlp_log tlp; /* TLP Header */ + unsigned int status; + unsigned int mask; + struct pcie_tlp_log tlp; }; int aer_get_device_error_info(struct aer_err_info *info, int i); From f953b7d5e19a1310dd5d92b86bafc5957847b4d6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 14 Jan 2026 12:20:37 -0600 Subject: [PATCH 23/59] cxl/mem: Clarify @host for devm_cxl_add_nvdimm() The convention for devm_ helpers in the CXL driver is that the first argument is the @host for the operation (locked driver::probe() context). Reviewed-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114182055.46029-17-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/pmem.c | 13 +++++++------ drivers/cxl/cxl.h | 3 ++- drivers/cxl/mem.c | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index 8853415c106a..e7b1e6fa0ea0 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -237,12 +237,13 @@ static void cxlmd_release_nvdimm(void *_cxlmd) /** * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm - * @parent_port: parent port for the (to be added) @cxlmd endpoint port - * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations + * @host: host device for devm operations + * @port: any port in the CXL topology to find the nvdimm-bridge device + * @cxlmd: parent of the to be created cxl_nvdimm device * * Return: 0 on success negative error code on failure. */ -int devm_cxl_add_nvdimm(struct cxl_port *parent_port, +int devm_cxl_add_nvdimm(struct device *host, struct cxl_port *port, struct cxl_memdev *cxlmd) { struct cxl_nvdimm_bridge *cxl_nvb; @@ -250,7 +251,7 @@ int devm_cxl_add_nvdimm(struct cxl_port *parent_port, struct device *dev; int rc; - cxl_nvb = cxl_find_nvdimm_bridge(parent_port); + cxl_nvb = cxl_find_nvdimm_bridge(port); if (!cxl_nvb) return -ENODEV; @@ -270,10 +271,10 @@ int devm_cxl_add_nvdimm(struct cxl_port *parent_port, if (rc) goto err; - dev_dbg(&cxlmd->dev, "register %s\n", dev_name(dev)); + dev_dbg(host, "register %s\n", dev_name(dev)); /* @cxlmd carries a reference on @cxl_nvb until cxlmd_release_nvdimm */ - return devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, cxlmd); + return devm_add_action_or_reset(host, cxlmd_release_nvdimm, cxlmd); err: put_device(dev); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 42a76a7a088f..6f3741a57932 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -887,7 +887,8 @@ struct cxl_nvdimm_bridge *devm_cxl_add_nvdimm_bridge(struct device *host, struct cxl_port *port); struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev); bool is_cxl_nvdimm(struct device *dev); -int devm_cxl_add_nvdimm(struct cxl_port *parent_port, struct cxl_memdev *cxlmd); +int devm_cxl_add_nvdimm(struct device *host, struct cxl_port *port, + struct cxl_memdev *cxlmd); struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_port *port); #ifdef CONFIG_CXL_REGION diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index 6e6777b7bafb..c2ee7f7f6320 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -153,7 +153,7 @@ static int cxl_mem_probe(struct device *dev) } if (cxl_pmem_size(cxlds) && IS_ENABLED(CONFIG_CXL_PMEM)) { - rc = devm_cxl_add_nvdimm(parent_port, cxlmd); + rc = devm_cxl_add_nvdimm(dev, parent_port, cxlmd); if (rc) { if (rc == -ENODEV) dev_info(dev, "PMEM disabled by platform\n"); From 9a8920ca8ebfb99604f639e7fbc681d0d04518a0 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Wed, 14 Jan 2026 12:20:38 -0600 Subject: [PATCH 24/59] cxl: Update RAS handler interfaces to also support CXL Ports CXL PCIe Port Protocol Error handling support will be added to the CXL drivers in the future. In preparation, rename the existing interfaces to support handling all CXL PCIe Port Protocol Errors. The driver's RAS support functions currently rely on a 'struct cxl_dev_state' type parameter, which is not available for CXL Port devices. However, since the same CXL RAS capability structure is needed across most CXL components and devices, a common handling approach should be adopted. To accommodate this, update the __cxl_handle_cor_ras() and __cxl_handle_ras() functions to use a `struct device` instead of `struct cxl_dev_state`. No functional changes are introduced. [1] CXL 3.1 Spec, 8.2.4 CXL.cache and CXL.mem Registers Signed-off-by: Terry Bowman Reviewed-by: Alejandro Lucero Reviewed-by: Ira Weiny Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Ben Cheatham Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260114182055.46029-18-terry.bowman@amd.com Signed-off-by: Dan Williams Signed-off-by: Dave Jiang --- drivers/cxl/core/core.h | 14 +++++--------- drivers/cxl/core/ras.c | 12 ++++++------ drivers/cxl/core/ras_rch.c | 4 ++-- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 724361195057..422531799af2 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -147,8 +147,8 @@ int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, #ifdef CONFIG_CXL_RAS int cxl_ras_init(void); void cxl_ras_exit(void); -bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); -void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base); +bool cxl_handle_ras(struct device *dev, void __iomem *ras_base); +void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base); void cxl_dport_map_rch_aer(struct cxl_dport *dport); void cxl_disable_rch_root_ints(struct cxl_dport *dport); void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); @@ -157,16 +157,12 @@ static inline int cxl_ras_init(void) { return 0; } - -static inline void cxl_ras_exit(void) -{ -} - -static inline bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +static inline void cxl_ras_exit(void) { } +static inline bool cxl_handle_ras(struct device *dev, void __iomem *ras_base) { return false; } -static inline void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) { } +static inline void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base) { } static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index b933030b8e1e..72908f3ced77 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -160,7 +160,7 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) } EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); -void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base) { void __iomem *addr; u32 status; @@ -172,7 +172,7 @@ void cxl_handle_cor_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) status = readl(addr); if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); - trace_cxl_aer_correctable_error(cxlds->cxlmd, status); + trace_cxl_aer_correctable_error(to_cxl_memdev(dev), status); } } @@ -197,7 +197,7 @@ static void header_log_copy(void __iomem *ras_base, u32 *log) * Log the state of the RAS status registers and prepare them to log the * next error status. Return 1 if reset needed. */ -bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) +bool cxl_handle_ras(struct device *dev, void __iomem *ras_base) { u32 hl[CXL_HEADERLOG_SIZE_U32]; void __iomem *addr; @@ -224,7 +224,7 @@ bool cxl_handle_ras(struct cxl_dev_state *cxlds, void __iomem *ras_base) } header_log_copy(ras_base, hl); - trace_cxl_aer_uncorrectable_error(cxlds->cxlmd, status, fe, hl); + trace_cxl_aer_uncorrectable_error(to_cxl_memdev(dev), status, fe, hl); writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); return true; @@ -246,7 +246,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev) if (cxlds->rcd) cxl_handle_rdport_errors(cxlds); - cxl_handle_cor_ras(cxlds, cxlds->regs.ras); + cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); } } EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); @@ -275,7 +275,7 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, * chance the situation is recoverable dump the status of the RAS * capability registers and bounce the active state of the memdev. */ - ue = cxl_handle_ras(cxlds, cxlds->regs.ras); + ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); } diff --git a/drivers/cxl/core/ras_rch.c b/drivers/cxl/core/ras_rch.c index ed58afd18ecc..0a8b3b9b6388 100644 --- a/drivers/cxl/core/ras_rch.c +++ b/drivers/cxl/core/ras_rch.c @@ -115,7 +115,7 @@ void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) pci_print_aer(pdev, severity, &aer_regs); if (severity == AER_CORRECTABLE) - cxl_handle_cor_ras(cxlds, dport->regs.ras); + cxl_handle_cor_ras(&cxlds->cxlmd->dev, dport->regs.ras); else - cxl_handle_ras(cxlds, dport->regs.ras); + cxl_handle_ras(&cxlds->cxlmd->dev, dport->regs.ras); } From 2489d83c22ce9e44425469960677e6dbfd68adcc Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 19 Dec 2025 12:05:37 -0500 Subject: [PATCH 25/59] Documentation/driver-api/cxl: BIOS/EFI expectation update Add a snippet about what Linux expects BIOS/EFI to do (and not to do) to the BIOS/EFI section. Suggested-by: Alejandro Lucero Palau Signed-off-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Alejandro Lucero Palau Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251219170538.1675743-2-gourry@gourry.net Signed-off-by: Dave Jiang --- .../driver-api/cxl/platform/bios-and-efi.rst | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/Documentation/driver-api/cxl/platform/bios-and-efi.rst b/Documentation/driver-api/cxl/platform/bios-and-efi.rst index a9aa0ccd92af..9034c206cf8e 100644 --- a/Documentation/driver-api/cxl/platform/bios-and-efi.rst +++ b/Documentation/driver-api/cxl/platform/bios-and-efi.rst @@ -29,6 +29,26 @@ at :doc:`ACPI Tables `. on physical memory region size and alignment, memory holes, HDM interleave, and what linux expects of HDM decoders trying to work with these features. + +Linux Expectations of BIOS/EFI Software +======================================= +Linux expects BIOS/EFI software to construct sufficient ACPI tables (such as +CEDT, SRAT, HMAT, etc) and platform-specific configurations (such as HPA spaces +and host-bridge interleave configurations) to allow the Linux driver to +subsequently configure the devices in the CXL fabric at runtime. + +Programming of HDM decoders and switch ports is not required, and may be +deferred to the CXL driver based on admin policy (e.g. udev rules). + +Some platforms may require pre-programming HDM decoders and locking them +due to quirks (see: Zen5 address translation), but this is not the normal, +"expected" configuration path. This should be avoided if possible. + +Some platforms may wish to pre-configure these resources to bring memory +up without requiring CXL driver support. These platform vendors should +test their configurations with the existing CXL driver and provide driver +support for their auto-configurations if features like RAS are required. + UEFI Settings ============= If your platform supports it, the :code:`uefisettings` command can be used to From 7362facf6ec14f70fe28413cb484639d783b89f0 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 19 Dec 2025 12:05:38 -0500 Subject: [PATCH 26/59] Documentation/driver-api/cxl: device hotplug section Describe cxl memory device hotplug implications, in particular how the platform CEDT CFMWS must be described to support successful hot-add of memory devices. Reviewed-by: Jonathan Cameron Signed-off-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Alejandro Lucero Palau Link: https://patch.msgid.link/20251219170538.1675743-3-gourry@gourry.net Signed-off-by: Dave Jiang --- Documentation/driver-api/cxl/index.rst | 1 + .../driver-api/cxl/platform/bios-and-efi.rst | 3 + .../cxl/platform/device-hotplug.rst | 130 ++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 Documentation/driver-api/cxl/platform/device-hotplug.rst diff --git a/Documentation/driver-api/cxl/index.rst b/Documentation/driver-api/cxl/index.rst index c1106a68b67c..5a734988a5af 100644 --- a/Documentation/driver-api/cxl/index.rst +++ b/Documentation/driver-api/cxl/index.rst @@ -30,6 +30,7 @@ that have impacts on each other. The docs here break up configurations steps. platform/acpi platform/cdat platform/example-configs + platform/device-hotplug .. toctree:: :maxdepth: 2 diff --git a/Documentation/driver-api/cxl/platform/bios-and-efi.rst b/Documentation/driver-api/cxl/platform/bios-and-efi.rst index 9034c206cf8e..a4b44c018f09 100644 --- a/Documentation/driver-api/cxl/platform/bios-and-efi.rst +++ b/Documentation/driver-api/cxl/platform/bios-and-efi.rst @@ -49,6 +49,9 @@ up without requiring CXL driver support. These platform vendors should test their configurations with the existing CXL driver and provide driver support for their auto-configurations if features like RAS are required. +Platforms requiring boot-time programming and/or locking of CXL fabric +components may prevent features, such as device hot-plug, from working. + UEFI Settings ============= If your platform supports it, the :code:`uefisettings` command can be used to diff --git a/Documentation/driver-api/cxl/platform/device-hotplug.rst b/Documentation/driver-api/cxl/platform/device-hotplug.rst new file mode 100644 index 000000000000..e4a065fdd3ec --- /dev/null +++ b/Documentation/driver-api/cxl/platform/device-hotplug.rst @@ -0,0 +1,130 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================== +CXL Device Hotplug +================== + +Device hotplug refers to *physical* hotplug of a device (addition or removal +of a physical device from the machine). + +BIOS/EFI software is expected to configure sufficient resources **at boot +time** to allow hotplugged devices to be configured by software (such as +proximity domains, HPA regions, and host-bridge configurations). + +BIOS/EFI is not expected (**nor suggested**) to configure hotplugged +devices at hotplug time (i.e. HDM decoders should be left unprogrammed). + +This document covers some examples of those resources, but should not +be considered exhaustive. + +Hot-Remove +========== +Hot removal of a device typically requires careful removal of software +constructs (memory regions, associated drivers) which manage these devices. + +Hard-removing a CXL.mem device without carefully tearing down driver stacks +is likely to cause the system to machine-check (or at least SIGBUS if memory +access is limited to user space). + +Memory Device Hot-Add +===================== +A device present at boot may be associated with a CXL Fixed Memory Window +reported in :doc:`CEDT`. That CFMWS may match the size of the +device, but the construction of the CEDT CFMWS is platform-defined. + +Hot-adding a memory device requires this pre-defined, **static** CFMWS to +have sufficient HPA space to describe that device. + +There are a few common scenarios to consider. + +Single-Endpoint Memory Device Present at Boot +--------------------------------------------- +A device present at boot likely had its capacity reported in the +:doc:`CEDT`. If a device is removed and a new device hotplugged, +the capacity of the new device will be limited to the original CFMWS capacity. + +Adding capacity larger than the original device will cause memory region +creation to fail if the region size is greater than the CFMWS size. + +The CFMWS is **static** and cannot be adjusted. Platforms which may expect +different sized devices to be hotplugged must allocate sufficient CFMWS space +**at boot time** to cover all future expected devices. + +Multi-Endpoint Memory Device Present at Boot +-------------------------------------------- +Non-switch-based Multi-Endpoint devices are outside the scope of what the +CXL specification describes, but they are technically possible. We describe +them here for instructive reasons only - this does not imply Linux support. + +A hot-plug capable CXL memory device, such as one which presents multiple +expanders as a single large-capacity device, should report the **maximum +possible capacity** for the device at boot. :: + + HB0 + RP0 + | + [Multi-Endpoint Memory Device] + _____|_____ + | | + [Endpoint0] [Empty] + + +Limiting the size to the capacity preset at boot will limit hot-add support +to replacing capacity that was present at boot. + +No CXL Device Present at Boot +----------------------------- +When no CXL memory device is present on boot, some platforms omit the CFMWS +in the :doc:`CEDT`. When this occurs, hot-add is not possible. + +This describes the base case for any given device not being present at boot. +If a future possible device is not described in the CEDT at boot, hot-add +of that device is either limited or not possible. + +For a platform to support hot-add of a full memory device, it must allocate +a CEDT CFMWS region with sufficient memory capacity to cover all future +potentially added capacity (along with any relevant CEDT CHBS entry). + +To support memory hotplug directly on the host bridge/root port, or on a switch +downstream of the host bridge, a platform must construct a CEDT CFMWS at boot +with sufficient resources to support the max possible (or expected) hotplug +memory capacity. :: + + HB0 HB1 + RP0 RP1 RP2 + | | | + Empty Empty USP + ________|________ + | | | | + DSP DSP DSP DSP + | | | | + All Empty + +For example, a BIOS/EFI may expose an option to configure a CEDT CFMWS with +a pre-configured amount of memory capacity (per host bridge, or host bridge +interleave set), even if no device is attached to Root Ports or Downstream +Ports at boot (as depicted in the figure above). + + +Interleave Sets +=============== + +Host Bridge Interleave +---------------------- +Host-bridge interleaved memory regions are defined **statically** in the +:doc:`CEDT`. To apply cross-host-bridge interleave, a CFMWS entry +describing that interleave must have been provided **at boot**. Hotplugged +devices cannot add host-bridge interleave capabilities at hotplug time. + +See the :doc:`Flexible CEDT Configuration` +example to see how a platform can provide this kind of flexibility regarding +hotplugged memory devices. BIOS/EFI software should consider options to +present flexible CEDT configurations with hotplug support. + +HDM Interleave +-------------- +Decoder-applied interleave can flexibly handle hotplugged devices, as decoders +can be re-programmed after hotplug. + +To add or remove a device to/from an existing HDM-applied interleaved region, +that region must be torn down an re-created. From 4dd05f02f1d618da610e7d3bd479c47a96b4fc3f Mon Sep 17 00:00:00 2001 From: Samasth Norway Ananda Date: Mon, 5 Jan 2026 12:38:33 -0800 Subject: [PATCH 27/59] cxl/pci: Remove outdated FIXME comment and BUILD_BUG_ON Remove the outdated FIXME comment about switching to struct_group() and the associated BUILD_BUG_ON check. This work was already completed in commit 301e68dd9b9b ("cxl/core: Replace unions with struct_group()") which converted struct cxl_regs to use struct_group_tagged(). The BUILD_BUG_ON was checking that anonymous union layout was preserved, but since struct_group() now handles this correctly, the compile-time check is no longer necessary. Signed-off-by: Samasth Norway Ananda Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260105203833.1604585-1-samasth.norway.ananda@oracle.com Signed-off-by: Dave Jiang --- drivers/cxl/pci.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 0be4e508affe..3b2293dffb3f 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -912,13 +912,6 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) unsigned int i; bool irq_avail; - /* - * Double check the anonymous union trickery in struct cxl_regs - * FIXME switch to struct_group() - */ - BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) != - offsetof(struct cxl_regs, device_regs.memdev)); - rc = pcim_enable_device(pdev); if (rc) return rc; From e5b1887619403c2da25a5899cad3e1ab34e7717f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 9 Jan 2026 13:29:51 +0100 Subject: [PATCH 28/59] cxl/hdm: Fix newline character in dev_err() messages The newline character is not placed at the end of the string. This causes unintended line wraps, broken log level and unterminated log messages. Fix that for all messages. Note that the messages are changed to use colons now instead of parentheses, which is more common use. Fixes: 24b18197184a ("cxl/hdm: Extend DVSEC range register emulation for region enumeration") Fixes: 9c57cde0dcbd ("cxl/hdm: Enumerate allocated DPA") Signed-off-by: Robert Richter Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260109122952.639231-1-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 1c5d2022c87a..6e516c69b2d2 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -966,7 +966,7 @@ static int cxl_setup_hdm_decoder_from_dvsec( rc = devm_cxl_dpa_reserve(cxled, *dpa_base, len, 0); if (rc) { dev_err(&port->dev, - "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)", + "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx: %d\n", port->id, cxld->id, *dpa_base, *dpa_base + len - 1, rc); return rc; } @@ -1117,7 +1117,7 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, rc = devm_cxl_dpa_reserve(cxled, *dpa_base + skip, dpa_size, skip); if (rc) { dev_err(&port->dev, - "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx\n (%d)", + "decoder%d.%d: Failed to reserve DPA range %#llx - %#llx: %d\n", port->id, cxld->id, *dpa_base, *dpa_base + dpa_size + skip - 1, rc); return rc; From 99698e70148fbce4410799570adac8456204fa37 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Fri, 9 Jan 2026 23:40:42 +0800 Subject: [PATCH 29/59] cxl/acpi: Remove cxl_acpi_set_cache_size() cxl_acpi_set_cache_size() returns an error only when the size of the cache range is not matched with the CXL address range. Almost all implementation of setting cache size is in cxl_acpi_set_cache_size(), cxl_setup_extended_linear_size() does nothing except printing a warning in above error case, but cxl_acpi_set_cache_size() also prints a warning at the same time. So can consolidates these two functions into one, keep the function name as cxl_setup_extended_linear_size(). Signed-off-by: Li Ming Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260109154042.331296-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 77ac940e3013..e65dfae42bde 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -357,7 +357,7 @@ static int add_or_reset_cxl_resource(struct resource *parent, struct resource *r return rc; } -static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd) +static void cxl_setup_extended_linear_cache(struct cxl_root_decoder *cxlrd) { struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct range *hpa = &cxld->hpa_range; @@ -367,12 +367,14 @@ static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd) struct resource res; int nid, rc; + /* Explicitly initialize cache size to 0 at the beginning */ + cxlrd->cache_size = 0; res = DEFINE_RES_MEM(start, size); nid = phys_to_target_node(start); rc = hmat_get_extended_linear_cache_size(&res, nid, &cache_size); if (rc) - return 0; + return; /* * The cache range is expected to be within the CFMWS. @@ -384,31 +386,10 @@ static int cxl_acpi_set_cache_size(struct cxl_root_decoder *cxlrd) dev_warn(&cxld->dev, "Extended Linear Cache size %pa != CXL size %pa. No Support!", &cache_size, &size); - return -ENXIO; + return; } cxlrd->cache_size = cache_size; - - return 0; -} - -static void cxl_setup_extended_linear_cache(struct cxl_root_decoder *cxlrd) -{ - int rc; - - rc = cxl_acpi_set_cache_size(cxlrd); - if (rc) { - /* - * Failing to retrieve extended linear cache region resize does not - * prevent the region from functioning. Only causes cxl list showing - * incorrect region size. - */ - dev_warn(cxlrd->cxlsd.cxld.dev.parent, - "Extended linear cache retrieval failed rc:%d\n", rc); - - /* Ignoring return code */ - cxlrd->cache_size = 0; - } } DEFINE_FREE(put_cxlrd, struct cxl_root_decoder *, From 4ed7952b9e87cf731ebc8251874416e60eb15230 Mon Sep 17 00:00:00 2001 From: "Cheatham, Benjamin" Date: Fri, 9 Jan 2026 07:57:38 -0600 Subject: [PATCH 30/59] cxl/core: Fix cxl_dport debugfs EINJ entries Protocol error injection is only valid for CXL 2.0+ root ports and CXL 1.1 memory-mapped downstream ports as per the ACPI v6.5 spec (Table 8-31). The core code currently creates an 'einj_inject' file in CXL debugfs for all CXL 1.1 downstream ports and all PCI CXL 2.0+ downstream ports. This results in debugfs EINJ files that won't work due to platform/spec restrictions. Fix by limiting 'einj_inject' file creation to only CXL 1.1 dports and CXL 2.0+ root ports. Update the comment above the check to more accurately represent the requirements expected by the EINJ module and ACPI spec. Fixes: 8039804cfa73 ("cxl/core: Add CXL EINJ debugfs files") Signed-off-by: Ben Cheatham Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Reviewed-by: Dave Jiang Link: https://patch.msgid.link/6e9fb657-8264-4028-92e2-5428e2695bf1@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index fef3aa0c6680..54f72452fb06 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -822,16 +822,18 @@ DEFINE_DEBUGFS_ATTRIBUTE(cxl_einj_inject_fops, NULL, cxl_einj_inject, static void cxl_debugfs_create_dport_dir(struct cxl_dport *dport) { + struct cxl_port *parent = parent_port_of(dport->port); struct dentry *dir; if (!einj_cxl_is_initialized()) return; /* - * dport_dev needs to be a PCIe port for CXL 2.0+ ports because - * EINJ expects a dport SBDF to be specified for 2.0 error injection. + * Protocol error injection is only available for CXL 2.0+ root ports + * and CXL 1.1 downstream ports */ - if (!dport->rch && !dev_is_pci(dport->dport_dev)) + if (!dport->rch && + !(dev_is_pci(dport->dport_dev) && parent && is_cxl_root(parent))) return; dir = cxl_debugfs_create_dir(dev_name(dport->dport_dev)); From e639055f1f30311db91cafb36e408cc727c7d445 Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Thu, 15 Jan 2026 20:58:36 -0800 Subject: [PATCH 31/59] cxl/region: Translate DPA->HPA in unaligned MOD3 regions The CXL driver implementation of DPA->HPA address translation depends on a region's starting address always being aligned to Host Bridge Interleave Ways * 256MB. The driver follows the decode methods defined in the CXL Spec[1] and expanded upon in the CXL Driver Writers Guide[2], which describe bit manipulations based on power-of-2 alignment to translate a DPA to an HPA. With the introduction of MOD3 interleave way support, platforms may create regions at starting addresses that are not power-of-2 aligned. This allows platforms to avoid gaps in the memory map, but addresses within those regions cannot be translated using the existing bit manipulation method. Introduce an unaligned translation method for DPA->HPA that reconstructs an HPA by restoring the address first at the port level and then at the host bridge level. [1] CXL Spec 4.0 8.2.4.20.13 Implementation Note Device Decoder Logic [2] CXL Type 3 Memory Software Guide 1.1 2.13.25 DPA to HPA Translation Suggested-by: Qing Huang Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Signed-off-by: Alison Schofield Link: https://patch.msgid.link/e7c53215bf69f2ff1ae7e58bcc49ca387b7b0299.1768538962.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 160 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 155 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index ae899f68551f..cdfa454b940d 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3112,13 +3112,146 @@ u64 cxl_calculate_hpa_offset(u64 dpa_offset, int pos, u8 eiw, u16 eig) } EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate"); +static int decode_pos(int region_ways, int hb_ways, int pos, int *pos_port, + int *pos_hb) +{ + int devices_per_hb; + + /* + * Decode for 3-6-12 way interleaves as defined in the CXL + * Spec 4.0 9.13.1.1 Legal Interleaving Configurations. + * Region creation should prevent invalid combinations but + * sanity check here to avoid a silent bad decode. + */ + switch (hb_ways) { + case 3: + if (region_ways != 3 && region_ways != 6 && region_ways != 12) + return -EINVAL; + break; + case 6: + if (region_ways != 6 && region_ways != 12) + return -EINVAL; + break; + case 12: + if (region_ways != 12) + return -EINVAL; + break; + default: + return -EINVAL; + } + /* + * Each host bridge contributes an equal number of endpoints + * that are laid out contiguously per host bridge. Modulo + * selects the port within a host bridge and division selects + * the host bridge position. + */ + devices_per_hb = region_ways / hb_ways; + *pos_port = pos % devices_per_hb; + *pos_hb = pos / devices_per_hb; + + return 0; +} + +/* + * restore_parent() reconstruct the address in parent + * + * This math, specifically the bitmask creation 'mask = gran - 1' relies + * on the CXL Spec requirement that interleave granularity is always a + * power of two. + * + * [mask] isolate the offset with the granularity + * [addr & ~mask] remove the offset leaving the aligned portion + * [* ways] distribute across all interleave ways + * [+ (pos * gran)] add the positional offset + * [+ (addr & mask)] restore the masked offset + */ +static u64 restore_parent(u64 addr, u64 pos, u64 gran, u64 ways) +{ + u64 mask = gran - 1; + + return ((addr & ~mask) * ways) + (pos * gran) + (addr & mask); +} + +/* + * unaligned_dpa_to_hpa() translates a DPA to HPA when the region resource + * start address is not aligned at Host Bridge Interleave Ways * 256MB. + * + * Unaligned start addresses only occur with MOD3 interleaves. All power- + * of-two interleaves are guaranteed aligned. + */ +static u64 unaligned_dpa_to_hpa(struct cxl_decoder *cxld, + struct cxl_region_params *p, int pos, u64 dpa) +{ + int ways_port = p->interleave_ways / cxld->interleave_ways; + int gran_port = p->interleave_granularity; + int gran_hb = cxld->interleave_granularity; + int ways_hb = cxld->interleave_ways; + int pos_port, pos_hb, gran_shift; + u64 hpa_port = 0; + + /* Decode an endpoint 'pos' into port and host-bridge components */ + if (decode_pos(p->interleave_ways, ways_hb, pos, &pos_port, &pos_hb)) { + dev_dbg(&cxld->dev, "not supported for region ways:%d\n", + p->interleave_ways); + return ULLONG_MAX; + } + + /* Restore the port parent address if needed */ + if (gran_hb != gran_port) + hpa_port = restore_parent(dpa, pos_port, gran_port, ways_port); + else + hpa_port = dpa; + + /* + * Complete the HPA reconstruction by restoring the address as if + * each HB position is a candidate. Test against expected pos_hb + * to confirm match. + */ + gran_shift = ilog2(gran_hb); + for (int position = 0; position < ways_hb; position++) { + u64 shifted, hpa; + + hpa = restore_parent(hpa_port, position, gran_hb, ways_hb); + hpa += p->res->start; + + shifted = hpa >> gran_shift; + if (do_div(shifted, ways_hb) == pos_hb) + return hpa; + } + + dev_dbg(&cxld->dev, "fail dpa:%#llx region:%pr pos:%d\n", dpa, p->res, + pos); + dev_dbg(&cxld->dev, " port-w/g/p:%d/%d/%d hb-w/g/p:%d/%d/%d\n", + ways_port, gran_port, pos_port, ways_hb, gran_hb, pos_hb); + + return ULLONG_MAX; +} + +static bool region_is_unaligned_mod3(struct cxl_region *cxlr) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region_params *p = &cxlr->params; + int hbiw = cxld->interleave_ways; + u64 rem; + + if (is_power_of_2(hbiw)) + return false; + + div64_u64_rem(p->res->start, (u64)hbiw * SZ_256M, &rem); + + return (rem != 0); +} + u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct cxl_region_params *p = &cxlr->params; struct cxl_endpoint_decoder *cxled = NULL; u64 dpa_offset, hpa_offset, hpa; + bool unaligned = false; u16 eig = 0; u8 eiw = 0; int pos; @@ -3132,15 +3265,32 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, if (!cxled) return ULLONG_MAX; + dpa_offset = dpa - cxl_dpa_resource_start(cxled); + + /* Unaligned calc for MOD3 interleaves not hbiw * 256MB aligned */ + unaligned = region_is_unaligned_mod3(cxlr); + if (unaligned) { + hpa = unaligned_dpa_to_hpa(cxld, p, cxled->pos, dpa_offset); + if (hpa == ULLONG_MAX) + return ULLONG_MAX; + + goto skip_aligned; + } + /* + * Aligned calc for all power-of-2 interleaves and for MOD3 + * interleaves that are aligned at hbiw * 256MB + */ pos = cxled->pos; ways_to_eiw(p->interleave_ways, &eiw); granularity_to_eig(p->interleave_granularity, &eig); - dpa_offset = dpa - cxl_dpa_resource_start(cxled); hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, eiw, eig); /* Apply the hpa_offset to the region base address */ - hpa = hpa_offset + p->res->start + p->cache_size; + hpa = hpa_offset + p->res->start; + +skip_aligned: + hpa += p->cache_size; /* Root decoder translation overrides typical modulo decode */ if (cxlrd->ops.hpa_to_spa) @@ -3151,9 +3301,9 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, "Addr trans fail: hpa 0x%llx not in region\n", hpa); return ULLONG_MAX; } - - /* Simple chunk check, by pos & gran, only applies to modulo decodes */ - if (!cxlrd->ops.hpa_to_spa && !cxl_is_hpa_in_chunk(hpa, cxlr, pos)) + /* Chunk check applies to aligned modulo decodes only */ + if (!unaligned && !cxlrd->ops.hpa_to_spa && + !cxl_is_hpa_in_chunk(hpa, cxlr, pos)) return ULLONG_MAX; return hpa; From b51792fd9168e581e51be98e22df5f79454e22de Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Thu, 15 Jan 2026 20:58:37 -0800 Subject: [PATCH 32/59] cxl/region: Translate HPA to DPA and memdev in unaligned regions The CXL driver supports an expert user debugfs interface to inject and clear poison by a region offset. That feature requires translating a HPA (the region address) to a DPA and a memdev to perform the poison operation. Unaligned regions do not have an algebraically invertible mapping from HPA to DPA due to the region offset skew. The region base is not aligned to a full interleave. Add a helper to perform the unaligned translations that first calculates the DPA offset and then tests it against each candidate endpoint decoder. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Alison Schofield Link: https://patch.msgid.link/f338b7aff7e4574fcc525b1a0d4f09786bfb6489.1768538962.git.alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 46 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index cdfa454b940d..d5979000fba1 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3314,6 +3314,48 @@ struct dpa_result { u64 dpa; }; +static int unaligned_region_offset_to_dpa_result(struct cxl_region *cxlr, + u64 offset, + struct dpa_result *result) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region_params *p = &cxlr->params; + u64 interleave_width, interleave_index; + u64 gran, gran_offset, dpa_offset; + u64 hpa = p->res->start + offset; + + /* + * Unaligned addresses are not algebraically invertible. Calculate + * a dpa_offset independent of the target device and then enumerate + * and test that dpa_offset against each candidate endpoint decoder. + */ + gran = cxld->interleave_granularity; + interleave_width = gran * cxld->interleave_ways; + interleave_index = div64_u64(offset, interleave_width); + gran_offset = div64_u64_rem(offset, gran, NULL); + + dpa_offset = interleave_index * gran + gran_offset; + + for (int i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + int pos = cxled->pos; + u64 test_hpa; + + test_hpa = unaligned_dpa_to_hpa(cxld, p, pos, dpa_offset); + if (test_hpa == hpa) { + result->cxlmd = cxled_to_memdev(cxled); + result->dpa = + cxl_dpa_resource_start(cxled) + dpa_offset; + return 0; + } + } + dev_err(&cxlr->dev, + "failed to resolve HPA %#llx in unaligned MOD3 region\n", hpa); + + return -ENXIO; +} + static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, struct dpa_result *result) { @@ -3343,6 +3385,10 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, hpa_offset = offset; } + if (region_is_unaligned_mod3(cxlr)) + return unaligned_region_offset_to_dpa_result(cxlr, offset, + result); + pos = cxl_calculate_position(hpa_offset, eiw, eig); if (pos < 0 || pos >= p->nr_targets) { dev_dbg(&cxlr->dev, "Invalid position %d for %d targets\n", From 064c098790944fa44f6aa704eb55a5c3ed65a2fa Mon Sep 17 00:00:00 2001 From: Alison Schofield Date: Fri, 16 Jan 2026 20:47:30 -0800 Subject: [PATCH 33/59] cxl/region: Use do_div() for 64-bit modulo operation div64_u64_rem() was the wrong choice for doing a modulo operation and it was used incorrectly, causing a kernel oops by passing NULL as the remainder parameter. Replace it with the do_div() helper that does the intended math (gran_offset = offset % gran) and is architecture safe. This bug appeared during testing of unaligned address translations. The visibility to userspace would be limited to folks doing poison injection or clear by HPA on unaligned regions. Fixes: 78b50b598462 ("cxl/region: Translate HPA to DPA and memdev in unaligned regions") Signed-off-by: Alison Schofield Link: https://patch.msgid.link/20260117044732.567831-1-alison.schofield@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index d5979000fba1..96888d87a8df 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3324,6 +3324,7 @@ static int unaligned_region_offset_to_dpa_result(struct cxl_region *cxlr, u64 interleave_width, interleave_index; u64 gran, gran_offset, dpa_offset; u64 hpa = p->res->start + offset; + u64 tmp = offset; /* * Unaligned addresses are not algebraically invertible. Calculate @@ -3333,7 +3334,7 @@ static int unaligned_region_offset_to_dpa_result(struct cxl_region *cxlr, gran = cxld->interleave_granularity; interleave_width = gran * cxld->interleave_ways; interleave_index = div64_u64(offset, interleave_width); - gran_offset = div64_u64_rem(offset, gran, NULL); + gran_offset = do_div(tmp, gran); dpa_offset = interleave_index * gran + gran_offset; From 7b6f9d9b1ea05c9c22570126547c780e8c6c3f62 Mon Sep 17 00:00:00 2001 From: Yuxiong Wang Date: Thu, 29 Jan 2026 14:45:52 +0800 Subject: [PATCH 34/59] cxl: Fix premature commit_end increment on decoder commit failure In cxl_decoder_commit(), commit_end is incremented before verifying whether the commit succeeded, and the CXL_DECODER_F_ENABLE bit in cxld->flags is only set after a successful commit. As a result, if the commit fails, commit_end has been incremented and cxld->reset() has no effect since the flag is not set, so commit_end remains incorrectly incremented. The inconsistency between commit_end and CXL_DECODER_F_ENABLE causes failure during subsequent either commit or reset operations. Fix this by incrementing commit_end only after confirming the commit succeeded. Also, remove the ineffective cxld->reset() call. According to CXL Spec r4.0 8.2.4.20.12 Committing Decoder Programming, since cxld_await_commit() has cleared the decoder commit bit on failure, no additional reset is required. [dj: Fixed commit log 80 char wrapping. ] [dj: Fix "Fixes" tag to correct hash length. ] [dj: Change spec to r4.0. ] Fixes: 176baefb2eb5 ("cxl/hdm: Commit decoder state to hardware") Signed-off-by: Yuxiong Wang Acked-by: Huang Ying Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Link: https://patch.msgid.link/20260129064552.31180-1-yuxiong.wang@linux.alibaba.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 6e516c69b2d2..1097de03a2bb 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -844,14 +844,13 @@ static int cxl_decoder_commit(struct cxl_decoder *cxld) scoped_guard(rwsem_read, &cxl_rwsem.dpa) setup_hw_decoder(cxld, hdm); - port->commit_end++; rc = cxld_await_commit(hdm, cxld->id); if (rc) { dev_dbg(&port->dev, "%s: error %d committing decoder\n", dev_name(&cxld->dev), rc); - cxld->reset(cxld); return rc; } + port->commit_end++; cxld->flags |= CXL_DECODER_F_ENABLE; return 0; From 47fec713d97fb6b823026f723435b58af541bd8d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jan 2026 16:03:55 -0800 Subject: [PATCH 35/59] cxl/port: Cleanup handling of the nr_dports 0 -> 1 transition There are multiple setup actions that can occur for a switch port after it is known that it has at least one active downstream link. That work is currently split between __devm_cxl_add_dport(), the add_dport() helper, and cxl_port_add_dport() where decoder setup occurs. Clean this up by moving all @dport object setup responsibilities into add_dport() and all port effects into cxl_port_add_dport(). add_dport() handles taking a reference on @dport->dport_dev, and cxl_port_add_dport() grows the awareness to setup the port component registers. This removes an awkward open-coded xa_erase() from the middle of __devm_cxl_add_dport() and instead tasks cxl_port_add_dport() with calling the common @dport destruction path if anything goes wrong. After this @port->nr_dports is always the count of @dports in the @port->dports xarray, and cxl_dport_remove() is symmetric with add_dport(). With ->nr_dports now reliably tracking the number of dports the use of ida_is_empty() can be dropped. Recall that the ida is only cleared on "release" of decoder objects, and release can be arbitrarily delayed past unregistration. Lastly port->component_reg_phys is no longer reset to CXL_RESOURCE_NONE post setup, no reason is seen to carry that forward. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Terry Bowman Signed-off-by: Dan Williams Link: https://patch.msgid.link/20260131000403.2135324-2-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index fef3aa0c6680..ff899c690d85 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1066,11 +1066,15 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *dport) return -EBUSY; } + /* Arrange for dport_dev to be valid through remove_dport() */ + struct device *dev __free(put_device) = get_device(dport->dport_dev); + rc = xa_insert(&port->dports, (unsigned long)dport->dport_dev, dport, GFP_KERNEL); if (rc) return rc; + retain_and_null_ptr(dev); port->nr_dports++; return 0; } @@ -1099,6 +1103,7 @@ static void cxl_dport_remove(void *data) struct cxl_dport *dport = data; struct cxl_port *port = dport->port; + port->nr_dports--; xa_erase(&port->dports, (unsigned long) dport->dport_dev); put_device(dport->dport_dev); } @@ -1181,21 +1186,6 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, if (rc) return ERR_PTR(rc); - /* - * Setup port register if this is the first dport showed up. Having - * a dport also means that there is at least 1 active link. - */ - if (port->nr_dports == 1 && - port->component_reg_phys != CXL_RESOURCE_NONE) { - rc = cxl_port_setup_regs(port, port->component_reg_phys); - if (rc) { - xa_erase(&port->dports, (unsigned long)dport->dport_dev); - return ERR_PTR(rc); - } - port->component_reg_phys = CXL_RESOURCE_NONE; - } - - get_device(dport_dev); rc = devm_add_action_or_reset(host, cxl_dport_remove, dport); if (rc) return ERR_PTR(rc); @@ -1622,7 +1612,16 @@ static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, cxl_switch_parse_cdat(new_dport); - if (ida_is_empty(&port->decoder_ida)) { + if (port->nr_dports == 1) { + /* + * Some host bridges are known to not have component regsisters + * available until a root port has trained CXL. Perform that + * setup now. + */ + rc = cxl_port_setup_regs(port, port->component_reg_phys); + if (rc) + return ERR_PTR(rc); + rc = devm_cxl_switch_port_decoders_setup(port); if (rc) return ERR_PTR(rc); From 83ccbaf1a1075ded82329d27de01d3b2681986ec Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jan 2026 16:03:56 -0800 Subject: [PATCH 36/59] cxl/port: Reduce number of @dport variables in cxl_port_add_dport() In preparation for refactoring cxl_port_add_dport() to add RAS register setup, cleanup the number of dport variables with a dport_exists() helper. Kill the @dport needed to check for duplicates, rename @new_dport to @dport. Reported-by: Jonathan Cameron Closes: http://lore.kernel.org/20260116150119.00003bbd@huawei.com Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Terry Bowman Signed-off-by: Dan Williams Link: https://patch.msgid.link/20260131000403.2135324-3-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index ff899c690d85..d7b6f52d0adc 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1587,30 +1587,38 @@ static int update_decoder_targets(struct device *dev, void *data) return 0; } +static bool dport_exists(struct cxl_port *port, struct device *dport_dev) +{ + struct cxl_dport *dport = cxl_find_dport_by_dev(port, dport_dev); + + if (dport) { + dev_dbg(&port->dev, "dport%d:%s already exists\n", + dport->port_id, dev_name(dport_dev)); + return true; + } + + return false; +} + DEFINE_FREE(del_cxl_dport, struct cxl_dport *, if (!IS_ERR_OR_NULL(_T)) del_dport(_T)) static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, struct device *dport_dev) { - struct cxl_dport *dport; int rc; device_lock_assert(&port->dev); if (!port->dev.driver) return ERR_PTR(-ENXIO); - dport = cxl_find_dport_by_dev(port, dport_dev); - if (dport) { - dev_dbg(&port->dev, "dport%d:%s already exists\n", - dport->port_id, dev_name(dport_dev)); + if (dport_exists(port, dport_dev)) return ERR_PTR(-EBUSY); - } - struct cxl_dport *new_dport __free(del_cxl_dport) = + struct cxl_dport *dport __free(del_cxl_dport) = devm_cxl_add_dport_by_dev(port, dport_dev); - if (IS_ERR(new_dport)) - return new_dport; + if (IS_ERR(dport)) + return dport; - cxl_switch_parse_cdat(new_dport); + cxl_switch_parse_cdat(dport); if (port->nr_dports == 1) { /* @@ -1626,17 +1634,17 @@ static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, if (rc) return ERR_PTR(rc); dev_dbg(&port->dev, "first dport%d:%s added with decoders\n", - new_dport->port_id, dev_name(dport_dev)); - return no_free_ptr(new_dport); + dport->port_id, dev_name(dport_dev)); + return no_free_ptr(dport); } /* New dport added, update the decoder targets */ - device_for_each_child(&port->dev, new_dport, update_decoder_targets); + device_for_each_child(&port->dev, dport, update_decoder_targets); - dev_dbg(&port->dev, "dport%d:%s added\n", new_dport->port_id, + dev_dbg(&port->dev, "dport%d:%s added\n", dport->port_id, dev_name(dport_dev)); - return no_free_ptr(new_dport); + return no_free_ptr(dport); } static struct cxl_dport *devm_cxl_create_port(struct device *ep_dev, From afa2bdba1ee28e21f30fe5391b0273b58b32e0d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jan 2026 16:03:57 -0800 Subject: [PATCH 37/59] cxl/port: Cleanup dport removal with a devres group In preparation for adding more setup actions like RAS register mapping, introduce a devres group to collect all the dport creation / registration actions. This replaces the maintenance tedium of open coding several devm_release_action() calls in del_dport(). Tested-by: Terry Bowman Signed-off-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260131000403.2135324-4-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 71 +++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index d7b6f52d0adc..99bbcf9cf236 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1118,6 +1118,57 @@ static void cxl_dport_unlink(void *data) sysfs_remove_link(&port->dev.kobj, link_name); } +static struct device *dport_to_host(struct cxl_dport *dport) +{ + struct cxl_port *port = dport->port; + + if (is_cxl_root(port)) + return port->uport_dev; + return &port->dev; +} + +static void free_dport(void *dport) +{ + kfree(dport); +} + +/* + * Upon return either a group is established with one action (free_dport()), or + * no group established and @dport is freed. + */ +static void *cxl_dport_open_dr_group_or_free(struct cxl_dport *dport) +{ + int rc; + struct device *host = dport_to_host(dport); + void *group = devres_open_group(host, dport, GFP_KERNEL); + + if (!group) { + kfree(dport); + return NULL; + } + + rc = devm_add_action_or_reset(host, free_dport, dport); + if (rc) { + devres_release_group(host, group); + return NULL; + } + + return group; +} + +static void cxl_dport_close_dr_group(struct cxl_dport *dport, void *group) +{ + devres_close_group(dport_to_host(dport), group); +} + +static void del_dport(struct cxl_dport *dport) +{ + devres_release_group(dport_to_host(dport), dport); +} + +/* The dport group id is the dport */ +DEFINE_FREE(cxl_dport_release_dr_group, void *, if (_T) del_dport(_T)) + static struct cxl_dport * __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, int port_id, resource_size_t component_reg_phys, @@ -1143,14 +1194,20 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, CXL_TARGET_STRLEN) return ERR_PTR(-EINVAL); - dport = devm_kzalloc(host, sizeof(*dport), GFP_KERNEL); + dport = kzalloc(sizeof(*dport), GFP_KERNEL); if (!dport) return ERR_PTR(-ENOMEM); + /* Just enough init to manage the devres group */ dport->dport_dev = dport_dev; dport->port_id = port_id; dport->port = port; + void *dport_dr_group __free(cxl_dport_release_dr_group) = + cxl_dport_open_dr_group_or_free(dport); + if (!dport_dr_group) + return ERR_PTR(-ENOMEM); + if (rcrb == CXL_RESOURCE_NONE) { rc = cxl_dport_setup_regs(&port->dev, dport, component_reg_phys); @@ -1203,6 +1260,9 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, cxl_debugfs_create_dport_dir(dport); + /* keep the group, and mark the end of devm actions */ + cxl_dport_close_dr_group(dport, no_free_ptr(dport_dr_group)); + return dport; } @@ -1429,15 +1489,6 @@ static void delete_switch_port(struct cxl_port *port) devm_release_action(port->dev.parent, unregister_port, port); } -static void del_dport(struct cxl_dport *dport) -{ - struct cxl_port *port = dport->port; - - devm_release_action(&port->dev, cxl_dport_unlink, dport); - devm_release_action(&port->dev, cxl_dport_remove, dport); - devm_kfree(&port->dev, dport); -} - static void del_dports(struct cxl_port *port) { struct cxl_dport *dport; From 86e756715db22cd79a9726c22644415c46b6b149 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jan 2026 16:03:58 -0800 Subject: [PATCH 38/59] cxl/port: Move decoder setup before dport creation There are port setup actions that run on first dport arrival, and there are setup actions that run per dport. RAS register setup is a future additional setup action to run per-port (once the first dport arrives), and each dport also has RAS registers to map. Before adding that, flip the order of "first dport" and "per-dport" actions. This makes allocation symmetric with teardown, "first dport" actions unwind after last dport removed. It also allows for using a devres group to collect the unrelated decoder, RAS, and dport setup actions into one group release action. The new cxl_port_open_group() collects "first dport" and "per-dport" into one group that can be released on any failure. This group's lifetime only needs to span the short duration of cxl_port_add_dport() to cleanup all potential damage from failing to add a dport. Contrast that to the "dport" devres group that is called upon to destruct fully formed dport objects. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Terry Bowman Signed-off-by: Dan Williams Link: https://patch.msgid.link/20260131000403.2135324-5-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 99bbcf9cf236..6a554d0466a1 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1651,10 +1651,14 @@ static bool dport_exists(struct cxl_port *port, struct device *dport_dev) return false; } -DEFINE_FREE(del_cxl_dport, struct cxl_dport *, if (!IS_ERR_OR_NULL(_T)) del_dport(_T)) +/* note this implicitly casts the group back to its @port */ +DEFINE_FREE(cxl_port_release_dr_group, struct cxl_port *, + if (_T) devres_release_group(&_T->dev, _T)) + static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, struct device *dport_dev) { + struct cxl_dport *dport; int rc; device_lock_assert(&port->dev); @@ -1664,14 +1668,13 @@ static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, if (dport_exists(port, dport_dev)) return ERR_PTR(-EBUSY); - struct cxl_dport *dport __free(del_cxl_dport) = - devm_cxl_add_dport_by_dev(port, dport_dev); - if (IS_ERR(dport)) - return dport; + /* Temp group for all "first dport" and "per dport" setup actions */ + void *port_dr_group __free(cxl_port_release_dr_group) = + devres_open_group(&port->dev, port, GFP_KERNEL); + if (!port_dr_group) + return ERR_PTR(-ENOMEM); - cxl_switch_parse_cdat(dport); - - if (port->nr_dports == 1) { + if (port->nr_dports == 0) { /* * Some host bridges are known to not have component regsisters * available until a root port has trained CXL. Perform that @@ -1684,18 +1687,24 @@ static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, rc = devm_cxl_switch_port_decoders_setup(port); if (rc) return ERR_PTR(rc); - dev_dbg(&port->dev, "first dport%d:%s added with decoders\n", - dport->port_id, dev_name(dport_dev)); - return no_free_ptr(dport); } + dport = devm_cxl_add_dport_by_dev(port, dport_dev); + if (IS_ERR(dport)) + return dport; + + /* This group was only needed for early exit above */ + devres_remove_group(&port->dev, no_free_ptr(port_dr_group)); + + cxl_switch_parse_cdat(dport); + /* New dport added, update the decoder targets */ device_for_each_child(&port->dev, dport, update_decoder_targets); dev_dbg(&port->dev, "dport%d:%s added\n", dport->port_id, dev_name(dport_dev)); - return no_free_ptr(dport); + return dport; } static struct cxl_dport *devm_cxl_create_port(struct device *ep_dev, From 3864cb60dad5a6c1bd9f444740cf541a1d8cda99 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jan 2026 16:03:59 -0800 Subject: [PATCH 39/59] cxl/port: Move dport probe operations to a driver event In preparation for adding more register setup to the cxl_port_add_dport() path (for RAS register mapping), move the dport creation event to a driver callback. This achieves two goals, it puts driver operations logically where they belong, in a driver, and it obviates the gymnastics of DECLARE_TESTABLE() which just makes a mess of grepping for CXL symbols. In other words, a driver callback is less of an ongoing maintenance burden than this DECLARE_TESTABLE arrangement that does not scale and diminishes the grep-ability of the codebase. cxl_port_add_dport() moves mostly unmodified from drivers/cxl/core/port.c. The only deliberate change is that it now assumes that the device_lock is held on entry and the driver is attached (just like cxl_port_probe()). Reviewed-by: Terry Bowman Tested-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Signed-off-by: Dan Williams Link: https://patch.msgid.link/20260131000403.2135324-6-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/hdm.c | 6 +-- drivers/cxl/core/pci.c | 8 ++-- drivers/cxl/core/port.c | 68 ++++++++-------------------- drivers/cxl/cxl.h | 31 +++++-------- drivers/cxl/port.c | 50 ++++++++++++++++++++ tools/testing/cxl/Kbuild | 2 + tools/testing/cxl/cxl_core_exports.c | 22 --------- tools/testing/cxl/exports.h | 13 ------ tools/testing/cxl/test/mock.c | 24 +++------- 9 files changed, 98 insertions(+), 126 deletions(-) delete mode 100644 tools/testing/cxl/exports.h diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 1c5d2022c87a..365b02b7a241 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -1219,12 +1219,12 @@ static int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm, } /** - * __devm_cxl_switch_port_decoders_setup - allocate and setup switch decoders + * devm_cxl_switch_port_decoders_setup - allocate and setup switch decoders * @port: CXL port context * * Return 0 or -errno on error */ -int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port) +int devm_cxl_switch_port_decoders_setup(struct cxl_port *port) { struct cxl_hdm *cxlhdm; @@ -1248,7 +1248,7 @@ int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port) dev_err(&port->dev, "HDM decoder capability not found\n"); return -ENXIO; } -EXPORT_SYMBOL_NS_GPL(__devm_cxl_switch_port_decoders_setup, "CXL"); +EXPORT_SYMBOL_NS_GPL(devm_cxl_switch_port_decoders_setup, "CXL"); /** * devm_cxl_endpoint_decoders_setup - allocate and setup endpoint decoders diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index b838c59d7a3c..f96ce884a213 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -41,14 +41,14 @@ static int pci_get_port_num(struct pci_dev *pdev) } /** - * __devm_cxl_add_dport_by_dev - allocate a dport by dport device + * devm_cxl_add_dport_by_dev - allocate a dport by dport device * @port: cxl_port that hosts the dport * @dport_dev: 'struct device' of the dport * * Returns the allocated dport on success or ERR_PTR() of -errno on error */ -struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev) +struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev) { struct cxl_register_map map; struct pci_dev *pdev; @@ -69,7 +69,7 @@ struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port, device_lock_assert(&port->dev); return devm_cxl_add_dport(port, dport_dev, port_num, map.resource); } -EXPORT_SYMBOL_NS_GPL(__devm_cxl_add_dport_by_dev, "CXL"); +EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport_by_dev, "CXL"); static int cxl_dvsec_mem_range_valid(struct cxl_dev_state *cxlds, int id) { diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 6a554d0466a1..7356e1725db8 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -778,7 +778,7 @@ static int cxl_setup_comp_regs(struct device *host, struct cxl_register_map *map return cxl_setup_regs(map); } -static int cxl_port_setup_regs(struct cxl_port *port, +int cxl_port_setup_regs(struct cxl_port *port, resource_size_t component_reg_phys) { if (dev_is_platform(port->uport_dev)) @@ -786,6 +786,7 @@ static int cxl_port_setup_regs(struct cxl_port *port, return cxl_setup_comp_regs(&port->dev, &port->reg_map, component_reg_phys); } +EXPORT_SYMBOL_NS_GPL(cxl_port_setup_regs, "CXL"); static int cxl_dport_setup_regs(struct device *host, struct cxl_dport *dport, resource_size_t component_reg_phys) @@ -1638,6 +1639,13 @@ static int update_decoder_targets(struct device *dev, void *data) return 0; } +void cxl_port_update_decoder_targets(struct cxl_port *port, + struct cxl_dport *dport) +{ + device_for_each_child(&port->dev, dport, update_decoder_targets); +} +EXPORT_SYMBOL_NS_GPL(cxl_port_update_decoder_targets, "CXL"); + static bool dport_exists(struct cxl_port *port, struct device *dport_dev) { struct cxl_dport *dport = cxl_find_dport_by_dev(port, dport_dev); @@ -1651,15 +1659,10 @@ static bool dport_exists(struct cxl_port *port, struct device *dport_dev) return false; } -/* note this implicitly casts the group back to its @port */ -DEFINE_FREE(cxl_port_release_dr_group, struct cxl_port *, - if (_T) devres_release_group(&_T->dev, _T)) - -static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, - struct device *dport_dev) +static struct cxl_dport *probe_dport(struct cxl_port *port, + struct device *dport_dev) { - struct cxl_dport *dport; - int rc; + struct cxl_driver *drv; device_lock_assert(&port->dev); if (!port->dev.driver) @@ -1668,43 +1671,12 @@ static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, if (dport_exists(port, dport_dev)) return ERR_PTR(-EBUSY); - /* Temp group for all "first dport" and "per dport" setup actions */ - void *port_dr_group __free(cxl_port_release_dr_group) = - devres_open_group(&port->dev, port, GFP_KERNEL); - if (!port_dr_group) - return ERR_PTR(-ENOMEM); + drv = container_of(port->dev.driver, struct cxl_driver, drv); + if (!drv->add_dport) + return ERR_PTR(-ENXIO); - if (port->nr_dports == 0) { - /* - * Some host bridges are known to not have component regsisters - * available until a root port has trained CXL. Perform that - * setup now. - */ - rc = cxl_port_setup_regs(port, port->component_reg_phys); - if (rc) - return ERR_PTR(rc); - - rc = devm_cxl_switch_port_decoders_setup(port); - if (rc) - return ERR_PTR(rc); - } - - dport = devm_cxl_add_dport_by_dev(port, dport_dev); - if (IS_ERR(dport)) - return dport; - - /* This group was only needed for early exit above */ - devres_remove_group(&port->dev, no_free_ptr(port_dr_group)); - - cxl_switch_parse_cdat(dport); - - /* New dport added, update the decoder targets */ - device_for_each_child(&port->dev, dport, update_decoder_targets); - - dev_dbg(&port->dev, "dport%d:%s added\n", dport->port_id, - dev_name(dport_dev)); - - return dport; + /* see cxl_port_add_dport() */ + return drv->add_dport(port, dport_dev); } static struct cxl_dport *devm_cxl_create_port(struct device *ep_dev, @@ -1751,7 +1723,7 @@ static struct cxl_dport *devm_cxl_create_port(struct device *ep_dev, } guard(device)(&port->dev); - return cxl_port_add_dport(port, dport_dev); + return probe_dport(port, dport_dev); } static int add_port_attach_ep(struct cxl_memdev *cxlmd, @@ -1783,7 +1755,7 @@ static int add_port_attach_ep(struct cxl_memdev *cxlmd, scoped_guard(device, &parent_port->dev) { parent_dport = cxl_find_dport_by_dev(parent_port, dparent); if (!parent_dport) { - parent_dport = cxl_port_add_dport(parent_port, dparent); + parent_dport = probe_dport(parent_port, dparent); if (IS_ERR(parent_dport)) return PTR_ERR(parent_dport); } @@ -1819,7 +1791,7 @@ static struct cxl_dport *find_or_add_dport(struct cxl_port *port, device_lock_assert(&port->dev); dport = cxl_find_dport_by_dev(port, dport_dev); if (!dport) { - dport = cxl_port_add_dport(port, dport_dev); + dport = probe_dport(port, dport_dev); if (IS_ERR(dport)) return dport; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 6f3741a57932..4479d632a687 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -840,8 +840,11 @@ struct cxl_endpoint_dvsec_info { }; int devm_cxl_switch_port_decoders_setup(struct cxl_port *port); -int __devm_cxl_switch_port_decoders_setup(struct cxl_port *port); int devm_cxl_endpoint_decoders_setup(struct cxl_port *port); +void cxl_port_update_decoder_targets(struct cxl_port *port, + struct cxl_dport *dport); +int cxl_port_setup_regs(struct cxl_port *port, + resource_size_t component_reg_phys); struct cxl_dev_state; int cxl_dvsec_rr_decode(struct cxl_dev_state *cxlds, @@ -851,10 +854,18 @@ bool is_cxl_region(struct device *dev); extern const struct bus_type cxl_bus_type; +/* + * Note, add_dport() is expressly for the cxl_port driver. TODO: investigate a + * type-safe driver model where probe()/remove() take the type of object implied + * by @id and the add_dport() op only defined for the CXL_DEVICE_PORT driver + * template. + */ struct cxl_driver { const char *name; int (*probe)(struct device *dev); void (*remove)(struct device *dev); + struct cxl_dport *(*add_dport)(struct cxl_port *port, + struct device *dport_dev); struct device_driver drv; int id; }; @@ -939,8 +950,6 @@ void cxl_coordinates_combine(struct access_coordinate *out, bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port); struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, struct device *dport_dev); -struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev); /* * Unit test builds overrides this to __weak, find the 'strong' version @@ -952,20 +961,4 @@ struct cxl_dport *__devm_cxl_add_dport_by_dev(struct cxl_port *port, u16 cxl_gpf_get_dvsec(struct device *dev); -/* - * Declaration for functions that are mocked by cxl_test that are called by - * cxl_core. The respective functions are defined as __foo() and called by - * cxl_core as foo(). The macros below ensures that those functions would - * exist as foo(). See tools/testing/cxl/cxl_core_exports.c and - * tools/testing/cxl/exports.h for setting up the mock functions. The dance - * is done to avoid a circular dependency where cxl_core calls a function that - * ends up being a mock function and goes to * cxl_test where it calls a - * cxl_core function. - */ -#ifndef CXL_TEST_ENABLE -#define DECLARE_TESTABLE(x) __##x -#define devm_cxl_add_dport_by_dev DECLARE_TESTABLE(devm_cxl_add_dport_by_dev) -#define devm_cxl_switch_port_decoders_setup DECLARE_TESTABLE(devm_cxl_switch_port_decoders_setup) -#endif - #endif /* __CXL_H__ */ diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 51c8f2f84717..913c469e067a 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -151,9 +151,59 @@ static const struct attribute_group *cxl_port_attribute_groups[] = { NULL, }; +/* note this implicitly casts the group back to its @port */ +DEFINE_FREE(cxl_port_release_dr_group, struct cxl_port *, + if (_T) devres_release_group(&_T->dev, _T)) + +static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, + struct device *dport_dev) +{ + struct cxl_dport *dport; + int rc; + + /* Temp group for all "first dport" and "per dport" setup actions */ + void *port_dr_group __free(cxl_port_release_dr_group) = + devres_open_group(&port->dev, port, GFP_KERNEL); + if (!port_dr_group) + return ERR_PTR(-ENOMEM); + + if (port->nr_dports == 0) { + /* + * Some host bridges are known to not have component regsisters + * available until a root port has trained CXL. Perform that + * setup now. + */ + rc = cxl_port_setup_regs(port, port->component_reg_phys); + if (rc) + return ERR_PTR(rc); + + rc = devm_cxl_switch_port_decoders_setup(port); + if (rc) + return ERR_PTR(rc); + } + + dport = devm_cxl_add_dport_by_dev(port, dport_dev); + if (IS_ERR(dport)) + return dport; + + /* This group was only needed for early exit above */ + devres_remove_group(&port->dev, no_free_ptr(port_dr_group)); + + cxl_switch_parse_cdat(dport); + + /* New dport added, update the decoder targets */ + cxl_port_update_decoder_targets(port, dport); + + dev_dbg(&port->dev, "dport%d:%s added\n", dport->port_id, + dev_name(dport_dev)); + + return dport; +} + static struct cxl_driver cxl_port_driver = { .name = "cxl_port", .probe = cxl_port_probe, + .add_dport = cxl_port_add_dport, .id = CXL_DEVICE_PORT, .drv = { .dev_groups = cxl_port_attribute_groups, diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 6eceefefb0e0..9b2d514a867e 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -10,6 +10,8 @@ ldflags-y += --wrap=cxl_endpoint_parse_cdat ldflags-y += --wrap=cxl_dport_init_ras_reporting ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup ldflags-y += --wrap=hmat_get_extended_linear_cache_size +ldflags-y += --wrap=devm_cxl_add_dport_by_dev +ldflags-y += --wrap=devm_cxl_switch_port_decoders_setup DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/cxl_core_exports.c b/tools/testing/cxl/cxl_core_exports.c index 6754de35598d..f088792a8925 100644 --- a/tools/testing/cxl/cxl_core_exports.c +++ b/tools/testing/cxl/cxl_core_exports.c @@ -2,28 +2,6 @@ /* Copyright(c) 2022 Intel Corporation. All rights reserved. */ #include "cxl.h" -#include "exports.h" /* Exporting of cxl_core symbols that are only used by cxl_test */ EXPORT_SYMBOL_NS_GPL(cxl_num_decoders_committed, "CXL"); - -cxl_add_dport_by_dev_fn _devm_cxl_add_dport_by_dev = - __devm_cxl_add_dport_by_dev; -EXPORT_SYMBOL_NS_GPL(_devm_cxl_add_dport_by_dev, "CXL"); - -struct cxl_dport *devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev) -{ - return _devm_cxl_add_dport_by_dev(port, dport_dev); -} -EXPORT_SYMBOL_NS_GPL(devm_cxl_add_dport_by_dev, "CXL"); - -cxl_switch_decoders_setup_fn _devm_cxl_switch_port_decoders_setup = - __devm_cxl_switch_port_decoders_setup; -EXPORT_SYMBOL_NS_GPL(_devm_cxl_switch_port_decoders_setup, "CXL"); - -int devm_cxl_switch_port_decoders_setup(struct cxl_port *port) -{ - return _devm_cxl_switch_port_decoders_setup(port); -} -EXPORT_SYMBOL_NS_GPL(devm_cxl_switch_port_decoders_setup, "CXL"); diff --git a/tools/testing/cxl/exports.h b/tools/testing/cxl/exports.h deleted file mode 100644 index 7ebee7c0bd67..000000000000 --- a/tools/testing/cxl/exports.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 2025 Intel Corporation */ -#ifndef __MOCK_CXL_EXPORTS_H_ -#define __MOCK_CXL_EXPORTS_H_ - -typedef struct cxl_dport *(*cxl_add_dport_by_dev_fn)(struct cxl_port *port, - struct device *dport_dev); -extern cxl_add_dport_by_dev_fn _devm_cxl_add_dport_by_dev; - -typedef int(*cxl_switch_decoders_setup_fn)(struct cxl_port *port); -extern cxl_switch_decoders_setup_fn _devm_cxl_switch_port_decoders_setup; - -#endif diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 44bce80ef3ff..f307c5b39184 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -10,21 +10,12 @@ #include #include #include "mock.h" -#include "../exports.h" static LIST_HEAD(mock); -static struct cxl_dport * -redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev); -static int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port); - void register_cxl_mock_ops(struct cxl_mock_ops *ops) { list_add_rcu(&ops->list, &mock); - _devm_cxl_add_dport_by_dev = redirect_devm_cxl_add_dport_by_dev; - _devm_cxl_switch_port_decoders_setup = - redirect_devm_cxl_switch_port_decoders_setup; } EXPORT_SYMBOL_GPL(register_cxl_mock_ops); @@ -32,9 +23,6 @@ DEFINE_STATIC_SRCU(cxl_mock_srcu); void unregister_cxl_mock_ops(struct cxl_mock_ops *ops) { - _devm_cxl_switch_port_decoders_setup = - __devm_cxl_switch_port_decoders_setup; - _devm_cxl_add_dport_by_dev = __devm_cxl_add_dport_by_dev; list_del_rcu(&ops->list); synchronize_srcu(&cxl_mock_srcu); } @@ -163,7 +151,7 @@ __wrap_nvdimm_bus_register(struct device *dev, } EXPORT_SYMBOL_GPL(__wrap_nvdimm_bus_register); -int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) +int __wrap_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) { int rc, index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); @@ -171,11 +159,12 @@ int redirect_devm_cxl_switch_port_decoders_setup(struct cxl_port *port) if (ops && ops->is_mock_port(port->uport_dev)) rc = ops->devm_cxl_switch_port_decoders_setup(port); else - rc = __devm_cxl_switch_port_decoders_setup(port); + rc = devm_cxl_switch_port_decoders_setup(port); put_cxl_mock_ops(index); return rc; } +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_switch_port_decoders_setup, "CXL"); int __wrap_devm_cxl_endpoint_decoders_setup(struct cxl_port *port) { @@ -257,8 +246,8 @@ void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, "CXL"); -struct cxl_dport *redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port, - struct device *dport_dev) +struct cxl_dport *__wrap_devm_cxl_add_dport_by_dev(struct cxl_port *port, + struct device *dport_dev) { int index; struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); @@ -267,11 +256,12 @@ struct cxl_dport *redirect_devm_cxl_add_dport_by_dev(struct cxl_port *port, if (ops && ops->is_mock_port(port->uport_dev)) dport = ops->devm_cxl_add_dport_by_dev(port, dport_dev); else - dport = __devm_cxl_add_dport_by_dev(port, dport_dev); + dport = devm_cxl_add_dport_by_dev(port, dport_dev); put_cxl_mock_ops(index); return dport; } +EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_dport_by_dev, "CXL"); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("cxl_test: emulation module"); From 7f5ff740ce0bcde242dafcc3f9bb3cbe6b5b8f3a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jan 2026 16:04:00 -0800 Subject: [PATCH 40/59] cxl/port: Move dport RAS setup to dport add time Towards the end goal of making all CXL RAS capability handling uniform across host bridge ports, upstream switch ports, and endpoint ports, move dport RAS setup. Move it to cxl_switch_port_probe() context for switch / VH dports (via cxl_port_add_dport()) and cxl_endpoint_port_probe() context for an RCH dport. Rename the RAS setup helper to devm_cxl_dport_ras_setup() for symmetry with devm_cxl_switch_port_decoders_setup(). Only the RCH version needs to be exported and the cxl_test mocking can be deleted with a dev_is_pci() check on the dport_dev. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Terry Bowman Signed-off-by: Dan Williams Link: https://patch.msgid.link/20260131000403.2135324-7-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/core.h | 10 ++++++++++ drivers/cxl/core/port.c | 12 +++--------- drivers/cxl/core/ras.c | 36 ++++++++++++++++++++--------------- drivers/cxl/cxlpci.h | 7 ++++--- drivers/cxl/mem.c | 2 -- drivers/cxl/port.c | 12 ++++++++++++ tools/testing/cxl/Kbuild | 1 - tools/testing/cxl/test/mock.c | 12 ------------ 8 files changed, 50 insertions(+), 42 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 422531799af2..be3c7b137115 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -144,6 +144,14 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, struct access_coordinate *c); +static inline struct device *dport_to_host(struct cxl_dport *dport) +{ + struct cxl_port *port = dport->port; + + if (is_cxl_root(port)) + return port->uport_dev; + return &port->dev; +} #ifdef CONFIG_CXL_RAS int cxl_ras_init(void); void cxl_ras_exit(void); @@ -152,6 +160,7 @@ void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base); void cxl_dport_map_rch_aer(struct cxl_dport *dport); void cxl_disable_rch_root_ints(struct cxl_dport *dport); void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds); +void devm_cxl_dport_ras_setup(struct cxl_dport *dport); #else static inline int cxl_ras_init(void) { @@ -166,6 +175,7 @@ static inline void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base static inline void cxl_dport_map_rch_aer(struct cxl_dport *dport) { } static inline void cxl_disable_rch_root_ints(struct cxl_dport *dport) { } static inline void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } +static inline void devm_cxl_dport_ras_setup(struct cxl_dport *dport) { } #endif /* CONFIG_CXL_RAS */ int cxl_gpf_port_setup(struct cxl_dport *dport); diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 7356e1725db8..9f56f7e75e81 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1119,15 +1119,6 @@ static void cxl_dport_unlink(void *data) sysfs_remove_link(&port->dev.kobj, link_name); } -static struct device *dport_to_host(struct cxl_dport *dport) -{ - struct cxl_port *port = dport->port; - - if (is_cxl_root(port)) - return port->uport_dev; - return &port->dev; -} - static void free_dport(void *dport) { kfree(dport); @@ -1261,6 +1252,9 @@ __devm_cxl_add_dport(struct cxl_port *port, struct device *dport_dev, cxl_debugfs_create_dport_dir(dport); + if (!dport->rch) + devm_cxl_dport_ras_setup(dport); + /* keep the group, and mark the end of devm actions */ cxl_dport_close_dr_group(dport, no_free_ptr(dport_dr_group)); diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index 72908f3ced77..e90b7a91bf5d 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -139,26 +139,32 @@ static void cxl_dport_map_ras(struct cxl_dport *dport) } /** - * cxl_dport_init_ras_reporting - Setup CXL RAS report on this dport + * devm_cxl_dport_ras_setup - Setup CXL RAS report on this dport * @dport: the cxl_dport that needs to be initialized - * @host: host device for devm operations */ -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) +void devm_cxl_dport_ras_setup(struct cxl_dport *dport) { - dport->reg_map.host = host; + dport->reg_map.host = dport_to_host(dport); cxl_dport_map_ras(dport); - - if (dport->rch) { - struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport->dport_dev); - - if (!host_bridge->native_aer) - return; - - cxl_dport_map_rch_aer(dport); - cxl_disable_rch_root_ints(dport); - } } -EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL"); + +void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport) +{ + struct pci_host_bridge *host_bridge; + + if (!dev_is_pci(dport->dport_dev)) + return; + + devm_cxl_dport_ras_setup(dport); + + host_bridge = to_pci_host_bridge(dport->dport_dev); + if (!host_bridge->native_aer) + return; + + cxl_dport_map_rch_aer(dport); + cxl_disable_rch_root_ints(dport); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_dport_rch_ras_setup, "CXL"); void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base) { diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 6f9c78886fd9..65575371a35c 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -81,7 +81,7 @@ void read_cdat_data(struct cxl_port *port); void cxl_cor_error_detected(struct pci_dev *pdev); pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, pci_channel_state_t state); -void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host); +void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport); #else static inline void cxl_cor_error_detected(struct pci_dev *pdev) { } @@ -91,8 +91,9 @@ static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_NONE; } -static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, - struct device *host) { } +static inline void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport) +{ +} #endif #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c index c2ee7f7f6320..e25c33f8c6cf 100644 --- a/drivers/cxl/mem.c +++ b/drivers/cxl/mem.c @@ -166,8 +166,6 @@ static int cxl_mem_probe(struct device *dev) else endpoint_parent = &parent_port->dev; - cxl_dport_init_ras_reporting(dport, dev); - scoped_guard(device, endpoint_parent) { if (!endpoint_parent->driver) { dev_err(dev, "CXL port topology %s not enabled\n", diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 913c469e067a..929f7e259f0d 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -71,6 +71,7 @@ static int cxl_switch_port_probe(struct cxl_port *port) static int cxl_endpoint_port_probe(struct cxl_port *port) { struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); + struct cxl_dport *dport = port->parent_dport; int rc; /* Cache the data early to ensure is_visible() works */ @@ -86,6 +87,17 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) if (rc) return rc; + /* + * With VH (CXL Virtual Host) topology the cxl_port::add_dport() method + * handles RAS setup for downstream ports. With RCH (CXL Restricted CXL + * Host) topologies the downstream port is enumerated early by platform + * firmware, but the RCRB (root complex register block) is not mapped + * until after the cxl_pci driver attaches to the RCIeP (root complex + * integrated endpoint). + */ + if (dport->rch) + devm_cxl_dport_rch_ras_setup(dport); + /* * Now that all endpoint decoders are successfully enumerated, try to * assemble regions from committed decoders diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 9b2d514a867e..982e8ea28b92 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -7,7 +7,6 @@ ldflags-y += --wrap=nvdimm_bus_register ldflags-y += --wrap=cxl_await_media_ready ldflags-y += --wrap=devm_cxl_add_rch_dport ldflags-y += --wrap=cxl_endpoint_parse_cdat -ldflags-y += --wrap=cxl_dport_init_ras_reporting ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup ldflags-y += --wrap=hmat_get_extended_linear_cache_size ldflags-y += --wrap=devm_cxl_add_dport_by_dev diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index f307c5b39184..b8fcb50c1027 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -234,18 +234,6 @@ void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_cxl_endpoint_parse_cdat, "CXL"); -void __wrap_cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) -{ - int index; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (!ops || !ops->is_mock_port(dport->dport_dev)) - cxl_dport_init_ras_reporting(dport, host); - - put_cxl_mock_ops(index); -} -EXPORT_SYMBOL_NS_GPL(__wrap_cxl_dport_init_ras_reporting, "CXL"); - struct cxl_dport *__wrap_devm_cxl_add_dport_by_dev(struct cxl_port *port, struct device *dport_dev) { From ef1df6cf69785ec6c949ecfa92c49cfc5e237576 Mon Sep 17 00:00:00 2001 From: Terry Bowman Date: Fri, 30 Jan 2026 16:04:01 -0800 Subject: [PATCH 41/59] cxl/port: Map Port RAS registers In preparation for CXL VH (Virtual Host) topology protocol error handling, add RAS capability registered mapping for all ports in a CXL VH topology. This includes the RAS capabilities of Switch Upstream Ports, Switch Downstream Ports, Host Bridge Ports ("upstream"), and Root Ports ("downstream") Update cxl_port_add_dport() to map the upstream RAS capability on first 'dport' attach. Signed-off-by: Terry Bowman Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Co-developed-by: Dan Williams Signed-off-by: Dan Williams Tested-by: Terry Bowman Link: https://patch.msgid.link/20260131000403.2135324-8-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/ras.c | 16 ++++++++++++++++ drivers/cxl/cxl.h | 2 ++ drivers/cxl/cxlpci.h | 5 +++++ drivers/cxl/port.c | 6 ++++++ 4 files changed, 29 insertions(+) diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index e90b7a91bf5d..b4be9c5715a6 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -166,6 +166,22 @@ void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport) } EXPORT_SYMBOL_NS_GPL(devm_cxl_dport_rch_ras_setup, "CXL"); +void devm_cxl_port_ras_setup(struct cxl_port *port) +{ + struct cxl_register_map *map = &port->reg_map; + + if (!map->component_map.ras.valid) { + dev_dbg(&port->dev, "RAS registers not found\n"); + return; + } + + map->host = &port->dev; + if (cxl_map_component_regs(map, &port->regs, + BIT(CXL_CM_CAP_CAP_ID_RAS))) + dev_dbg(&port->dev, "Failed to map RAS capability\n"); +} +EXPORT_SYMBOL_NS_GPL(devm_cxl_port_ras_setup, "CXL"); + void cxl_handle_cor_ras(struct device *dev, void __iomem *ras_base) { void __iomem *addr; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 4479d632a687..626a37b72fc3 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -607,6 +607,7 @@ struct cxl_dax_region { * @parent_dport: dport that points to this port in the parent * @decoder_ida: allocator for decoder ids * @reg_map: component and ras register mapping parameters + * @regs: mapped component registers * @nr_dports: number of entries in @dports * @hdm_end: track last allocated HDM decoder instance for allocation ordering * @commit_end: cursor to track highest committed decoder for commit ordering @@ -628,6 +629,7 @@ struct cxl_port { struct cxl_dport *parent_dport; struct ida decoder_ida; struct cxl_register_map reg_map; + struct cxl_component_regs regs; int nr_dports; int hdm_end; int commit_end; diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index 65575371a35c..0cf64218aa16 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -82,6 +82,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev); pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, pci_channel_state_t state); void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport); +void devm_cxl_port_ras_setup(struct cxl_port *port); #else static inline void cxl_cor_error_detected(struct pci_dev *pdev) { } @@ -94,6 +95,10 @@ static inline pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, static inline void devm_cxl_dport_rch_ras_setup(struct cxl_dport *dport) { } + +static inline void devm_cxl_port_ras_setup(struct cxl_port *port) +{ +} #endif #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 929f7e259f0d..6ebd665fb347 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -192,6 +192,12 @@ static struct cxl_dport *cxl_port_add_dport(struct cxl_port *port, rc = devm_cxl_switch_port_decoders_setup(port); if (rc) return ERR_PTR(rc); + + /* + * RAS setup is optional, either driver operation can continue + * on failure, or the device does not implement RAS registers. + */ + devm_cxl_port_ras_setup(port); } dport = devm_cxl_add_dport_by_dev(port, dport_dev); From dab7162d0ae782295c2c2cff4bb386ee6ae5d566 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jan 2026 16:04:02 -0800 Subject: [PATCH 42/59] cxl/port: Move endpoint component register management to cxl_port In preparation for generic protocol error handling across CXL endpoints, whether they be memory expander class devices or accelerators, drop the endpoint component management from cxl_dev_state. Organize all CXL port component management through the common cxl_port driver. Note that the end game is that drivers/cxl/core/ras.c loses all dependencies on a 'struct cxl_dev_state' parameter and operates only on port resources. The removal of component register mapping from cxl_pci is an incremental step towards that. Reviewed-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Terry Bowman Signed-off-by: Dan Williams Link: https://patch.msgid.link/20260131000403.2135324-9-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/ras.c | 6 ++-- drivers/cxl/cxlmem.h | 4 +-- drivers/cxl/pci.c | 63 +----------------------------------------- drivers/cxl/port.c | 54 ++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 67 deletions(-) diff --git a/drivers/cxl/core/ras.c b/drivers/cxl/core/ras.c index b4be9c5715a6..f6a8f4a355f1 100644 --- a/drivers/cxl/core/ras.c +++ b/drivers/cxl/core/ras.c @@ -255,6 +255,7 @@ bool cxl_handle_ras(struct device *dev, void __iomem *ras_base) void cxl_cor_error_detected(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + struct cxl_memdev *cxlmd = cxlds->cxlmd; struct device *dev = &cxlds->cxlmd->dev; scoped_guard(device, dev) { @@ -268,7 +269,7 @@ void cxl_cor_error_detected(struct pci_dev *pdev) if (cxlds->rcd) cxl_handle_rdport_errors(cxlds); - cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); + cxl_handle_cor_ras(&cxlds->cxlmd->dev, cxlmd->endpoint->regs.ras); } } EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL"); @@ -297,10 +298,9 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, * chance the situation is recoverable dump the status of the RAS * capability registers and bounce the active state of the memdev. */ - ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlds->regs.ras); + ue = cxl_handle_ras(&cxlds->cxlmd->dev, cxlmd->endpoint->regs.ras); } - switch (state) { case pci_channel_io_normal: if (ue) { diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index 434031a0c1f7..ab7201ef3ea6 100644 --- a/drivers/cxl/cxlmem.h +++ b/drivers/cxl/cxlmem.h @@ -415,7 +415,7 @@ struct cxl_dpa_partition { * @dev: The device associated with this CXL state * @cxlmd: The device representing the CXL.mem capabilities of @dev * @reg_map: component and ras register mapping parameters - * @regs: Parsed register blocks + * @regs: Class device "Device" registers * @cxl_dvsec: Offset to the PCIe device DVSEC * @rcd: operating in RCD mode (CXL 3.0 9.11.8 CXL Devices Attached to an RCH) * @media_ready: Indicate whether the device media is usable @@ -431,7 +431,7 @@ struct cxl_dev_state { struct device *dev; struct cxl_memdev *cxlmd; struct cxl_register_map reg_map; - struct cxl_regs regs; + struct cxl_device_regs regs; int cxl_dvsec; bool rcd; bool media_ready; diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index b7f694bda913..acb0eb2a13c3 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -535,52 +535,6 @@ static int cxl_pci_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type, return cxl_setup_regs(map); } -static int cxl_pci_ras_unmask(struct pci_dev *pdev) -{ - struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); - void __iomem *addr; - u32 orig_val, val, mask; - u16 cap; - int rc; - - if (!cxlds->regs.ras) { - dev_dbg(&pdev->dev, "No RAS registers.\n"); - return 0; - } - - /* BIOS has PCIe AER error control */ - if (!pcie_aer_is_native(pdev)) - return 0; - - rc = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &cap); - if (rc) - return rc; - - if (cap & PCI_EXP_DEVCTL_URRE) { - addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET; - orig_val = readl(addr); - - mask = CXL_RAS_UNCORRECTABLE_MASK_MASK | - CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK; - val = orig_val & ~mask; - writel(val, addr); - dev_dbg(&pdev->dev, - "Uncorrectable RAS Errors Mask: %#x -> %#x\n", - orig_val, val); - } - - if (cap & PCI_EXP_DEVCTL_CERE) { - addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET; - orig_val = readl(addr); - val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK; - writel(val, addr); - dev_dbg(&pdev->dev, "Correctable RAS Errors Mask: %#x -> %#x\n", - orig_val, val); - } - - return 0; -} - static void free_event_buf(void *buf) { kvfree(buf); @@ -912,13 +866,6 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) unsigned int i; bool irq_avail; - /* - * Double check the anonymous union trickery in struct cxl_regs - * FIXME switch to struct_group() - */ - BUILD_BUG_ON(offsetof(struct cxl_regs, memdev) != - offsetof(struct cxl_regs, device_regs.memdev)); - rc = pcim_enable_device(pdev); if (rc) return rc; @@ -942,7 +889,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; - rc = cxl_map_device_regs(&map, &cxlds->regs.device_regs); + rc = cxl_map_device_regs(&map, &cxlds->regs); if (rc) return rc; @@ -957,11 +904,6 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) else if (!cxlds->reg_map.component_map.ras.valid) dev_dbg(&pdev->dev, "RAS registers not found\n"); - rc = cxl_map_component_regs(&cxlds->reg_map, &cxlds->regs.component, - BIT(CXL_CM_CAP_CAP_ID_RAS)); - if (rc) - dev_dbg(&pdev->dev, "Failed to map RAS capability.\n"); - rc = cxl_pci_type3_init_mailbox(cxlds); if (rc) return rc; @@ -1052,9 +994,6 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; - if (cxl_pci_ras_unmask(pdev)) - dev_dbg(&pdev->dev, "No RAS reporting unmasked\n"); - pci_save_state(pdev); return rc; diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 6ebd665fb347..0ae78469207a 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include #include #include #include @@ -68,6 +69,55 @@ static int cxl_switch_port_probe(struct cxl_port *port) return 0; } +static int cxl_ras_unmask(struct cxl_port *port) +{ + struct pci_dev *pdev; + void __iomem *addr; + u32 orig_val, val, mask; + u16 cap; + int rc; + + if (!dev_is_pci(port->uport_dev)) + return 0; + pdev = to_pci_dev(port->uport_dev); + + if (!port->regs.ras) { + pci_dbg(pdev, "No RAS registers.\n"); + return 0; + } + + /* BIOS has PCIe AER error control */ + if (!pcie_aer_is_native(pdev)) + return 0; + + rc = pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &cap); + if (rc) + return rc; + + if (cap & PCI_EXP_DEVCTL_URRE) { + addr = port->regs.ras + CXL_RAS_UNCORRECTABLE_MASK_OFFSET; + orig_val = readl(addr); + + mask = CXL_RAS_UNCORRECTABLE_MASK_MASK | + CXL_RAS_UNCORRECTABLE_MASK_F256B_MASK; + val = orig_val & ~mask; + writel(val, addr); + pci_dbg(pdev, "Uncorrectable RAS Errors Mask: %#x -> %#x\n", + orig_val, val); + } + + if (cap & PCI_EXP_DEVCTL_CERE) { + addr = port->regs.ras + CXL_RAS_CORRECTABLE_MASK_OFFSET; + orig_val = readl(addr); + val = orig_val & ~CXL_RAS_CORRECTABLE_MASK_MASK; + writel(val, addr); + pci_dbg(pdev, "Correctable RAS Errors Mask: %#x -> %#x\n", + orig_val, val); + } + + return 0; +} + static int cxl_endpoint_port_probe(struct cxl_port *port) { struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev); @@ -98,6 +148,10 @@ static int cxl_endpoint_port_probe(struct cxl_port *port) if (dport->rch) devm_cxl_dport_rch_ras_setup(dport); + devm_cxl_port_ras_setup(port); + if (cxl_ras_unmask(port)) + dev_dbg(&port->dev, "failed to unmask RAS interrupts\n"); + /* * Now that all endpoint decoders are successfully enumerated, try to * assemble regions from committed decoders From 2d2b3fe002797c8de2c71236662593bf36de834d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jan 2026 16:04:03 -0800 Subject: [PATCH 43/59] cxl/port: Unify endpoint and switch port lookup In support of generic CXL protocol error handling across various 'struct cxl_port' types, update find_cxl_port_by_uport() to retrieve endpoint CXL port companions from endpoint PCIe device instances. The end result is that upstream switch ports and endpoint ports can share error handling and eventually delete the misplaced cxl_error_handlers from the cxl_pci class driver. Reviewed-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Terry Bowman Signed-off-by: Dan Williams Link: https://patch.msgid.link/20260131000403.2135324-10-dan.j.williams@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/port.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 9f56f7e75e81..ee7d14528867 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1590,10 +1590,20 @@ static int match_port_by_uport(struct device *dev, const void *data) return 0; port = to_cxl_port(dev); + /* Endpoint ports are hosted by memdevs */ + if (is_cxl_memdev(port->uport_dev)) + return uport_dev == port->uport_dev->parent; return uport_dev == port->uport_dev; } -/* +/** + * find_cxl_port_by_uport - Find a CXL port device companion + * @uport_dev: Device that acts as a switch or endpoint in the CXL hierarchy + * + * In the case of endpoint ports recall that port->uport_dev points to a 'struct + * cxl_memdev' device. So, the @uport_dev argument is the parent device of the + * 'struct cxl_memdev' in that case. + * * Function takes a device reference on the port device. Caller should do a * put_device() when done. */ From 72971184a1eed005b48babe226673d5496bcd959 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 3 Feb 2026 18:35:58 +0100 Subject: [PATCH 44/59] cxl, doc: Remove isonum.txt inclusion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch removes the line to include:: . From Jon: "This include has been cargo-culted around the docs...the only real use of it is to write |copy| rather than ©, but these docs don't even do that. It can be taken out." Cc: Jonathan Corbet Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260203173604.1440334-1-rrichter@amd.com Signed-off-by: Dave Jiang --- Documentation/driver-api/cxl/conventions.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/driver-api/cxl/conventions.rst b/Documentation/driver-api/cxl/conventions.rst index e37336d7b116..ed4237583d36 100644 --- a/Documentation/driver-api/cxl/conventions.rst +++ b/Documentation/driver-api/cxl/conventions.rst @@ -1,5 +1,4 @@ .. SPDX-License-Identifier: GPL-2.0 -.. include:: ======================================= Compute Express Link: Linux Conventions From e6efbd2995c1c14fbf53e2b63056eeeb30b034b1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 3 Feb 2026 18:35:59 +0100 Subject: [PATCH 45/59] cxl, doc: Moving conventions in separate files Moving conventions in separate files. Cc: Jonathan Corbet Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260203173604.1440334-2-rrichter@amd.com Signed-off-by: Dave Jiang --- Documentation/driver-api/cxl/conventions.rst | 176 +----------------- .../driver-api/cxl/conventions/cxl-lmh.rst | 135 ++++++++++++++ .../driver-api/cxl/conventions/template.rst | 37 ++++ 3 files changed, 178 insertions(+), 170 deletions(-) create mode 100644 Documentation/driver-api/cxl/conventions/cxl-lmh.rst create mode 100644 Documentation/driver-api/cxl/conventions/template.rst diff --git a/Documentation/driver-api/cxl/conventions.rst b/Documentation/driver-api/cxl/conventions.rst index ed4237583d36..9267a697b2fe 100644 --- a/Documentation/driver-api/cxl/conventions.rst +++ b/Documentation/driver-api/cxl/conventions.rst @@ -1,8 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 -======================================= Compute Express Link: Linux Conventions -======================================= +####################################### There exists shipping platforms that bend or break CXL specification expectations. Record the details and the rationale for those deviations. @@ -10,172 +9,9 @@ Borrow the ACPI Code First template format to capture the assumptions and tradeoffs such that multiple platform implementations can follow the same convention. -<(template) Title> -================== +.. toctree:: + :maxdepth: 1 + :caption: Contents -Document --------- -CXL Revision , Version - -License -------- -SPDX-License Identifier: CC-BY-4.0 - -Creator/Contributors --------------------- - -Summary of the Change ---------------------- - - - - -Benefits of the Change ----------------------- - - - -References ----------- - -Detailed Description of the Change ----------------------------------- - - - - -Resolve conflict between CFMWS, Platform Memory Holes, and Endpoint Decoders -============================================================================ - -Document --------- - -CXL Revision 3.2, Version 1.0 - -License -------- - -SPDX-License Identifier: CC-BY-4.0 - -Creator/Contributors --------------------- - -- Fabio M. De Francesco, Intel -- Dan J. Williams, Intel -- Mahesh Natu, Intel - -Summary of the Change ---------------------- - -According to the current Compute Express Link (CXL) Specifications (Revision -3.2, Version 1.0), the CXL Fixed Memory Window Structure (CFMWS) describes zero -or more Host Physical Address (HPA) windows associated with each CXL Host -Bridge. Each window represents a contiguous HPA range that may be interleaved -across one or more targets, including CXL Host Bridges. Each window has a set -of restrictions that govern its usage. It is the Operating System-directed -configuration and Power Management (OSPM) responsibility to utilize each window -for the specified use. - -Table 9-22 of the current CXL Specifications states that the Window Size field -contains the total number of consecutive bytes of HPA this window describes. -This value must be a multiple of the Number of Interleave Ways (NIW) * 256 MB. - -Platform Firmware (BIOS) might reserve physical addresses below 4 GB where a -memory gap such as the Low Memory Hole for PCIe MMIO may exist. In such cases, -the CFMWS Range Size may not adhere to the NIW * 256 MB rule. - -The HPA represents the actual physical memory address space that the CXL devices -can decode and respond to, while the System Physical Address (SPA), a related -but distinct concept, represents the system-visible address space that users can -direct transaction to and so it excludes reserved regions. - -BIOS publishes CFMWS to communicate the active SPA ranges that, on platforms -with LMH's, map to a strict subset of the HPA. The SPA range trims out the hole, -resulting in lost capacity in the Endpoints with no SPA to map to that part of -the HPA range that intersects the hole. - -E.g, an x86 platform with two CFMWS and an LMH starting at 2 GB: - - +--------+------------+-------------------+------------------+-------------------+------+ - | Window | CFMWS Base | CFMWS Size | HDM Decoder Base | HDM Decoder Size | Ways | - +========+============+===================+==================+===================+======+ - |  0 | 0 GB | 2 GB | 0 GB | 3 GB | 12 | - +--------+------------+-------------------+------------------+-------------------+------+ - |  1 | 4 GB | NIW*256MB Aligned | 4 GB | NIW*256MB Aligned | 12 | - +--------+------------+-------------------+------------------+-------------------+------+ - -HDM decoder base and HDM decoder size represent all the 12 Endpoint Decoders of -a 12 ways region and all the intermediate Switch Decoders. They are configured -by the BIOS according to the NIW * 256MB rule, resulting in a HPA range size of -3GB. Instead, the CFMWS Base and CFMWS Size are used to configure the Root -Decoder HPA range that results smaller (2GB) than that of the Switch and -Endpoint Decoders in the hierarchy (3GB). - -This creates 2 issues which lead to a failure to construct a region: - -1) A mismatch in region size between root and any HDM decoder. The root decoders - will always be smaller due to the trim. - -2) The trim causes the root decoder to violate the (NIW * 256MB) rule. - -This change allows a region with a base address of 0GB to bypass these checks to -allow for region creation with the trimmed root decoder address range. - -This change does not allow for any other arbitrary region to violate these -checks - it is intended exclusively to enable x86 platforms which map CXL memory -under 4GB. - -Despite the HDM decoders covering the PCIE hole HPA region, it is expected that -the platform will never route address accesses to the CXL complex because the -root decoder only covers the trimmed region (which excludes this). This is -outside the ability of Linux to enforce. - -On the example platform, only the first 2GB will be potentially usable, but -Linux, aiming to adhere to the current specifications, fails to construct -Regions and attach Endpoint and intermediate Switch Decoders to them. - -There are several points of failure that due to the expectation that the Root -Decoder HPA size, that is equal to the CFMWS from which it is configured, has -to be greater or equal to the matching Switch and Endpoint HDM Decoders. - -In order to succeed with construction and attachment, Linux must construct a -Region with Root Decoder HPA range size, and then attach to that all the -intermediate Switch Decoders and Endpoint Decoders that belong to the hierarchy -regardless of their range sizes. - -Benefits of the Change ----------------------- - -Without the change, the OSPM wouldn't match intermediate Switch and Endpoint -Decoders with Root Decoders configured with CFMWS HPA sizes that don't align -with the NIW * 256MB constraint, and so it leads to lost memdev capacity. - -This change allows the OSPM to construct Regions and attach intermediate Switch -and Endpoint Decoders to them, so that the addressable part of the memory -devices total capacity is made available to the users. - -References ----------- - -Compute Express Link Specification Revision 3.2, Version 1.0 - - -Detailed Description of the Change ----------------------------------- - -The description of the Window Size field in table 9-22 needs to account for -platforms with Low Memory Holes, where SPA ranges might be subsets of the -endpoints HPA. Therefore, it has to be changed to the following: - -"The total number of consecutive bytes of HPA this window represents. This value -shall be a multiple of NIW * 256 MB. - -On platforms that reserve physical addresses below 4 GB, such as the Low Memory -Hole for PCIe MMIO on x86, an instance of CFMWS whose Base HPA range is 0 might -have a size that doesn't align with the NIW * 256 MB constraint. - -Note that the matching intermediate Switch Decoders and the Endpoint Decoders -HPA range sizes must still align to the above-mentioned rule, but the memory -capacity that exceeds the CFMWS window size won't be accessible.". + conventions/cxl-lmh.rst + conventions/template.rst diff --git a/Documentation/driver-api/cxl/conventions/cxl-lmh.rst b/Documentation/driver-api/cxl/conventions/cxl-lmh.rst new file mode 100644 index 000000000000..baece5c35345 --- /dev/null +++ b/Documentation/driver-api/cxl/conventions/cxl-lmh.rst @@ -0,0 +1,135 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Resolve conflict between CFMWS, Platform Memory Holes, and Endpoint Decoders +============================================================================ + +Document +-------- + +CXL Revision 3.2, Version 1.0 + +License +------- + +SPDX-License Identifier: CC-BY-4.0 + +Creator/Contributors +-------------------- + +- Fabio M. De Francesco, Intel +- Dan J. Williams, Intel +- Mahesh Natu, Intel + +Summary of the Change +--------------------- + +According to the current Compute Express Link (CXL) Specifications (Revision +3.2, Version 1.0), the CXL Fixed Memory Window Structure (CFMWS) describes zero +or more Host Physical Address (HPA) windows associated with each CXL Host +Bridge. Each window represents a contiguous HPA range that may be interleaved +across one or more targets, including CXL Host Bridges. Each window has a set +of restrictions that govern its usage. It is the Operating System-directed +configuration and Power Management (OSPM) responsibility to utilize each window +for the specified use. + +Table 9-22 of the current CXL Specifications states that the Window Size field +contains the total number of consecutive bytes of HPA this window describes. +This value must be a multiple of the Number of Interleave Ways (NIW) * 256 MB. + +Platform Firmware (BIOS) might reserve physical addresses below 4 GB where a +memory gap such as the Low Memory Hole for PCIe MMIO may exist. In such cases, +the CFMWS Range Size may not adhere to the NIW * 256 MB rule. + +The HPA represents the actual physical memory address space that the CXL devices +can decode and respond to, while the System Physical Address (SPA), a related +but distinct concept, represents the system-visible address space that users can +direct transaction to and so it excludes reserved regions. + +BIOS publishes CFMWS to communicate the active SPA ranges that, on platforms +with LMH's, map to a strict subset of the HPA. The SPA range trims out the hole, +resulting in lost capacity in the Endpoints with no SPA to map to that part of +the HPA range that intersects the hole. + +E.g, an x86 platform with two CFMWS and an LMH starting at 2 GB: + + +--------+------------+-------------------+------------------+-------------------+------+ + | Window | CFMWS Base | CFMWS Size | HDM Decoder Base | HDM Decoder Size | Ways | + +========+============+===================+==================+===================+======+ + |  0 | 0 GB | 2 GB | 0 GB | 3 GB | 12 | + +--------+------------+-------------------+------------------+-------------------+------+ + |  1 | 4 GB | NIW*256MB Aligned | 4 GB | NIW*256MB Aligned | 12 | + +--------+------------+-------------------+------------------+-------------------+------+ + +HDM decoder base and HDM decoder size represent all the 12 Endpoint Decoders of +a 12 ways region and all the intermediate Switch Decoders. They are configured +by the BIOS according to the NIW * 256MB rule, resulting in a HPA range size of +3GB. Instead, the CFMWS Base and CFMWS Size are used to configure the Root +Decoder HPA range that results smaller (2GB) than that of the Switch and +Endpoint Decoders in the hierarchy (3GB). + +This creates 2 issues which lead to a failure to construct a region: + +1) A mismatch in region size between root and any HDM decoder. The root decoders + will always be smaller due to the trim. + +2) The trim causes the root decoder to violate the (NIW * 256MB) rule. + +This change allows a region with a base address of 0GB to bypass these checks to +allow for region creation with the trimmed root decoder address range. + +This change does not allow for any other arbitrary region to violate these +checks - it is intended exclusively to enable x86 platforms which map CXL memory +under 4GB. + +Despite the HDM decoders covering the PCIE hole HPA region, it is expected that +the platform will never route address accesses to the CXL complex because the +root decoder only covers the trimmed region (which excludes this). This is +outside the ability of Linux to enforce. + +On the example platform, only the first 2GB will be potentially usable, but +Linux, aiming to adhere to the current specifications, fails to construct +Regions and attach Endpoint and intermediate Switch Decoders to them. + +There are several points of failure that due to the expectation that the Root +Decoder HPA size, that is equal to the CFMWS from which it is configured, has +to be greater or equal to the matching Switch and Endpoint HDM Decoders. + +In order to succeed with construction and attachment, Linux must construct a +Region with Root Decoder HPA range size, and then attach to that all the +intermediate Switch Decoders and Endpoint Decoders that belong to the hierarchy +regardless of their range sizes. + +Benefits of the Change +---------------------- + +Without the change, the OSPM wouldn't match intermediate Switch and Endpoint +Decoders with Root Decoders configured with CFMWS HPA sizes that don't align +with the NIW * 256MB constraint, and so it leads to lost memdev capacity. + +This change allows the OSPM to construct Regions and attach intermediate Switch +and Endpoint Decoders to them, so that the addressable part of the memory +devices total capacity is made available to the users. + +References +---------- + +Compute Express Link Specification Revision 3.2, Version 1.0 + + +Detailed Description of the Change +---------------------------------- + +The description of the Window Size field in table 9-22 needs to account for +platforms with Low Memory Holes, where SPA ranges might be subsets of the +endpoints HPA. Therefore, it has to be changed to the following: + +"The total number of consecutive bytes of HPA this window represents. This value +shall be a multiple of NIW * 256 MB. + +On platforms that reserve physical addresses below 4 GB, such as the Low Memory +Hole for PCIe MMIO on x86, an instance of CFMWS whose Base HPA range is 0 might +have a size that doesn't align with the NIW * 256 MB constraint. + +Note that the matching intermediate Switch Decoders and the Endpoint Decoders +HPA range sizes must still align to the above-mentioned rule, but the memory +capacity that exceeds the CFMWS window size won't be accessible.". diff --git a/Documentation/driver-api/cxl/conventions/template.rst b/Documentation/driver-api/cxl/conventions/template.rst new file mode 100644 index 000000000000..ff2fcf1b5e24 --- /dev/null +++ b/Documentation/driver-api/cxl/conventions/template.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. :: Template Title here: + +Template File +============= + +Document +-------- +CXL Revision , Version + +License +------- +SPDX-License Identifier: CC-BY-4.0 + +Creator/Contributors +-------------------- + +Summary of the Change +--------------------- + + + +Benefits of the Change +---------------------- + + + +References +---------- + +Detailed Description of the Change +---------------------------------- + + From 0692afe940e0959dd2fa74539622f16cf3709433 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 3 Feb 2026 18:36:00 +0100 Subject: [PATCH 46/59] Documentation/driver-api/cxl: ACPI PRM Address Translation Support and AMD Zen5 enablement This adds a convention document for the following patch series: cxl: ACPI PRM Address Translation Support and AMD Zen5 enablement Version 7 and later: https://lore.kernel.org/linux-cxl/20251114213931.30754-1-rrichter@amd.com/ Link: https://lore.kernel.org/linux-cxl/20251114213931.30754-1-rrichter@amd.com/ Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Acked-by: Dan Williams Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260203173604.1440334-3-rrichter@amd.com Signed-off-by: Dave Jiang --- Documentation/driver-api/cxl/conventions.rst | 1 + .../driver-api/cxl/conventions/cxl-atl.rst | 304 ++++++++++++++++++ 2 files changed, 305 insertions(+) create mode 100644 Documentation/driver-api/cxl/conventions/cxl-atl.rst diff --git a/Documentation/driver-api/cxl/conventions.rst b/Documentation/driver-api/cxl/conventions.rst index 9267a697b2fe..0d2e07279ad9 100644 --- a/Documentation/driver-api/cxl/conventions.rst +++ b/Documentation/driver-api/cxl/conventions.rst @@ -14,4 +14,5 @@ same convention. :caption: Contents conventions/cxl-lmh.rst + conventions/cxl-atl.rst conventions/template.rst diff --git a/Documentation/driver-api/cxl/conventions/cxl-atl.rst b/Documentation/driver-api/cxl/conventions/cxl-atl.rst new file mode 100644 index 000000000000..3a36a84743d0 --- /dev/null +++ b/Documentation/driver-api/cxl/conventions/cxl-atl.rst @@ -0,0 +1,304 @@ +.. SPDX-License-Identifier: GPL-2.0 + +ACPI PRM CXL Address Translation +================================ + +Document +-------- + +CXL Revision 3.2, Version 1.0 + +License +------- + +SPDX-License Identifier: CC-BY-4.0 + +Creator/Contributors +-------------------- + +- Robert Richter, AMD et al. + +Summary of the Change +--------------------- + +The CXL Fixed Memory Window Structures (CFMWS) describe zero or more Host +Physical Address (HPA) windows associated with one or more CXL Host Bridges. +Each HPA range of a CXL Host Bridge is represented by a CFMWS entry. An HPA +range may include addresses currently assigned to CXL.mem devices, or an OS may +assign ranges from an address window to a device. + +Host-managed Device Memory is Device-attached memory that is mapped to system +coherent address space and accessible to the Host using standard write-back +semantics. The managed address range is configured in the CXL HDM Decoder +registers of the device. An HDM Decoder in a device is responsible for +converting HPA into DPA by stripping off specific address bits. + +CXL devices and CXL bridges use the same HPA space. It is common across all +components that belong to the same host domain. The view of the address region +must be consistent on the CXL.mem path between the Host and the Device. + +This is described in the *CXL 3.2 specification* (Table 1-1, 3.3.1, +8.2.4.20, 9.13.1, 9.18.1.3). [#cxl-spec-3.2]_ + +Depending on the interconnect architecture of the platform, components attached +to a host may not share the same host physical address space. Those platforms +need address translation to convert an HPA between the host and the attached +component, such as a CXL device. The translation mechanism is host-specific and +implementation dependent. + +For example, x86 AMD platforms use a Data Fabric that manages access to physical +memory. Devices have their own memory space and can be configured to use +'Normalized addresses' different from System Physical Addresses (SPA). Address +translation is then needed. For details, see +:doc:`x86 AMD Address Translation `. + +Those AMD platforms provide PRM [#prm-spec]_ handlers in firmware to perform +various types of address translation, including for CXL endpoints. AMD Zen5 +systems implement the ACPI PRM CXL Address Translation firmware call. The ACPI +PRM handler has a specific GUID to uniquely identify platforms with support for +Normalized addressing. This is documented in the *ACPI v6.5 Porting Guide* +(Address Translation - CXL DPA to System Physical Address). [#amd-ppr-58088]_ + +When in Normalized address mode, HDM decoder address ranges must be configured +and handled differently. Hardware addresses used in the HDM decoder +configurations of an endpoint are not SPA and need to be translated from the +address range of the endpoint to that of the CXL host bridge. This is especially +important for finding an endpoint's associated CXL Host Bridge and HPA window +described in the CFMWS. Additionally, the interleave decoding is done by the +Data Fabric and the endpoint does not perform decoding when converting HPA to +DPA. Instead, interleaving is switched off for the endpoint (1-way). Finally, +address translation might also be needed to inspect the endpoint's hardware +addresses, such as during profiling, tracing, or error handling. + +For example, with Normalized addressing the HDM decoders could look as follows:: + + ------------------------------- + | Root Decoder (CFMWS) | + | SPA Range: 0x850000000 | + | Size: 0x8000000000 (512 GB) | + | Interleave Ways: 1 | + ------------------------------- + | + v + ------------------------------- + | Host Bridge Decoder (HDM) | + | SPA Range: 0x850000000 | + | Size: 0x8000000000 (512 GB) | + | Interleave Ways: 4 | + | Targets: endpoint5,8,11,13 | + | Granularity: 256 | + ------------------------------- + | + -----------------------------+------------------------------ + | | | | + v v v v + ------------------- ------------------- ------------------- ------------------- + | endpoint5 | | endpoint8 | | endpoint11 | | endpoint13 | + | decoder5.0 | | decoder8.0 | | decoder11.0 | | decoder13.0 | + | PCIe: | | PCIe: | | PCIe: | | PCIe: | + | 0000:e2:00.0 | | 0000:e3:00.0 | | 0000:e4:00.0 | | 0000:e1:00.0 | + | DPA: | | DPA: | | DPA: | | DPA: | + | Start: 0x0 | | Start: 0x0 | | Start: 0x0 | | Start: 0x0 | + | Size: | | Size: | | Size: | | Size: | + | 0x2000000000 | | 0x2000000000 | | 0x2000000000 | | 0x2000000000 | + | (128 GB) | | (128 GB) | | (128 GB) | | (128 GB) | + | Interleaving: | | Interleaving: | | Interleaving: | | Interleaving: | + | Ways: 1 | | Ways: 1 | | Ways: 1 | | Ways: 1 | + | Gran: 256 | | Gran: 256 | | Gran: 256 | | Gran: 256 | + ------------------- ------------------- ------------------- ------------------- + | | | | + v v v v + DPA DPA DPA DPA + +This shows the representation in sysfs: + +.. code-block:: none + + /sys/bus/cxl/devices/endpoint5/decoder5.0/interleave_granularity:256 + /sys/bus/cxl/devices/endpoint5/decoder5.0/interleave_ways:1 + /sys/bus/cxl/devices/endpoint5/decoder5.0/size:0x2000000000 + /sys/bus/cxl/devices/endpoint5/decoder5.0/start:0x0 + /sys/bus/cxl/devices/endpoint8/decoder8.0/interleave_granularity:256 + /sys/bus/cxl/devices/endpoint8/decoder8.0/interleave_ways:1 + /sys/bus/cxl/devices/endpoint8/decoder8.0/size:0x2000000000 + /sys/bus/cxl/devices/endpoint8/decoder8.0/start:0x0 + /sys/bus/cxl/devices/endpoint11/decoder11.0/interleave_granularity:256 + /sys/bus/cxl/devices/endpoint11/decoder11.0/interleave_ways:1 + /sys/bus/cxl/devices/endpoint11/decoder11.0/size:0x2000000000 + /sys/bus/cxl/devices/endpoint11/decoder11.0/start:0x0 + /sys/bus/cxl/devices/endpoint13/decoder13.0/interleave_granularity:256 + /sys/bus/cxl/devices/endpoint13/decoder13.0/interleave_ways:1 + /sys/bus/cxl/devices/endpoint13/decoder13.0/size:0x2000000000 + /sys/bus/cxl/devices/endpoint13/decoder13.0/start:0x0 + +Note that the endpoint interleaving configurations use direct mapping (1-way). + +With PRM calls, the kernel can determine the following mappings: + +.. code-block:: none + + cxl decoder5.0: address mapping found for 0000:e2:00.0 (hpa -> spa): + 0x0+0x2000000000 -> 0x850000000+0x8000000000 ways:4 granularity:256 + cxl decoder8.0: address mapping found for 0000:e3:00.0 (hpa -> spa): + 0x0+0x2000000000 -> 0x850000000+0x8000000000 ways:4 granularity:256 + cxl decoder11.0: address mapping found for 0000:e4:00.0 (hpa -> spa): + 0x0+0x2000000000 -> 0x850000000+0x8000000000 ways:4 granularity:256 + cxl decoder13.0: address mapping found for 0000:e1:00.0 (hpa -> spa): + 0x0+0x2000000000 -> 0x850000000+0x8000000000 ways:4 granularity:256 + +The corresponding CXL host bridge (HDM) decoders and root decoder (CFMWS) match +the calculated endpoint mappings shown: + +.. code-block:: none + + /sys/bus/cxl/devices/port1/decoder1.0/interleave_granularity:256 + /sys/bus/cxl/devices/port1/decoder1.0/interleave_ways:4 + /sys/bus/cxl/devices/port1/decoder1.0/size:0x8000000000 + /sys/bus/cxl/devices/port1/decoder1.0/start:0x850000000 + /sys/bus/cxl/devices/port1/decoder1.0/target_list:0,1,2,3 + /sys/bus/cxl/devices/port1/decoder1.0/target_type:expander + /sys/bus/cxl/devices/root0/decoder0.0/interleave_granularity:256 + /sys/bus/cxl/devices/root0/decoder0.0/interleave_ways:1 + /sys/bus/cxl/devices/root0/decoder0.0/size:0x8000000000 + /sys/bus/cxl/devices/root0/decoder0.0/start:0x850000000 + /sys/bus/cxl/devices/root0/decoder0.0/target_list:7 + +The following changes to the specification are needed: + +* Allow a CXL device to be in an HPA space other than the host's address space. + +* Allow the platform to use implementation-specific address translation when + crossing memory domains on the CXL.mem path between the host and the device. + +* Define a PRM handler method for converting device addresses to SPAs. + +* Specify that the platform shall provide the PRM handler method to the + Operating System to detect Normalized addressing and for determining Endpoint + SPA ranges and interleaving configurations. + +* Add reference to: + + | Platform Runtime Mechanism Specification, Version 1.1 – November 2020 + | https://uefi.org/sites/default/files/resources/PRM_Platform_Runtime_Mechanism_1_1_release_candidate.pdf + +Benefits of the Change +---------------------- + +Without the change, the Operating System may be unable to determine the memory +region and Root Decoder for an Endpoint and its corresponding HDM decoder. +Region creation would fail. Platforms with a different interconnect architecture +would fail to set up and use CXL. + +References +---------- + +.. [#cxl-spec-3.2] Compute Express Link Specification, Revision 3.2, Version 1.0, + https://www.computeexpresslink.org/ + +.. [#amd-ppr-58088] AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh, + ACPI v6.5 Porting Guide, Publication # 58088, + https://www.amd.com/en/search/documentation/hub.html + +.. [#prm-spec] Platform Runtime Mechanism, Version: 1.1, + https://uefi.org/sites/default/files/resources/PRM_Platform_Runtime_Mechanism_1_1_release_candidate.pdf + +Detailed Description of the Change +---------------------------------- + +The following describes the necessary changes to the *CXL 3.2 specification* +[#cxl-spec-3.2]_: + +Add the following reference to the table: + +Table 1-2. Reference Documents + ++----------------------------+-------------------+---------------------------+ +| Document | Chapter Reference | Document No./Location | ++============================+===================+===========================+ +| Platform Runtime Mechanism | Chapter 8, 9 | https://www.uefi.org/acpi | +| Version: 1.1 | | | ++----------------------------+-------------------+---------------------------+ + +Add the following paragraphs to the end of the section: + +**8.2.4.20 CXL HDM Decoder Capability Structure** + +"A device may use an HPA space that is not common to other components of the +host domain. The platform is responsible for address translation when crossing +HPA spaces. The Operating System must determine the interleaving configuration +and perform address translation to the HPA ranges of the HDM decoders as needed. +The translation mechanism is host-specific and implementation dependent. + +The platform indicates support of independent HPA spaces and the need for +address translation by providing a Platform Runtime Mechanism (PRM) handler. The +OS shall use that handler to perform the necessary translations from the DPA +space to the HPA space. The handler is defined in Section 9.18.4 *PRM Handler +for CXL DPA to System Physical Address Translation*." + +Add the following section and sub-section including tables: + +**9.18.4 PRM Handler for CXL DPA to System Physical Address Translation** + +"A platform may be configured to use 'Normalized addresses'. Host physical +address (HPA) spaces are component-specific and differ from system physical +addresses (SPAs). The endpoint has its own physical address space. All requests +presented to the device already use Device Physical Addresses (DPAs). The CXL +endpoint decoders have interleaving disabled (1-way interleaving) and the device +does not perform HPA decoding to determine a DPA. + +The platform provides a PRM handler for CXL DPA to System Physical Address +Translation. The PRM handler translates a Device Physical Address (DPA) to a +System Physical Address (SPA) for a specified CXL endpoint. In the address space +of the host, SPA and HPA are equivalent, and the OS shall use this handler to +determine the HPA that corresponds to a device address, for example when +configuring HDM decoders on platforms with Normalized addressing. The GUID and +the parameter buffer format of the handler are specified in section 9.18.4.1. If +the OS identifies the PRM handler, the platform supports Normalized addressing +and the OS must perform DPA address translation as needed." + +**9.18.4.1 PRM Handler Invocation** + +"The OS calls the PRM handler for CXL DPA to System Physical Address Translation +using the direct invocation mechanism. Details of calling a PRM handler are +described in the Platform Runtime Mechanism (PRM) specification. + +The PRM handler is identified by the following GUID: + + EE41B397-25D4-452C-AD54-48C6E3480B94 + +The caller allocates and prepares a Parameter Buffer, then passes the PRM +handler GUID and a pointer to the Parameter Buffer to invoke the handler. The +Parameter Buffer is described in Table 9-32." + +**Table 9-32. PRM Parameter Buffer used for CXL DPA to System Physical Address Translation** + ++-------------+-----------+------------------------------------------------------------------------+ +| Byte Offset | Length in | Description | +| | Bytes | | ++=============+===========+========================================================================+ +| 00h | 8 | **CXL Device Physical Address (DPA)**: CXL DPA (e.g., from | +| | | CXL Component Event Log) | ++-------------+-----------+------------------------------------------------------------------------+ +| 08h | 4 | **CXL Endpoint SBDF**: | +| | | | +| | | - Byte 3 - PCIe Segment | +| | | - Byte 2 - Bus Number | +| | | - Byte 1: | +| | | - Device Number Bits[7:3] | +| | | - Function Number Bits[2:0] | +| | | - Byte 0 - RESERVED (MBZ) | +| | | | ++-------------+-----------+------------------------------------------------------------------------+ +| 0Ch | 8 | **Output Buffer**: Virtual Address Pointer to the buffer, | +| | | as defined in Table 9-33. | ++-------------+-----------+------------------------------------------------------------------------+ + +**Table 9-33. PRM Output Buffer used for CXL DPA to System Physical Address Translation** + ++-------------+-----------+------------------------------------------------------------------------+ +| Byte Offset | Length in | Description | +| | Bytes | | ++=============+===========+========================================================================+ +| 00h | 8 | **System Physical Address (SPA)**: The SPA converted | +| | | from the CXL DPA. | ++-------------+-----------+------------------------------------------------------------------------+ From df8b57c34b47e0acbe1133ca58ac75ec3c56771f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:17 +0100 Subject: [PATCH 47/59] cxl/region: Rename misleading variable name @hpa to @hpa_range @hpa is actually a @hpa_range, rename variables accordingly. Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260114164837.1076338-2-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index ae899f68551f..51f1a5545324 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3474,9 +3474,9 @@ static int match_decoder_by_range(struct device *dev, const void *data) } static struct cxl_decoder * -cxl_port_find_switch_decoder(struct cxl_port *port, struct range *hpa) +cxl_port_find_switch_decoder(struct cxl_port *port, struct range *hpa_range) { - struct device *cxld_dev = device_find_child(&port->dev, hpa, + struct device *cxld_dev = device_find_child(&port->dev, hpa_range, match_decoder_by_range); return cxld_dev ? to_cxl_decoder(cxld_dev) : NULL; @@ -3489,14 +3489,14 @@ cxl_find_root_decoder(struct cxl_endpoint_decoder *cxled) struct cxl_port *port = cxled_to_port(cxled); struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); struct cxl_decoder *root, *cxld = &cxled->cxld; - struct range *hpa = &cxld->hpa_range; + struct range *hpa_range = &cxld->hpa_range; - root = cxl_port_find_switch_decoder(&cxl_root->port, hpa); + root = cxl_port_find_switch_decoder(&cxl_root->port, hpa_range); if (!root) { dev_err(cxlmd->dev.parent, "%s:%s no CXL window for range %#llx:%#llx\n", dev_name(&cxlmd->dev), dev_name(&cxld->dev), - cxld->hpa_range.start, cxld->hpa_range.end); + hpa_range->start, hpa_range->end); return NULL; } @@ -3562,7 +3562,7 @@ static int __construct_region(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct range *hpa = &cxled->cxld.hpa_range; + struct range *hpa_range = &cxled->cxld.hpa_range; struct cxl_region_params *p; struct resource *res; int rc; @@ -3583,7 +3583,7 @@ static int __construct_region(struct cxl_region *cxlr, if (!res) return -ENOMEM; - *res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa), + *res = DEFINE_RES_MEM_NAMED(hpa_range->start, range_len(hpa_range), dev_name(&cxlr->dev)); rc = cxl_extended_linear_cache_resize(cxlr, res); @@ -3666,11 +3666,12 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, } static struct cxl_region * -cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa) +cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, + struct range *hpa_range) { struct device *region_dev; - region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa, + region_dev = device_find_child(&cxlrd->cxlsd.cxld.dev, hpa_range, match_region_by_range); if (!region_dev) return NULL; @@ -3680,7 +3681,7 @@ cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, struct range *hpa) int cxl_add_to_region(struct cxl_endpoint_decoder *cxled) { - struct range *hpa = &cxled->cxld.hpa_range; + struct range *hpa_range = &cxled->cxld.hpa_range; struct cxl_region_params *p; bool attach = false; int rc; @@ -3691,12 +3692,13 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled) return -ENXIO; /* - * Ensure that if multiple threads race to construct_region() for @hpa - * one does the construction and the others add to that. + * Ensure that, if multiple threads race to construct_region() + * for the HPA range, one does the construction and the others + * add to that. */ mutex_lock(&cxlrd->range_lock); struct cxl_region *cxlr __free(put_cxl_region) = - cxl_find_region_by_range(cxlrd, hpa); + cxl_find_region_by_range(cxlrd, hpa_range); if (!cxlr) cxlr = construct_region(cxlrd, cxled); mutex_unlock(&cxlrd->range_lock); From 4fe82279580d10ba63c1461ff404f2c6c82ff1d5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:18 +0100 Subject: [PATCH 48/59] cxl/region: Store root decoder in struct cxl_region A region is always bound to a root decoder. The region's associated root decoder is often needed. Add it to struct cxl_region. This simplifies the code by removing dynamic lookups and the root decoder argument from the function argument list where possible. Patch is a prerequisite to implement address translation which uses struct cxl_region to store all relevant region and interleaving parameters. It changes the argument list of __construct_region() in preparation of adding a context argument. Additionally the arg list of cxl_region_attach_position() is simplified and the use of to_cxl_root_decoder() removed, which always reconstructs and checks the pointer. The pointer never changes and is frequently used. Code becomes more readable as this amphazises the binding between both objects. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Gregory Price Reviewed-by: Alison Schofield Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260114164837.1076338-3-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 37 +++++++++++++++++++------------------ drivers/cxl/cxl.h | 2 ++ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 51f1a5545324..22bd8ff37cef 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -489,9 +489,9 @@ static ssize_t interleave_ways_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); - struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct cxl_region_params *p = &cxlr->params; unsigned int val, save; int rc; @@ -552,9 +552,9 @@ static ssize_t interleave_granularity_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); - struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct cxl_region_params *p = &cxlr->params; int rc, val; u16 ig; @@ -628,7 +628,7 @@ static DEVICE_ATTR_RO(mode); static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_region_params *p = &cxlr->params; struct resource *res; u64 remainder = 0; @@ -1373,7 +1373,7 @@ static int cxl_port_setup_targets(struct cxl_port *port, struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; int parent_iw, parent_ig, ig, iw, rc, pos = cxled->pos; struct cxl_port *parent_port = to_cxl_port(port->dev.parent); struct cxl_region_ref *cxl_rr = cxl_rr_load(port, cxlr); @@ -1731,10 +1731,10 @@ static int cxl_region_validate_position(struct cxl_region *cxlr, } static int cxl_region_attach_position(struct cxl_region *cxlr, - struct cxl_root_decoder *cxlrd, struct cxl_endpoint_decoder *cxled, const struct cxl_dport *dport, int pos) { + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_switch_decoder *cxlsd = &cxlrd->cxlsd; struct cxl_decoder *cxld = &cxlsd->cxld; @@ -1971,7 +1971,7 @@ static int cxl_region_sort_targets(struct cxl_region *cxlr) static int cxl_region_attach(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled, int pos) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_region_params *p = &cxlr->params; @@ -2076,8 +2076,7 @@ static int cxl_region_attach(struct cxl_region *cxlr, ep_port = cxled_to_port(cxled); dport = cxl_find_dport_by_dev(root_port, ep_port->host_bridge); - rc = cxl_region_attach_position(cxlr, cxlrd, cxled, - dport, i); + rc = cxl_region_attach_position(cxlr, cxled, dport, i); if (rc) return rc; } @@ -2100,7 +2099,7 @@ static int cxl_region_attach(struct cxl_region *cxlr, if (rc) return rc; - rc = cxl_region_attach_position(cxlr, cxlrd, cxled, dport, pos); + rc = cxl_region_attach_position(cxlr, cxled, dport, pos); if (rc) return rc; @@ -2396,8 +2395,8 @@ static const struct attribute_group *region_groups[] = { static void cxl_region_release(struct device *dev) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; int id = atomic_read(&cxlrd->region_id); /* @@ -2480,10 +2479,12 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i * region id allocations */ get_device(dev->parent); + cxlr->cxlrd = cxlrd; + cxlr->id = id; + device_set_pm_not_required(dev); dev->bus = &cxl_bus_type; dev->type = &cxl_region_type; - cxlr->id = id; cxl_region_set_lock(cxlr, &cxlrd->cxlsd.cxld); return cxlr; @@ -3115,7 +3116,7 @@ EXPORT_SYMBOL_FOR_MODULES(cxl_calculate_hpa_offset, "cxl_translate"); u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u64 dpa) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_region_params *p = &cxlr->params; struct cxl_endpoint_decoder *cxled = NULL; u64 dpa_offset, hpa_offset, hpa; @@ -3168,7 +3169,7 @@ static int region_offset_to_dpa_result(struct cxl_region *cxlr, u64 offset, struct dpa_result *result) { struct cxl_region_params *p = &cxlr->params; - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_endpoint_decoder *cxled; u64 hpa, hpa_offset, dpa_offset; u16 eig = 0; @@ -3522,7 +3523,7 @@ static int match_region_by_range(struct device *dev, const void *data) static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, struct resource *res) { - struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_region_params *p = &cxlr->params; resource_size_t size = resource_size(res); resource_size_t cache_size, start; @@ -3558,9 +3559,9 @@ static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, } static int __construct_region(struct cxl_region *cxlr, - struct cxl_root_decoder *cxlrd, struct cxl_endpoint_decoder *cxled) { + struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct range *hpa_range = &cxled->cxld.hpa_range; struct cxl_region_params *p; @@ -3656,7 +3657,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, return cxlr; } - rc = __construct_region(cxlr, cxlrd, cxled); + rc = __construct_region(cxlr, cxled); if (rc) { devm_release_action(port->uport_dev, unregister_region, cxlr); return ERR_PTR(rc); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ba17fa86d249..10ce9c3a8a55 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -529,6 +529,7 @@ enum cxl_partition_mode { * struct cxl_region - CXL region * @dev: This region's device * @id: This region's id. Id is globally unique across all regions + * @cxlrd: Region's root decoder * @mode: Operational mode of the mapped capacity * @type: Endpoint decoder target type * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown @@ -542,6 +543,7 @@ enum cxl_partition_mode { struct cxl_region { struct device dev; int id; + struct cxl_root_decoder *cxlrd; enum cxl_partition_mode mode; enum cxl_decoder_type type; struct cxl_nvdimm_bridge *cxl_nvb; From 98ceb1a42dab91c6dcf95d1d424cba61b0f9bc5c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:19 +0100 Subject: [PATCH 49/59] cxl/region: Store HPA range in struct cxl_region Each region has a known host physical address (HPA) range it is assigned to. Endpoint decoders assigned to a region share the same HPA range. The region's address range is the system's physical address (SPA) range. Endpoint decoders in systems that need address translation use HPAs which are not SPAs. To make the SPA range accessible to the endpoint decoders, store and track the region's SPA range in struct cxl_region. Introduce the @hpa_range member to the struct. Now, the SPA range of an endpoint decoder can be determined based on its assigned region. Patch is a prerequisite to implement address translation which uses struct cxl_region to store all relevant region and interleaving parameters. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Gregory Price Reviewed-by: Alison Schofield Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260114164837.1076338-4-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 7 +++++++ drivers/cxl/cxl.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 22bd8ff37cef..04c3ff66ec81 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -664,6 +664,8 @@ static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) return PTR_ERR(res); } + cxlr->hpa_range = DEFINE_RANGE(res->start, res->end); + p->res = res; p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; @@ -700,6 +702,8 @@ static int free_hpa(struct cxl_region *cxlr) if (p->state >= CXL_CONFIG_ACTIVE) return -EBUSY; + cxlr->hpa_range = DEFINE_RANGE(0, -1); + cxl_region_iomem_release(cxlr); p->state = CXL_CONFIG_IDLE; return 0; @@ -2453,6 +2457,8 @@ static void unregister_region(void *_cxlr) for (i = 0; i < p->interleave_ways; i++) detach_target(cxlr, i); + cxlr->hpa_range = DEFINE_RANGE(0, -1); + cxl_region_iomem_release(cxlr); put_device(&cxlr->dev); } @@ -3579,6 +3585,7 @@ static int __construct_region(struct cxl_region *cxlr, } set_bit(CXL_REGION_F_AUTO, &cxlr->flags); + cxlr->hpa_range = *hpa_range; res = kmalloc(sizeof(*res), GFP_KERNEL); if (!res) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 10ce9c3a8a55..3a5ca1936ed1 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -530,6 +530,7 @@ enum cxl_partition_mode { * @dev: This region's device * @id: This region's id. Id is globally unique across all regions * @cxlrd: Region's root decoder + * @hpa_range: Address range occupied by the region * @mode: Operational mode of the mapped capacity * @type: Endpoint decoder target type * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem setup / shutdown @@ -544,6 +545,7 @@ struct cxl_region { struct device dev; int id; struct cxl_root_decoder *cxlrd; + struct range hpa_range; enum cxl_partition_mode mode; enum cxl_decoder_type type; struct cxl_nvdimm_bridge *cxl_nvb; From 3e422caa40d0d4bf25ece6e82418ce642d56524a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:20 +0100 Subject: [PATCH 50/59] cxl: Simplify cxl_root_ops allocation and handling A root port's callback handlers are collected in struct cxl_root_ops. The structure is dynamically allocated, though it contains only a single pointer in it. This also requires to check two pointers to check for the existance of a callback. Simplify the allocation, release and handler check by embedding the ops statically in struct cxl_root. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260114164837.1076338-5-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 7 ++----- drivers/cxl/core/cdat.c | 8 ++++---- drivers/cxl/core/port.c | 8 ++------ drivers/cxl/cxl.h | 19 ++++++++++--------- 4 files changed, 18 insertions(+), 24 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index 77ac940e3013..b4bed40ef7c0 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -318,10 +318,6 @@ static int cxl_acpi_qos_class(struct cxl_root *cxl_root, return cxl_acpi_evaluate_qtg_dsm(handle, coord, entries, qos_class); } -static const struct cxl_root_ops acpi_root_ops = { - .qos_class = cxl_acpi_qos_class, -}; - static void del_cxl_resource(struct resource *res) { if (!res) @@ -923,9 +919,10 @@ static int cxl_acpi_probe(struct platform_device *pdev) cxl_res->end = -1; cxl_res->flags = IORESOURCE_MEM; - cxl_root = devm_cxl_add_root(host, &acpi_root_ops); + cxl_root = devm_cxl_add_root(host); if (IS_ERR(cxl_root)) return PTR_ERR(cxl_root); + cxl_root->ops.qos_class = cxl_acpi_qos_class; root_port = &cxl_root->port; rc = bus_for_each_dev(adev->dev.bus, NULL, root_port, diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c index 7120b5f2e31f..18f0f2a25113 100644 --- a/drivers/cxl/core/cdat.c +++ b/drivers/cxl/core/cdat.c @@ -213,7 +213,7 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, if (!cxl_root) return -ENODEV; - if (!cxl_root->ops || !cxl_root->ops->qos_class) + if (!cxl_root->ops.qos_class) return -EOPNOTSUPP; xa_for_each(dsmas_xa, index, dent) { @@ -221,9 +221,9 @@ static int cxl_port_perf_data_calculate(struct cxl_port *port, cxl_coordinates_combine(dent->coord, dent->cdat_coord, ep_c); dent->entries = 1; - rc = cxl_root->ops->qos_class(cxl_root, - &dent->coord[ACCESS_COORDINATE_CPU], - 1, &qos_class); + rc = cxl_root->ops.qos_class(cxl_root, + &dent->coord[ACCESS_COORDINATE_CPU], + 1, &qos_class); if (rc != 1) continue; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index fef3aa0c6680..2338d146577c 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -954,19 +954,15 @@ struct cxl_port *devm_cxl_add_port(struct device *host, } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_port, "CXL"); -struct cxl_root *devm_cxl_add_root(struct device *host, - const struct cxl_root_ops *ops) +struct cxl_root *devm_cxl_add_root(struct device *host) { - struct cxl_root *cxl_root; struct cxl_port *port; port = devm_cxl_add_port(host, host, CXL_RESOURCE_NONE, NULL); if (IS_ERR(port)) return ERR_CAST(port); - cxl_root = to_cxl_root(port); - cxl_root->ops = ops; - return cxl_root; + return to_cxl_root(port); } EXPORT_SYMBOL_NS_GPL(devm_cxl_add_root, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 3a5ca1936ed1..0e15dc6e169f 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -646,6 +646,14 @@ struct cxl_port { resource_size_t component_reg_phys; }; +struct cxl_root; + +struct cxl_root_ops { + int (*qos_class)(struct cxl_root *cxl_root, + struct access_coordinate *coord, int entries, + int *qos_class); +}; + /** * struct cxl_root - logical collection of root cxl_port items * @@ -654,7 +662,7 @@ struct cxl_port { */ struct cxl_root { struct cxl_port port; - const struct cxl_root_ops *ops; + struct cxl_root_ops ops; }; static inline struct cxl_root * @@ -663,12 +671,6 @@ to_cxl_root(const struct cxl_port *port) return container_of(port, struct cxl_root, port); } -struct cxl_root_ops { - int (*qos_class)(struct cxl_root *cxl_root, - struct access_coordinate *coord, int entries, - int *qos_class); -}; - static inline struct cxl_dport * cxl_find_dport_by_dev(struct cxl_port *port, const struct device *dport_dev) { @@ -782,8 +784,7 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport_dev, resource_size_t component_reg_phys, struct cxl_dport *parent_dport); -struct cxl_root *devm_cxl_add_root(struct device *host, - const struct cxl_root_ops *ops); +struct cxl_root *devm_cxl_add_root(struct device *host); struct cxl_root *find_cxl_root(struct cxl_port *port); DEFINE_FREE(put_cxl_root, struct cxl_root *, if (_T) put_device(&_T->port.dev)) From bc01fd5019faa14f4253de6f6abcae6d957c3a12 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:21 +0100 Subject: [PATCH 51/59] cxl/region: Separate region parameter setup and region construction To construct a region, the region parameters such as address range and interleaving config need to be determined. This is done while constructing the region by inspecting the endpoint decoder configuration. The endpoint decoder is passed as a function argument. With address translation the endpoint decoder data is no longer sufficient to extract the region parameters as some of the information is obtained using other methods such as using firmware calls. In a first step, separate code to determine the region parameters from the region construction. Temporarily store all the data to create the region in the new struct cxl_region_context. Once the region data is determined and struct cxl_region_context is filled, construct the region. Patch is a prerequisite to implement address translation. The code separation helps to later extend it to determine region parameters using other methods as needed, esp. to support address translation. Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Tested-by: Gregory Price Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260114164837.1076338-6-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/core.h | 8 ++++++++ drivers/cxl/core/region.c | 27 ++++++++++++++++++--------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 1fb66132b777..ae9e1bb51562 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -19,6 +19,14 @@ enum cxl_detach_mode { }; #ifdef CONFIG_CXL_REGION + +struct cxl_region_context { + struct cxl_endpoint_decoder *cxled; + struct range hpa_range; + int interleave_ways; + int interleave_granularity; +}; + extern struct device_attribute dev_attr_create_pmem_region; extern struct device_attribute dev_attr_create_ram_region; extern struct device_attribute dev_attr_delete_region; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 04c3ff66ec81..5ae77e9feb4d 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3565,11 +3565,12 @@ static int cxl_extended_linear_cache_resize(struct cxl_region *cxlr, } static int __construct_region(struct cxl_region *cxlr, - struct cxl_endpoint_decoder *cxled) + struct cxl_region_context *ctx) { + struct cxl_endpoint_decoder *cxled = ctx->cxled; struct cxl_root_decoder *cxlrd = cxlr->cxlrd; struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct range *hpa_range = &cxled->cxld.hpa_range; + struct range *hpa_range = &ctx->hpa_range; struct cxl_region_params *p; struct resource *res; int rc; @@ -3622,8 +3623,8 @@ static int __construct_region(struct cxl_region *cxlr, } p->res = res; - p->interleave_ways = cxled->cxld.interleave_ways; - p->interleave_granularity = cxled->cxld.interleave_granularity; + p->interleave_ways = ctx->interleave_ways; + p->interleave_granularity = ctx->interleave_granularity; p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); @@ -3643,8 +3644,9 @@ static int __construct_region(struct cxl_region *cxlr, /* Establish an empty region covering the given HPA range */ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, - struct cxl_endpoint_decoder *cxled) + struct cxl_region_context *ctx) { + struct cxl_endpoint_decoder *cxled = ctx->cxled; struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_port *port = cxlrd_to_port(cxlrd); struct cxl_dev_state *cxlds = cxlmd->cxlds; @@ -3664,7 +3666,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd, return cxlr; } - rc = __construct_region(cxlr, cxled); + rc = __construct_region(cxlr, ctx); if (rc) { devm_release_action(port->uport_dev, unregister_region, cxlr); return ERR_PTR(rc); @@ -3689,11 +3691,18 @@ cxl_find_region_by_range(struct cxl_root_decoder *cxlrd, int cxl_add_to_region(struct cxl_endpoint_decoder *cxled) { - struct range *hpa_range = &cxled->cxld.hpa_range; + struct cxl_region_context ctx; struct cxl_region_params *p; bool attach = false; int rc; + ctx = (struct cxl_region_context) { + .cxled = cxled, + .hpa_range = cxled->cxld.hpa_range, + .interleave_ways = cxled->cxld.interleave_ways, + .interleave_granularity = cxled->cxld.interleave_granularity, + }; + struct cxl_root_decoder *cxlrd __free(put_cxl_root_decoder) = cxl_find_root_decoder(cxled); if (!cxlrd) @@ -3706,9 +3715,9 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled) */ mutex_lock(&cxlrd->range_lock); struct cxl_region *cxlr __free(put_cxl_region) = - cxl_find_region_by_range(cxlrd, hpa_range); + cxl_find_region_by_range(cxlrd, &ctx.hpa_range); if (!cxlr) - cxlr = construct_region(cxlrd, cxled); + cxlr = construct_region(cxlrd, &ctx); mutex_unlock(&cxlrd->range_lock); rc = PTR_ERR_OR_ZERO(cxlr); From 1fd6c38fc5e18a9904bc1bd447bb4c2708f0292d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:22 +0100 Subject: [PATCH 52/59] cxl/region: Add @hpa_range argument to function cxl_calc_interleave_pos() cxl_calc_interleave_pos() uses the endpoint decoder's HPA range to determine its interleaving position. This requires the endpoint decoders to be an SPA, which is not the case for systems that need address translation. Add a separate @hpa_range argument to function cxl_calc_interleave_pos() to specify the address range. Now it is possible to pass the SPA translated address range of an endpoint decoder to function cxl_calc_interleave_pos(). Refactor only, no functional changes. Patch is a prerequisite to implement address translation. Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Tested-by: Gregory Price Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260114164837.1076338-7-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 5ae77e9feb4d..60d2d1dae2aa 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1878,6 +1878,7 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range, /** * cxl_calc_interleave_pos() - calculate an endpoint position in a region * @cxled: endpoint decoder member of given region + * @hpa_range: translated HPA range of the endpoint * * The endpoint position is calculated by traversing the topology from * the endpoint to the root decoder and iteratively applying this @@ -1890,11 +1891,11 @@ static int find_pos_and_ways(struct cxl_port *port, struct range *range, * Return: position >= 0 on success * -ENXIO on failure */ -static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled) +static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled, + struct range *hpa_range) { struct cxl_port *iter, *port = cxled_to_port(cxled); struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - struct range *range = &cxled->cxld.hpa_range; int parent_ways = 0, parent_pos = 0, pos = 0; int rc; @@ -1932,7 +1933,8 @@ static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled) if (is_cxl_root(iter)) break; - rc = find_pos_and_ways(iter, range, &parent_pos, &parent_ways); + rc = find_pos_and_ways(iter, hpa_range, &parent_pos, + &parent_ways); if (rc) return rc; @@ -1942,7 +1944,7 @@ static int cxl_calc_interleave_pos(struct cxl_endpoint_decoder *cxled) dev_dbg(&cxlmd->dev, "decoder:%s parent:%s port:%s range:%#llx-%#llx pos:%d\n", dev_name(&cxled->cxld.dev), dev_name(cxlmd->dev.parent), - dev_name(&port->dev), range->start, range->end, pos); + dev_name(&port->dev), hpa_range->start, hpa_range->end, pos); return pos; } @@ -1955,7 +1957,7 @@ static int cxl_region_sort_targets(struct cxl_region *cxlr) for (i = 0; i < p->nr_targets; i++) { struct cxl_endpoint_decoder *cxled = p->targets[i]; - cxled->pos = cxl_calc_interleave_pos(cxled); + cxled->pos = cxl_calc_interleave_pos(cxled, &cxlr->hpa_range); /* * Record that sorting failed, but still continue to calc * cxled->pos so that follow-on code paths can reliably @@ -2139,7 +2141,7 @@ static int cxl_region_attach(struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled = p->targets[i]; int test_pos; - test_pos = cxl_calc_interleave_pos(cxled); + test_pos = cxl_calc_interleave_pos(cxled, &cxlr->hpa_range); dev_dbg(&cxled->cxld.dev, "Test cxl_calc_interleave_pos(): %s test_pos:%d cxled->pos:%d\n", (test_pos == cxled->pos) ? "success" : "fail", From d01149bbe76d81d360ed24853d5247fcaad873e4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:23 +0100 Subject: [PATCH 53/59] cxl/region: Use region data to get the root decoder To find a region's root decoder, the endpoint's HPA range is used to search the matching decoder by its range. With address translation the endpoint decoder's range is in a different address space and thus cannot be used to determine the root decoder. The region parameters are encapsulated within struct cxl_region_context and may include the translated Host Physical Address (HPA) range. Use this context to identify the root decoder rather than relying on the endpoint. Modify cxl_find_root_decoder() and add the region context as parameter. Rename this function to get_cxl_root_decoder() as a counterpart to put_cxl_root_decoder(). Simplify the implementation by removing function cxl_port_find_switch_decode(). The function is unnecessary because it is not referenced or utilized elsewhere in the code. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Tested-by: Gregory Price Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260114164837.1076338-8-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 50 +++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 60d2d1dae2aa..912796fd708e 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3469,47 +3469,44 @@ err: return rc; } -static int match_decoder_by_range(struct device *dev, const void *data) +static int match_root_decoder(struct device *dev, const void *data) { const struct range *r1, *r2 = data; - struct cxl_decoder *cxld; + struct cxl_root_decoder *cxlrd; - if (!is_switch_decoder(dev)) + if (!is_root_decoder(dev)) return 0; - cxld = to_cxl_decoder(dev); - r1 = &cxld->hpa_range; + cxlrd = to_cxl_root_decoder(dev); + r1 = &cxlrd->cxlsd.cxld.hpa_range; + return range_contains(r1, r2); } -static struct cxl_decoder * -cxl_port_find_switch_decoder(struct cxl_port *port, struct range *hpa_range) -{ - struct device *cxld_dev = device_find_child(&port->dev, hpa_range, - match_decoder_by_range); - - return cxld_dev ? to_cxl_decoder(cxld_dev) : NULL; -} - +/* + * Note, when finished with the device, drop the reference with + * put_device() or use the put_cxl_root_decoder helper. + */ static struct cxl_root_decoder * -cxl_find_root_decoder(struct cxl_endpoint_decoder *cxled) +get_cxl_root_decoder(struct cxl_endpoint_decoder *cxled, + struct cxl_region_context *ctx) { struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); struct cxl_port *port = cxled_to_port(cxled); struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); - struct cxl_decoder *root, *cxld = &cxled->cxld; - struct range *hpa_range = &cxld->hpa_range; + struct device *cxlrd_dev; - root = cxl_port_find_switch_decoder(&cxl_root->port, hpa_range); - if (!root) { + cxlrd_dev = device_find_child(&cxl_root->port.dev, &ctx->hpa_range, + match_root_decoder); + if (!cxlrd_dev) { dev_err(cxlmd->dev.parent, "%s:%s no CXL window for range %#llx:%#llx\n", - dev_name(&cxlmd->dev), dev_name(&cxld->dev), - hpa_range->start, hpa_range->end); - return NULL; + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + ctx->hpa_range.start, ctx->hpa_range.end); + return ERR_PTR(-ENXIO); } - return to_cxl_root_decoder(&root->dev); + return to_cxl_root_decoder(cxlrd_dev); } static int match_region_by_range(struct device *dev, const void *data) @@ -3706,9 +3703,10 @@ int cxl_add_to_region(struct cxl_endpoint_decoder *cxled) }; struct cxl_root_decoder *cxlrd __free(put_cxl_root_decoder) = - cxl_find_root_decoder(cxled); - if (!cxlrd) - return -ENXIO; + get_cxl_root_decoder(cxled, &ctx); + + if (IS_ERR(cxlrd)) + return PTR_ERR(cxlrd); /* * Ensure that, if multiple threads race to construct_region() From a31af41115b0f7021a86f5439cb8720b93314f91 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:24 +0100 Subject: [PATCH 54/59] cxl: Introduce callback for HPA address ranges translation Introduce a callback to translate an endpoint's HPA range to the address range of the root port which is the System Physical Address (SPA) range used by a region. The callback can be set if a platform needs to handle address translation. The callback is attached to the root port. An endpoint's root port can easily be determined in the PCI hierarchy without any CXL specific knowledge. This allows the early use of address translation for CXL enumeration. Address translation is esp. needed for the detection of the root decoders. Thus, the callback is embedded in struct cxl_root_ops instead of struct cxl_rd_ops. Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Alison Schofield Tested-by: Gregory Price Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260114164837.1076338-9-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 24 ++++++++++++++++++++++++ drivers/cxl/cxl.h | 1 + 2 files changed, 25 insertions(+) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 912796fd708e..ed8469fa55a9 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3483,6 +3483,15 @@ static int match_root_decoder(struct device *dev, const void *data) return range_contains(r1, r2); } +static int cxl_root_setup_translation(struct cxl_root *cxl_root, + struct cxl_region_context *ctx) +{ + if (!cxl_root->ops.translation_setup_root) + return 0; + + return cxl_root->ops.translation_setup_root(cxl_root, ctx); +} + /* * Note, when finished with the device, drop the reference with * put_device() or use the put_cxl_root_decoder helper. @@ -3495,6 +3504,21 @@ get_cxl_root_decoder(struct cxl_endpoint_decoder *cxled, struct cxl_port *port = cxled_to_port(cxled); struct cxl_root *cxl_root __free(put_cxl_root) = find_cxl_root(port); struct device *cxlrd_dev; + int rc; + + /* + * Adjust the endpoint's HPA range and interleaving + * configuration to the root decoder’s memory space before + * setting up the root decoder. + */ + rc = cxl_root_setup_translation(cxl_root, ctx); + if (rc) { + dev_err(cxlmd->dev.parent, + "%s:%s Failed to setup translation for address range %#llx:%#llx\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + ctx->hpa_range.start, ctx->hpa_range.end); + return ERR_PTR(rc); + } cxlrd_dev = device_find_child(&cxl_root->port.dev, &ctx->hpa_range, match_root_decoder); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 0e15dc6e169f..8ea334d81edf 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -652,6 +652,7 @@ struct cxl_root_ops { int (*qos_class)(struct cxl_root *cxl_root, struct access_coordinate *coord, int entries, int *qos_class); + int (*translation_setup_root)(struct cxl_root *cxl_root, void *data); }; /** From 7be03eae1fdb690dff8f102a7306ca61b55a810c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:25 +0100 Subject: [PATCH 55/59] cxl/acpi: Prepare use of EFI runtime services In order to use EFI runtime services, esp. ACPI PRM which uses the efi_rts_wq workqueue, initialize EFI before CXL ACPI. There is a subsys_initcall order dependency if driver is builtin: subsys_initcall(cxl_acpi_init); subsys_initcall(efisubsys_init); Prevent the efi_rts_wq workqueue being used by cxl_acpi_init() before its allocation. Use subsys_initcall_sync(cxl_acpi_init) to always run efisubsys_init() first. Reported-by: Gregory Price Tested-by: Joshua Hahn Reviewed-by: Joshua Hahn Reviewed-by: Gregory Price Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Gregory Price Signed-off-by: Robert Richter Reviewed-by: Dave Jiang > --- Link: https://patch.msgid.link/20260114164837.1076338-10-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/acpi.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index b4bed40ef7c0..a31d0f97f916 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -1005,8 +1005,12 @@ static void __exit cxl_acpi_exit(void) cxl_bus_drain(); } -/* load before dax_hmem sees 'Soft Reserved' CXL ranges */ -subsys_initcall(cxl_acpi_init); +/* + * Load before dax_hmem sees 'Soft Reserved' CXL ranges. Use + * subsys_initcall_sync() since there is an order dependency with + * subsys_initcall(efisubsys_init), which must run first. + */ +subsys_initcall_sync(cxl_acpi_init); /* * Arrange for host-bridge ports to be active synchronous with From af74daf91652f15b82560bb93850d2ec8bbfa976 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 27 Jan 2026 11:12:31 -0700 Subject: [PATCH 56/59] cxl: Enable AMD Zen5 address translation using ACPI PRMT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add AMD Zen5 support for address translation. Zen5 systems may be configured to use 'Normalized addresses'. Then, host physical addresses (HPA) are different from their system physical addresses (SPA). The endpoint has its own physical address space and an incoming HPA is already converted to the device's physical address (DPA). Thus it has interleaving disabled and CXL endpoints are programmed passthrough (DPA == HPA). Host Physical Addresses (HPAs) need to be translated from the endpoint to its CXL host bridge, esp. to identify the endpoint's root decoder and region's address range. ACPI Platform Runtime Mechanism (PRM) provides a handler to translate the DPA to its SPA. This is documented in: AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh ACPI v6.5 Porting Guide, Publication # 58088 https://www.amd.com/en/search/documentation/hub.html With Normalized Addressing this PRM handler must be used to translate an HPA of an endpoint to its SPA. Do the following to implement AMD Zen5 address translation: Introduce a new file core/atl.c to handle ACPI PRM specific address translation code. Naming is loosely related to the kernel's AMD Address Translation Library (CONFIG_AMD_ATL) but implementation does not depend on it, nor it is vendor specific. Use Kbuild and Kconfig options respectively to enable the code depending on architecture and platform options. AMD Zen5 systems support the ACPI PRM CXL Address Translation firmware call (see ACPI v6.5 Porting Guide, Address Translation - CXL DPA to System Physical Address). Firmware enables the PRM handler if the platform has address translation implemented. Check firmware and kernel support of ACPI PRM using the specific GUID. On success enable address translation by setting up the earlier introduced root port callback, see function cxl_prm_setup_translation(). Setup is done in cxl_setup_prm_address_translation(), it is the only function that needs to be exported. For low level PRM firmware calls, use the ACPI framework. Identify the region's interleaving ways by inspecting the address ranges. Also determine the interleaving granularity using the address translation callback. Note that the position of the chunk from one interleaving block to the next may vary and thus cannot be considered constant. Address offsets larger than the interleaving block size cannot be used to calculate the granularity. Thus, probe the granularity using address translation for various HPAs in the same interleaving block. [ dj: Add atl.o build to cxl_test ] Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Tested-by: Gregory Price Signed-off-by: Robert Richter Link: https://patch.msgid.link/20260114164837.1076338-11-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/Kconfig | 5 + drivers/cxl/acpi.c | 2 + drivers/cxl/core/Makefile | 1 + drivers/cxl/core/atl.c | 190 ++++++++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 7 ++ tools/testing/cxl/Kbuild | 1 + 6 files changed, 206 insertions(+) create mode 100644 drivers/cxl/core/atl.c diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 48b7314afdb8..103950a9b73e 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -233,4 +233,9 @@ config CXL_MCE def_bool y depends on X86_MCE && MEMORY_FAILURE +config CXL_ATL + def_bool y + depends on CXL_REGION + depends on ACPI_PRMT && AMD_NB + endif diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c index a31d0f97f916..50c2987e0459 100644 --- a/drivers/cxl/acpi.c +++ b/drivers/cxl/acpi.c @@ -925,6 +925,8 @@ static int cxl_acpi_probe(struct platform_device *pdev) cxl_root->ops.qos_class = cxl_acpi_qos_class; root_port = &cxl_root->port; + cxl_setup_prm_address_translation(cxl_root); + rc = bus_for_each_dev(adev->dev.bus, NULL, root_port, add_host_bridge_dport); if (rc < 0) diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 5ad8fef210b5..11fe272a6e29 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -20,3 +20,4 @@ cxl_core-$(CONFIG_CXL_REGION) += region.o cxl_core-$(CONFIG_CXL_MCE) += mce.o cxl_core-$(CONFIG_CXL_FEATURES) += features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += edac.o +cxl_core-$(CONFIG_CXL_ATL) += atl.o diff --git a/drivers/cxl/core/atl.c b/drivers/cxl/core/atl.c new file mode 100644 index 000000000000..c36984686fb0 --- /dev/null +++ b/drivers/cxl/core/atl.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include +#include +#include + +#include +#include "core.h" + +/* + * PRM Address Translation - CXL DPA to System Physical Address + * + * Reference: + * + * AMD Family 1Ah Models 00h–0Fh and Models 10h–1Fh + * ACPI v6.5 Porting Guide, Publication # 58088 + */ + +static const guid_t prm_cxl_dpa_spa_guid = + GUID_INIT(0xee41b397, 0x25d4, 0x452c, 0xad, 0x54, 0x48, 0xc6, 0xe3, + 0x48, 0x0b, 0x94); + +struct prm_cxl_dpa_spa_data { + u64 dpa; + u8 reserved; + u8 devfn; + u8 bus; + u8 segment; + u64 *spa; +} __packed; + +static u64 prm_cxl_dpa_spa(struct pci_dev *pci_dev, u64 dpa) +{ + struct prm_cxl_dpa_spa_data data; + u64 spa; + int rc; + + data = (struct prm_cxl_dpa_spa_data) { + .dpa = dpa, + .devfn = pci_dev->devfn, + .bus = pci_dev->bus->number, + .segment = pci_domain_nr(pci_dev->bus), + .spa = &spa, + }; + + rc = acpi_call_prm_handler(prm_cxl_dpa_spa_guid, &data); + if (rc) { + pci_dbg(pci_dev, "failed to get SPA for %#llx: %d\n", dpa, rc); + return ULLONG_MAX; + } + + pci_dbg(pci_dev, "PRM address translation: DPA -> SPA: %#llx -> %#llx\n", dpa, spa); + + return spa; +} + +static int cxl_prm_setup_root(struct cxl_root *cxl_root, void *data) +{ + struct cxl_region_context *ctx = data; + struct cxl_endpoint_decoder *cxled = ctx->cxled; + struct cxl_decoder *cxld = &cxled->cxld; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct range hpa_range = ctx->hpa_range; + struct pci_dev *pci_dev; + u64 spa_len, len; + u64 addr, base_spa, base; + int ways, gran; + + /* + * When Normalized Addressing is enabled, the endpoint maintains a 1:1 + * mapping between HPA and DPA. If disabled, skip address translation + * and perform only a range check. + */ + if (hpa_range.start != cxled->dpa_res->start) + return 0; + + /* + * Endpoints are programmed passthrough in Normalized Addressing mode. + */ + if (ctx->interleave_ways != 1) { + dev_dbg(&cxld->dev, "unexpected interleaving config: ways: %d granularity: %d\n", + ctx->interleave_ways, ctx->interleave_granularity); + return -ENXIO; + } + + if (!cxlmd || !dev_is_pci(cxlmd->dev.parent)) { + dev_dbg(&cxld->dev, "No endpoint found: %s, range %#llx-%#llx\n", + dev_name(cxld->dev.parent), hpa_range.start, + hpa_range.end); + return -ENXIO; + } + + pci_dev = to_pci_dev(cxlmd->dev.parent); + + /* Translate HPA range to SPA. */ + base = hpa_range.start; + hpa_range.start = prm_cxl_dpa_spa(pci_dev, hpa_range.start); + hpa_range.end = prm_cxl_dpa_spa(pci_dev, hpa_range.end); + base_spa = hpa_range.start; + + if (hpa_range.start == ULLONG_MAX || hpa_range.end == ULLONG_MAX) { + dev_dbg(cxld->dev.parent, + "CXL address translation: Failed to translate HPA range: %#llx-%#llx:%#llx-%#llx(%s)\n", + hpa_range.start, hpa_range.end, ctx->hpa_range.start, + ctx->hpa_range.end, dev_name(&cxld->dev)); + return -ENXIO; + } + + /* + * Since translated addresses include the interleaving offsets, align + * the range to 256 MB. + */ + hpa_range.start = ALIGN_DOWN(hpa_range.start, SZ_256M); + hpa_range.end = ALIGN(hpa_range.end, SZ_256M) - 1; + + len = range_len(&ctx->hpa_range); + spa_len = range_len(&hpa_range); + if (!len || !spa_len || spa_len % len) { + dev_dbg(cxld->dev.parent, + "CXL address translation: HPA range not contiguous: %#llx-%#llx:%#llx-%#llx(%s)\n", + hpa_range.start, hpa_range.end, ctx->hpa_range.start, + ctx->hpa_range.end, dev_name(&cxld->dev)); + return -ENXIO; + } + + ways = spa_len / len; + gran = SZ_256; + + /* + * Determine interleave granularity + * + * Note: The position of the chunk from one interleaving block to the + * next may vary and thus cannot be considered constant. Address offsets + * larger than the interleaving block size cannot be used to calculate + * the granularity. + */ + if (ways > 1) { + while (gran <= SZ_16M) { + addr = prm_cxl_dpa_spa(pci_dev, base + gran); + if (addr != base_spa + gran) + break; + gran <<= 1; + } + } + + if (gran > SZ_16M) { + dev_dbg(cxld->dev.parent, + "CXL address translation: Cannot determine granularity: %#llx-%#llx:%#llx-%#llx(%s)\n", + hpa_range.start, hpa_range.end, ctx->hpa_range.start, + ctx->hpa_range.end, dev_name(&cxld->dev)); + return -ENXIO; + } + + ctx->hpa_range = hpa_range; + ctx->interleave_ways = ways; + ctx->interleave_granularity = gran; + + dev_dbg(&cxld->dev, + "address mapping found for %s (hpa -> spa): %#llx+%#llx -> %#llx+%#llx ways:%d granularity:%d\n", + dev_name(cxlmd->dev.parent), base, len, hpa_range.start, + spa_len, ways, gran); + + return 0; +} + +void cxl_setup_prm_address_translation(struct cxl_root *cxl_root) +{ + struct device *host = cxl_root->port.uport_dev; + u64 spa; + struct prm_cxl_dpa_spa_data data = { .spa = &spa }; + int rc; + + /* + * Applies only to PCIe Host Bridges which are children of the CXL Root + * Device (HID=“ACPI0017”). Check this and drop cxl_test instances. + */ + if (!acpi_match_device(host->driver->acpi_match_table, host)) + return; + + /* Check kernel (-EOPNOTSUPP) and firmware support (-ENODEV) */ + rc = acpi_call_prm_handler(prm_cxl_dpa_spa_guid, &data); + if (rc == -EOPNOTSUPP || rc == -ENODEV) + return; + + cxl_root->ops.translation_setup_root = cxl_prm_setup_root; +} +EXPORT_SYMBOL_NS_GPL(cxl_setup_prm_address_translation, "CXL"); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 8ea334d81edf..20b0fd43fa7b 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -817,6 +817,13 @@ static inline void cxl_dport_init_ras_reporting(struct cxl_dport *dport, struct device *host) { } #endif +#ifdef CONFIG_CXL_ATL +void cxl_setup_prm_address_translation(struct cxl_root *cxl_root); +#else +static inline +void cxl_setup_prm_address_translation(struct cxl_root *cxl_root) {} +#endif + struct cxl_decoder *to_cxl_decoder(struct device *dev); struct cxl_root_decoder *to_cxl_root_decoder(struct device *dev); struct cxl_switch_decoder *to_cxl_switch_decoder(struct device *dev); diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0e151d0572d1..612d8edbfc6f 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -63,6 +63,7 @@ cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-$(CONFIG_CXL_MCE) += $(CXL_CORE_SRC)/mce.o cxl_core-$(CONFIG_CXL_FEATURES) += $(CXL_CORE_SRC)/features.o cxl_core-$(CONFIG_CXL_EDAC_MEM_FEATURES) += $(CXL_CORE_SRC)/edac.o +cxl_core-$(CONFIG_CXL_ATL) += $(CXL_CORE_SRC)/atl.o cxl_core-y += config_check.o cxl_core-y += cxl_core_test.o cxl_core-y += cxl_core_exports.o From a2e794895089c1356b7687e8df1fa7d224d40bb6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:27 +0100 Subject: [PATCH 57/59] cxl/atl: Lock decoders that need address translation The current kernel implementation does not support endpoint setup with Normalized Addressing. It only translates an endpoint's DPA to the SPA range of the host bridge. Therefore, the endpoint address range cannot be determined, making a non-auto setup impossible. If a decoder requires address translation, reprogramming should be disabled and the decoder locked. The BIOS, however, provides all the necessary address translation data, which the kernel can use to reconfigure endpoint decoders with normalized addresses. Locking the decoders in the BIOS would prevent a capable kernel (or other operating systems) from shutting down auto-generated regions and managing resources dynamically. Reviewed-by: Gregory Price Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Tested-by: Gregory Price Signed-off-by: Robert Richter Reviewed-by: Dave Jiang > --- Link: https://patch.msgid.link/20260114164837.1076338-12-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/atl.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/cxl/core/atl.c b/drivers/cxl/core/atl.c index c36984686fb0..09d0ea1792d9 100644 --- a/drivers/cxl/core/atl.c +++ b/drivers/cxl/core/atl.c @@ -154,6 +154,24 @@ static int cxl_prm_setup_root(struct cxl_root *cxl_root, void *data) return -ENXIO; } + /* + * The current kernel implementation does not support endpoint + * setup with Normalized Addressing. It only translates an + * endpoint's DPA to the SPA range of the host bridge. + * Therefore, the endpoint address range cannot be determined, + * making a non-auto setup impossible. If a decoder requires + * address translation, reprogramming should be disabled and + * the decoder locked. + * + * The BIOS, however, provides all the necessary address + * translation data, which the kernel can use to reconfigure + * endpoint decoders with normalized addresses. Locking the + * decoders in the BIOS would prevent a capable kernel (or + * other operating systems) from shutting down auto-generated + * regions and managing resources dynamically. + */ + cxld->flags |= CXL_DECODER_F_LOCK; + ctx->hpa_range = hpa_range; ctx->interleave_ways = ways; ctx->interleave_granularity = gran; From d1c9ba46d6c36ff8d5b5f83ae28eae4132e46988 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:28 +0100 Subject: [PATCH 58/59] cxl/region: Factor out code into cxl_region_setup_poison() Poison injection setup code is embedded in cxl_region_probe(). For improved encapsulation, readability, and maintainability, factor out code into function cxl_region_setup_poison(). This patch is a prerequisite to disable poison by region offset for Normalized Addressing. No functional changes. Reviewed-by: Dave Jiang Reviewed-by: Alison Schofield Signed-off-by: Robert Richter Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20260114164837.1076338-13-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/region.c | 53 +++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index ed8469fa55a9..80cd77f0842e 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -3916,6 +3916,31 @@ static int cxl_region_debugfs_poison_clear(void *data, u64 offset) DEFINE_DEBUGFS_ATTRIBUTE(cxl_poison_clear_fops, NULL, cxl_region_debugfs_poison_clear, "%llx\n"); +static int cxl_region_setup_poison(struct cxl_region *cxlr) +{ + struct device *dev = &cxlr->dev; + struct cxl_region_params *p = &cxlr->params; + struct dentry *dentry; + + /* Create poison attributes if all memdevs support the capabilities */ + for (int i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + + if (!cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_INJECT) || + !cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_CLEAR)) + return 0; + } + + dentry = cxl_debugfs_create_dir(dev_name(dev)); + debugfs_create_file("inject_poison", 0200, dentry, cxlr, + &cxl_poison_inject_fops); + debugfs_create_file("clear_poison", 0200, dentry, cxlr, + &cxl_poison_clear_fops); + + return devm_add_action_or_reset(dev, remove_debugfs, dentry); +} + static int cxl_region_can_probe(struct cxl_region *cxlr) { struct cxl_region_params *p = &cxlr->params; @@ -3945,7 +3970,6 @@ static int cxl_region_probe(struct device *dev) { struct cxl_region *cxlr = to_cxl_region(dev); struct cxl_region_params *p = &cxlr->params; - bool poison_supported = true; int rc; rc = cxl_region_can_probe(cxlr); @@ -3969,30 +3993,9 @@ static int cxl_region_probe(struct device *dev) if (rc) return rc; - /* Create poison attributes if all memdevs support the capabilities */ - for (int i = 0; i < p->nr_targets; i++) { - struct cxl_endpoint_decoder *cxled = p->targets[i]; - struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); - - if (!cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_INJECT) || - !cxl_memdev_has_poison_cmd(cxlmd, CXL_POISON_ENABLED_CLEAR)) { - poison_supported = false; - break; - } - } - - if (poison_supported) { - struct dentry *dentry; - - dentry = cxl_debugfs_create_dir(dev_name(dev)); - debugfs_create_file("inject_poison", 0200, dentry, cxlr, - &cxl_poison_inject_fops); - debugfs_create_file("clear_poison", 0200, dentry, cxlr, - &cxl_poison_clear_fops); - rc = devm_add_action_or_reset(dev, remove_debugfs, dentry); - if (rc) - return rc; - } + rc = cxl_region_setup_poison(cxlr); + if (rc) + return rc; switch (cxlr->mode) { case CXL_PARTMODE_PMEM: From 208f432406b7ed446c061d68cc73efd85b575d3f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 14 Jan 2026 17:48:29 +0100 Subject: [PATCH 59/59] cxl: Disable HPA/SPA translation handlers for Normalized Addressing The root decoder provides the callbacks hpa_to_spa and spa_to_hpa to perform Host Physical Address (HPA) and System Physical Address translations, respectively. The callbacks are required to convert addresses when HPA != SPA. XOR interleaving depends on this mechanism, and the necessary handlers are implemented. The translation handlers are used for poison injection (trace_cxl_poison, cxl_poison_inject_fops) and error handling (cxl_event_trace_record). In AMD Zen5 systems with Normalized Addressing, endpoint addresses are not SPAs, and translation handlers are required for these features to function correctly. Now, as ACPI PRM translation could be expensive in tracing or error handling code paths, do not yet enable translations to avoid its intensive use. Instead, disable those features which are used only for debugging and enhanced logging. Introduce the flag CXL_REGION_F_NORMALIZED_ADDRESSING that indicates Normalized Addressing for a region and use it to disable poison injection and DPA to HPA conversion. Note: Dropped unused CXL_DECODER_F_MASK macro. [dj: Fix commit log CXL_REGION_F_NORM_ADDR to CXL_REGION_F_NORMALIZED_ADDRESSING ] Reviewed-by: Alison Schofield Signed-off-by: Robert Richter Reviewed-by: Jonathan Cameron Reviewed-by: Dave Jiang Link: https://patch.msgid.link/20260114164837.1076338-14-rrichter@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/atl.c | 3 +++ drivers/cxl/core/region.c | 33 +++++++++++++++++++++++++-------- drivers/cxl/cxl.h | 9 ++++++++- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/drivers/cxl/core/atl.c b/drivers/cxl/core/atl.c index 09d0ea1792d9..310668786189 100644 --- a/drivers/cxl/core/atl.c +++ b/drivers/cxl/core/atl.c @@ -169,8 +169,11 @@ static int cxl_prm_setup_root(struct cxl_root *cxl_root, void *data) * decoders in the BIOS would prevent a capable kernel (or * other operating systems) from shutting down auto-generated * regions and managing resources dynamically. + * + * Indicate that Normalized Addressing is enabled. */ cxld->flags |= CXL_DECODER_F_LOCK; + cxld->flags |= CXL_DECODER_F_NORMALIZED_ADDRESSING; ctx->hpa_range = hpa_range; ctx->interleave_ways = ways; diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 80cd77f0842e..8e92b491d686 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -1097,14 +1097,16 @@ static int cxl_rr_assign_decoder(struct cxl_port *port, struct cxl_region *cxlr, return 0; } -static void cxl_region_set_lock(struct cxl_region *cxlr, - struct cxl_decoder *cxld) +static void cxl_region_setup_flags(struct cxl_region *cxlr, + struct cxl_decoder *cxld) { - if (!test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) - return; + if (test_bit(CXL_DECODER_F_LOCK, &cxld->flags)) { + set_bit(CXL_REGION_F_LOCK, &cxlr->flags); + clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); + } - set_bit(CXL_REGION_F_LOCK, &cxlr->flags); - clear_bit(CXL_REGION_F_NEEDS_RESET, &cxlr->flags); + if (test_bit(CXL_DECODER_F_NORMALIZED_ADDRESSING, &cxld->flags)) + set_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags); } /** @@ -1218,7 +1220,7 @@ static int cxl_port_attach_region(struct cxl_port *port, } } - cxl_region_set_lock(cxlr, cxld); + cxl_region_setup_flags(cxlr, cxld); rc = cxl_rr_ep_add(cxl_rr, cxled); if (rc) { @@ -2493,7 +2495,7 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i device_set_pm_not_required(dev); dev->bus = &cxl_bus_type; dev->type = &cxl_region_type; - cxl_region_set_lock(cxlr, &cxlrd->cxlsd.cxld); + cxl_region_setup_flags(cxlr, &cxlrd->cxlsd.cxld); return cxlr; } @@ -3132,6 +3134,13 @@ u64 cxl_dpa_to_hpa(struct cxl_region *cxlr, const struct cxl_memdev *cxlmd, u8 eiw = 0; int pos; + /* + * Conversion between SPA and DPA is not supported in + * Normalized Address mode. + */ + if (test_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags)) + return ULLONG_MAX; + for (int i = 0; i < p->nr_targets; i++) { if (cxlmd == cxled_to_memdev(p->targets[i])) { cxled = p->targets[i]; @@ -3922,6 +3931,14 @@ static int cxl_region_setup_poison(struct cxl_region *cxlr) struct cxl_region_params *p = &cxlr->params; struct dentry *dentry; + /* + * Do not enable poison injection in Normalized Address mode. + * Conversion between SPA and DPA is required for this, but it is + * not supported in this mode. + */ + if (test_bit(CXL_REGION_F_NORMALIZED_ADDRESSING, &cxlr->flags)) + return 0; + /* Create poison attributes if all memdevs support the capabilities */ for (int i = 0; i < p->nr_targets; i++) { struct cxl_endpoint_decoder *cxled = p->targets[i]; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 20b0fd43fa7b..de30a87600be 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -332,7 +332,7 @@ int cxl_dport_map_rcd_linkcap(struct pci_dev *pdev, struct cxl_dport *dport); #define CXL_DECODER_F_TYPE3 BIT(3) #define CXL_DECODER_F_LOCK BIT(4) #define CXL_DECODER_F_ENABLE BIT(5) -#define CXL_DECODER_F_MASK GENMASK(5, 0) +#define CXL_DECODER_F_NORMALIZED_ADDRESSING BIT(6) enum cxl_decoder_type { CXL_DECODER_DEVMEM = 2, @@ -525,6 +525,13 @@ enum cxl_partition_mode { */ #define CXL_REGION_F_LOCK 2 +/* + * Indicate Normalized Addressing. Use it to disable SPA conversion if + * HPA != SPA and an address translation callback handler does not + * exist. Flag is needed by AMD Zen5 platforms. + */ +#define CXL_REGION_F_NORMALIZED_ADDRESSING 3 + /** * struct cxl_region - CXL region * @dev: This region's device