drm/xe/xe_hw_error: Add fault injection to trigger csc error handler

Add a debugfs fault handler to trigger csc error handler that
wedges the device and enables runtime survivability mode.

v2: add debugfs only for bmg (Umesh)
v3: do not use csc_fault attribute if debugfs is not enabled
v4: rebase

Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Riana Tauro <riana.tauro@intel.com>
Reviewed-by: Raag Jadav <raag.jadav@intel.com>
Link: https://lore.kernel.org/r/20250826063419.3022216-11-riana.tauro@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
This commit is contained in:
Riana Tauro 2025-08-26 12:04:17 +05:30 committed by Rodrigo Vivi
parent a7df563b45
commit d1f51a4f95
No known key found for this signature in database
GPG key ID: FA625F640EEB13CA
2 changed files with 16 additions and 1 deletions

View file

@ -35,6 +35,7 @@
#endif
DECLARE_FAULT_ATTR(gt_reset_failure);
DECLARE_FAULT_ATTR(inject_csc_hw_error);
static void read_residency_counter(struct xe_device *xe, struct xe_mmio *mmio,
u32 offset, char *name, struct drm_printer *p)
@ -361,10 +362,13 @@ void xe_debugfs_register(struct xe_device *xe)
ARRAY_SIZE(debugfs_list),
root, minor);
if (xe->info.platform == XE_BATTLEMAGE)
if (xe->info.platform == XE_BATTLEMAGE) {
drm_debugfs_create_files(debugfs_residencies,
ARRAY_SIZE(debugfs_residencies),
root, minor);
fault_create_debugfs_attr("inject_csc_hw_error", root,
&inject_csc_hw_error);
}
debugfs_create_file("forcewake_all", 0400, root, xe,
&forcewake_all_fops);

View file

@ -3,6 +3,8 @@
* Copyright © 2025 Intel Corporation
*/
#include <linux/fault-inject.h>
#include "regs/xe_gsc_regs.h"
#include "regs/xe_hw_error_regs.h"
#include "regs/xe_irq_regs.h"
@ -13,6 +15,7 @@
#include "xe_survivability_mode.h"
#define HEC_UNCORR_FW_ERR_BITS 4
extern struct fault_attr inject_csc_hw_error;
/* Error categories reported by hardware */
enum hardware_error {
@ -43,6 +46,11 @@ static const char *hw_error_to_str(const enum hardware_error hw_err)
}
}
static bool fault_inject_csc_hw_error(void)
{
return IS_ENABLED(CONFIG_DEBUG_FS) && should_fail(&inject_csc_hw_error, 1);
}
static void csc_hw_error_work(struct work_struct *work)
{
struct xe_tile *tile = container_of(work, typeof(*tile), csc_hw_error_work);
@ -130,6 +138,9 @@ void xe_hw_error_irq_handler(struct xe_tile *tile, const u32 master_ctl)
{
enum hardware_error hw_err;
if (fault_inject_csc_hw_error())
schedule_work(&tile->csc_hw_error_work);
for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++)
if (master_ctl & ERROR_IRQ(hw_err))
hw_error_source_handler(tile, hw_err);