mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 03:44:45 +01:00
accel/amdxdna: Support getting last hardware error
Add new parameter DRM_AMDXDNA_HW_LAST_ASYNC_ERR to get array IOCTL. When hardware reports an error, the driver save the error information and timestamp. This new get array parameter retrieves the last error. Reviewed-by: Mario Limonciello (AMD) <superm1@kernel.org> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com> Link: https://lore.kernel.org/r/20251014234119.628453-1-lizhi.hou@amd.com
This commit is contained in:
parent
83f81f5499
commit
b291e4f1a4
6 changed files with 159 additions and 19 deletions
|
|
@ -13,6 +13,7 @@
|
|||
|
||||
#include "aie2_msg_priv.h"
|
||||
#include "aie2_pci.h"
|
||||
#include "amdxdna_error.h"
|
||||
#include "amdxdna_mailbox.h"
|
||||
#include "amdxdna_pci_drv.h"
|
||||
|
||||
|
|
@ -46,6 +47,7 @@ enum aie_module_type {
|
|||
AIE_MEM_MOD = 0,
|
||||
AIE_CORE_MOD,
|
||||
AIE_PL_MOD,
|
||||
AIE_UNKNOWN_MOD,
|
||||
};
|
||||
|
||||
enum aie_error_category {
|
||||
|
|
@ -143,6 +145,31 @@ static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
|
|||
EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
|
||||
};
|
||||
|
||||
static const enum amdxdna_error_num aie_cat_err_num_map[] = {
|
||||
[AIE_ERROR_SATURATION] = AMDXDNA_ERROR_NUM_AIE_SATURATION,
|
||||
[AIE_ERROR_FP] = AMDXDNA_ERROR_NUM_AIE_FP,
|
||||
[AIE_ERROR_STREAM] = AMDXDNA_ERROR_NUM_AIE_STREAM,
|
||||
[AIE_ERROR_ACCESS] = AMDXDNA_ERROR_NUM_AIE_ACCESS,
|
||||
[AIE_ERROR_BUS] = AMDXDNA_ERROR_NUM_AIE_BUS,
|
||||
[AIE_ERROR_INSTRUCTION] = AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
|
||||
[AIE_ERROR_ECC] = AMDXDNA_ERROR_NUM_AIE_ECC,
|
||||
[AIE_ERROR_LOCK] = AMDXDNA_ERROR_NUM_AIE_LOCK,
|
||||
[AIE_ERROR_DMA] = AMDXDNA_ERROR_NUM_AIE_DMA,
|
||||
[AIE_ERROR_MEM_PARITY] = AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
|
||||
[AIE_ERROR_UNKNOWN] = AMDXDNA_ERROR_NUM_UNKNOWN,
|
||||
};
|
||||
|
||||
static_assert(ARRAY_SIZE(aie_cat_err_num_map) == AIE_ERROR_UNKNOWN + 1);
|
||||
|
||||
static const enum amdxdna_error_module aie_err_mod_map[] = {
|
||||
[AIE_MEM_MOD] = AMDXDNA_ERROR_MODULE_AIE_MEMORY,
|
||||
[AIE_CORE_MOD] = AMDXDNA_ERROR_MODULE_AIE_CORE,
|
||||
[AIE_PL_MOD] = AMDXDNA_ERROR_MODULE_AIE_PL,
|
||||
[AIE_UNKNOWN_MOD] = AMDXDNA_ERROR_MODULE_UNKNOWN,
|
||||
};
|
||||
|
||||
static_assert(ARRAY_SIZE(aie_err_mod_map) == AIE_UNKNOWN_MOD + 1);
|
||||
|
||||
static enum aie_error_category
|
||||
aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
|
||||
{
|
||||
|
|
@ -176,12 +203,40 @@ aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
|
|||
if (event_id != lut[i].event_id)
|
||||
continue;
|
||||
|
||||
if (lut[i].category > AIE_ERROR_UNKNOWN)
|
||||
return AIE_ERROR_UNKNOWN;
|
||||
|
||||
return lut[i].category;
|
||||
}
|
||||
|
||||
return AIE_ERROR_UNKNOWN;
|
||||
}
|
||||
|
||||
static void aie2_update_last_async_error(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
|
||||
{
|
||||
struct aie_error *errs = err_info;
|
||||
enum amdxdna_error_module err_mod;
|
||||
enum aie_error_category aie_err;
|
||||
enum amdxdna_error_num err_num;
|
||||
struct aie_error *last_err;
|
||||
|
||||
last_err = &errs[num_err - 1];
|
||||
if (last_err->mod_type >= AIE_UNKNOWN_MOD) {
|
||||
err_num = aie_cat_err_num_map[AIE_ERROR_UNKNOWN];
|
||||
err_mod = aie_err_mod_map[AIE_UNKNOWN_MOD];
|
||||
} else {
|
||||
aie_err = aie_get_error_category(last_err->row,
|
||||
last_err->event_id,
|
||||
last_err->mod_type);
|
||||
err_num = aie_cat_err_num_map[aie_err];
|
||||
err_mod = aie_err_mod_map[last_err->mod_type];
|
||||
}
|
||||
|
||||
ndev->last_async_err.err_code = AMDXDNA_ERROR_ENCODE(err_num, err_mod);
|
||||
ndev->last_async_err.ts_us = ktime_to_us(ktime_get_real());
|
||||
ndev->last_async_err.ex_err_code = AMDXDNA_EXTRA_ERR_ENCODE(last_err->row, last_err->col);
|
||||
}
|
||||
|
||||
static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
|
||||
{
|
||||
struct aie_error *errs = err_info;
|
||||
|
|
@ -264,29 +319,14 @@ static void aie2_error_worker(struct work_struct *err_work)
|
|||
}
|
||||
|
||||
mutex_lock(&xdna->dev_lock);
|
||||
aie2_update_last_async_error(e->ndev, info->payload, info->err_cnt);
|
||||
|
||||
/* Re-sent this event to firmware */
|
||||
if (aie2_error_event_send(e))
|
||||
XDNA_WARN(xdna, "Unable to register async event");
|
||||
mutex_unlock(&xdna->dev_lock);
|
||||
}
|
||||
|
||||
int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
|
||||
{
|
||||
struct amdxdna_dev *xdna = ndev->xdna;
|
||||
struct async_event *e;
|
||||
int i, ret;
|
||||
|
||||
drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
|
||||
for (i = 0; i < ndev->async_events->event_cnt; i++) {
|
||||
e = &ndev->async_events->event[i];
|
||||
ret = aie2_error_event_send(e);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
|
||||
{
|
||||
struct amdxdna_dev *xdna = ndev->xdna;
|
||||
|
|
@ -341,6 +381,10 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
|
|||
e->size = ASYNC_BUF_SIZE;
|
||||
e->resp.status = MAX_AIE2_STATUS_CODE;
|
||||
INIT_WORK(&e->work, aie2_error_worker);
|
||||
|
||||
ret = aie2_error_event_send(e);
|
||||
if (ret)
|
||||
goto free_wq;
|
||||
}
|
||||
|
||||
ndev->async_events = events;
|
||||
|
|
@ -349,6 +393,8 @@ int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
|
|||
events->event_cnt, events->size);
|
||||
return 0;
|
||||
|
||||
free_wq:
|
||||
destroy_workqueue(events->wq);
|
||||
free_buf:
|
||||
dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
|
||||
events->addr, DMA_FROM_DEVICE);
|
||||
|
|
@ -356,3 +402,18 @@ free_events:
|
|||
kfree(events);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev, struct amdxdna_drm_get_array *args)
|
||||
{
|
||||
struct amdxdna_dev *xdna = ndev->xdna;
|
||||
|
||||
drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
|
||||
|
||||
args->num_element = 1;
|
||||
args->element_size = sizeof(ndev->last_async_err);
|
||||
if (copy_to_user(u64_to_user_ptr(args->buffer),
|
||||
&ndev->last_async_err, args->element_size))
|
||||
return -EFAULT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -924,6 +924,9 @@ static int aie2_get_array(struct amdxdna_client *client,
|
|||
case DRM_AMDXDNA_HW_CONTEXT_ALL:
|
||||
ret = aie2_query_ctx_status_array(client, args);
|
||||
break;
|
||||
case DRM_AMDXDNA_HW_LAST_ASYNC_ERR:
|
||||
ret = aie2_get_array_async_error(xdna->dev_handle, args);
|
||||
break;
|
||||
default:
|
||||
XDNA_ERR(xdna, "Not supported request parameter %u", args->param);
|
||||
ret = -EOPNOTSUPP;
|
||||
|
|
|
|||
|
|
@ -190,6 +190,8 @@ struct amdxdna_dev_hdl {
|
|||
|
||||
enum aie2_dev_status dev_status;
|
||||
u32 hwctx_num;
|
||||
|
||||
struct amdxdna_async_error last_async_err;
|
||||
};
|
||||
|
||||
#define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \
|
||||
|
|
@ -253,8 +255,9 @@ void aie2_psp_stop(struct psp_device *psp);
|
|||
/* aie2_error.c */
|
||||
int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
|
||||
void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev);
|
||||
int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev);
|
||||
int aie2_error_async_msg_thread(void *data);
|
||||
int aie2_get_array_async_error(struct amdxdna_dev_hdl *ndev,
|
||||
struct amdxdna_drm_get_array *args);
|
||||
|
||||
/* aie2_message.c */
|
||||
int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
|
||||
|
|
|
|||
59
drivers/accel/amdxdna/amdxdna_error.h
Normal file
59
drivers/accel/amdxdna/amdxdna_error.h
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (C) 2025, Advanced Micro Devices, Inc.
|
||||
*/
|
||||
|
||||
#ifndef _AMDXDNA_ERROR_H_
|
||||
#define _AMDXDNA_ERROR_H_
|
||||
|
||||
#include <linux/bitfield.h>
|
||||
#include <linux/bits.h>
|
||||
|
||||
#define AMDXDNA_ERR_DRV_AIE 4
|
||||
#define AMDXDNA_ERR_SEV_CRITICAL 3
|
||||
#define AMDXDNA_ERR_CLASS_AIE 2
|
||||
|
||||
#define AMDXDNA_ERR_NUM_MASK GENMASK_U64(15, 0)
|
||||
#define AMDXDNA_ERR_DRV_MASK GENMASK_U64(23, 16)
|
||||
#define AMDXDNA_ERR_SEV_MASK GENMASK_U64(31, 24)
|
||||
#define AMDXDNA_ERR_MOD_MASK GENMASK_U64(39, 32)
|
||||
#define AMDXDNA_ERR_CLASS_MASK GENMASK_U64(47, 40)
|
||||
|
||||
enum amdxdna_error_num {
|
||||
AMDXDNA_ERROR_NUM_AIE_SATURATION = 3,
|
||||
AMDXDNA_ERROR_NUM_AIE_FP,
|
||||
AMDXDNA_ERROR_NUM_AIE_STREAM,
|
||||
AMDXDNA_ERROR_NUM_AIE_ACCESS,
|
||||
AMDXDNA_ERROR_NUM_AIE_BUS,
|
||||
AMDXDNA_ERROR_NUM_AIE_INSTRUCTION,
|
||||
AMDXDNA_ERROR_NUM_AIE_ECC,
|
||||
AMDXDNA_ERROR_NUM_AIE_LOCK,
|
||||
AMDXDNA_ERROR_NUM_AIE_DMA,
|
||||
AMDXDNA_ERROR_NUM_AIE_MEM_PARITY,
|
||||
AMDXDNA_ERROR_NUM_UNKNOWN = 15,
|
||||
};
|
||||
|
||||
enum amdxdna_error_module {
|
||||
AMDXDNA_ERROR_MODULE_AIE_CORE = 3,
|
||||
AMDXDNA_ERROR_MODULE_AIE_MEMORY,
|
||||
AMDXDNA_ERROR_MODULE_AIE_SHIM,
|
||||
AMDXDNA_ERROR_MODULE_AIE_NOC,
|
||||
AMDXDNA_ERROR_MODULE_AIE_PL,
|
||||
AMDXDNA_ERROR_MODULE_UNKNOWN = 8,
|
||||
};
|
||||
|
||||
#define AMDXDNA_ERROR_ENCODE(err_num, err_mod) \
|
||||
(FIELD_PREP(AMDXDNA_ERR_NUM_MASK, err_num) | \
|
||||
FIELD_PREP_CONST(AMDXDNA_ERR_DRV_MASK, AMDXDNA_ERR_DRV_AIE) | \
|
||||
FIELD_PREP_CONST(AMDXDNA_ERR_SEV_MASK, AMDXDNA_ERR_SEV_CRITICAL) | \
|
||||
FIELD_PREP(AMDXDNA_ERR_MOD_MASK, err_mod) | \
|
||||
FIELD_PREP_CONST(AMDXDNA_ERR_CLASS_MASK, AMDXDNA_ERR_CLASS_AIE))
|
||||
|
||||
#define AMDXDNA_EXTRA_ERR_COL_MASK GENMASK_U64(7, 0)
|
||||
#define AMDXDNA_EXTRA_ERR_ROW_MASK GENMASK_U64(15, 8)
|
||||
|
||||
#define AMDXDNA_EXTRA_ERR_ENCODE(row, col) \
|
||||
(FIELD_PREP(AMDXDNA_EXTRA_ERR_COL_MASK, col) | \
|
||||
FIELD_PREP(AMDXDNA_EXTRA_ERR_ROW_MASK, row))
|
||||
|
||||
#endif /* _AMDXDNA_ERROR_H_ */
|
||||
|
|
@ -27,9 +27,10 @@ MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin");
|
|||
/*
|
||||
* 0.0: Initial version
|
||||
* 0.1: Support getting all hardware contexts by DRM_IOCTL_AMDXDNA_GET_ARRAY
|
||||
* 0.2: Support getting last error hardware error
|
||||
*/
|
||||
#define AMDXDNA_DRIVER_MAJOR 0
|
||||
#define AMDXDNA_DRIVER_MINOR 1
|
||||
#define AMDXDNA_DRIVER_MINOR 2
|
||||
|
||||
/*
|
||||
* Bind the driver base on (vendor_id, device_id) pair and later use the
|
||||
|
|
|
|||
|
|
@ -523,7 +523,20 @@ struct amdxdna_drm_hwctx_entry {
|
|||
__u32 pad;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct amdxdna_async_error - XDNA async error structure
|
||||
*/
|
||||
struct amdxdna_async_error {
|
||||
/** @err_code: Error code. */
|
||||
__u64 err_code;
|
||||
/** @ts_us: Timestamp. */
|
||||
__u64 ts_us;
|
||||
/** @ex_err_code: Extra error code */
|
||||
__u64 ex_err_code;
|
||||
};
|
||||
|
||||
#define DRM_AMDXDNA_HW_CONTEXT_ALL 0
|
||||
#define DRM_AMDXDNA_HW_LAST_ASYNC_ERR 2
|
||||
|
||||
/**
|
||||
* struct amdxdna_drm_get_array - Get information array.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue