drm/amdgpu: Send applicable RMA CPERs at end of RAS init

Firmware and monitoring tools may not be ready to receive a CPER when we
read the bad pages, so send the CPERs at the end of RAS initialization
to ensure that the FW is ready to receive and process the CPER. This
removes the previous CPER submission that was added during bad page
load, and sends both in-band and out-of-band at the same time.

Signed-off-by: Kent Russell <kent.russell@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Kent Russell 2026-02-03 09:48:23 -05:00 committed by Alex Deucher
parent f7afda7fcd
commit 5028a24aa8
3 changed files with 27 additions and 4 deletions

View file

@ -4650,6 +4650,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
}
amdgpu_ras_check_bad_page_status(adev);
return 0;
}

View file

@ -1712,10 +1712,6 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
control->ras_num_bad_pages,
ras->bad_page_cnt_threshold);
if (amdgpu_bad_page_threshold != 0 &&
control->ras_num_bad_pages >= ras->bad_page_cnt_threshold)
amdgpu_dpm_send_rma_reason(adev);
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
amdgpu_bad_page_threshold != 0) {
if (hdr->version >= RAS_TABLE_VER_V2_1) {
@ -1932,3 +1928,26 @@ int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
result);
return -EOPNOTSUPP;
}
void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev)
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
if (!control || amdgpu_bad_page_threshold == 0)
return;
if (control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
if (amdgpu_dpm_send_rma_reason(adev))
dev_warn(adev->dev, "Unable to send out-of-band RMA CPER");
else
dev_dbg(adev->dev, "Sent out-of-band RMA CPER");
if (adev->cper.enabled && !amdgpu_uniras_enabled(adev)) {
if (amdgpu_cper_generate_bp_threshold_record(adev))
dev_warn(adev->dev, "Unable to send in-band RMA CPER");
else
dev_dbg(adev->dev, "Sent in-band RMA CPER");
}
}
}

View file

@ -193,6 +193,8 @@ int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control);
void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev);
extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;