mirror of
https://github.com/torvalds/linux.git
synced 2026-03-08 03:24:45 +01:00
drm/amdgpu: Send applicable RMA CPERs at end of RAS init
Firmware and monitoring tools may not be ready to receive a CPER when we read the bad pages, so send the CPERs at the end of RAS initialization to ensure that the FW is ready to receive and process the CPER. This removes the previous CPER submission that was added during bad page load, and sends both in-band and out-of-band at the same time. Signed-off-by: Kent Russell <kent.russell@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
f7afda7fcd
commit
5028a24aa8
3 changed files with 27 additions and 4 deletions
|
|
@ -4650,6 +4650,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
|
|||
amdgpu_ras_block_late_init_default(adev, &obj->ras_comm);
|
||||
}
|
||||
|
||||
amdgpu_ras_check_bad_page_status(adev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1712,10 +1712,6 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
|
|||
dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
|
||||
control->ras_num_bad_pages,
|
||||
ras->bad_page_cnt_threshold);
|
||||
if (amdgpu_bad_page_threshold != 0 &&
|
||||
control->ras_num_bad_pages >= ras->bad_page_cnt_threshold)
|
||||
amdgpu_dpm_send_rma_reason(adev);
|
||||
|
||||
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
|
||||
amdgpu_bad_page_threshold != 0) {
|
||||
if (hdr->version >= RAS_TABLE_VER_V2_1) {
|
||||
|
|
@ -1932,3 +1928,26 @@ int amdgpu_ras_smu_erase_ras_table(struct amdgpu_device *adev,
|
|||
result);
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
|
||||
|
||||
if (!control || amdgpu_bad_page_threshold == 0)
|
||||
return;
|
||||
|
||||
if (control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
|
||||
if (amdgpu_dpm_send_rma_reason(adev))
|
||||
dev_warn(adev->dev, "Unable to send out-of-band RMA CPER");
|
||||
else
|
||||
dev_dbg(adev->dev, "Sent out-of-band RMA CPER");
|
||||
|
||||
if (adev->cper.enabled && !amdgpu_uniras_enabled(adev)) {
|
||||
if (amdgpu_cper_generate_bp_threshold_record(adev))
|
||||
dev_warn(adev->dev, "Unable to send in-band RMA CPER");
|
||||
else
|
||||
dev_dbg(adev->dev, "Sent in-band RMA CPER");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -193,6 +193,8 @@ int amdgpu_ras_eeprom_read_idx(struct amdgpu_ras_eeprom_control *control,
|
|||
|
||||
int amdgpu_ras_eeprom_update_record_num(struct amdgpu_ras_eeprom_control *control);
|
||||
|
||||
void amdgpu_ras_check_bad_page_status(struct amdgpu_device *adev);
|
||||
|
||||
extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
|
||||
extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue