drm/amdgpu: add a helper for processing recoverable GPUVM faults

Add a common helper to remove the repeated logic from each
gmc module.

Suggested-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Alex Deucher 2025-12-01 14:46:53 -05:00
parent a50d32c41f
commit d3ff65243a
6 changed files with 69 additions and 97 deletions

View file

@ -524,6 +524,54 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
} while (fault->timestamp < tmp);
}
int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry,
u64 addr,
u32 cam_index,
u32 node_id,
bool write_fault)
{
int ret;
if (adev->irq.retry_cam_enabled) {
/* Delegate it to a different ring if the hardware hasn't
* already done it.
*/
if (entry->ih == &adev->irq.ih) {
amdgpu_irq_delegate(adev, entry, 8);
return 1;
}
ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
addr, entry->timestamp, write_fault);
WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
if (ret)
return 1;
} else {
/* Process it only if it's the first fault for this address */
if (entry->ih != &adev->irq.ih_soft &&
amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
entry->timestamp))
return 1;
/* Delegate it to a different ring if the hardware hasn't
* already done it.
*/
if (entry->ih == &adev->irq.ih) {
amdgpu_irq_delegate(adev, entry, 8);
return 1;
}
/* Try to handle the recoverable page faults by filling page
* tables
*/
if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
addr, entry->timestamp, write_fault))
return 1;
}
return 0;
}
int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev)
{
int r;

View file

@ -425,6 +425,12 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
uint16_t pasid, uint64_t timestamp);
void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
uint16_t pasid);
int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry,
u64 addr,
u32 cam_index,
u32 node_id,
bool write_fault);
int amdgpu_gmc_ras_sw_init(struct amdgpu_device *adev);
int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);

View file

@ -115,27 +115,10 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
addr |= ((u64)entry->src_data[1] & 0xf) << 44;
if (retry_fault) {
int ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, 0, 0,
write_fault);
/* Returning 1 here also prevents sending the IV to the KFD */
/* Process it only if it's the first fault for this address */
if (entry->ih != &adev->irq.ih_soft &&
amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
entry->timestamp))
return 1;
/* Delegate it to a different ring if the hardware hasn't
* already done it.
*/
if (entry->ih == &adev->irq.ih) {
amdgpu_irq_delegate(adev, entry, 8);
return 1;
}
/* Try to handle the recoverable page faults by filling page
* tables
*/
if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
entry->timestamp, write_fault))
if (ret == 1)
return 1;
}

View file

@ -114,27 +114,10 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
addr |= ((u64)entry->src_data[1] & 0xf) << 44;
if (retry_fault) {
int ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, 0, 0,
write_fault);
/* Returning 1 here also prevents sending the IV to the KFD */
/* Process it only if it's the first fault for this address */
if (entry->ih != &adev->irq.ih_soft &&
amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
entry->timestamp))
return 1;
/* Delegate it to a different ring if the hardware hasn't
* already done it.
*/
if (entry->ih == &adev->irq.ih) {
amdgpu_irq_delegate(adev, entry, 8);
return 1;
}
/* Try to handle the recoverable page faults by filling page
* tables
*/
if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
entry->timestamp, write_fault))
if (ret == 1)
return 1;
}

View file

@ -110,27 +110,10 @@ static int gmc_v12_0_process_interrupt(struct amdgpu_device *adev,
hub = &adev->vmhub[AMDGPU_GFXHUB(0)];
if (retry_fault) {
int ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, 0, 0,
write_fault);
/* Returning 1 here also prevents sending the IV to the KFD */
/* Process it only if it's the first fault for this address */
if (entry->ih != &adev->irq.ih_soft &&
amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
entry->timestamp))
return 1;
/* Delegate it to a different ring if the hardware hasn't
* already done it.
*/
if (entry->ih == &adev->irq.ih) {
amdgpu_irq_delegate(adev, entry, 8);
return 1;
}
/* Try to handle the recoverable page faults by filling page
* tables
*/
if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
entry->timestamp, write_fault))
if (ret == 1)
return 1;
}

View file

@ -583,44 +583,13 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
hub = &adev->vmhub[vmhub];
if (retry_fault) {
if (adev->irq.retry_cam_enabled) {
/* Delegate it to a different ring if the hardware hasn't
* already done it.
*/
if (entry->ih == &adev->irq.ih) {
amdgpu_irq_delegate(adev, entry, 8);
return 1;
}
cam_index = entry->src_data[2] & 0x3ff;
cam_index = entry->src_data[2] & 0x3ff;
ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
addr, entry->timestamp, write_fault);
WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
if (ret)
return 1;
} else {
/* Process it only if it's the first fault for this address */
if (entry->ih != &adev->irq.ih_soft &&
amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
entry->timestamp))
return 1;
/* Delegate it to a different ring if the hardware hasn't
* already done it.
*/
if (entry->ih == &adev->irq.ih) {
amdgpu_irq_delegate(adev, entry, 8);
return 1;
}
/* Try to handle the recoverable page faults by filling page
* tables
*/
if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
addr, entry->timestamp, write_fault))
return 1;
}
ret = amdgpu_gmc_handle_retry_fault(adev, entry, addr, cam_index, node_id,
write_fault);
/* Returning 1 here also prevents sending the IV to the KFD */
if (ret == 1)
return 1;
}
if (kgd2kfd_vmfault_fast_path(adev, entry, retry_fault))