drm/amdgpu: Reduce dequeue retry timeout for gfx9 family

Dequeue retry timeout controls the interval between checks for unmet
conditions. On MI series, reduce this from 0x40 to 0x1 (~ 1 uS). The
cost of additional bandwidth consumed by CP when polling memory
shouldn't be substantial.

Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Reviewed-by: Jonathan Kim <jonathan.kim@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Harish Kasiviswanathan 2025-02-25 15:50:30 -05:00 committed by Alex Deucher
parent 02fc2f3c46
commit 8a7820c072
10 changed files with 72 additions and 52 deletions

View file

@ -189,7 +189,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = {
.set_address_watch = kgd_gfx_aldebaran_set_address_watch,
.clear_address_watch = kgd_gfx_v9_clear_address_watch,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
.build_dequeue_wait_counts_packet_info = kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v9_hqd_reset,

View file

@ -415,7 +415,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
.set_address_watch = kgd_gfx_v9_set_address_watch,
.clear_address_watch = kgd_gfx_v9_clear_address_watch,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
.build_dequeue_wait_counts_packet_info = kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,

View file

@ -541,8 +541,8 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings =
kgd_gfx_v9_program_trap_handler_settings,
.build_grace_period_packet_info =
kgd_gfx_v9_build_grace_period_packet_info,
.build_dequeue_wait_counts_packet_info =
kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.enable_debug_trap = kgd_aldebaran_enable_debug_trap,
.disable_debug_trap = kgd_gfx_v9_4_3_disable_debug_trap,

View file

@ -1021,25 +1021,25 @@ void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev,
*wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2));
}
void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
void kgd_gfx_v10_build_dequeue_wait_counts_packet_info(struct amdgpu_device *adev,
uint32_t wait_times,
uint32_t grace_period,
uint32_t sch_wave,
uint32_t que_sleep,
uint32_t *reg_offset,
uint32_t *reg_data)
{
*reg_data = wait_times;
/*
* The CP cannont handle a 0 grace period input and will result in
* an infinite grace period being set so set to 1 to prevent this.
*/
if (grace_period == 0)
grace_period = 1;
*reg_data = REG_SET_FIELD(*reg_data,
CP_IQ_WAIT_TIME2,
SCH_WAVE,
grace_period);
if (sch_wave)
*reg_data = REG_SET_FIELD(*reg_data,
CP_IQ_WAIT_TIME2,
SCH_WAVE,
sch_wave);
if (que_sleep)
*reg_data = REG_SET_FIELD(*reg_data,
CP_IQ_WAIT_TIME2,
QUE_SLEEP,
que_sleep);
*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
}
@ -1115,7 +1115,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = {
.set_address_watch = kgd_gfx_v10_set_address_watch,
.clear_address_watch = kgd_gfx_v10_clear_address_watch,
.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
.build_dequeue_wait_counts_packet_info = kgd_gfx_v10_build_dequeue_wait_counts_packet_info,
.program_trap_handler_settings = program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr,
.hqd_reset = kgd_gfx_v10_hqd_reset,

View file

@ -51,9 +51,10 @@ uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,
void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev,
uint32_t *wait_times,
uint32_t inst);
void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev,
void kgd_gfx_v10_build_dequeue_wait_counts_packet_info(struct amdgpu_device *adev,
uint32_t wait_times,
uint32_t grace_period,
uint32_t sch_wave,
uint32_t que_sleep,
uint32_t *reg_offset,
uint32_t *reg_data);
uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev,

View file

@ -673,7 +673,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = {
.set_vm_context_page_table_base = set_vm_context_page_table_base_v10_3,
.program_trap_handler_settings = program_trap_handler_settings_v10_3,
.get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times,
.build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info,
.build_dequeue_wait_counts_packet_info = kgd_gfx_v10_build_dequeue_wait_counts_packet_info,
.enable_debug_trap = kgd_gfx_v10_enable_debug_trap,
.disable_debug_trap = kgd_gfx_v10_disable_debug_trap,
.validate_trap_override_request = kgd_gfx_v10_validate_trap_override_request,

View file

@ -1077,25 +1077,25 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
adev->gfx.cu_info.max_waves_per_simd;
}
void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
void kgd_gfx_v9_build_dequeue_wait_counts_packet_info(struct amdgpu_device *adev,
uint32_t wait_times,
uint32_t grace_period,
uint32_t sch_wave,
uint32_t que_sleep,
uint32_t *reg_offset,
uint32_t *reg_data)
{
*reg_data = wait_times;
/*
* The CP cannot handle a 0 grace period input and will result in
* an infinite grace period being set so set to 1 to prevent this.
*/
if (grace_period == 0)
grace_period = 1;
*reg_data = REG_SET_FIELD(*reg_data,
CP_IQ_WAIT_TIME2,
SCH_WAVE,
grace_period);
if (sch_wave)
*reg_data = REG_SET_FIELD(*reg_data,
CP_IQ_WAIT_TIME2,
SCH_WAVE,
sch_wave);
if (que_sleep)
*reg_data = REG_SET_FIELD(*reg_data,
CP_IQ_WAIT_TIME2,
QUE_SLEEP,
que_sleep);
*reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2);
}
@ -1255,7 +1255,7 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = {
.set_address_watch = kgd_gfx_v9_set_address_watch,
.clear_address_watch = kgd_gfx_v9_clear_address_watch,
.get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times,
.build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info,
.build_dequeue_wait_counts_packet_info = kgd_gfx_v9_build_dequeue_wait_counts_packet_info,
.get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy,
.program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings,
.hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr,

View file

@ -97,9 +97,10 @@ uint32_t kgd_gfx_v9_clear_address_watch(struct amdgpu_device *adev,
void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev,
uint32_t *wait_times,
uint32_t inst);
void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev,
void kgd_gfx_v9_build_dequeue_wait_counts_packet_info(struct amdgpu_device *adev,
uint32_t wait_times,
uint32_t grace_period,
uint32_t sch_wave,
uint32_t que_sleep,
uint32_t *reg_offset,
uint32_t *reg_data);
uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,

View file

@ -298,13 +298,14 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer,
}
static inline void pm_build_dequeue_wait_counts_packet_info(struct packet_manager *pm,
uint32_t sch_value, uint32_t *reg_offset,
uint32_t sch_value, uint32_t que_sleep, uint32_t *reg_offset,
uint32_t *reg_data)
{
pm->dqm->dev->kfd2kgd->build_grace_period_packet_info(
pm->dqm->dev->kfd2kgd->build_dequeue_wait_counts_packet_info(
pm->dqm->dev->adev,
pm->dqm->wait_times,
sch_value,
que_sleep,
reg_offset,
reg_data);
}
@ -319,27 +320,43 @@ static int pm_config_dequeue_wait_counts_v9(struct packet_manager *pm,
uint32_t reg_data = 0;
switch (cmd) {
case KFD_DEQUEUE_WAIT_INIT:
/* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU */
if (amdgpu_emu_mode == 0 && pm->dqm->dev->adev->gmc.is_app_apu &&
(KFD_GC_VERSION(pm->dqm->dev) == IP_VERSION(9, 4, 3)))
pm_build_dequeue_wait_counts_packet_info(pm, 1, &reg_offset, &reg_data);
else
case KFD_DEQUEUE_WAIT_INIT: {
uint32_t sch_wave = 0, que_sleep = 0;
/* Reduce CP_IQ_WAIT_TIME2.QUE_SLEEP to 0x1 from default 0x40.
* On a 1GHz machine this is roughly 1 microsecond, which is
* about how long it takes to load data out of memory during
* queue connect
* QUE_SLEEP: Wait Count for Dequeue Retry.
*/
if (KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(9, 4, 1) &&
KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(10, 0, 0)) {
que_sleep = 1;
/* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU */
if (amdgpu_emu_mode == 0 && pm->dqm->dev->adev->gmc.is_app_apu &&
(KFD_GC_VERSION(pm->dqm->dev) == IP_VERSION(9, 4, 3)))
sch_wave = 1;
} else {
return 0;
}
pm_build_dequeue_wait_counts_packet_info(pm, sch_wave, que_sleep,
&reg_offset, &reg_data);
break;
}
case KFD_DEQUEUE_WAIT_RESET:
/* function called only to get reg_offset */
pm_build_dequeue_wait_counts_packet_info(pm, 0, &reg_offset, &reg_data);
reg_data = pm->dqm->wait_times;
/* reg_data would be set to dqm->wait_times */
pm_build_dequeue_wait_counts_packet_info(pm, 0, 0, &reg_offset, &reg_data);
break;
case KFD_DEQUEUE_WAIT_SET_SCH_WAVE:
/* The CP cannot handle value 0 and it will result in
* an infinite grace period being set so set to 1 to prevent this.
* an infinite grace period being set so set to 1 to prevent this. Also
* avoid debugger API breakage as it sets 0 and expects a low value.
*/
if (!value)
value = 1;
pm_build_dequeue_wait_counts_packet_info(pm, value, &reg_offset, &reg_data);
pm_build_dequeue_wait_counts_packet_info(pm, value, 0, &reg_offset, &reg_data);
break;
default:
pr_err("Invalid dequeue wait cmd\n");

View file

@ -313,9 +313,10 @@ struct kfd2kgd_calls {
void (*get_iq_wait_times)(struct amdgpu_device *adev,
uint32_t *wait_times,
uint32_t inst);
void (*build_grace_period_packet_info)(struct amdgpu_device *adev,
void (*build_dequeue_wait_counts_packet_info)(struct amdgpu_device *adev,
uint32_t wait_times,
uint32_t grace_period,
uint32_t sch_wave,
uint32_t que_sleep,
uint32_t *reg_offset,
uint32_t *reg_data);
void (*get_cu_occupancy)(struct amdgpu_device *adev,