diff --git a/drivers/gpu/drm/xe/regs/xe_engine_regs.h b/drivers/gpu/drm/xe/regs/xe_engine_regs.h index 68172b0248a6..dc5a4fafa70c 100644 --- a/drivers/gpu/drm/xe/regs/xe_engine_regs.h +++ b/drivers/gpu/drm/xe/regs/xe_engine_regs.h @@ -96,6 +96,12 @@ #define ENABLE_SEMAPHORE_POLL_BIT REG_BIT(13) #define RING_CMD_CCTL(base) XE_REG((base) + 0xc4, XE_REG_OPTION_MASKED) + +#define CS_MMIO_GROUP_INSTANCE_SELECT(base) XE_REG((base) + 0xcc) +#define SELECTIVE_READ_ADDRESSING REG_BIT(30) +#define SELECTIVE_READ_GROUP REG_GENMASK(29, 23) +#define SELECTIVE_READ_INSTANCE REG_GENMASK(22, 16) + /* * CMD_CCTL read/write fields take a MOCS value and _not_ a table index. * The lsb of each can be considered a separate enabling bit for encryption. diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 9d090d0f2438..df6d04704823 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -210,11 +210,15 @@ static int emit_nop_job(struct xe_gt *gt, struct xe_exec_queue *q) return ret; } +/* Dwords required to emit a RMW of a register */ +#define EMIT_RMW_DW 20 + static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) { - struct xe_reg_sr *sr = &q->hwe->reg_lrc; + struct xe_hw_engine *hwe = q->hwe; + struct xe_reg_sr *sr = &hwe->reg_lrc; struct xe_reg_sr_entry *entry; - int count_rmw = 0, count = 0, ret; + int count_rmw = 0, count_rmw_mcr = 0, count = 0, ret; unsigned long idx; struct xe_bb *bb; size_t bb_len = 0; @@ -224,6 +228,8 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) xa_for_each(&sr->xa, idx, entry) { if (entry->reg.masked || entry->clr_bits == ~0) ++count; + else if (entry->reg.mcr) + ++count_rmw_mcr; else ++count_rmw; } @@ -231,17 +237,35 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) if (count) bb_len += count * 2 + 1; - if (count_rmw) - bb_len += count_rmw * 20 + 7; + /* + * RMW of MCR registers is the same as a normal RMW, except an + * additional LRI (3 dwords) is required per register to steer the read + * to a nom-terminated instance. + * + * We could probably shorten the batch slightly by eliding the + * steering for consecutive MCR registers that have the same + * group/instance target, but it's not worth the extra complexity to do + * so. + */ + bb_len += count_rmw * EMIT_RMW_DW; + bb_len += count_rmw_mcr * (EMIT_RMW_DW + 3); - if (q->hwe->class == XE_ENGINE_CLASS_RENDER) + /* + * After doing all RMW, we need 7 trailing dwords to clean up, + * plus an additional 3 dwords to reset steering if any of the + * registers were MCR. + */ + if (count_rmw || count_rmw_mcr) + bb_len += 7 + (count_rmw_mcr ? 3 : 0); + + if (hwe->class == XE_ENGINE_CLASS_RENDER) /* * Big enough to emit all of the context's 3DSTATE via * xe_lrc_emit_hwe_state_instructions() */ - bb_len += xe_gt_lrc_size(gt, q->hwe->class) / sizeof(u32); + bb_len += xe_gt_lrc_size(gt, hwe->class) / sizeof(u32); - xe_gt_dbg(gt, "LRC %s WA job: %zu dwords\n", q->hwe->name, bb_len); + xe_gt_dbg(gt, "LRC %s WA job: %zu dwords\n", hwe->name, bb_len); bb = xe_bb_new(gt, bb_len, false); if (IS_ERR(bb)) @@ -276,13 +300,23 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) } } - if (count_rmw) { - /* Emit MI_MATH for each RMW reg: 20dw per reg + 7 trailing dw */ - + if (count_rmw || count_rmw_mcr) { xa_for_each(&sr->xa, idx, entry) { if (entry->reg.masked || entry->clr_bits == ~0) continue; + if (entry->reg.mcr) { + struct xe_reg_mcr reg = { .__reg.raw = entry->reg.raw }; + u8 group, instance; + + xe_gt_mcr_get_nonterminated_steering(gt, reg, &group, &instance); + *cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); + *cs++ = CS_MMIO_GROUP_INSTANCE_SELECT(hwe->mmio_base).addr; + *cs++ = SELECTIVE_READ_ADDRESSING | + REG_FIELD_PREP(SELECTIVE_READ_GROUP, group) | + REG_FIELD_PREP(SELECTIVE_READ_INSTANCE, instance); + } + *cs++ = MI_LOAD_REGISTER_REG | MI_LRR_DST_CS_MMIO; *cs++ = entry->reg.addr; *cs++ = CS_GPR_REG(0, 0).addr; @@ -308,8 +342,9 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) *cs++ = CS_GPR_REG(0, 0).addr; *cs++ = entry->reg.addr; - xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x\n", - entry->reg.addr, entry->clr_bits, entry->set_bits); + xe_gt_dbg(gt, "REG[%#x] = ~%#x|%#x%s\n", + entry->reg.addr, entry->clr_bits, entry->set_bits, + entry->reg.mcr ? " (MCR)" : ""); } /* reset used GPR */ @@ -321,6 +356,13 @@ static int emit_wa_job(struct xe_gt *gt, struct xe_exec_queue *q) *cs++ = 0; *cs++ = CS_GPR_REG(0, 2).addr; *cs++ = 0; + + /* reset steering */ + if (count_rmw_mcr) { + *cs++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1); + *cs++ = CS_MMIO_GROUP_INSTANCE_SELECT(q->hwe->mmio_base).addr; + *cs++ = 0; + } } cs = xe_lrc_emit_hwe_state_instructions(q, cs);