Cross-subsystem Changes:

- mm: Fix a hmm_range_fault() livelock / starvation problem (Thomas)
 
 Core Changes:
  - Revert "drm/pagemap: Disable device-to-device migration" (Thomas)
 
 Driver Changes:
  - Do not preempt fence signaling CS instructions (Brost)
  - Some leak and finalization fixes (Shuicheng, Tomasz, Varun, Zhanjun)
  - Workaround fix (Roper)
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCgAdFiEEbSBwaO7dZQkcLOKj+mJfZA7rE8oFAmmphjMACgkQ+mJfZA7r
 E8oUMAf+KUZhgCyJUDsg0QY4xC6LCUlqY179QOsHwiFxJfH0cM24c9QIQAW6HQqu
 AWtcyUjC6a9F1y2CO08oV6E6+/6xOV3yEjv4aEseAO161mz+BGoRaP7FZjZbG2NF
 SPRWMLWFcAOzksvsTgaNSDPD/MlIYm+VxM8eW/ynwLzFucKmImt9VMKUNLkGM9uk
 KcCrjWYgviWrHqxly9G7xddeVMQI/HhR3ggxy7Z7t3buUc8cOmU4BucM8tZ/3pf2
 BceZap2RFIl/FXjyxnOaFjOv3fs8hRJMQBDlq1HX47cmzPJhgNAJyWMQWz3zEPTJ
 qrpU4Vg3I3Itk32o3N3E6rz0+oOjWw==
 =TOo+
 -----END PGP SIGNATURE-----

Merge tag 'drm-xe-fixes-2026-03-05' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes

Cross-subsystem Changes:
 - mm: Fix a hmm_range_fault() livelock / starvation problem (Thomas)

Core Changes:
 - Revert "drm/pagemap: Disable device-to-device migration" (Thomas)

Driver Changes:
 - Do not preempt fence signaling CS instructions (Brost)
 - Some leak and finalization fixes (Shuicheng, Tomasz, Varun, Zhanjun)
 - Workaround fix (Roper)

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patch.msgid.link/aamGvvGRBRtX8-6u@intel.com
This commit is contained in:
Dave Airlie 2026-03-06 19:45:11 +10:00
commit 96bfe9ff7e
15 changed files with 100 additions and 53 deletions

View file

@ -480,18 +480,8 @@ int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation,
.start = start,
.end = end,
.pgmap_owner = pagemap->owner,
/*
* FIXME: MIGRATE_VMA_SELECT_DEVICE_PRIVATE intermittently
* causes 'xe_exec_system_allocator --r *race*no*' to trigger aa
* engine reset and a hard hang due to getting stuck on a folio
* lock. This should work and needs to be root-caused. The only
* downside of not selecting MIGRATE_VMA_SELECT_DEVICE_PRIVATE
* is that device-to-device migrations wont work; instead,
* memory will bounce through system memory. This path should be
* rare and only occur when the madvise attributes of memory are
* changed or atomics are being used.
*/
.flags = MIGRATE_VMA_SELECT_SYSTEM | MIGRATE_VMA_SELECT_DEVICE_COHERENT,
.flags = MIGRATE_VMA_SELECT_SYSTEM | MIGRATE_VMA_SELECT_DEVICE_COHERENT |
MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
};
unsigned long i, npages = npages_in_range(start, end);
unsigned long own_pages = 0, migrated_pages = 0;

View file

@ -830,6 +830,7 @@ static void xe_config_device_release(struct config_item *item)
mutex_destroy(&dev->lock);
kfree(dev->config.ctx_restore_mid_bb[0].cs);
kfree(dev->config.ctx_restore_post_bb[0].cs);
kfree(dev);
}

View file

@ -266,6 +266,16 @@ static struct xe_exec_queue *__xe_exec_queue_alloc(struct xe_device *xe,
return q;
}
static void __xe_exec_queue_fini(struct xe_exec_queue *q)
{
int i;
q->ops->fini(q);
for (i = 0; i < q->width; ++i)
xe_lrc_put(q->lrc[i]);
}
static int __xe_exec_queue_init(struct xe_exec_queue *q, u32 exec_queue_flags)
{
int i, err;
@ -320,21 +330,10 @@ static int __xe_exec_queue_init(struct xe_exec_queue *q, u32 exec_queue_flags)
return 0;
err_lrc:
for (i = i - 1; i >= 0; --i)
xe_lrc_put(q->lrc[i]);
__xe_exec_queue_fini(q);
return err;
}
static void __xe_exec_queue_fini(struct xe_exec_queue *q)
{
int i;
q->ops->fini(q);
for (i = 0; i < q->width; ++i)
xe_lrc_put(q->lrc[i]);
}
struct xe_exec_queue *xe_exec_queue_create(struct xe_device *xe, struct xe_vm *vm,
u32 logical_mask, u16 width,
struct xe_hw_engine *hwe, u32 flags,

View file

@ -435,15 +435,11 @@ static int proxy_channel_alloc(struct xe_gsc *gsc)
return 0;
}
static void xe_gsc_proxy_remove(void *arg)
static void xe_gsc_proxy_stop(struct xe_gsc *gsc)
{
struct xe_gsc *gsc = arg;
struct xe_gt *gt = gsc_to_gt(gsc);
struct xe_device *xe = gt_to_xe(gt);
if (!gsc->proxy.component_added)
return;
/* disable HECI2 IRQs */
scoped_guard(xe_pm_runtime, xe) {
CLASS(xe_force_wake, fw_ref)(gt_to_fw(gt), XE_FW_GSC);
@ -455,6 +451,30 @@ static void xe_gsc_proxy_remove(void *arg)
}
xe_gsc_wait_for_worker_completion(gsc);
gsc->proxy.started = false;
}
static void xe_gsc_proxy_remove(void *arg)
{
struct xe_gsc *gsc = arg;
struct xe_gt *gt = gsc_to_gt(gsc);
struct xe_device *xe = gt_to_xe(gt);
if (!gsc->proxy.component_added)
return;
/*
* GSC proxy start is an async process that can be ongoing during
* Xe module load/unload. Using devm managed action to register
* xe_gsc_proxy_stop could cause issues if Xe module unload has
* already started when the action is registered, potentially leading
* to the cleanup being called at the wrong time. Therefore, instead
* of registering a separate devm action to undo what is done in
* proxy start, we call it from here, but only if the start has
* completed successfully (tracked with the 'started' flag).
*/
if (gsc->proxy.started)
xe_gsc_proxy_stop(gsc);
component_del(xe->drm.dev, &xe_gsc_proxy_component_ops);
gsc->proxy.component_added = false;
@ -510,6 +530,7 @@ int xe_gsc_proxy_init(struct xe_gsc *gsc)
*/
int xe_gsc_proxy_start(struct xe_gsc *gsc)
{
struct xe_gt *gt = gsc_to_gt(gsc);
int err;
/* enable the proxy interrupt in the GSC shim layer */
@ -521,12 +542,18 @@ int xe_gsc_proxy_start(struct xe_gsc *gsc)
*/
err = xe_gsc_proxy_request_handler(gsc);
if (err)
return err;
goto err_irq_disable;
if (!xe_gsc_proxy_init_done(gsc)) {
xe_gt_err(gsc_to_gt(gsc), "GSC FW reports proxy init not completed\n");
return -EIO;
xe_gt_err(gt, "GSC FW reports proxy init not completed\n");
err = -EIO;
goto err_irq_disable;
}
gsc->proxy.started = true;
return 0;
err_irq_disable:
gsc_proxy_irq_toggle(gsc, false);
return err;
}

View file

@ -58,6 +58,8 @@ struct xe_gsc {
struct mutex mutex;
/** @proxy.component_added: whether the component has been added */
bool component_added;
/** @proxy.started: whether the proxy has been started */
bool started;
/** @proxy.bo: object to store message to and from the GSC */
struct xe_bo *bo;
/** @proxy.to_gsc: map of the memory used to send messages to the GSC */

View file

@ -75,7 +75,8 @@ static inline struct xe_lrc *xe_lrc_get(struct xe_lrc *lrc)
*/
static inline void xe_lrc_put(struct xe_lrc *lrc)
{
kref_put(&lrc->refcount, xe_lrc_destroy);
if (lrc)
kref_put(&lrc->refcount, xe_lrc_destroy);
}
/**

View file

@ -98,10 +98,12 @@ int xe_reg_sr_add(struct xe_reg_sr *sr,
*pentry = *e;
ret = xa_err(xa_store(&sr->xa, idx, pentry, GFP_KERNEL));
if (ret)
goto fail;
goto fail_free;
return 0;
fail_free:
kfree(pentry);
fail:
xe_gt_err(gt,
"discarding save-restore reg %04lx (clear: %08x, set: %08x, masked: %s, mcr: %s): ret=%d\n",

View file

@ -280,6 +280,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc
i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
/* Don't preempt fence signaling */
dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE;
if (job->user_fence.used) {
i = emit_flush_dw(dw, i);
i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
@ -345,6 +348,9 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
/* Don't preempt fence signaling */
dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE;
if (job->user_fence.used) {
i = emit_flush_dw(dw, i);
i = emit_store_imm_ppgtt_posted(job->user_fence.addr,
@ -397,6 +403,9 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
i = emit_bb_start(batch_addr, ppgtt_flag, dw, i);
/* Don't preempt fence signaling */
dw[i++] = MI_ARB_ON_OFF | MI_ARB_DISABLE;
i = emit_render_cache_flush(job, dw, i);
if (job->user_fence.used)

View file

@ -453,7 +453,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *fil
madvise_range.num_vmas,
args->atomic.val)) {
err = -EINVAL;
goto madv_fini;
goto free_vmas;
}
}
@ -490,6 +490,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *fil
err_fini:
if (madvise_range.has_bo_vmas)
drm_exec_fini(&exec);
free_vmas:
kfree(madvise_range.vmas);
madvise_range.vmas = NULL;
madv_fini:

View file

@ -241,12 +241,13 @@ static const struct xe_rtp_entry_sr gt_was[] = {
{ XE_RTP_NAME("16025250150"),
XE_RTP_RULES(GRAPHICS_VERSION(2001)),
XE_RTP_ACTIONS(SET(LSN_VC_REG2,
LSN_LNI_WGT(1) |
LSN_LNE_WGT(1) |
LSN_DIM_X_WGT(1) |
LSN_DIM_Y_WGT(1) |
LSN_DIM_Z_WGT(1)))
XE_RTP_ACTIONS(FIELD_SET(LSN_VC_REG2,
LSN_LNI_WGT_MASK | LSN_LNE_WGT_MASK |
LSN_DIM_X_WGT_MASK | LSN_DIM_Y_WGT_MASK |
LSN_DIM_Z_WGT_MASK,
LSN_LNI_WGT(1) | LSN_LNE_WGT(1) |
LSN_DIM_X_WGT(1) | LSN_DIM_Y_WGT(1) |
LSN_DIM_Z_WGT(1)))
},
/* Xe2_HPM */

View file

@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list);
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct folio *dst, struct folio *src);
void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
__releases(ptl);
void folio_migrate_flags(struct folio *newfolio, struct folio *folio);
int folio_migrate_mapping(struct address_space *mapping,
@ -97,6 +97,14 @@ static inline int set_movable_ops(const struct movable_operations *ops, enum pag
return -ENOSYS;
}
static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
__releases(ptl)
{
WARN_ON_ONCE(1);
spin_unlock(ptl);
}
#endif /* CONFIG_MIGRATION */
#ifdef CONFIG_NUMA_BALANCING

View file

@ -1379,14 +1379,16 @@ repeat:
#ifdef CONFIG_MIGRATION
/**
* migration_entry_wait_on_locked - Wait for a migration entry to be removed
* @entry: migration swap entry.
* softleaf_entry_wait_on_locked - Wait for a migration entry or
* device_private entry to be removed.
* @entry: migration or device_private swap entry.
* @ptl: already locked ptl. This function will drop the lock.
*
* Wait for a migration entry referencing the given page to be removed. This is
* Wait for a migration entry referencing the given page, or device_private
* entry referencing a dvice_private page to be unlocked. This is
* equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
* this can be called without taking a reference on the page. Instead this
* should be called while holding the ptl for the migration entry referencing
* should be called while holding the ptl for @entry referencing
* the page.
*
* Returns after unlocking the ptl.
@ -1394,7 +1396,7 @@ repeat:
* This follows the same logic as folio_wait_bit_common() so see the comments
* there.
*/
void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
__releases(ptl)
{
struct wait_page_queue wait_page;
@ -1428,6 +1430,9 @@ void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl)
* If a migration entry exists for the page the migration path must hold
* a valid reference to the page, and it must take the ptl to remove the
* migration entry. So the page is valid until the ptl is dropped.
* Similarly any path attempting to drop the last reference to a
* device-private page needs to grab the ptl to remove the device-private
* entry.
*/
spin_unlock(ptl);

View file

@ -4763,7 +4763,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
unlock_page(vmf->page);
put_page(vmf->page);
} else {
pte_unmap_unlock(vmf->pte, vmf->ptl);
pte_unmap(vmf->pte);
softleaf_entry_wait_on_locked(entry, vmf->ptl);
}
} else if (softleaf_is_hwpoison(entry)) {
ret = VM_FAULT_HWPOISON;

View file

@ -500,7 +500,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
if (!softleaf_is_migration(entry))
goto out;
migration_entry_wait_on_locked(entry, ptl);
softleaf_entry_wait_on_locked(entry, ptl);
return;
out:
spin_unlock(ptl);
@ -532,10 +532,10 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p
* If migration entry existed, safe to release vma lock
* here because the pgtable page won't be freed without the
* pgtable lock released. See comment right above pgtable
* lock release in migration_entry_wait_on_locked().
* lock release in softleaf_entry_wait_on_locked().
*/
hugetlb_vma_unlock_read(vma);
migration_entry_wait_on_locked(entry, ptl);
softleaf_entry_wait_on_locked(entry, ptl);
return;
}
@ -553,7 +553,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
ptl = pmd_lock(mm, pmd);
if (!pmd_is_migration_entry(*pmd))
goto unlock;
migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl);
softleaf_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl);
return;
unlock:
spin_unlock(ptl);

View file

@ -176,7 +176,7 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
}
if (softleaf_is_migration(entry)) {
migration_entry_wait_on_locked(entry, ptl);
softleaf_entry_wait_on_locked(entry, ptl);
spin_unlock(ptl);
return -EAGAIN;
}