Defined a reset_domain struct such that all the entities that go through reset together will be serialized one against another. Do it for both single device and XGMI hive cases. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> Suggested-by: Daniel Vetter <daniel.vetter@xxxxxxxx> Suggested-by: Christian König <ckoenig.leichtzumerken@xxxxxxxxx> Reviewed-by: Christian König <christian.koenig@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 9 +++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 ++ 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d8b854fcbffa..b76c1cfad7f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -813,6 +813,10 @@ struct amd_powerplay { #define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4 #define AMDGPU_PRODUCT_NAME_LEN 64 +struct amdgpu_reset_domain { + struct workqueue_struct *wq; +}; + struct amdgpu_device { struct device *dev; struct pci_dev *pdev; @@ -1100,6 +1104,7 @@ struct amdgpu_device { uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE]; bool ram_is_direct_mapped; + struct amdgpu_reset_domain reset_domain; }; static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ed077de426d9..9704b0e1fd82 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2398,9 +2398,27 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; - if (adev->gmc.xgmi.num_physical_nodes > 1) + if (adev->gmc.xgmi.num_physical_nodes > 1) { + struct amdgpu_hive_info *hive; + amdgpu_xgmi_add_device(adev); + hive = amdgpu_get_xgmi_hive(adev); + if (!hive || !hive->reset_domain.wq) { + DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id); + r = -EINVAL; + goto init_failed; + } + + adev->reset_domain.wq = hive->reset_domain.wq; + } else { + adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0); + if (!adev->reset_domain.wq) { + r = -ENOMEM; + goto init_failed; + } + } + /* Don't init kfd if whole hive need to be reset during init */ if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index e8b8f28c2f72..d406897346d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -398,6 +398,14 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) goto pro_end; } + hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0); + if (!hive->reset_domain.wq) { + dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n"); + kfree(hive); + hive = NULL; + goto pro_end; + } + hive->hive_id = adev->gmc.xgmi.hive_id; INIT_LIST_HEAD(&hive->device_list); INIT_LIST_HEAD(&hive->node); @@ -407,6 +415,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) task_barrier_init(&hive->tb); hive->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; hive->hi_req_gpu = NULL; + /* * hive pstate on boot is high in vega20 so we have to go to low * pstate on after boot. diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index d2189bf7d428..6121aaa292cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -42,6 +42,8 @@ struct amdgpu_hive_info { AMDGPU_XGMI_PSTATE_MAX_VEGA20, AMDGPU_XGMI_PSTATE_UNKNOWN } pstate; + + struct amdgpu_reset_domain reset_domain; }; struct amdgpu_pcs_ras_field { -- 2.25.1