On Fri, Dec 13, 2024 at 4:03 AM Christian König <ckoenig.leichtzumerken@xxxxxxxxx> wrote: > > This partially reverts commit 194eb174cbe4fe2b3376ac30acca2dc8c8beca00. > > This commit introduced a new state variable into adev without even > remotely worrying about CPU barriers. > > Since we already have the amdgpu_in_reset() function exactly for this > use case partially revert that. > > Signed-off-by: Christian König <christian.koenig@xxxxxxx> Series is: Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 - > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 ++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 3 --- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 2 +- > 5 files changed, 4 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 7051b697530b..5e55a44f9eef 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1192,7 +1192,6 @@ struct amdgpu_device { > > struct work_struct reset_work; > > - bool job_hang; > bool dc_enabled; > /* Mask of active clusters */ > uint32_t aid_mask; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index 503051352922..dca5a4ef2734 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -836,7 +836,7 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off, > if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) > return -EINVAL; > > - if (!kiq_ring->sched.ready || adev->job_hang) > + if (!kiq_ring->sched.ready || amdgpu_in_reset(adev)) > return 0; > > ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index a4dde54512b1..2e6829e1554b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -515,7 +515,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) > if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) > return -EINVAL; > > - if (!kiq_ring->sched.ready || adev->job_hang || amdgpu_in_reset(adev)) > + if (!kiq_ring->sched.ready || amdgpu_in_reset(adev)) > return 0; > > spin_lock(&kiq->ring_lock); > @@ -567,7 +567,7 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id) > if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) > return -EINVAL; > > - if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang) > + if (!adev->gfx.kiq[0].ring.sched.ready || amdgpu_in_reset(adev)) > return 0; > > if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 7fdf7b047317..6cc44eb2586f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -102,8 +102,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > return DRM_GPU_SCHED_STAT_ENODEV; > } > > - adev->job_hang = true; > - > /* > * Do the coredump immediately after a job timeout to get a very > * close dump/snapshot/representation of GPU's current error status > @@ -181,7 +179,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > } > > exit: > - adev->job_hang = false; > drm_dev_exit(idx); > return DRM_GPU_SCHED_STAT_NOMINAL; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index c4e15418e187..714a6caeb679 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -5957,7 +5957,7 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable) > else > WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp); > > - if (adev->job_hang && !enable) > + if (amdgpu_in_reset(adev) && !enable) > return 0; > > for (i = 0; i < adev->usec_timeout; i++) { > -- > 2.34.1 >