[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Christian > König > Sent: Tuesday, February 4, 2025 10:31 PM > To: timur.kristof@xxxxxxxxx; Deucher, Alexander <Alexander.Deucher@xxxxxxx>; > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Subject: [PATCH 5/5] drm/amdgpu: rework gfx10 queue reset > > Apply the same changes to gfx10 as done to gfx9. > > The general idea to reset the whole kernel queue and then asking the kiq to map it > again didn't worked at all. Background is that we don't use per application kernel > queues for gfx10 on Linux for performance reasons. > > So instead use the gfx9 approach here as well and only reset all submissions from > a specific VMID instead of the whole queue. > > This also avoids reserving and kmap the MQD which are operations generally not > allowed in the reset handler. > > This approach seems to work for at least some time, but not as reliable as it is on > gfx9. It will probably need some more work until it survives a whole night of reset > stress testing. > > Signed-off-by: Christian König <christian.koenig@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 65 +++++++------------------- > 1 file changed, 16 insertions(+), 49 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index 5ba263fe5512..7ffdba974f87 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -8684,7 +8684,17 @@ static void gfx_v10_0_ring_emit_pipeline_sync(struct > amdgpu_ring *ring) > int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX); > uint32_t seq = ring->fence_drv.sync_seq; > uint64_t addr = ring->fence_drv.gpu_addr; > + struct amdgpu_device *adev = ring->adev; > > + amdgpu_ring_emit_reg_wait(ring, > + SOC15_REG_OFFSET(GC, 0, > mmCP_VMID_RESET), > + 0, 0xffff); > + amdgpu_ring_emit_wreg(ring, > + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), > + 0); > + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, > + ring->fence_drv.sync_seq, > + AMDGPU_FENCE_FLAG_EXEC); > gfx_v10_0_wait_reg_mem(ring, usepfp, 1, 0, lower_32_bits(addr), > upper_32_bits(addr), seq, 0xffffffff, 4); } @@ -8984,21 > +8994,6 @@ static void gfx_v10_0_ring_emit_reg_write_reg_wait(struct > amdgpu_ring *ring, > ref, mask); > } > > -static void gfx_v10_0_ring_soft_recovery(struct amdgpu_ring *ring, > - unsigned int vmid) > -{ > - struct amdgpu_device *adev = ring->adev; > - uint32_t value = 0; > - > - value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03); > - value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01); > - value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1); > - value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid); > - amdgpu_gfx_rlc_enter_safe_mode(adev, 0); > - WREG32_SOC15(GC, 0, mmSQ_CMD, value); > - amdgpu_gfx_rlc_exit_safe_mode(adev, 0); > -} > - > static void > gfx_v10_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev, > uint32_t me, uint32_t pipe, > @@ -9467,7 +9462,6 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring > *ring, unsigned int vmid) > struct amdgpu_ring *kiq_ring = &kiq->ring; > unsigned long flags; > u32 tmp; > - u64 addr; > int r; > > if (amdgpu_sriov_vf(adev)) > @@ -9478,27 +9472,14 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring > *ring, unsigned int vmid) > > spin_lock_irqsave(&kiq->ring_lock, flags); > > - if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7 + kiq->pmf->map_queues_size)) { > + if (amdgpu_ring_alloc(kiq_ring, 5)) { > spin_unlock_irqrestore(&kiq->ring_lock, flags); > return -ENOMEM; > } > > - addr = amdgpu_bo_gpu_offset(ring->mqd_obj) + > - offsetof(struct v10_gfx_mqd, cp_gfx_hqd_active); > tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << > vmid); > - if (ring->pipe == 0) > - tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE0_QUEUES, > 1 << ring->queue); > - else > - tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE1_QUEUES, > 1 << ring->queue); > - > gfx_v10_0_ring_emit_wreg(kiq_ring, > SOC15_REG_OFFSET(GC, 0, > mmCP_VMID_RESET), tmp); > - gfx_v10_0_wait_reg_mem(kiq_ring, 0, 1, 0, > - lower_32_bits(addr), upper_32_bits(addr), > - 0, 1, 0x20); > - gfx_v10_0_ring_emit_reg_wait(kiq_ring, > - SOC15_REG_OFFSET(GC, 0, > mmCP_VMID_RESET), 0, 0xffffffff); > - kiq->pmf->kiq_map_queues(kiq_ring, ring); > amdgpu_ring_commit(kiq_ring); > > spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -9507,24 +9488,12 > @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid) > if (r) > return r; > > - r = amdgpu_bo_reserve(ring->mqd_obj, false); > - if (unlikely(r != 0)) { > - DRM_ERROR("fail to resv mqd_obj\n"); > - return r; > - } > - r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr); > - if (!r) { > - r = gfx_v10_0_kgq_init_queue(ring, true); > - amdgpu_bo_kunmap(ring->mqd_obj); > - ring->mqd_ptr = NULL; > - } > - amdgpu_bo_unreserve(ring->mqd_obj); > - if (r) { > - DRM_ERROR("fail to unresv mqd_obj\n"); > - return r; > - } > + if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7)) > + return -ENOMEM; > + gfx_v10_0_ring_emit_pipeline_sync(ring); Why we need to set CP_VMID_RESET 0xffff here? Does it mean that we have to reset(terminate) all the waves from the other vmids? Thanks, Jiadong > + amdgpu_ring_commit(ring); > > - return amdgpu_ring_test_ring(ring); > + return gfx_v10_0_ring_test_ib(ring, > AMDGPU_QUEUE_RESET_TIMEOUT); > } > > static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, @@ -9819,7 +9788,6 > @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = { > .emit_wreg = gfx_v10_0_ring_emit_wreg, > .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait, > .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait, > - .soft_recovery = gfx_v10_0_ring_soft_recovery, > .emit_mem_sync = gfx_v10_0_emit_mem_sync, > .reset = gfx_v10_0_reset_kgq, > .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader, > @@ -9860,7 +9828,6 @@ static const struct amdgpu_ring_funcs > gfx_v10_0_ring_funcs_compute = { > .emit_wreg = gfx_v10_0_ring_emit_wreg, > .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait, > .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait, > - .soft_recovery = gfx_v10_0_ring_soft_recovery, > .emit_mem_sync = gfx_v10_0_emit_mem_sync, > .reset = gfx_v10_0_reset_kcq, > .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader, > -- > 2.34.1