On Tue, Feb 4, 2025 at 9:48 AM Christian König <ckoenig.leichtzumerken@xxxxxxxxx> wrote: > > Testing this feature turned out that it was a bit unstable. The > CP_VMID_RESET register takes the VMID which all submissions from should > be canceled. > > Unlike Windows Linux uses per process VMIDs instead of per engine VMIDs > for the simple reason that we don't have enough. So resetting one VMID > only killed the submissions of one specific process. > > Fortunately that turned out to be exactly what we want to have. > > So clear the CP_VMID_RESET register between every context switch between > applications when we do the pipeline sync to avoid trouble if multiple > VMIDs are used on the ring right behind each other. > > Use the same pipeline sync function in the reset handler and issue an IB > test instead of a ring test after the queue reset to provide a longer > timeout and additional fence value should there be additional work on > the ring after the one aborted. > > Also drop the soft recovery since that pretty much does the same thing as > CP_VMID_RESET, just on a lower level and with less chance of succeeding. > > This now survives a stress test running over night sending a broken > submission ever 45 seconds and recovering fine from each of them. > > Signed-off-by: Christian König <christian.koenig@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 47 ++++++++++----------------- > 2 files changed, 19 insertions(+), 29 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 8902fafbcf8d..1eee2a1bca5a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -275,6 +275,7 @@ extern int amdgpu_wbrf; > #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000 > #define AMDGPU_MAX_USEC_TIMEOUT 100000 /* 100 ms */ > #define AMDGPU_FENCE_JIFFIES_TIMEOUT (HZ / 2) > +#define AMDGPU_QUEUE_RESET_TIMEOUT (HZ / 10) > #define AMDGPU_DEBUGFS_MAX_COMPONENTS 32 > #define AMDGPUFB_CONN_LIMIT 4 > #define AMDGPU_BIOS_NUM_SCRATCH 16 > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index fa572b40989e..705f5a9c11a0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -5607,7 +5607,17 @@ static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring) > int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX); > uint32_t seq = ring->fence_drv.sync_seq; > uint64_t addr = ring->fence_drv.gpu_addr; > + struct amdgpu_device *adev = ring->adev; > > + amdgpu_ring_emit_reg_wait(ring, > + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), > + 0, 0xffff); > + amdgpu_ring_emit_wreg(ring, > + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), > + 0); > + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, > + ring->fence_drv.sync_seq, > + AMDGPU_FENCE_FLAG_EXEC); > gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0, > lower_32_bits(addr), upper_32_bits(addr), > seq, 0xffffffff, 4); > @@ -5963,20 +5973,6 @@ static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring, > ref, mask); > } > > -static void gfx_v9_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid) > -{ > - struct amdgpu_device *adev = ring->adev; > - uint32_t value = 0; > - > - value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03); > - value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01); > - value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1); > - value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid); > - amdgpu_gfx_rlc_enter_safe_mode(adev, 0); > - WREG32_SOC15(GC, 0, mmSQ_CMD, value); > - amdgpu_gfx_rlc_exit_safe_mode(adev, 0); > -} > - > static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev, > enum amdgpu_interrupt_state state) > { > @@ -7252,16 +7248,12 @@ static int gfx_v9_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid) > if (r) > return r; > > - if (amdgpu_ring_alloc(ring, 7 + 7 + 5)) > + if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7)) > return -ENOMEM; > - gfx_v9_0_ring_emit_fence(ring, ring->fence_drv.gpu_addr, > - ring->fence_drv.sync_seq, AMDGPU_FENCE_FLAG_EXEC); > - gfx_v9_0_ring_emit_reg_wait(ring, > - SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0, 0xffff); > - gfx_v9_0_ring_emit_wreg(ring, > - SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0); > + gfx_v9_0_ring_emit_pipeline_sync(ring); > + amdgpu_ring_commit(ring); > > - return amdgpu_ring_test_ring(ring); > + return gfx_v9_0_ring_test_ib(ring, AMDGPU_QUEUE_RESET_TIMEOUT); > } > > static int gfx_v9_0_reset_kcq(struct amdgpu_ring *ring, > @@ -7468,7 +7460,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { > .set_wptr = gfx_v9_0_ring_set_wptr_gfx, > .emit_frame_size = /* totally 242 maximum if 16 IBs */ > 5 + /* COND_EXEC */ > - 7 + /* PIPELINE_SYNC */ > + 7 + 7 + 5 + 7 + /* PIPELINE_SYNC */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > 2 + /* VM_FLUSH */ > @@ -7506,7 +7498,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { > .emit_wreg = gfx_v9_0_ring_emit_wreg, > .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, > .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait, > - .soft_recovery = gfx_v9_0_ring_soft_recovery, > .emit_mem_sync = gfx_v9_0_emit_mem_sync, > .reset = gfx_v9_0_reset_kgq, > .emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader, > @@ -7525,7 +7516,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = { > .set_wptr = amdgpu_sw_ring_set_wptr_gfx, > .emit_frame_size = /* totally 242 maximum if 16 IBs */ > 5 + /* COND_EXEC */ > - 7 + /* PIPELINE_SYNC */ > + 7 + 7 + 5 + 7 + /* PIPELINE_SYNC */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > 2 + /* VM_FLUSH */ > @@ -7564,7 +7555,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = { > .emit_wreg = gfx_v9_0_ring_emit_wreg, > .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, > .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait, > - .soft_recovery = gfx_v9_0_ring_soft_recovery, > .emit_mem_sync = gfx_v9_0_emit_mem_sync, > .patch_cntl = gfx_v9_0_ring_patch_cntl, > .patch_de = gfx_v9_0_ring_patch_de_meta, > @@ -7586,7 +7576,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { > 20 + /* gfx_v9_0_ring_emit_gds_switch */ > 7 + /* gfx_v9_0_ring_emit_hdp_flush */ > 5 + /* hdp invalidate */ > - 7 + /* gfx_v9_0_ring_emit_pipeline_sync */ > + 7 + 7 + 5 + 7 + /* PIPELINE_SYNC */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > 8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */ > @@ -7608,7 +7598,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { > .emit_wreg = gfx_v9_0_ring_emit_wreg, > .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, > .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait, > - .soft_recovery = gfx_v9_0_ring_soft_recovery, Probably want to keep soft_recovery for compute as compute queues don't support vmid reset. Alex > .emit_mem_sync = gfx_v9_0_emit_mem_sync, > .emit_wave_limit = gfx_v9_0_emit_wave_limit, > .reset = gfx_v9_0_reset_kcq, > @@ -7629,7 +7618,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = { > 20 + /* gfx_v9_0_ring_emit_gds_switch */ > 7 + /* gfx_v9_0_ring_emit_hdp_flush */ > 5 + /* hdp invalidate */ > - 7 + /* gfx_v9_0_ring_emit_pipeline_sync */ > + 7 + 7 + 5 + 7 + /* PIPELINE_SYNC */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */ > -- > 2.34.1 >