From: Jay Cornwall <Jay.Cornwall@xxxxxxx> Synchronization between context-saving wavefronts is achieved by sending a SAVEWAVE message to the SPI and then spin-waiting for a response. These spin-waiting wavefronts may inhibit the progress of other wavefronts in the context save handler, leading to the synchronization condition never being achieved. Before spin-waiting reduce the priority of each wavefront to guarantee foward progress in the others. Signed-off-by: Jay Cornwall <Jay.Cornwall at amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> --- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm | 10 ++++++++-- drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 8 +++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm index 997a383d..34eabcd 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm @@ -98,6 +98,7 @@ var SWIZZLE_EN = 0 //whether we use swi /**************************************************************************/ var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 @@ -319,6 +320,10 @@ end s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC end + // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. + s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp + L_SLEEP: s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 @@ -1132,7 +1137,7 @@ end #endif static const uint32_t cwsr_trap_gfx8_hex[] = { - 0xbf820001, 0xbf820123, + 0xbf820001, 0xbf820125, 0xb8f4f802, 0x89748674, 0xb8f5f803, 0x8675ff75, 0x00000400, 0xbf850011, @@ -1158,7 +1163,8 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { 0x867aff7a, 0x00007fff, 0xb97af807, 0xbef2007e, 0xbef3007f, 0xbefe0180, - 0xbf900004, 0xbf8e0002, + 0xbf900004, 0x877a8474, + 0xb97af802, 0xbf8e0002, 0xbf88fffe, 0xbef8007e, 0x8679ff7f, 0x0000ffff, 0x8779ff79, 0x00040000, diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index da09794..8fc3698 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -97,6 +97,7 @@ var ACK_SQC_STORE = 1 //workaround for suspected SQC store bug causing /**************************************************************************/ var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 +var SQ_WAVE_STATUS_SPI_PRIO_SHIFT = 1 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 var SQ_WAVE_STATUS_HALT_MASK = 0x2000 @@ -362,6 +363,10 @@ end s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC end + // Set SPI_PRIO=2 to avoid starving instruction fetch in the waves we're waiting for. + s_or_b32 s_save_tmp, s_save_status, (2 << SQ_WAVE_STATUS_SPI_PRIO_SHIFT) + s_setreg_b32 hwreg(HW_REG_STATUS), s_save_tmp + L_SLEEP: s_sleep 0x2 // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause SQ hang, since the 7,8th wave could not get arbit to exec inst, while other waves are stuck into the sleep-loop and waiting for wrexec!=0 @@ -1210,7 +1215,7 @@ end #endif static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf820158, + 0xbf820001, 0xbf82015a, 0xb8f8f802, 0x89788678, 0xb8f1f803, 0x866eff71, 0x00000400, 0xbf850034, @@ -1249,6 +1254,7 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0x00007fff, 0xb970f807, 0xbeee007e, 0xbeef007f, 0xbefe0180, 0xbf900004, + 0x87708478, 0xb970f802, 0xbf8e0002, 0xbf88fffe, 0xb8f02a05, 0x80708170, 0x8e708a70, 0xb8f11605, -- 2.7.4