[PATCH 10/11] drm/amdkfd: Handle save/restore of lds allocated in 1280B blocks

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Lancelot SIX <lancelot.six@xxxxxxx>

The gfx-9 trap handler is reading LDS allocation size in 256 bytes
granularity (from SQ_WAVE_LDS_ALLOC), but it using the assumption that
this value is always even (i.e. the LDS allocation is really done in
multiple of 512 bytes).  This was true so far, but gfx-950 allocates LDS
in chunks of 1280 bytes, making this assumption invalid.  This can cause
the trap handler to try to save / restore past the end of LDS, and past
the LDS allocated slot in the save are, overriding data from the
following wave.

This patch updates the trap handler to support LDS allocated in 1280
bytes blocks:
- During restore, copy from main memory directly to LDS in batch of 1280
  bytes.
- During save, continue to use 512 bytes blocks (we only have 2 VGPRs we
  can use to hold data), making sure to mask the upper half of the wave
  when handling when the LDS size is not a multiple of 512 bytes.

Signed-off-by: Lancelot SIX <lancelot.six@xxxxxxx>
Co-authored-By: Alex Sierra <alex.sierra@xxxxxxx>
Reviewed-by: Jay Cornwall <jay.cornwall@xxxxxxx>
Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 60 +++++++++----------
 .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 25 +++++++-
 2 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 5931a17b4da3e..388b44ed5928c 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -4124,7 +4124,7 @@ static const uint32_t cwsr_trap_gfx12_hex[] = {
 };
 
 static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
-	0xbf820001, 0xbf8202ea,
+	0xbf820001, 0xbf8202d8,
 	0xb8f8f802, 0x8978ff78,
 	0x00020006, 0xb8fbf803,
 	0x866eff78, 0x00002000,
@@ -4321,9 +4321,9 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
 	0xe0724300, 0x701d0300,
 	0xbefe00c1, 0xbeff00c1,
 	0xb8fb5306, 0x867bc17b,
-	0xbf840064, 0xbf8a0000,
+	0xbf840052, 0xbf8a0000,
 	0x867aff6f, 0x04000000,
-	0xbf840060, 0x8e7b867b,
+	0xbf84004e, 0x8e7b867b,
 	0x8e7b827b, 0xbef6007b,
 	0xb8f02985, 0x80708170,
 	0x8e708a70, 0x8e708170,
@@ -4336,8 +4336,8 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
 	0x000204c1, 0x867aff78,
 	0x00400000, 0xbf850003,
 	0xb8faf803, 0x897a7aff,
-	0x10000000, 0xbf850030,
-	0x24040682, 0xd86e4000,
+	0x10000000, 0xbf85001d,
+	0x24040682, 0xd86c0000,
 	0x00000002, 0xbf8cc07f,
 	0xbe840080, 0xd2890000,
 	0x00000900, 0x80048104,
@@ -4348,29 +4348,20 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
 	0x80048104, 0xc069003a,
 	0x00000070, 0xbf8cc07f,
 	0x80709070, 0xbf06c004,
-	0xbf84ffee, 0xbe840080,
-	0xd2890000, 0x00000901,
-	0x80048104, 0xd2890001,
-	0x00000901, 0x80048104,
-	0xd2890002, 0x00000901,
-	0x80048104, 0xd2890003,
-	0x00000901, 0x80048104,
-	0xc069003a, 0x00000070,
-	0xbf8cc07f, 0x80709070,
-	0xbf06c004, 0xbf84ffee,
-	0x680404ff, 0x00000200,
+	0xbf84ffee, 0x680404ff,
+	0x00000100, 0xd0c9006a,
+	0x0000f702, 0xbf87ffe5,
+	0xbf820016, 0xd1060002,
+	0x00011103, 0x7e0602ff,
+	0x00000200, 0xbefc00ff,
+	0x00010000, 0xbe800077,
+	0x8677ff77, 0xff7fffff,
+	0x8777ff77, 0x00058000,
+	0xd8ec0000, 0x00000002,
+	0xbf8cc07f, 0xe0765000,
+	0x701d0002, 0x68040702,
 	0xd0c9006a, 0x0000f702,
-	0xbf87ffd2, 0xbf820015,
-	0xd1060002, 0x00011103,
-	0x7e0602ff, 0x00000200,
-	0xbefc00ff, 0x00010000,
-	0xbe800077, 0x8677ff77,
-	0xff7fffff, 0x8777ff77,
-	0x00058000, 0xd8ec0000,
-	0x00000002, 0xbf8cc07f,
-	0xe0765000, 0x701d0002,
-	0x68040702, 0xd0c9006a,
-	0x0000f702, 0xbf87fff7,
+	0xbefe016a, 0xbf87fff6,
 	0xbef70000, 0xbef000ff,
 	0x00000400, 0xbefe00c1,
 	0xbeff00c1, 0xb8fb2b05,
@@ -4497,15 +4488,15 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
 	0x701d0300, 0x807c847c,
 	0x8070ff70, 0x00000400,
 	0xbf0a7b7c, 0xbf85ffeb,
-	0xbf9c0000, 0xbf8200ee,
+	0xbf9c0000, 0xbf8200f4,
 	0xbef4007e, 0x8675ff7f,
 	0x0000ffff, 0x8775ff75,
 	0x00040000, 0xbef60080,
 	0xbef700ff, 0x00807fac,
 	0x866eff7f, 0x04000000,
-	0xbf84001f, 0xbefe00c1,
+	0xbf840025, 0xbefe00c1,
 	0xbeff00c1, 0xb8ef5306,
-	0x866fc16f, 0xbf84001a,
+	0x866fc16f, 0xbf840020,
 	0x8e6f866f, 0x8e6f826f,
 	0xbef6006f, 0xb8f82985,
 	0x80788178, 0x8e788a78,
@@ -4516,9 +4507,12 @@ static const uint32_t cwsr_trap_gfx9_5_0_hex[] = {
 	0x01000000, 0xbefc0080,
 	0xe0510000, 0x781d0000,
 	0xe0510100, 0x781d0000,
-	0x807cff7c, 0x00000200,
-	0x8078ff78, 0x00000200,
-	0xbf0a6f7c, 0xbf85fff6,
+	0xe0510200, 0x781d0000,
+	0xe0510300, 0x781d0000,
+	0xe0510400, 0x781d0000,
+	0x807cff7c, 0x00000500,
+	0x8078ff78, 0x00000500,
+	0xbf0a6f7c, 0xbf85fff0,
 	0xbefe00c1, 0xbeff00c1,
 	0xbef600ff, 0x01000000,
 	0xb8ef2b05, 0x806f816f,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
index 0f4877a601409..0eabb7a8cab94 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm
@@ -75,8 +75,10 @@ var SQ_WAVE_STATUS_ECC_ERR_MASK         = 0x20000
 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT	= 12
 #if ASIC_FAMILY >= CHIP_GC_9_5_0
 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE	= 11
+var LDS_RESTORE_GRANULARITY_BYTES	= 1280
 #else
 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE	= 9
+var LDS_RESTORE_GRANULARITY_BYTES	= 512
 #endif
 var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE	= 6
 var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE	= 3			//FIXME	 sq.blk still has 4 bits at this time while SQ programming guide has 3 bits
@@ -572,12 +574,21 @@ if SAVE_AFTER_XNACK_ERROR
 
 	v_lshlrev_b32 v2, 2, v3
 L_SAVE_LDS_LOOP_SQC:
+#if ASIC_FAMILY < CHIP_GC_9_5_0
 	ds_read2_b32 v[0:1], v2 offset0:0 offset1:0x40
 	s_waitcnt lgkmcnt(0)
-
 	write_vgprs_to_mem_with_sqc(v0, 2, s_save_buf_rsrc0, s_save_mem_offset)
 
 	v_add_u32 v2, 0x200, v2
+#else
+	// gfx950 needs to save in multiple of 256 bytes.
+	ds_read_b32 v0, v2
+	s_waitcnt lgkmcnt(0)
+	write_vgprs_to_mem_with_sqc(v0, 1, s_save_buf_rsrc0, s_save_mem_offset)
+
+	v_add_u32 v2, 0x100, v2
+#endif
+
 	v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
 	s_cbranch_vccnz L_SAVE_LDS_LOOP_SQC
 
@@ -601,6 +612,9 @@ L_SAVE_LDS_LOOP_VECTOR:
 //	v_add_u32 v2, vcc[0:1], v2, v3
       v_add_u32 v2, v2, v3
       v_cmp_lt_u32 vcc[0:1], v2, s_save_alloc_size
+#if ASIC_FAMILY >= CHIP_GC_9_5_0
+      s_mov_b64 exec, vcc
+#endif
       s_cbranch_vccnz L_SAVE_LDS_LOOP_VECTOR
 
       // restore rsrc3
@@ -763,8 +777,13 @@ L_RESTORE:
   L_RESTORE_LDS_LOOP:
 	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1		       // first 64DW
 	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:256	       // second 64DW
-    s_add_u32	    m0, m0, 256*2						// 128 DW
-    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, 256*2		//mem offset increased by 128DW
+#if ASIC_FAMILY >= CHIP_GC_9_5_0
+	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:512	// third 64DW
+	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:768	// forth 64DW
+	buffer_load_dword   v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 offset:1024	// fifth 64DW
+#endif
+    s_add_u32	    m0, m0, LDS_RESTORE_GRANULARITY_BYTES					// 128/320 DW
+    s_add_u32	    s_restore_mem_offset, s_restore_mem_offset, LDS_RESTORE_GRANULARITY_BYTES	//mem offset increased by 128/320 DW
     s_cmp_lt_u32    m0, s_restore_alloc_size					//scc=(m0 < s_restore_alloc_size) ? 1 : 0
     s_cbranch_scc1  L_RESTORE_LDS_LOOP							    //LDS restore is complete?
 
-- 
2.47.0




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux