SMEM instructions can reach addresses above 47 bits but require bit 47 to be sign-extended through bits [63:48]. This allows the TMA to be relocated in a following patch. Signed-off-by: Jay Cornwall <jay.cornwall@xxxxxxx> --- .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 58 ++++++++++++------- .../amd/amdkfd/cwsr_trap_handler_gfx10.asm | 5 ++ .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 5 ++ 3 files changed, 46 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 717ad0633dbe..d7cd5fa313ff 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -274,14 +274,14 @@ static const uint32_t cwsr_trap_gfx8_hex[] = { static const uint32_t cwsr_trap_gfx9_hex[] = { - 0xbf820001, 0xbf820254, + 0xbf820001, 0xbf820258, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, 0xbf840009, 0x866eff6d, 0x00ff0000, 0xbf85001e, 0x866eff7b, 0x00000400, - 0xbf850051, 0xbf8e0010, + 0xbf850055, 0xbf8e0010, 0xb8fbf803, 0xbf82fffa, 0x866eff7b, 0x03c00900, 0xbf850015, 0x866eff7b, @@ -294,13 +294,15 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xbf850007, 0xb8eef801, 0x866eff6e, 0x00000800, 0xbf850003, 0x866eff7b, - 0x00000400, 0xbf850036, + 0x00000400, 0xbf85003a, 0xb8faf807, 0x867aff7a, 0x001f8000, 0x8e7a8b7a, 0x8977ff77, 0xfc000000, 0x87777a77, 0xba7ff807, 0x00000000, 0xb8faf812, 0xb8fbf813, 0x8efa887a, + 0xbf0d8f7b, 0xbf840002, + 0x877bff7b, 0xffff0000, 0xc0031bbd, 0x00000010, 0xbf8cc07f, 0x8e6e976e, 0x8977ff77, 0x00800000, @@ -676,14 +678,14 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { }; static const uint32_t cwsr_trap_nv1x_hex[] = { - 0xbf820001, 0xbf8201f1, + 0xbf820001, 0xbf8201f5, 0xb0804004, 0xb978f802, 0x8a78ff78, 0x00020006, 0xb97bf803, 0x876eff78, 0x00002000, 0xbf840009, 0x876eff6d, 0x00ff0000, 0xbf85001e, 0x876eff7b, - 0x00000400, 0xbf850057, + 0x00000400, 0xbf85005b, 0xbf8e0010, 0xb97bf803, 0xbf82fffa, 0x876eff7b, 0x00000900, 0xbf850015, @@ -697,7 +699,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xb96ef801, 0x876eff6e, 0x00000800, 0xbf850003, 0x876eff7b, 0x00000400, - 0xbf85003c, 0x8a77ff77, + 0xbf850040, 0x8a77ff77, 0xff000000, 0xb97af807, 0x877bff7a, 0x02000000, 0x8f7b867b, 0x88777b77, @@ -706,6 +708,8 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0x8a7aff7a, 0x023f8000, 0xb9faf807, 0xb97af812, 0xb97bf813, 0x8ffa887a, + 0xbf0d8f7b, 0xbf840002, + 0x887bff7b, 0xffff0000, 0xf4011bbd, 0xfa000010, 0xbf8cc07f, 0x8f6e976e, 0x8a77ff77, 0x00800000, @@ -1094,14 +1098,14 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { }; static const uint32_t cwsr_trap_arcturus_hex[] = { - 0xbf820001, 0xbf8202d0, + 0xbf820001, 0xbf8202d4, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, 0xbf840009, 0x866eff6d, 0x00ff0000, 0xbf85001e, 0x866eff7b, 0x00000400, - 0xbf850051, 0xbf8e0010, + 0xbf850055, 0xbf8e0010, 0xb8fbf803, 0xbf82fffa, 0x866eff7b, 0x03c00900, 0xbf850015, 0x866eff7b, @@ -1114,13 +1118,15 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { 0xbf850007, 0xb8eef801, 0x866eff6e, 0x00000800, 0xbf850003, 0x866eff7b, - 0x00000400, 0xbf850036, + 0x00000400, 0xbf85003a, 0xb8faf807, 0x867aff7a, 0x001f8000, 0x8e7a8b7a, 0x8977ff77, 0xfc000000, 0x87777a77, 0xba7ff807, 0x00000000, 0xb8faf812, 0xb8fbf813, 0x8efa887a, + 0xbf0d8f7b, 0xbf840002, + 0x877bff7b, 0xffff0000, 0xc0031bbd, 0x00000010, 0xbf8cc07f, 0x8e6e976e, 0x8977ff77, 0x00800000, @@ -1572,14 +1578,14 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { }; static const uint32_t cwsr_trap_aldebaran_hex[] = { - 0xbf820001, 0xbf8202db, + 0xbf820001, 0xbf8202df, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, 0xbf840009, 0x866eff6d, 0x00ff0000, 0xbf85001e, 0x866eff7b, 0x00000400, - 0xbf850051, 0xbf8e0010, + 0xbf850055, 0xbf8e0010, 0xb8fbf803, 0xbf82fffa, 0x866eff7b, 0x03c00900, 0xbf850015, 0x866eff7b, @@ -1592,13 +1598,15 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { 0xbf850007, 0xb8eef801, 0x866eff6e, 0x00000800, 0xbf850003, 0x866eff7b, - 0x00000400, 0xbf850036, + 0x00000400, 0xbf85003a, 0xb8faf807, 0x867aff7a, 0x001f8000, 0x8e7a8b7a, 0x8977ff77, 0xfc000000, 0x87777a77, 0xba7ff807, 0x00000000, 0xb8faf812, 0xb8fbf813, 0x8efa887a, + 0xbf0d8f7b, 0xbf840002, + 0x877bff7b, 0xffff0000, 0xc0031bbd, 0x00000010, 0xbf8cc07f, 0x8e6e976e, 0x8977ff77, 0x00800000, @@ -2061,14 +2069,14 @@ static const uint32_t cwsr_trap_aldebaran_hex[] = { }; static const uint32_t cwsr_trap_gfx10_hex[] = { - 0xbf820001, 0xbf82021c, + 0xbf820001, 0xbf820220, 0xb0804004, 0xb978f802, 0x8a78ff78, 0x00020006, 0xb97bf803, 0x876eff78, 0x00002000, 0xbf840009, 0x876eff6d, 0x00ff0000, 0xbf85001e, 0x876eff7b, - 0x00000400, 0xbf850041, + 0x00000400, 0xbf850045, 0xbf8e0010, 0xb97bf803, 0xbf82fffa, 0x876eff7b, 0x00000900, 0xbf850015, @@ -2082,8 +2090,10 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xb96ef801, 0x876eff6e, 0x00000800, 0xbf850003, 0x876eff7b, 0x00000400, - 0xbf850026, 0xb97af812, + 0xbf85002a, 0xb97af812, 0xb97bf813, 0x8ffa887a, + 0xbf0d8f7b, 0xbf840002, + 0x887bff7b, 0xffff0000, 0xf4011bbd, 0xfa000010, 0xbf8cc07f, 0x8f6e976e, 0x8a77ff77, 0x00800000, @@ -2496,7 +2506,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { }; static const uint32_t cwsr_trap_gfx11_hex[] = { - 0xbfa00001, 0xbfa00221, + 0xbfa00001, 0xbfa00225, 0xb0804006, 0xb8f8f802, 0x9178ff78, 0x00020006, 0xb8fbf803, 0xbf0d9e6d, @@ -2506,7 +2516,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0xbfa10009, 0x8b6eff6d, 0x00ff0000, 0xbfa2001e, 0x8b6eff7b, 0x00000400, - 0xbfa20041, 0xbf830010, + 0xbfa20045, 0xbf830010, 0xb8fbf803, 0xbfa0fffa, 0x8b6eff7b, 0x00000900, 0xbfa20015, 0x8b6eff7b, @@ -2519,9 +2529,11 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0xbfa20007, 0xb8eef801, 0x8b6eff6e, 0x00000800, 0xbfa20003, 0x8b6eff7b, - 0x00000400, 0xbfa20026, + 0x00000400, 0xbfa2002a, 0xbefa4d82, 0xbf89fc07, - 0x84fa887a, 0xf4005bbd, + 0x84fa887a, 0xbf0d8f7b, + 0xbfa10002, 0x8c7bff7b, + 0xffff0000, 0xf4005bbd, 0xf8000010, 0xbf89fc07, 0x846e976e, 0x9177ff77, 0x00800000, 0x8c776e77, @@ -2939,14 +2951,14 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { }; static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { - 0xbf820001, 0xbf8202d7, + 0xbf820001, 0xbf8202db, 0xb8f8f802, 0x8978ff78, 0x00020006, 0xb8fbf803, 0x866eff78, 0x00002000, 0xbf840009, 0x866eff6d, 0x00ff0000, 0xbf85001a, 0x866eff7b, 0x00000400, - 0xbf85004d, 0xbf8e0010, + 0xbf850051, 0xbf8e0010, 0xb8fbf803, 0xbf82fffa, 0x866eff7b, 0x03c00900, 0xbf850011, 0x866eff7b, @@ -2957,13 +2969,15 @@ static const uint32_t cwsr_trap_gfx9_4_3_hex[] = { 0x866e6f6e, 0xbf850006, 0x866eff6d, 0x00ff0000, 0xbf850003, 0x866eff7b, - 0x00000400, 0xbf850036, + 0x00000400, 0xbf85003a, 0xb8faf807, 0x867aff7a, 0x001f8000, 0x8e7a8b7a, 0x8979ff79, 0xfc000000, 0x87797a79, 0xba7ff807, 0x00000000, 0xb8faf812, 0xb8fbf813, 0x8efa887a, + 0xbf0d8f7b, 0xbf840002, + 0x877bff7b, 0xffff0000, 0xc0031bbd, 0x00000010, 0xbf8cc07f, 0x8e6e976e, 0x8979ff79, 0x00800000, diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 8b92c33c2a7c..fdab64624422 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -276,6 +276,11 @@ L_FETCH_2ND_TRAP: #endif s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 + s_bitcmp1_b32 ttmp15, 0xF + s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA + s_or_b32 ttmp15, ttmp15, 0xFFFF0000 +L_NO_SIGN_EXTEND_TMA: + s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag s_waitcnt lgkmcnt(0) s_lshl_b32 ttmp2, ttmp2, TTMP11_DEBUG_TRAP_ENABLED_SHIFT diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index f2087cc2e89d..e506411ad28a 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm @@ -283,6 +283,11 @@ L_FETCH_2ND_TRAP: s_getreg_b32 ttmp15, hwreg(HW_REG_SQ_SHADER_TMA_HI) s_lshl_b64 [ttmp14, ttmp15], [ttmp14, ttmp15], 0x8 + s_bitcmp1_b32 ttmp15, 0xF + s_cbranch_scc0 L_NO_SIGN_EXTEND_TMA + s_or_b32 ttmp15, ttmp15, 0xFFFF0000 +L_NO_SIGN_EXTEND_TMA: + s_load_dword ttmp2, [ttmp14, ttmp15], 0x10 glc:1 // debug trap enabled flag s_waitcnt lgkmcnt(0) s_lshl_b32 ttmp2, ttmp2, TTMP_DEBUG_TRAP_ENABLED_SHIFT -- 2.25.1