Now here is one possible deadlock case.
gpu_recovery
-> stop drm scheduler
-> asic reset
-> ib test
-> tt populate (uvd ib test)
-> ttm_bo_swapout (BO A) // this always fails as the fence of
BO A would not be signaled by schedluer or HW. Hit deadlock.
I paste the drm test patch below.
#modprobe ttm pages_limit=65536
#amdgpu_test -s 1 -t 4
---
tests/amdgpu/basic_tests.c | 32 ++++++++++++++------------------
1 file changed, 14 insertions(+), 18 deletions(-)
diff --git a/tests/amdgpu/basic_tests.c b/tests/amdgpu/basic_tests.c
index dbf02fee..f85ed340 100644
--- a/tests/amdgpu/basic_tests.c
+++ b/tests/amdgpu/basic_tests.c
@@ -65,13 +65,16 @@ static void amdgpu_direct_gma_test(void);
static void amdgpu_command_submission_write_linear_helper(unsigned ip_type);
static void amdgpu_command_submission_const_fill_helper(unsigned ip_type);
static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type);
-static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
+static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
unsigned ip_type,
int instance, int pm4_dw, uint32_t *pm4_src,
int res_cnt, amdgpu_bo_handle *resources,
struct amdgpu_cs_ib_info *ib_info,
- struct amdgpu_cs_request *ibs_request);
+ struct amdgpu_cs_request *ibs_request, int sync, int repeat);
+#define amdgpu_test_exec_cs_helper(...) \
+ _amdgpu_test_exec_cs_helper(__VA_ARGS__, 1, 1)
+
CU_TestInfo basic_tests[] = {
{ "Query Info Test", amdgpu_query_info_test },
{ "Userptr Test", amdgpu_userptr_test },
@@ -1341,12 +1344,12 @@ static void amdgpu_command_submission_compute(void)
* pm4_src, resources, ib_info, and ibs_request
* submit command stream described in ibs_request and wait for this IB accomplished
*/
-static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
+static void _amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
unsigned ip_type,
int instance, int pm4_dw, uint32_t *pm4_src,
int res_cnt, amdgpu_bo_handle *resources,
struct amdgpu_cs_ib_info *ib_info,
- struct amdgpu_cs_request *ibs_request)
+ struct amdgpu_cs_request *ibs_request, int sync, int repeat)
{
int r;
uint32_t expired;
@@ -1395,12 +1398,15 @@ static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle,
CU_ASSERT_NOT_EQUAL(ibs_request, NULL);
/* submit CS */
- r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1);
+ while (repeat--)
+ r = amdgpu_cs_submit(context_handle, 0, ibs_request, 1);
CU_ASSERT_EQUAL(r, 0);
r = amdgpu_bo_list_destroy(ibs_request->resources);
CU_ASSERT_EQUAL(r, 0);
+ if (!sync)
+ return;
fence_status.ip_type = ip_type;
fence_status.ip_instance = 0;
fence_status.ring = ibs_request->ring;
@@ -1667,7 +1673,7 @@ static void amdgpu_command_submission_sdma_const_fill(void)
static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
{
- const int sdma_write_length = 1024;
+ const int sdma_write_length = (255) << 20;
const int pm4_dw = 256;
amdgpu_context_handle context_handle;
amdgpu_bo_handle bo1, bo2;
@@ -1715,8 +1721,6 @@ static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
&bo1_va_handle);
CU_ASSERT_EQUAL(r, 0);
- /* set bo1 */
- memset((void*)bo1_cpu, 0xaa, sdma_write_length);
/* allocate UC bo2 for sDMA use */
r = amdgpu_bo_alloc_and_map(device_handle,
@@ -1727,8 +1731,6 @@ static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
&bo2_va_handle);
CU_ASSERT_EQUAL(r, 0);
- /* clear bo2 */
- memset((void*)bo2_cpu, 0, sdma_write_length);
resources[0] = bo1;
resources[1] = bo2;
@@ -1785,17 +1787,11 @@ static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type)
}
}
- amdgpu_test_exec_cs_helper(context_handle,
+ _amdgpu_test_exec_cs_helper(context_handle,
ip_type, ring_id,
i, pm4,
2, resources,
- ib_info, ibs_request);
-
- /* verify if SDMA test result meets with expected */
- i = 0;
- while(i < sdma_write_length) {
- CU_ASSERT_EQUAL(bo2_cpu[i++], 0xaa);
- }
+ ib_info, ibs_request, 0, 100);
r = amdgpu_bo_unmap_and_free(bo1, bo1_va_handle, bo1_mc,
sdma_write_length);
CU_ASSERT_EQUAL(r, 0);