On Mon, Feb 8, 2016 at 4:38 AM, Christian König <deathsimple@xxxxxxxxxxx> wrote: > Am 07.02.2016 um 22:51 schrieb Matthew Dawson: >> >> When the radeon driver resets a gpu, it attempts to test whether all the >> rings can successfully handle an IB. If these rings fail to respond, the >> process will wait forever. Another gpu reset can't happen at this point, >> as the current reset holds a lock required to do so. Instead, make all >> the IB tests run with a timeout, so the system can attempt to recover >> in this case. >> >> While this doesn't fix the underlying issue with card resets failing, it >> gives the system a higher chance of recovering. These timeouts have been >> confirmed to help both a Tathi and Hawaii card recover after a gpu reset. >> >> This also adds a new function, radeon_fence_wait_timeout, that behaves >> like >> fence_wait_timeout. It is used instead of fence_wait_timeout as it >> continues >> to work during a reset. radeon_fence_wait is changed to be implemented >> using this function. >> >> V2: >> - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout >> was >> too long. A timeout of 100ms was tested and found to be too short. >> - Changed radeon_fence_wait_timeout to behave more like >> fence_wait_timeout. >> >> Signed-off-by: Matthew Dawson <matthew@xxxxxxxxxxxxx> > > > Reviewed-by: Christian König <christian.koenig@xxxxxxx> Applied. Thanks! Alex > > Regards, > Christian. > > >> --- >> drivers/gpu/drm/radeon/cik.c | 11 ++++++++-- >> drivers/gpu/drm/radeon/cik_sdma.c | 9 ++++++-- >> drivers/gpu/drm/radeon/r100.c | 10 +++++++-- >> drivers/gpu/drm/radeon/r600.c | 10 +++++++-- >> drivers/gpu/drm/radeon/r600_dma.c | 9 ++++++-- >> drivers/gpu/drm/radeon/radeon.h | 2 ++ >> drivers/gpu/drm/radeon/radeon_fence.c | 40 >> ++++++++++++++++++++++++++++------- >> drivers/gpu/drm/radeon/radeon_vce.c | 11 +++++++--- >> drivers/gpu/drm/radeon/uvd_v1_0.c | 10 +++++++-- >> 9 files changed, 89 insertions(+), 23 deletions(-) >> >> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c >> index 4c30d8c..0600140 100644 >> --- a/drivers/gpu/drm/radeon/cik.c >> +++ b/drivers/gpu/drm/radeon/cik.c >> @@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct >> radeon_ring *ring) >> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); >> return r; >> } >> - r = radeon_fence_wait(ib.fence, false); >> - if (r) { >> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( >> + RADEON_USEC_IB_TEST_TIMEOUT)); >> + if (r < 0) { >> DRM_ERROR("radeon: fence wait failed (%d).\n", r); >> radeon_scratch_free(rdev, scratch); >> radeon_ib_free(rdev, &ib); >> return r; >> + } else if (r == 0) { >> + DRM_ERROR("radeon: fence wait timed out.\n"); >> + radeon_scratch_free(rdev, scratch); >> + radeon_ib_free(rdev, &ib); >> + return -ETIMEDOUT; >> } >> + r = 0; >> for (i = 0; i < rdev->usec_timeout; i++) { >> tmp = RREG32(scratch); >> if (tmp == 0xDEADBEEF) >> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c >> b/drivers/gpu/drm/radeon/cik_sdma.c >> index d16f2ee..9c351dc 100644 >> --- a/drivers/gpu/drm/radeon/cik_sdma.c >> +++ b/drivers/gpu/drm/radeon/cik_sdma.c >> @@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev, >> struct radeon_ring *ring) >> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); >> return r; >> } >> - r = radeon_fence_wait(ib.fence, false); >> - if (r) { >> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( >> + RADEON_USEC_IB_TEST_TIMEOUT)); >> + if (r < 0) { >> DRM_ERROR("radeon: fence wait failed (%d).\n", r); >> return r; >> + } else if (r == 0) { >> + DRM_ERROR("radeon: fence wait timed out.\n"); >> + return -ETIMEDOUT; >> } >> + r = 0; >> for (i = 0; i < rdev->usec_timeout; i++) { >> tmp = le32_to_cpu(rdev->wb.wb[index/4]); >> if (tmp == 0xDEADBEEF) >> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c >> index 5eae0a8..6e478a2 100644 >> --- a/drivers/gpu/drm/radeon/r100.c >> +++ b/drivers/gpu/drm/radeon/r100.c >> @@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev, >> struct radeon_ring *ring) >> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); >> goto free_ib; >> } >> - r = radeon_fence_wait(ib.fence, false); >> - if (r) { >> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( >> + RADEON_USEC_IB_TEST_TIMEOUT)); >> + if (r < 0) { >> DRM_ERROR("radeon: fence wait failed (%d).\n", r); >> goto free_ib; >> + } else if (r == 0) { >> + DRM_ERROR("radeon: fence wait timed out.\n"); >> + r = -ETIMEDOUT; >> + goto free_ib; >> } >> + r = 0; >> for (i = 0; i < rdev->usec_timeout; i++) { >> tmp = RREG32(scratch); >> if (tmp == 0xDEADBEEF) { >> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c >> index cc2fdf0..ed12104 100644 >> --- a/drivers/gpu/drm/radeon/r600.c >> +++ b/drivers/gpu/drm/radeon/r600.c >> @@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev, >> struct radeon_ring *ring) >> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); >> goto free_ib; >> } >> - r = radeon_fence_wait(ib.fence, false); >> - if (r) { >> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( >> + RADEON_USEC_IB_TEST_TIMEOUT)); >> + if (r < 0) { >> DRM_ERROR("radeon: fence wait failed (%d).\n", r); >> goto free_ib; >> + } else if (r == 0) { >> + DRM_ERROR("radeon: fence wait timed out.\n"); >> + r = -ETIMEDOUT; >> + goto free_ib; >> } >> + r = 0; >> for (i = 0; i < rdev->usec_timeout; i++) { >> tmp = RREG32(scratch); >> if (tmp == 0xDEADBEEF) >> diff --git a/drivers/gpu/drm/radeon/r600_dma.c >> b/drivers/gpu/drm/radeon/r600_dma.c >> index d2dd29a..fb65e6f 100644 >> --- a/drivers/gpu/drm/radeon/r600_dma.c >> +++ b/drivers/gpu/drm/radeon/r600_dma.c >> @@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev, >> struct radeon_ring *ring) >> DRM_ERROR("radeon: failed to schedule ib (%d).\n", r); >> return r; >> } >> - r = radeon_fence_wait(ib.fence, false); >> - if (r) { >> + r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies( >> + RADEON_USEC_IB_TEST_TIMEOUT)); >> + if (r < 0) { >> DRM_ERROR("radeon: fence wait failed (%d).\n", r); >> return r; >> + } else if (r == 0) { >> + DRM_ERROR("radeon: fence wait timed out.\n"); >> + return -ETIMEDOUT; >> } >> + r = 0; >> for (i = 0; i < rdev->usec_timeout; i++) { >> tmp = le32_to_cpu(rdev->wb.wb[index/4]); >> if (tmp == 0xDEADBEEF) >> diff --git a/drivers/gpu/drm/radeon/radeon.h >> b/drivers/gpu/drm/radeon/radeon.h >> index 78a51b3..007be29 100644 >> --- a/drivers/gpu/drm/radeon/radeon.h >> +++ b/drivers/gpu/drm/radeon/radeon.h >> @@ -120,6 +120,7 @@ extern int radeon_mst; >> */ >> #define RADEON_MAX_USEC_TIMEOUT 100000 /* 100 ms >> */ >> #define RADEON_FENCE_JIFFIES_TIMEOUT (HZ / 2) >> +#define RADEON_USEC_IB_TEST_TIMEOUT 1000000 /* 1s */ >> /* RADEON_IB_POOL_SIZE must be a power of 2 */ >> #define RADEON_IB_POOL_SIZE 16 >> #define RADEON_DEBUGFS_MAX_COMPONENTS 32 >> @@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct >> radeon_device *rdev, int ring); >> int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence >> **fence, int ring); >> void radeon_fence_process(struct radeon_device *rdev, int ring); >> bool radeon_fence_signaled(struct radeon_fence *fence); >> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool >> interruptible, long timeout); >> int radeon_fence_wait(struct radeon_fence *fence, bool interruptible); >> int radeon_fence_wait_next(struct radeon_device *rdev, int ring); >> int radeon_fence_wait_empty(struct radeon_device *rdev, int ring); >> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c >> b/drivers/gpu/drm/radeon/radeon_fence.c >> index 05815c4..7ef075a 100644 >> --- a/drivers/gpu/drm/radeon/radeon_fence.c >> +++ b/drivers/gpu/drm/radeon/radeon_fence.c >> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct >> radeon_device *rdev, >> } >> /** >> - * radeon_fence_wait - wait for a fence to signal >> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout >> * >> * @fence: radeon fence object >> * @intr: use interruptible sleep >> @@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct >> radeon_device *rdev, >> * Wait for the requested fence to signal (all asics). >> * @intr selects whether to use interruptable (true) or >> non-interruptable >> * (false) sleep when waiting for the fence. >> - * Returns 0 if the fence has passed, error for all other cases. >> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite >> wait >> + * Returns remaining time if the sequence number has passed, 0 when >> + * the wait timeout, or an error for all other cases. >> */ >> -int radeon_fence_wait(struct radeon_fence *fence, bool intr) >> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, >> long timeout) >> { >> uint64_t seq[RADEON_NUM_RINGS] = {}; >> long r; >> + int r_sig; >> /* >> * This function should not be called on !radeon fences. >> @@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence, >> bool intr) >> return fence_wait(&fence->base, intr); >> seq[fence->ring] = fence->seq; >> - r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, >> MAX_SCHEDULE_TIMEOUT); >> - if (r < 0) { >> + r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, >> timeout); >> + if (r <= 0) { >> return r; >> } >> - r = fence_signal(&fence->base); >> - if (!r) >> + r_sig = fence_signal(&fence->base); >> + if (!r_sig) >> FENCE_TRACE(&fence->base, "signaled from fence_wait\n"); >> - return 0; >> + return r; >> +} >> + >> +/** >> + * radeon_fence_wait - wait for a fence to signal >> + * >> + * @fence: radeon fence object >> + * @intr: use interruptible sleep >> + * >> + * Wait for the requested fence to signal (all asics). >> + * @intr selects whether to use interruptable (true) or non-interruptable >> + * (false) sleep when waiting for the fence. >> + * Returns 0 if the fence has passed, error for all other cases. >> + */ >> +int radeon_fence_wait(struct radeon_fence *fence, bool intr) >> +{ >> + long r = radeon_fence_wait_timeout(fence, intr, >> MAX_SCHEDULE_TIMEOUT); >> + if (r > 0) { >> + return 0; >> + } else { >> + return r; >> + } >> } >> /** >> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c >> b/drivers/gpu/drm/radeon/radeon_vce.c >> index 7eb1ae7..566a1a0 100644 >> --- a/drivers/gpu/drm/radeon/radeon_vce.c >> +++ b/drivers/gpu/drm/radeon/radeon_vce.c >> @@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev, >> struct radeon_ring *ring) >> goto error; >> } >> - r = radeon_fence_wait(fence, false); >> - if (r) { >> + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies( >> + RADEON_USEC_IB_TEST_TIMEOUT)); >> + if (r < 0) { >> DRM_ERROR("radeon: fence wait failed (%d).\n", r); >> + } else if (r == 0) { >> + DRM_ERROR("radeon: fence wait timed out.\n"); >> + r = -ETIMEDOUT; >> } else { >> - DRM_INFO("ib test on ring %d succeeded\n", ring->idx); >> + DRM_INFO("ib test on ring %d succeeded\n", ring->idx); >> + r = 0; >> } >> error: >> radeon_fence_unref(&fence); >> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c >> b/drivers/gpu/drm/radeon/uvd_v1_0.c >> index c6b1cbc..12ddcfa 100644 >> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c >> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c >> @@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, >> struct radeon_ring *ring) >> goto error; >> } >> - r = radeon_fence_wait(fence, false); >> - if (r) { >> + r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies( >> + RADEON_USEC_IB_TEST_TIMEOUT)); >> + if (r < 0) { >> DRM_ERROR("radeon: fence wait failed (%d).\n", r); >> goto error; >> + } else if (r == 0) { >> + DRM_ERROR("radeon: fence wait timed out.\n"); >> + r = -ETIMEDOUT; >> + goto error; >> } >> + r = 0; >> DRM_INFO("ib test on ring %d succeeded\n", ring->idx); >> error: >> radeon_fence_unref(&fence); > > _______________________________________________ dri-devel mailing list dri-devel@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/dri-devel