Am 01.03.2018 um 10:24 schrieb Monk Liu: > sometimes GPU is switched to other VFs and won't swich > back soon, so the kiq reg access will not signal within > a short period, instead of busy waiting a long time(MAX_KEQ_REG_WAIT) > and returning TMO we can istead sleep 5ms and try again > later (non irq context) > > And since the waiting in kiq_r/weg is busy wait, so MAX_KIQ_REG_WAIT > shouldn't set to a long time, set it to 10ms is more appropriate. > > if gpu already in reset state, don't retry the KIQ reg access > otherwise it would always hang because KIQ was already die usually. > > v2: > replace schedule() with msleep() for the wait > > v3: > use while loop for the wait repeating > use macros for the sleep period > more description for it > > Change-Id: I088243be6ccf722f6d80c3e70c62271d37edb40d > Signed-off-by: Monk Liu <Monk.Liu at amd.com> Reviewed-by: Christian König <christian.koenig at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 62 ++++++++++++++++++++++++++------ > 1 file changed, 52 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > index b832651..43ce2e5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > @@ -22,7 +22,9 @@ > */ > > #include "amdgpu.h" > -#define MAX_KIQ_REG_WAIT 100000000 /* in usecs */ > +#define MAX_KIQ_REG_WAIT 5000 /* in usecs, 5ms */ > +#define MAX_KIQ_REG_BAILOUT_INTERVAL 5 /* in msecs, 5ms */ > +#define MAX_KIQ_REG_TRY 20 > > uint64_t amdgpu_csa_vaddr(struct amdgpu_device *adev) > { > @@ -137,7 +139,7 @@ void amdgpu_virt_init_setting(struct amdgpu_device *adev) > > uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) > { > - signed long r; > + signed long r, cnt = 0; > unsigned long flags; > uint32_t val, seq; > struct amdgpu_kiq *kiq = &adev->gfx.kiq; > @@ -153,18 +155,36 @@ uint32_t amdgpu_virt_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) > spin_unlock_irqrestore(&kiq->ring_lock, flags); > > r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT); > - if (r < 1) { > - DRM_ERROR("wait for kiq fence error: %ld\n", r); > - return ~0; > + > + /* don't wait anymore for gpu reset case because this way may > + * block gpu_recover() routine forever, e.g. this virt_kiq_rreg > + * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will > + * never return if we keep waiting in virt_kiq_rreg, which cause > + * gpu_recover() hang there. > + * > + * also don't wait anymore for IRQ context > + * */ > + if (r < 1 && (adev->in_gpu_reset || in_interrupt())) > + goto failed_kiq_read; > + > + while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) { > + msleep(MAX_KIQ_REG_BAILOUT_INTERVAL); > + r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT); > } > - val = adev->wb.wb[adev->virt.reg_val_offs]; > > - return val; > + if (cnt > MAX_KIQ_REG_TRY) > + goto failed_kiq_read; > + > + return adev->wb.wb[adev->virt.reg_val_offs]; > + > +failed_kiq_read: > + pr_err("failed to read reg:%x\n", reg); > + return ~0; > } > > void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) > { > - signed long r; > + signed long r, cnt = 0; > unsigned long flags; > uint32_t seq; > struct amdgpu_kiq *kiq = &adev->gfx.kiq; > @@ -180,8 +200,30 @@ void amdgpu_virt_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) > spin_unlock_irqrestore(&kiq->ring_lock, flags); > > r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT); > - if (r < 1) > - DRM_ERROR("wait for kiq fence error: %ld\n", r); > + > + /* don't wait anymore for gpu reset case because this way may > + * block gpu_recover() routine forever, e.g. this virt_kiq_rreg > + * is triggered in TTM and ttm_bo_lock_delayed_workqueue() will > + * never return if we keep waiting in virt_kiq_rreg, which cause > + * gpu_recover() hang there. > + * > + * also don't wait anymore for IRQ context > + * */ > + if (r < 1 && (adev->in_gpu_reset || in_interrupt())) > + goto failed_kiq_write; > + > + while (r < 1 && cnt++ < MAX_KIQ_REG_TRY) { > + msleep(MAX_KIQ_REG_BAILOUT_INTERVAL); > + r = amdgpu_fence_wait_polling(ring, seq, MAX_KIQ_REG_WAIT); > + } > + > + if (cnt > MAX_KIQ_REG_TRY) > + goto failed_kiq_write; > + > + return; > + > +failed_kiq_write: > + pr_err("failed to write reg:%x\n", reg); > } > > /**