Make sure KFD gets a turn when serializing access to the GC IP. Currently non-KFD jobs can starve KFD if they submit often enough. This patch prevents that by stalling non-KFD if its time period has elapsed. v2: fix units v3: check enablement properly Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 53 ++++++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 + 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index ab5524b7a259..2f381848c849 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -119,7 +119,7 @@ #define MAX_GPU_INSTANCE 64 -#define GFX_SLICE_PERIOD msecs_to_jiffies(250) +#define GFX_SLICE_PERIOD_MS 250 struct amdgpu_gpu_instance { struct amdgpu_device *adev; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index e96984c53e72..b8cc4b146bdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1752,7 +1752,7 @@ static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx, if (adev->gfx.kfd_sch_req_count[idx] == 0 && adev->gfx.kfd_sch_inactive[idx]) { schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work, - GFX_SLICE_PERIOD); + msecs_to_jiffies(adev->gfx.enforce_isolation_time[idx])); } } else { if (adev->gfx.kfd_sch_req_count[idx] == 0) { @@ -1807,8 +1807,9 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work) fences += amdgpu_fence_count_emitted(&adev->gfx.compute_ring[i]); } if (fences) { + /* we've already had our timeslice, so let's wrap this up */ schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work, - GFX_SLICE_PERIOD); + msecs_to_jiffies(1)); } else { /* Tell KFD to resume the runqueue */ if (adev->kfd.init_complete) { @@ -1821,6 +1822,51 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work) mutex_unlock(&adev->enforce_isolation_mutex); } +static void +amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev, + u32 idx) +{ + unsigned long cjiffies; + bool wait = false; + + mutex_lock(&adev->enforce_isolation_mutex); + if (adev->enforce_isolation[idx]) { + /* set the initial values if nothing is set */ + if (!adev->gfx.enforce_isolation_jiffies[idx]) { + adev->gfx.enforce_isolation_jiffies[idx] = jiffies; + adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS; + } + /* Make sure KFD gets a chance to run */ + if (amdgpu_amdkfd_compute_active(adev, idx)) { + cjiffies = jiffies; + if (time_after(cjiffies, adev->gfx.enforce_isolation_jiffies[idx])) { + cjiffies -= adev->gfx.enforce_isolation_jiffies[idx]; + if ((jiffies_to_msecs(cjiffies) >= GFX_SLICE_PERIOD_MS)) { + /* if our time is up, let KGD work drain before scheduling more */ + wait = true; + /* reset the timer period */ + adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS; + } else { + /* set the timer period to what's left in our time slice */ + adev->gfx.enforce_isolation_time[idx] = + GFX_SLICE_PERIOD_MS - jiffies_to_msecs(cjiffies); + } + } else { + /* if jiffies wrap around we will just wait a little longer */ + adev->gfx.enforce_isolation_jiffies[idx] = jiffies; + } + } else { + /* if there is no KFD work, then set the full slice period */ + adev->gfx.enforce_isolation_jiffies[idx] = jiffies; + adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS; + } + } + mutex_unlock(&adev->enforce_isolation_mutex); + + if (wait) + msleep(GFX_SLICE_PERIOD_MS); +} + void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; @@ -1837,6 +1883,9 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) if (idx >= MAX_XCP) return; + /* Don't submit more work until KFD has had some time */ + amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx); + mutex_lock(&adev->enforce_isolation_mutex); if (adev->enforce_isolation[idx]) { if (adev->kfd.init_complete) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index f710178a21bc..af9dbd760fee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -472,6 +472,8 @@ struct amdgpu_gfx { struct mutex kfd_sch_mutex; u64 kfd_sch_req_count[MAX_XCP]; bool kfd_sch_inactive[MAX_XCP]; + unsigned long enforce_isolation_jiffies[MAX_XCP]; + unsigned long enforce_isolation_time[MAX_XCP]; }; struct amdgpu_gfx_ras_reg_entry { -- 2.47.0