Use an LRU policy to map usermode rings to HW compute queues. Most compute clients use one queue, and usually the first queue available. This results in poor pipe/queue work distribution when multiple compute apps are running. In most cases pipe 0 queue 0 is the only queue that gets used. In order to better distribute work across multiple HW queues, we adopt a policy to map the usermode ring ids to the LRU HW queue. This fixes a large majority of multi-app compute workloads sharing the same HW queue, even though 7 other queues are available. Signed-off-by: Andres Rodriguez <andresx7 at gmail.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c | 32 ++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 57 +++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 4 ++ 5 files changed, 97 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 67b33aa..e30c47e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1510,20 +1510,23 @@ struct amdgpu_device { struct kfd_dev *kfd; struct amdgpu_virt virt; /* link all shadow bo */ struct list_head shadow_list; struct mutex shadow_list_lock; /* link all gtt */ spinlock_t gtt_list_lock; struct list_head gtt_list; + /* keep an lru list of rings by HW IP */ + struct list_head ring_lru_list; + struct mutex ring_lru_list_lock; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) { return container_of(bdev, struct amdgpu_device, mman.bdev); } bool amdgpu_device_is_px(struct drm_device *dev); int amdgpu_device_init(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 944ba0d..1fb1303 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1708,20 +1708,23 @@ int amdgpu_device_init(struct amdgpu_device *adev, spin_lock_init(&adev->gc_cac_idx_lock); spin_lock_init(&adev->audio_endpt_idx_lock); spin_lock_init(&adev->mm_stats.lock); INIT_LIST_HEAD(&adev->shadow_list); mutex_init(&adev->shadow_list_lock); INIT_LIST_HEAD(&adev->gtt_list); spin_lock_init(&adev->gtt_list_lock); + INIT_LIST_HEAD(&adev->ring_lru_list); + mutex_init(&adev->ring_lru_list_lock); + if (adev->asic_type >= CHIP_BONAIRE) { adev->rmmio_base = pci_resource_start(adev->pdev, 5); adev->rmmio_size = pci_resource_len(adev->pdev, 5); } else { adev->rmmio_base = pci_resource_start(adev->pdev, 2); adev->rmmio_size = pci_resource_len(adev->pdev, 2); } adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); if (adev->rmmio == NULL) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c index 3918bdb..e4c6ac3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_queue_mgr.c @@ -90,32 +90,60 @@ static int amdgpu_identity_map(struct amdgpu_device *adev, return -EINVAL; } return update_cached_map(mapper, ring, *out_ring); } static struct amdgpu_queue_mapper_funcs identity_mapper = { .map = amdgpu_identity_map }; +static int amdgpu_lru_map(struct amdgpu_device *adev, + struct amdgpu_queue_mapper *mapper, + int user_ring, + struct amdgpu_ring **out_ring) +{ + int r; + + r = amdgpu_ring_lru_get(adev, mapper->hw_ip, out_ring); + if (r) + return r; + + return update_cached_map(mapper, user_ring, *out_ring); +} + +static struct amdgpu_queue_mapper_funcs lru_mapper = { + .map = amdgpu_lru_map +}; + int amdgpu_queue_mgr_init(struct amdgpu_device *adev, struct amdgpu_queue_mgr *mgr) { int i; if (!adev || !mgr) return -EINVAL; memset(mgr, 0, sizeof(*mgr)); - for (i = 0; i < AMDGPU_MAX_IP_NUM; ++i) - amdgpu_queue_mapper_init(&mgr->mapper[i], i, &identity_mapper); + for (i = 0; i < AMDGPU_MAX_IP_NUM; ++i) { + switch (i) { + case AMDGPU_HW_IP_COMPUTE: + amdgpu_queue_mapper_init(&mgr->mapper[i], i, + &lru_mapper); + break; + default: + amdgpu_queue_mapper_init(&mgr->mapper[i], i, + &identity_mapper); + break; + } + } return 0; } int amdgpu_queue_mgr_fini(struct amdgpu_device *adev, struct amdgpu_queue_mgr *mgr) { return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index a04f07d..80cb051 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -173,20 +173,22 @@ void amdgpu_ring_commit(struct amdgpu_ring *ring) count = ring->funcs->align_mask + 1 - (ring->wptr & ring->funcs->align_mask); count %= ring->funcs->align_mask + 1; ring->funcs->insert_nop(ring, count); mb(); amdgpu_ring_set_wptr(ring); if (ring->funcs->end_use) ring->funcs->end_use(ring); + + amdgpu_ring_lru_touch(ring->adev, ring); } /** * amdgpu_ring_undo - reset the wptr * * @ring: amdgpu_ring structure holding ring information * * Reset the driver's copy of the wptr (all asics). */ void amdgpu_ring_undo(struct amdgpu_ring *ring) @@ -273,20 +275,22 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, (void **)&ring->ring); if (r) { dev_err(adev->dev, "(%d) ring create failed\n", r); return r; } memset((void *)ring->ring, 0, ring->ring_size); } ring->ptr_mask = (ring->ring_size / 4) - 1; ring->max_dw = max_dw; ring->hw_ip = hw_ip; + INIT_LIST_HEAD(&ring->lru_list); + amdgpu_ring_lru_touch(adev, ring); if (amdgpu_debugfs_ring_init(adev, ring)) { DRM_ERROR("Failed to register debugfs file for rings !\n"); } return 0; } /** * amdgpu_ring_fini - tear down the driver ring struct. * @@ -306,20 +310,73 @@ void amdgpu_ring_fini(struct amdgpu_ring *ring) amdgpu_bo_free_kernel(&ring->ring_obj, &ring->gpu_addr, (void **)&ring->ring); amdgpu_debugfs_ring_fini(ring); ring->adev->rings[ring->idx] = NULL; } +/** + * amdgpu_ring_lru_get - get the least recently used ring for a HW IP block + * + * @adev: amdgpu_device pointer + * @hw_ip: HW IP enum + * @ring: output ring + * + * Retreive the amdgpu_ring structure for the least recently used ring of + * a specific IP block (all asics). + * Returns 0 on success, error on failure. + */ +int amdgpu_ring_lru_get(struct amdgpu_device *adev, int hw_ip, + struct amdgpu_ring **ring) +{ + struct amdgpu_ring *entry; + + /* List is sorted in LRU order, find first entry corresponding + * to the desired HW IP */ + *ring = NULL; + mutex_lock(&adev->ring_lru_list_lock); + list_for_each_entry(entry, &adev->ring_lru_list, lru_list) { + if (entry->hw_ip == hw_ip) { + *ring = entry; + break; + } + } + mutex_unlock(&adev->ring_lru_list_lock); + + if (!*ring) { + DRM_ERROR("Ring LRU contains no entries for hw ip:%d\n", hw_ip); + return -EINVAL; + } + + amdgpu_ring_lru_touch(adev, entry); + return 0; +} + +/** + * amdgpu_ring_lru_touch - mark a ring as recently being used + * + * @adev: amdgpu_device pointer + * @ring: ring to touch + * + * Move @ring to the the tail of the lru list + */ +void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring *ring) +{ + /* list_move_tail handles the case where ring isn't part of the list */ + mutex_lock(&adev->ring_lru_list_lock); + list_move_tail(&ring->lru_list, &adev->ring_lru_list); + mutex_unlock(&adev->ring_lru_list_lock); +} + /* * Debugfs info */ #if defined(CONFIG_DEBUG_FS) /* Layout of file is 12 bytes consisting of * - rptr * - wptr * - driver's copy of wptr * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 731c422..ecdd87c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -137,20 +137,21 @@ struct amdgpu_ring_funcs { void (*emit_cntxcntl) (struct amdgpu_ring *ring, uint32_t flags); void (*emit_rreg)(struct amdgpu_ring *ring, uint32_t reg); void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val); }; struct amdgpu_ring { struct amdgpu_device *adev; const struct amdgpu_ring_funcs *funcs; struct amdgpu_fence_driver fence_drv; struct amd_gpu_scheduler sched; + struct list_head lru_list; struct amdgpu_bo *ring_obj; volatile uint32_t *ring; unsigned rptr_offs; unsigned wptr; unsigned wptr_old; unsigned ring_size; unsigned max_dw; int count_dw; uint64_t gpu_addr; @@ -180,12 +181,15 @@ int amdgpu_ring_is_valid_index(struct amdgpu_device *adev, int hw_ip, int ring); int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw); void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count); void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib); void amdgpu_ring_commit(struct amdgpu_ring *ring); void amdgpu_ring_undo(struct amdgpu_ring *ring); int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, int hw_ip, unsigned ring_size, struct amdgpu_irq_src *irq_src, unsigned irq_type); void amdgpu_ring_fini(struct amdgpu_ring *ring); +int amdgpu_ring_lru_get(struct amdgpu_device *adev, int hw_ip, + struct amdgpu_ring **ring); +void amdgpu_ring_lru_touch(struct amdgpu_device *adev, struct amdgpu_ring *ring); #endif -- 2.7.4