On Thu, Apr 6, 2017 at 2:21 AM, Andres Rodriguez <andresx7 at gmail.com> wrote: > Programming CP_HQD_QUEUE_PRIORITY enables a queue to take priority over > other queues on the same pipe. Multiple queues on a pipe are timesliced > so this gives us full precedence over other queues. > > Programming CP_HQD_PIPE_PRIORITY changes the SPI_ARB_PRIORITY of the > wave as follows: > 0x2: CS_H > 0x1: CS_M > 0x0: CS_L > > The SPI block will then dispatch work according to the policy set by > SPI_ARB_PRIORITY. In the current policy CS_H is higher priority than > gfx. > > In order to prevent getting stuck in loops of CUs bouncing between GFX > and high priority compute and introducing further latency, we reserve > CUs 2+ for high priority compute on-demand. > > v2: fix srbm_select to ring->queue and use ring->funcs->type > v3: use AMD_SCHED_PRIORITY_* instead of AMDGPU_CTX_PRIORITY_* > v4: switch int to enum amd_sched_priority > v5: corresponding changes for srbm_lock > > Acked-by: Christian König <christian.koenig at amd.com> > Signed-off-by: Andres Rodriguez <andresx7 at gmail.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 96 +++++++++++++++++++++++++++++- > 3 files changed, 99 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index b9a4161..c56a884 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1044,20 +1044,23 @@ struct amdgpu_gfx { > uint32_t me_feature_version; > uint32_t ce_feature_version; > uint32_t pfp_feature_version; > uint32_t rlc_feature_version; > uint32_t mec_feature_version; > uint32_t mec2_feature_version; > struct amdgpu_ring gfx_ring[AMDGPU_MAX_GFX_RINGS]; > unsigned num_gfx_rings; > struct amdgpu_ring compute_ring[AMDGPU_MAX_COMPUTE_RINGS]; > unsigned num_compute_rings; > + spinlock_t cu_reserve_lock; > + uint32_t cu_reserve_pipe_mask; > + uint32_t cu_reserve_queue_mask[AMDGPU_MAX_COMPUTE_RINGS]; > struct amdgpu_irq_src eop_irq; > struct amdgpu_irq_src priv_reg_irq; > struct amdgpu_irq_src priv_inst_irq; > /* gfx status */ > uint32_t gfx_current_status; > /* ce ram size*/ > unsigned ce_ram_size; > struct amdgpu_cu_info cu_info; > const struct amdgpu_gfx_funcs *funcs; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 07f16b4..29b45bb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -1874,20 +1874,21 @@ int amdgpu_device_init(struct amdgpu_device *adev, > /* Registers mapping */ > /* TODO: block userspace mapping of io register */ > spin_lock_init(&adev->mmio_idx_lock); > spin_lock_init(&adev->smc_idx_lock); > spin_lock_init(&adev->pcie_idx_lock); > spin_lock_init(&adev->uvd_ctx_idx_lock); > spin_lock_init(&adev->didt_idx_lock); > spin_lock_init(&adev->gc_cac_idx_lock); > spin_lock_init(&adev->audio_endpt_idx_lock); > spin_lock_init(&adev->mm_stats.lock); > + spin_lock_init(&adev->gfx.cu_reserve_lock); > > INIT_LIST_HEAD(&adev->shadow_list); > mutex_init(&adev->shadow_list_lock); > > INIT_LIST_HEAD(&adev->gtt_list); > spin_lock_init(&adev->gtt_list_lock); > > INIT_LIST_HEAD(&adev->ring_lru_list); > spin_lock_init(&adev->ring_lru_list_lock); > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index 3cfe3c0..f94d532 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -46,21 +46,24 @@ > #include "gca/gfx_8_0_sh_mask.h" > #include "gca/gfx_8_0_enum.h" > > #include "dce/dce_10_0_d.h" > #include "dce/dce_10_0_sh_mask.h" > > #include "smu/smu_7_1_3_d.h" > > #define GFX8_NUM_GFX_RINGS 1 > #define GFX8_MEC_HPD_SIZE 2048 > - > +#define GFX8_CU_RESERVE_RESOURCES 0x45888 > +#define GFX8_CU_NUM 8 > +#define GFX8_UNRESERVED_CU_NUM 2 > +#define GFX8_CU_RESERVE_PIPE_SHIFT 7 > > #define TOPAZ_GB_ADDR_CONFIG_GOLDEN 0x22010001 > #define CARRIZO_GB_ADDR_CONFIG_GOLDEN 0x22010001 > #define POLARIS11_GB_ADDR_CONFIG_GOLDEN 0x22011002 > #define TONGA_GB_ADDR_CONFIG_GOLDEN 0x22011003 > > #define ARRAY_MODE(x) ((x) << GB_TILE_MODE0__ARRAY_MODE__SHIFT) > #define PIPE_CONFIG(x) ((x) << GB_TILE_MODE0__PIPE_CONFIG__SHIFT) > #define TILE_SPLIT(x) ((x) << GB_TILE_MODE0__TILE_SPLIT__SHIFT) > #define MICRO_TILE_MODE_NEW(x) ((x) << GB_TILE_MODE0__MICRO_TILE_MODE_NEW__SHIFT) > @@ -6710,20 +6713,110 @@ static u64 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring) > > static void gfx_v8_0_ring_set_wptr_compute(struct amdgpu_ring *ring) > { > struct amdgpu_device *adev = ring->adev; > > /* XXX check if swapping is necessary on BE */ > adev->wb.wb[ring->wptr_offs] = lower_32_bits(ring->wptr); > WDOORBELL32(ring->doorbell_index, lower_32_bits(ring->wptr)); > } > > +static void gfx_v8_0_cu_reserve(struct amdgpu_device *adev, > + struct amdgpu_ring *ring, bool acquire) > +{ > + int i, resources; > + int tmp = 0, queue_mask = 0, type_mask = 0; > + int reserve_res_reg, reserve_en_reg; > + > + /* gfx_v8_0_cu_reserve only supports compute path */ > + if (ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) > + return; > + > + spin_lock(&adev->gfx.cu_reserve_lock); > + if (acquire) { > + adev->gfx.cu_reserve_pipe_mask |= (1 << ring->pipe); > + adev->gfx.cu_reserve_queue_mask[ring->pipe] |= (1 << ring->queue); > + } else { > + adev->gfx.cu_reserve_pipe_mask &= ~(1 << ring->pipe); > + adev->gfx.cu_reserve_queue_mask[ring->pipe] &= ~(1 << ring->queue); > + } > + > + /* compute pipe 0 starts at GFX8_CU_RESERVE_PIPE_SHIFT */ > + type_mask = (adev->gfx.cu_reserve_pipe_mask << GFX8_CU_RESERVE_PIPE_SHIFT); > + > + /* HW only has one register for queue mask, so we collaspse them */ > + for (i = 0; i < AMDGPU_MAX_COMPUTE_RINGS; i++) > + queue_mask |= adev->gfx.cu_reserve_queue_mask[i]; > + > + /* leave the first CUs for general processing */ > + for (i = GFX8_UNRESERVED_CU_NUM; i < GFX8_CU_NUM; i++) { > + reserve_res_reg = mmSPI_RESOURCE_RESERVE_CU_0 + i; > + reserve_en_reg = mmSPI_RESOURCE_RESERVE_EN_CU_0 + i; > + > + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0, > + TYPE_MASK, type_mask); > + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0, > + QUEUE_MASK, queue_mask); > + if (queue_mask) { > + resources = GFX8_CU_RESERVE_RESOURCES; > + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0, > + EN, 1); > + } else { > + resources = 0; > + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0, > + EN, 0); > + } > + /* Commit */ > + WREG32(reserve_res_reg, resources); > + WREG32(reserve_en_reg, tmp); > + } Should these be programmed via the KIQ rather than MMIO? I think there may even be a special packet for this. John? Felix? > + > + spin_unlock(&adev->gfx.cu_reserve_lock); > +} > + > +static void gfx_v8_0_set_spi_priority(struct amdgpu_device *adev, > + struct amdgpu_ring *ring, > + enum amd_sched_priority priority) > +{ > + spin_lock(&adev->srbm_lock); > + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0); > + > + switch (priority) { > + case AMD_SCHED_PRIORITY_NORMAL: > + WREG32(mmCP_HQD_PIPE_PRIORITY, 0x0); > + WREG32(mmCP_HQD_QUEUE_PRIORITY, 0x0); > + break; > + case AMD_SCHED_PRIORITY_HIGH: > + WREG32(mmCP_HQD_PIPE_PRIORITY, 0x2); > + WREG32(mmCP_HQD_QUEUE_PRIORITY, 0xf); > + break; > + default: > + WARN(1, "Attempt to set invalid SPI priority:%d for ring:%d\n", > + priority, ring->idx); > + break; > + } I wonder if it would be better to program these via the KIQ rather than MMIO. > + > + vi_srbm_select(adev, 0, 0, 0, 0); > + spin_unlock(&adev->srbm_lock); > +} > +static void gfx_v8_0_ring_set_priority_compute(struct amdgpu_ring *ring, > + enum amd_sched_priority priority) > +{ > + struct amdgpu_device *adev = ring->adev; > + > + if (ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) > + return; > + > + gfx_v8_0_set_spi_priority(adev, ring, priority); > + gfx_v8_0_cu_reserve(adev, ring, priority == AMD_SCHED_PRIORITY_HIGH); > +} > + > static void gfx_v8_0_ring_emit_fence_compute(struct amdgpu_ring *ring, > u64 addr, u64 seq, > unsigned flags) > { > bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT; > bool int_sel = flags & AMDGPU_FENCE_FLAG_INT; > > /* RELEASE_MEM - flush caches, send int */ > amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 5)); > amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN | > @@ -7140,20 +7233,21 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = { > .emit_fence = gfx_v8_0_ring_emit_fence_compute, > .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync, > .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush, > .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch, > .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush, > .emit_hdp_invalidate = gfx_v8_0_ring_emit_hdp_invalidate, > .test_ring = gfx_v8_0_ring_test_ring, > .test_ib = gfx_v8_0_ring_test_ib, > .insert_nop = amdgpu_ring_insert_nop, > .pad_ib = amdgpu_ring_generic_pad_ib, > + .set_priority = gfx_v8_0_ring_set_priority_compute, > }; > > static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = { > .type = AMDGPU_RING_TYPE_KIQ, > .align_mask = 0xff, > .nop = PACKET3(PACKET3_NOP, 0x3FFF), > .support_64bit_ptrs = false, > .get_rptr = gfx_v8_0_ring_get_rptr, > .get_wptr = gfx_v8_0_ring_get_wptr_compute, > .set_wptr = gfx_v8_0_ring_set_wptr_compute, > -- > 2.9.3 > > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx