Programming CP_HQD_QUEUE_PRIORITY enables a queue to take priority over other queues on the same pipe. Multiple queues on a pipe are timesliced so this gives us full precedence over other queues. Programming CP_HQD_PIPE_PRIORITY changes the SPI_ARB_PRIORITY of the wave as follows: 0x2: CS_H 0x1: CS_M 0x0: CS_L The SPI block will then dispatch work according to the policy set by SPI_ARB_PRIORITY. In the current policy CS_H is higher priority than gfx. In order to prevent getting stuck in loops of CUs bouncing between GFX and high priority compute and introducing further latency, we reserve CUs 2+ for high priority compute on-demand. v2: fix srbm_select to ring->queue and use ring->funcs->type v3: use AMD_SCHED_PRIORITY_* instead of AMDGPU_CTX_PRIORITY_* v4: switch int to enum amd_sched_priority v5: corresponding changes for srbm_lock Acked-by: Christian König <christian.koenig at amd.com> Signed-off-by: Andres Rodriguez <andresx7 at gmail.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 95 ++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 68350ca..4e81a8e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1057,40 +1057,43 @@ struct amdgpu_gfx { const struct firmware *pfp_fw; /* PFP firmware */ uint32_t pfp_fw_version; const struct firmware *ce_fw; /* CE firmware */ uint32_t ce_fw_version; const struct firmware *rlc_fw; /* RLC firmware */ uint32_t rlc_fw_version; const struct firmware *mec_fw; /* MEC firmware */ uint32_t mec_fw_version; const struct firmware *mec2_fw; /* MEC2 firmware */ uint32_t mec2_fw_version; uint32_t me_feature_version; uint32_t ce_feature_version; uint32_t pfp_feature_version; uint32_t rlc_feature_version; uint32_t mec_feature_version; uint32_t mec2_feature_version; struct amdgpu_ring gfx_ring[AMDGPU_MAX_GFX_RINGS]; unsigned num_gfx_rings; struct amdgpu_ring compute_ring[AMDGPU_MAX_COMPUTE_RINGS]; unsigned num_compute_rings; + spinlock_t cu_reserve_lock; + uint32_t cu_reserve_pipe_mask; + uint32_t cu_reserve_queue_mask[AMDGPU_MAX_COMPUTE_RINGS]; struct amdgpu_irq_src eop_irq; struct amdgpu_irq_src priv_reg_irq; struct amdgpu_irq_src priv_inst_irq; /* gfx status */ uint32_t gfx_current_status; /* ce ram size*/ unsigned ce_ram_size; struct amdgpu_cu_info cu_info; const struct amdgpu_gfx_funcs *funcs; /* reset mask */ uint32_t grbm_soft_reset; uint32_t srbm_soft_reset; bool in_reset; /* s3/s4 mask */ bool in_suspend; /* NGG */ struct amdgpu_ngg ngg; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 674256a..971303d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1858,40 +1858,41 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->firmware.mutex); mutex_init(&adev->pm.mutex); mutex_init(&adev->gfx.gpu_clock_mutex); spin_lock_init(&adev->srbm_lock); mutex_init(&adev->grbm_idx_mutex); mutex_init(&adev->mn_lock); hash_init(adev->mn_hash); amdgpu_check_arguments(adev); /* Registers mapping */ /* TODO: block userspace mapping of io register */ spin_lock_init(&adev->mmio_idx_lock); spin_lock_init(&adev->smc_idx_lock); spin_lock_init(&adev->pcie_idx_lock); spin_lock_init(&adev->uvd_ctx_idx_lock); spin_lock_init(&adev->didt_idx_lock); spin_lock_init(&adev->gc_cac_idx_lock); spin_lock_init(&adev->audio_endpt_idx_lock); spin_lock_init(&adev->mm_stats.lock); + spin_lock_init(&adev->gfx.cu_reserve_lock); INIT_LIST_HEAD(&adev->shadow_list); mutex_init(&adev->shadow_list_lock); INIT_LIST_HEAD(&adev->gtt_list); spin_lock_init(&adev->gtt_list_lock); INIT_LIST_HEAD(&adev->ring_lru_list); spin_lock_init(&adev->ring_lru_list_lock); if (adev->asic_type >= CHIP_BONAIRE) { adev->rmmio_base = pci_resource_start(adev->pdev, 5); adev->rmmio_size = pci_resource_len(adev->pdev, 5); } else { adev->rmmio_base = pci_resource_start(adev->pdev, 2); adev->rmmio_size = pci_resource_len(adev->pdev, 2); } adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); if (adev->rmmio == NULL) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 6b9c7f8..0149b81 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -36,40 +36,44 @@ #include "gmc/gmc_8_2_sh_mask.h" #include "oss/oss_3_0_d.h" #include "oss/oss_3_0_sh_mask.h" #include "bif/bif_5_0_d.h" #include "bif/bif_5_0_sh_mask.h" #include "gca/gfx_8_0_d.h" #include "gca/gfx_8_0_enum.h" #include "gca/gfx_8_0_sh_mask.h" #include "gca/gfx_8_0_enum.h" #include "dce/dce_10_0_d.h" #include "dce/dce_10_0_sh_mask.h" #include "smu/smu_7_1_3_d.h" #define GFX8_NUM_GFX_RINGS 1 #define GFX8_MEC_HPD_SIZE 2048 +#define GFX8_CU_RESERVE_RESOURCES 0x45888 +#define GFX8_CU_NUM 8 +#define GFX8_UNRESERVED_CU_NUM 2 +#define GFX8_CU_RESERVE_PIPE_SHIFT 7 #define TOPAZ_GB_ADDR_CONFIG_GOLDEN 0x22010001 #define CARRIZO_GB_ADDR_CONFIG_GOLDEN 0x22010001 #define POLARIS11_GB_ADDR_CONFIG_GOLDEN 0x22011002 #define TONGA_GB_ADDR_CONFIG_GOLDEN 0x22011003 #define ARRAY_MODE(x) ((x) << GB_TILE_MODE0__ARRAY_MODE__SHIFT) #define PIPE_CONFIG(x) ((x) << GB_TILE_MODE0__PIPE_CONFIG__SHIFT) #define TILE_SPLIT(x) ((x) << GB_TILE_MODE0__TILE_SPLIT__SHIFT) #define MICRO_TILE_MODE_NEW(x) ((x) << GB_TILE_MODE0__MICRO_TILE_MODE_NEW__SHIFT) #define SAMPLE_SPLIT(x) ((x) << GB_TILE_MODE0__SAMPLE_SPLIT__SHIFT) #define BANK_WIDTH(x) ((x) << GB_MACROTILE_MODE0__BANK_WIDTH__SHIFT) #define BANK_HEIGHT(x) ((x) << GB_MACROTILE_MODE0__BANK_HEIGHT__SHIFT) #define MACRO_TILE_ASPECT(x) ((x) << GB_MACROTILE_MODE0__MACRO_TILE_ASPECT__SHIFT) #define NUM_BANKS(x) ((x) << GB_MACROTILE_MODE0__NUM_BANKS__SHIFT) #define RLC_CGTT_MGCG_OVERRIDE__CPF_MASK 0x00000001L #define RLC_CGTT_MGCG_OVERRIDE__RLC_MASK 0x00000002L #define RLC_CGTT_MGCG_OVERRIDE__MGCG_MASK 0x00000004L #define RLC_CGTT_MGCG_OVERRIDE__CGCG_MASK 0x00000008L @@ -6627,40 +6631,130 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring, /* sync PFP to ME, otherwise we might get invalid PFP reads */ amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0)); amdgpu_ring_write(ring, 0x0); } } static u64 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring) { return ring->adev->wb.wb[ring->wptr_offs]; } static void gfx_v8_0_ring_set_wptr_compute(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; /* XXX check if swapping is necessary on BE */ adev->wb.wb[ring->wptr_offs] = lower_32_bits(ring->wptr); WDOORBELL32(ring->doorbell_index, lower_32_bits(ring->wptr)); } +static void gfx_v8_0_cu_reserve(struct amdgpu_device *adev, + struct amdgpu_ring *ring, bool acquire) +{ + int i, resources; + int tmp = 0, queue_mask = 0, type_mask = 0; + int reserve_res_reg, reserve_en_reg; + + /* gfx_v8_0_cu_reserve only supports compute path */ + if (ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) + return; + + spin_lock(&adev->gfx.cu_reserve_lock); + if (acquire) { + adev->gfx.cu_reserve_pipe_mask |= (1 << ring->pipe); + adev->gfx.cu_reserve_queue_mask[ring->pipe] |= (1 << ring->queue); + } else { + adev->gfx.cu_reserve_pipe_mask &= ~(1 << ring->pipe); + adev->gfx.cu_reserve_queue_mask[ring->pipe] &= ~(1 << ring->queue); + } + + /* compute pipe 0 starts at GFX8_CU_RESERVE_PIPE_SHIFT */ + type_mask = (adev->gfx.cu_reserve_pipe_mask << GFX8_CU_RESERVE_PIPE_SHIFT); + + /* HW only has one register for queue mask, so we collaspse them */ + for (i = 0; i < AMDGPU_MAX_COMPUTE_RINGS; i++) + queue_mask |= adev->gfx.cu_reserve_queue_mask[i]; + + /* leave the first CUs for general processing */ + for (i = GFX8_UNRESERVED_CU_NUM; i < GFX8_CU_NUM; i++) { + reserve_res_reg = mmSPI_RESOURCE_RESERVE_CU_0 + i; + reserve_en_reg = mmSPI_RESOURCE_RESERVE_EN_CU_0 + i; + + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0, + TYPE_MASK, type_mask); + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0, + QUEUE_MASK, queue_mask); + if (queue_mask) { + resources = GFX8_CU_RESERVE_RESOURCES; + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0, + EN, 1); + } else { + resources = 0; + tmp = REG_SET_FIELD(tmp, SPI_RESOURCE_RESERVE_EN_CU_0, + EN, 0); + } + /* Commit */ + WREG32(reserve_res_reg, resources); + WREG32(reserve_en_reg, tmp); + } + + spin_unlock(&adev->gfx.cu_reserve_lock); +} + +static void gfx_v8_0_set_spi_priority(struct amdgpu_device *adev, + struct amdgpu_ring *ring, + enum amd_sched_priority priority) +{ + spin_lock(&adev->srbm_lock); + vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0); + + switch (priority) { + case AMD_SCHED_PRIORITY_NORMAL: + WREG32(mmCP_HQD_PIPE_PRIORITY, 0x0); + WREG32(mmCP_HQD_QUEUE_PRIORITY, 0x0); + break; + case AMD_SCHED_PRIORITY_HIGH: + WREG32(mmCP_HQD_PIPE_PRIORITY, 0x2); + WREG32(mmCP_HQD_QUEUE_PRIORITY, 0xf); + break; + default: + WARN(1, "Attempt to set invalid SPI priority:%d for ring:%d\n", + priority, ring->idx); + break; + } + + vi_srbm_select(adev, 0, 0, 0, 0); + spin_unlock(&adev->srbm_lock); +} +static void gfx_v8_0_ring_set_priority_compute(struct amdgpu_ring *ring, + enum amd_sched_priority priority) +{ + struct amdgpu_device *adev = ring->adev; + + if (ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) + return; + + gfx_v8_0_set_spi_priority(adev, ring, priority); + gfx_v8_0_cu_reserve(adev, ring, priority == AMD_SCHED_PRIORITY_HIGH); +} + static void gfx_v8_0_ring_emit_fence_compute(struct amdgpu_ring *ring, u64 addr, u64 seq, unsigned flags) { bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT; bool int_sel = flags & AMDGPU_FENCE_FLAG_INT; /* RELEASE_MEM - flush caches, send int */ amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 5)); amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN | EOP_TC_ACTION_EN | EOP_TC_WB_ACTION_EN | EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5))); amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0)); amdgpu_ring_write(ring, addr & 0xfffffffc); amdgpu_ring_write(ring, upper_32_bits(addr)); amdgpu_ring_write(ring, lower_32_bits(seq)); amdgpu_ring_write(ring, upper_32_bits(seq)); } @@ -7057,40 +7151,41 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_compute = { .set_wptr = gfx_v8_0_ring_set_wptr_compute, .emit_frame_size = 20 + /* gfx_v8_0_ring_emit_gds_switch */ 7 + /* gfx_v8_0_ring_emit_hdp_flush */ 5 + /* gfx_v8_0_ring_emit_hdp_invalidate */ 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ 17 + /* gfx_v8_0_ring_emit_vm_flush */ 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3 for user fence, vm fence */ .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ .emit_ib = gfx_v8_0_ring_emit_ib_compute, .emit_fence = gfx_v8_0_ring_emit_fence_compute, .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync, .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush, .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch, .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush, .emit_hdp_invalidate = gfx_v8_0_ring_emit_hdp_invalidate, .test_ring = gfx_v8_0_ring_test_ring, .test_ib = gfx_v8_0_ring_test_ib, .insert_nop = amdgpu_ring_insert_nop, .pad_ib = amdgpu_ring_generic_pad_ib, + .set_priority = gfx_v8_0_ring_set_priority_compute, }; static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_kiq = { .type = AMDGPU_RING_TYPE_KIQ, .align_mask = 0xff, .nop = PACKET3(PACKET3_NOP, 0x3FFF), .support_64bit_ptrs = false, .get_rptr = gfx_v8_0_ring_get_rptr, .get_wptr = gfx_v8_0_ring_get_wptr_compute, .set_wptr = gfx_v8_0_ring_set_wptr_compute, .emit_frame_size = 20 + /* gfx_v8_0_ring_emit_gds_switch */ 7 + /* gfx_v8_0_ring_emit_hdp_flush */ 5 + /* gfx_v8_0_ring_emit_hdp_invalidate */ 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ 17 + /* gfx_v8_0_ring_emit_vm_flush */ 7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for user fence, vm fence */ .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */ .emit_ib = gfx_v8_0_ring_emit_ib_compute, .emit_fence = gfx_v8_0_ring_emit_fence_kiq, -- 2.9.3