Am 22.01.2017 um 04:55 schrieb Monk Liu: > previously we always insert 128nops behind vm_flush, which > may lead to DAMframe size above 256 dw and automatially aligned > to 512 dw. > > now we calculate how many DWs already inserted after vm_flush > and make up for the reset to pad up to 128dws before emit_ib. > that way we only take 256 dw per submit. > > v2: > drop insert_nops in vm_flush > re-calculate the estimate frame size for gfx8 ring > v3: > calculate the gap between vm_flush and IB in cntx_cntl > on an member field of ring structure > v4: > set last_vm_flush_pos even for case of no vm flush. > v5: > move pipeline sync out of VM flush and always insert it. Didn't we've already found that always doing the pipeline sync also has some negative impact on performance as well? I wonder if there isn't any better way of handling this. How about using a separate frame for the VM flush as well? If you don't have time to take a deeper look and this really fixes a bug for you, feel free to add my Acked-by and commit it. We can sort out the performance problems later on. Regards, Christian. > > Change-Id: I670adf8c5e7bdfe5f4fd191c24f62a5d7a170d3a > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 8 +++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 31 ------------------------------- > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 19 +++++++++++++++---- > 4 files changed, 23 insertions(+), 36 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > index ff39858..a1cd429 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c > @@ -154,6 +154,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, > > alloc_size = ring->funcs->emit_frame_size + num_ibs * > ring->funcs->emit_ib_size; > + need_ctx_switch = ring->current_ctx != fence_ctx; > > r = amdgpu_ring_alloc(ring, alloc_size); > if (r) { > @@ -164,7 +165,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, > if (ring->funcs->init_cond_exec) > patch_offset = amdgpu_ring_init_cond_exec(ring); > > - need_ctx_switch = ring->current_ctx != fence_ctx; > + /* pipeline sync is to prevent following DE change SH register > + * before previous submit totally done (EOP signaled) > + */ > + if (ring->funcs->emit_pipeline_sync) > + amdgpu_ring_emit_pipeline_sync(ring); > + > if (vm) { > r = amdgpu_vm_flush(ring, job); > if (r) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index e0f8061..4c546fe 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -173,6 +173,7 @@ struct amdgpu_ring { > #if defined(CONFIG_DEBUG_FS) > struct dentry *ent; > #endif > + u32 last_vm_flush_pos; > }; > > int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index d05546e..4c1102f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -343,32 +343,6 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, > return r; > } > > -static bool amdgpu_vm_ring_has_compute_vm_bug(struct amdgpu_ring *ring) > -{ > - struct amdgpu_device *adev = ring->adev; > - const struct amdgpu_ip_block *ip_block; > - > - if (ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) > - /* only compute rings */ > - return false; > - > - ip_block = amdgpu_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); > - if (!ip_block) > - return false; > - > - if (ip_block->version->major <= 7) { > - /* gfx7 has no workaround */ > - return true; > - } else if (ip_block->version->major == 8) { > - if (adev->gfx.mec_fw_version >= 673) > - /* gfx8 is fixed in MEC firmware 673 */ > - return false; > - else > - return true; > - } > - return false; > -} > - > /** > * amdgpu_vm_flush - hardware flush the vm > * > @@ -391,11 +365,6 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job) > id->oa_size != job->oa_size); > int r; > > - if (ring->funcs->emit_pipeline_sync && ( > - job->vm_needs_flush || gds_switch_needed || > - amdgpu_vm_ring_has_compute_vm_bug(ring))) > - amdgpu_ring_emit_pipeline_sync(ring); > - > if (ring->funcs->emit_vm_flush && (job->vm_needs_flush || > amdgpu_vm_is_gpu_reset(adev, id))) { > struct fence *fence; > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index d5719eb2..8421da2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -6578,7 +6578,6 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr, > DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0)); > amdgpu_ring_write(ring, lower_32_bits(seq)); > amdgpu_ring_write(ring, upper_32_bits(seq)); > - > } > > static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring) > @@ -6595,6 +6594,8 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring) > amdgpu_ring_write(ring, upper_32_bits(addr) & 0xffffffff); > amdgpu_ring_write(ring, seq); > amdgpu_ring_write(ring, 0xffffffff); > + if (usepfp) > + ring->last_vm_flush_pos = ring->wptr; > amdgpu_ring_write(ring, 4); /* poll interval */ > } > > @@ -6641,9 +6642,9 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring, > if (usepfp) { > /* sync PFP to ME, otherwise we might get invalid PFP reads */ > amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0)); > + /* mark down last pos of vm_flush for padding NOPs before CE ib */ > + ring->last_vm_flush_pos = ring->wptr; > amdgpu_ring_write(ring, 0x0); > - /* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */ > - amdgpu_ring_insert_nop(ring, 128); > } > } > > @@ -6749,6 +6750,16 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) > if (amdgpu_sriov_vf(ring->adev)) > gfx_v8_0_ring_emit_de_meta_init(ring, > (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr); > + > + /* TODO: can we use one simple fomular to cover below logic? */ > + if (ring->wptr > ring->last_vm_flush_pos) { > + if (ring->wptr - ring->last_vm_flush_pos < 128) > + amdgpu_ring_insert_nop(ring, 128 - (ring->wptr - ring->last_vm_flush_pos)); > + } else { > + if (ring->ptr_mask - ring->last_vm_flush_pos + ring->wptr < 128) > + amdgpu_ring_insert_nop(ring, > + 128 - (ring->ptr_mask - ring->last_vm_flush_pos + ring->wptr)); > + } > } > > static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg) > @@ -7035,7 +7046,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = { > 5 + /* gfx_v8_0_ring_emit_hdp_invalidate */ > 6 + 6 + 6 +/* gfx_v8_0_ring_emit_fence_gfx x3 for user fence, vm fence */ > 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ > - 128 + 19 + /* gfx_v8_0_ring_emit_vm_flush */ > + 128 - 63 + 19 + /* gfx_v8_0_ring_emit_vm_flush, and padding NOPs prior to IB */ > 2 + /* gfx_v8_ring_emit_sb */ > 3 + 4 + 29, /* gfx_v8_ring_emit_cntxcntl including vgt flush/meta-data */ > .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */