Am 18.01.2017 um 12:42 schrieb Monk Liu: > previously we always insert 128nops behind vm_flush, which > may lead to DAMframe size above 256 dw and automatially aligned > to 512 dw. > > now we calculate how many DWs already inserted after vm_flush > and make up for the reset to pad up to 128dws before emit_ib. > that way we only take 256 dw per submit. > > v2: > drop insert_nops in vm_flush > re-calculate the estimate frame size for gfx8 ring > v3: > calculate the gap between vm_flush and IB in cntx_cntl > on an member field of ring structure > v4: > set last_vm_flush_pos even for case of no vm flush. > > Change-Id: I0a1845de07c0b86ac1b77ac1a007e893144144fd > Signed-off-by: Monk Liu <Monk.Liu at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 7 +++++++ > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 16 ++++++++++++---- > 3 files changed, 20 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index c813cbe..332969f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -173,6 +173,7 @@ struct amdgpu_ring { > #if defined(CONFIG_DEBUG_FS) > struct dentry *ent; > #endif > + u32 last_vm_flush_pos; > }; > > int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index d05546e..53fc5e0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -396,6 +396,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job) > amdgpu_vm_ring_has_compute_vm_bug(ring))) > amdgpu_ring_emit_pipeline_sync(ring); > > + /* when no vm-flush this frame, we still need to mark down > + * the position of the tail of hdp_flush, because we still > + * need to make sure there are 128 DWs between last SWITCH_BUFFER and > + * the emit_ib this frame. otherwise there is still VM fault occured on > + * constant engine. > + */ > + ring->last_vm_flush_pos = ring->wptr; > if (ring->funcs->emit_vm_flush && (job->vm_needs_flush || > amdgpu_vm_is_gpu_reset(adev, id))) { > struct fence *fence; > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index 5f37313..dde4177 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -6572,7 +6572,6 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr, > DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0)); > amdgpu_ring_write(ring, lower_32_bits(seq)); > amdgpu_ring_write(ring, upper_32_bits(seq)); > - > } > > static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring) > @@ -6636,9 +6635,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring, > /* sync PFP to ME, otherwise we might get invalid PFP reads */ > amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0)); > amdgpu_ring_write(ring, 0x0); > - /* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */ > - amdgpu_ring_insert_nop(ring, 128); > } > + ring->last_vm_flush_pos = ring->wptr; > } > > static u32 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring) > @@ -6743,6 +6741,15 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) > if (amdgpu_sriov_vf(ring->adev)) > gfx_v8_0_ring_emit_de_meta_init(ring, > (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr); > + > + /* We need to pad some NOPs before emit_ib to prevent CE run ahead of > + * vm_flush, which may trigger VM fault. */ > + if (ring->wptr > ring->last_vm_flush_pos) /* no wptr wrapping to RB head */ > + amdgpu_ring_insert_nop(ring, 128 - (ring->wptr - ring->last_vm_flush_pos)); This can easily result in a negative number, couldn't it? > + else > + if (ring->ptr_mask + 1 - ring->last_vm_flush_pos + ring->wptr < 128) > + amdgpu_ring_insert_nop(ring, > + 128 - (ring->ptr_mask + 1 - ring->last_vm_flush_pos + ring->wptr)); I think it would be cleaner if you calculate the number of NOPs needed first for both cases and then check if the number isn't negative for both cases. > } > > static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg) > @@ -7018,7 +7025,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = { > 7 + /* gfx_v8_0_ring_emit_pipeline_sync */ > 128 + 19 + /* gfx_v8_0_ring_emit_vm_flush */ > 2 + /* gfx_v8_ring_emit_sb */ > - 3 + 4 + 29, /* gfx_v8_ring_emit_cntxcntl including vgt flush/meta-data */ > + 3 + 4 + 29 - /* gfx_v8_ring_emit_cntxcntl including vgt flush/meta-data */ Please put the - on the next line. That is easy to miss and I asked myself for a moment why you want to add 20 here. Apart from that the patch looks good to me, Christian. > + 20 - 7 - 6 - 3 - 4 - 29, /* no need to count gds/hdp_flush/vm_flush fence/cntx_cntl/vgt_flush/meta-data anymore */ > .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */ > .emit_ib = gfx_v8_0_ring_emit_ib_gfx, > .emit_fence = gfx_v8_0_ring_emit_fence_gfx,