[PATCH] drm/amdgpu:guarantee 128dws after vm flush (v5)

deathsimple@xxxxxxxxxxx (Christian König) · Mon, 23 Jan 2017 14:15:28 +0100

Am 22.01.2017 um 04:55 schrieb Monk Liu:
> previously we always insert 128nops behind vm_flush, which
> may lead to DAMframe size above 256 dw and automatially aligned
> to 512 dw.
>
> now we calculate how many DWs already inserted after vm_flush
> and make up for the reset to pad up to 128dws before emit_ib.
> that way we only take 256 dw per submit.
>
> v2:
> drop insert_nops in vm_flush
> re-calculate the estimate frame size for gfx8 ring
> v3:
> calculate the gap between vm_flush and IB in cntx_cntl
> on an member field of ring structure
> v4:
> set last_vm_flush_pos even for case of no vm flush.
> v5:
> move pipeline sync out of VM flush and always insert it.

Didn't we've already found that always doing the pipeline sync also has 
some negative impact on performance as well?

I wonder if there isn't any better way of handling this. How about using 
a separate frame for the VM flush as well?

If you don't have time to take a deeper look and this really fixes a bug 
for you, feel free to add my Acked-by and commit it.

We can sort out the performance problems later on.

Regards,
Christian.

>
> Change-Id: I670adf8c5e7bdfe5f4fd191c24f62a5d7a170d3a
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c   |  8 +++++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   | 31 -------------------------------
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 19 +++++++++++++++----
>   4 files changed, 23 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index ff39858..a1cd429 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -154,6 +154,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   
>   	alloc_size = ring->funcs->emit_frame_size + num_ibs *
>   		ring->funcs->emit_ib_size;
> +	need_ctx_switch = ring->current_ctx != fence_ctx;
>   
>   	r = amdgpu_ring_alloc(ring, alloc_size);
>   	if (r) {
> @@ -164,7 +165,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	if (ring->funcs->init_cond_exec)
>   		patch_offset = amdgpu_ring_init_cond_exec(ring);
>   
> -	need_ctx_switch = ring->current_ctx != fence_ctx;
> +	/* pipeline sync is to prevent following DE change SH register
> +	 * before previous submit totally done (EOP signaled)
> +	 */
> +	if (ring->funcs->emit_pipeline_sync)
> +		amdgpu_ring_emit_pipeline_sync(ring);
> +
>   	if (vm) {
>   		r = amdgpu_vm_flush(ring, job);
>   		if (r) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index e0f8061..4c546fe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -173,6 +173,7 @@ struct amdgpu_ring {
>   #if defined(CONFIG_DEBUG_FS)
>   	struct dentry *ent;
>   #endif
> +	u32 last_vm_flush_pos;
>   };
>   
>   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index d05546e..4c1102f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -343,32 +343,6 @@ int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
>   	return r;
>   }
>   
> -static bool amdgpu_vm_ring_has_compute_vm_bug(struct amdgpu_ring *ring)
> -{
> -	struct amdgpu_device *adev = ring->adev;
> -	const struct amdgpu_ip_block *ip_block;
> -
> -	if (ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE)
> -		/* only compute rings */
> -		return false;
> -
> -	ip_block = amdgpu_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
> -	if (!ip_block)
> -		return false;
> -
> -	if (ip_block->version->major <= 7) {
> -		/* gfx7 has no workaround */
> -		return true;
> -	} else if (ip_block->version->major == 8) {
> -		if (adev->gfx.mec_fw_version >= 673)
> -			/* gfx8 is fixed in MEC firmware 673 */
> -			return false;
> -		else
> -			return true;
> -	}
> -	return false;
> -}
> -
>   /**
>    * amdgpu_vm_flush - hardware flush the vm
>    *
> @@ -391,11 +365,6 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job)
>   		id->oa_size != job->oa_size);
>   	int r;
>   
> -	if (ring->funcs->emit_pipeline_sync && (
> -	    job->vm_needs_flush || gds_switch_needed ||
> -	    amdgpu_vm_ring_has_compute_vm_bug(ring)))
> -		amdgpu_ring_emit_pipeline_sync(ring);
> -
>   	if (ring->funcs->emit_vm_flush && (job->vm_needs_flush ||
>   	    amdgpu_vm_is_gpu_reset(adev, id))) {
>   		struct fence *fence;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index d5719eb2..8421da2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6578,7 +6578,6 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>   			  DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
>   	amdgpu_ring_write(ring, lower_32_bits(seq));
>   	amdgpu_ring_write(ring, upper_32_bits(seq));
> -
>   }
>   
>   static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> @@ -6595,6 +6594,8 @@ static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, upper_32_bits(addr) & 0xffffffff);
>   	amdgpu_ring_write(ring, seq);
>   	amdgpu_ring_write(ring, 0xffffffff);
> +	if (usepfp)
> +		ring->last_vm_flush_pos = ring->wptr;
>   	amdgpu_ring_write(ring, 4); /* poll interval */
>   }
>   
> @@ -6641,9 +6642,9 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>   	if (usepfp) {
>   		/* sync PFP to ME, otherwise we might get invalid PFP reads */
>   		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
> +		/* mark down last pos of vm_flush for padding NOPs before CE ib */
> +		ring->last_vm_flush_pos = ring->wptr;
>   		amdgpu_ring_write(ring, 0x0);
> -		/* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */
> -		amdgpu_ring_insert_nop(ring, 128);
>   	}
>   }
>   
> @@ -6749,6 +6750,16 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>   	if (amdgpu_sriov_vf(ring->adev))
>   		gfx_v8_0_ring_emit_de_meta_init(ring,
>   			(flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
> +
> +	/* TODO: can we use one simple fomular to cover below logic? */
> +	if (ring->wptr > ring->last_vm_flush_pos) {
> +		if (ring->wptr - ring->last_vm_flush_pos < 128)
> +			amdgpu_ring_insert_nop(ring, 128 - (ring->wptr - ring->last_vm_flush_pos));
> +	} else {
> +		if (ring->ptr_mask - ring->last_vm_flush_pos + ring->wptr < 128)
> +			amdgpu_ring_insert_nop(ring,
> +				128 - (ring->ptr_mask - ring->last_vm_flush_pos + ring->wptr));
> +	}
>   }
>   
>   static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
> @@ -7035,7 +7046,7 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
>   		5 + /* gfx_v8_0_ring_emit_hdp_invalidate */
>   		6 + 6 + 6 +/* gfx_v8_0_ring_emit_fence_gfx x3 for user fence, vm fence */
>   		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
> -		128 + 19 + /* gfx_v8_0_ring_emit_vm_flush */
> +		128 - 63 + 19 + /* gfx_v8_0_ring_emit_vm_flush, and padding NOPs prior to IB */
>   		2 + /* gfx_v8_ring_emit_sb */
>   		3 + 4 + 29, /* gfx_v8_ring_emit_cntxcntl including vgt flush/meta-data */
>   	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_gfx */