[PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB

deathsimple@xxxxxxxxxxx (Christian König) · Wed, 18 Jan 2017 10:28:15 +0100

Am 18.01.2017 um 06:55 schrieb Monk Liu:
> previously we always insert 128nops behind vm_flush, which
> may lead to DAMframe size above 256 dw and automatially aligned
> to 512 dw.
>
> now we calculate how many DWs already inserted after vm_flush
> and make up for the reset to pad up to 128dws before emit_ib.
>
> that way we only take 256 dw per submit.

First question why do we want to limit the dw per submit to 256? Some 
limitation for SRIOV?

Then for the implementation, please don't add all that counting to the 
different functions. Instead save the current position in the ring in 
emit_vm_flush and then calculate in emit_cntxcntl() how many dw we still 
need to add to have at least 128. E.g. call the variable something like 
last_vm_flush_pos.

That makes the code way more robust towards any changes.

Regards,
Christian.

>
> Change-Id: Iac198e16f35b071476ba7bd48ab338223f6fe650
> Signed-off-by: Monk Liu <Monk.Liu at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 24 ++++++++++++++++++++++--
>   2 files changed, 23 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index c813cbe..1dbe600 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -173,6 +173,7 @@ struct amdgpu_ring {
>   #if defined(CONFIG_DEBUG_FS)
>   	struct dentry *ent;
>   #endif
> +	u32 dws_between_vm_ib;
>   };
>   
>   int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 5f37313..76b1315 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -5670,6 +5670,8 @@ static void gfx_v8_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>   	amdgpu_ring_write(ring, amdgpu_gds_reg_offset[vmid].oa);
>   	amdgpu_ring_write(ring, 0);
>   	amdgpu_ring_write(ring, (1 << (oa_size + oa_base)) - (1 << oa_base));
> +
> +	ring->dws_between_vm_ib += 20;
>   }
>   
>   static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
> @@ -6489,6 +6491,8 @@ static void gfx_v8_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, ref_and_mask);
>   	amdgpu_ring_write(ring, ref_and_mask);
>   	amdgpu_ring_write(ring, 0x20); /* poll interval */
> +
> +	ring->dws_between_vm_ib += 7;
>   }
>   
>   static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
> @@ -6500,6 +6504,8 @@ static void gfx_v8_0_ring_emit_vgt_flush(struct amdgpu_ring *ring)
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_EVENT_WRITE, 0));
>   	amdgpu_ring_write(ring, EVENT_TYPE(VGT_FLUSH) |
>   		EVENT_INDEX(0));
> +
> +	ring->dws_between_vm_ib += 4;
>   }
>   
>   
> @@ -6573,6 +6579,7 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
>   	amdgpu_ring_write(ring, lower_32_bits(seq));
>   	amdgpu_ring_write(ring, upper_32_bits(seq));
>   
> +	ring->dws_between_vm_ib += 6;
>   }
>   
>   static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
> @@ -6639,6 +6646,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
>   		/* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */
>   		amdgpu_ring_insert_nop(ring, 128);
>   	}
> +
> +	ring->dws_between_vm_ib = 0; /* clear before recalculate */
>   }
>   
>   static u32 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring)
> @@ -6711,9 +6720,11 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>   {
>   	uint32_t dw2 = 0;
>   
> -	if (amdgpu_sriov_vf(ring->adev))
> +	if (amdgpu_sriov_vf(ring->adev)) {
>   		gfx_v8_0_ring_emit_ce_meta_init(ring,
>   			(flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
> +		ring->dws_between_vm_ib += 8;
> +	}
>   
>   	dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
>   	if (flags & AMDGPU_HAVE_CTX_SWITCH) {
> @@ -6739,10 +6750,19 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
>   	amdgpu_ring_write(ring, dw2);
>   	amdgpu_ring_write(ring, 0);
> +	ring->dws_between_vm_ib += 3;
>   
> -	if (amdgpu_sriov_vf(ring->adev))
> +	if (amdgpu_sriov_vf(ring->adev)) {
>   		gfx_v8_0_ring_emit_de_meta_init(ring,
>   			(flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
> +		ring->dws_between_vm_ib += 21;
> +	}
> +
> +	/* emit_de_meta_init should be the last package right before emi_ib,
> +	 * and we need to pad some NOPs before emit_ib to prevent CE run ahead of
> +	 * vm_flush, which may trigger VM fault.
> +	 */
> +	amdgpu_ring_insert_nop(ring, 128 - ring->dws_between_vm_ib);
>   }
>   
>   static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)