[PATCH 09/26] drm/amdgpu: allow split of queues with kfd at queue granularity v3

felix.kuehling@xxxxxxx (Felix Kuehling) · Tue, 11 Apr 2017 17:36:23 -0400

Comments inline.

On 17-04-06 02:21 AM, Andres Rodriguez wrote:
> Previously the queue/pipe split with kfd operated with pipe
> granularity. This patch allows amdgpu to take ownership of an arbitrary
> set of queues.
>
> It also consolidates the last few magic numbers in the compute
> initialization process into mec_init.
>
> v2: support for gfx9
> v3: renamed AMDGPU_MAX_QUEUES to AMDGPU_MAX_COMPUTE_QUEUES
>
> Reviewed-by: Edward O'Callaghan <funfunctor at folklore1984.net>
> Acked-by: Christian KÃ¶nig <christian.koenig at amd.com>
> Signed-off-by: Andres Rodriguez <andresx7 at gmail.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h             |  7 +++
>  drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c           | 83 +++++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c           | 81 +++++++++++++++++++-----
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c           | 84 +++++++++++++++++++++++--
>  drivers/gpu/drm/amd/include/kgd_kfd_interface.h |  1 +
>  5 files changed, 212 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 8187587..b765aca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
[snip]
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index 3340012..55e61a9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -42,21 +42,20 @@
>  #include "gca/gfx_7_2_enum.h"
>  #include "gca/gfx_7_2_sh_mask.h"
>  
>  #include "gmc/gmc_7_0_d.h"
>  #include "gmc/gmc_7_0_sh_mask.h"
>  
>  #include "oss/oss_2_0_d.h"
>  #include "oss/oss_2_0_sh_mask.h"
>  
>  #define GFX7_NUM_GFX_RINGS     1
> -#define GFX7_NUM_COMPUTE_RINGS 8
>  #define GFX7_MEC_HPD_SIZE      2048
>  
>  
>  static void gfx_v7_0_set_ring_funcs(struct amdgpu_device *adev);
>  static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev);
>  static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev);
>  
>  MODULE_FIRMWARE("radeon/bonaire_pfp.bin");
>  MODULE_FIRMWARE("radeon/bonaire_me.bin");
>  MODULE_FIRMWARE("radeon/bonaire_ce.bin");
> @@ -2817,47 +2816,79 @@ static void gfx_v7_0_mec_fini(struct amdgpu_device *adev)
>  		if (unlikely(r != 0))
>  			dev_warn(adev->dev, "(%d) reserve HPD EOP bo failed\n", r);
>  		amdgpu_bo_unpin(adev->gfx.mec.hpd_eop_obj);
>  		amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
>  
>  		amdgpu_bo_unref(&adev->gfx.mec.hpd_eop_obj);
>  		adev->gfx.mec.hpd_eop_obj = NULL;
>  	}
>  }
>  
> +static void gfx_v7_0_compute_queue_acquire(struct amdgpu_device *adev)
> +{
> +	int i, queue, pipe, mec;
> +
> +	/* policy for amdgpu compute queue ownership */
> +	for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) {
> +		queue = i % adev->gfx.mec.num_queue_per_pipe;
> +		pipe = (i / adev->gfx.mec.num_queue_per_pipe)
> +			% adev->gfx.mec.num_pipe_per_mec;
> +		mec = (i / adev->gfx.mec.num_queue_per_pipe)
> +			/ adev->gfx.mec.num_pipe_per_mec;
> +
> +		/* we've run out of HW */
> +		if (mec > adev->gfx.mec.num_mec)

[FK] Shouldn't this be >=?

> +			break;
> +
> +		/* policy: amdgpu owns all queues in the first pipe */
> +		if (mec == 0 && pipe == 0)
> +			set_bit(i, adev->gfx.mec.queue_bitmap);
> +	}
> +
> +	/* update the number of active compute rings */
> +	adev->gfx.num_compute_rings =
> +		bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
> +
> +	/* If you hit this case and edited the policy, you probably just
> +	 * need to increase AMDGPU_MAX_COMPUTE_RINGS */
> +	WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS);
> +	if (adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS)

[FK] You can write this shorter:

    if (WARN_ON(condition))
        statement;

> +		adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
> +}
> +
[snip]
> @@ -4769,42 +4800,52 @@ static int gfx_v7_0_sw_init(void *handle)
>  		ring = &adev->gfx.gfx_ring[i];
>  		ring->ring_obj = NULL;
>  		sprintf(ring->name, "gfx");
>  		r = amdgpu_ring_init(adev, ring, 1024,
>  				     &adev->gfx.eop_irq, AMDGPU_CP_IRQ_GFX_EOP);
>  		if (r)
>  			return r;
>  	}
>  
>  	/* set up the compute queues */
> -	for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> +	for (i = 0, ring_id = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; i++) {
>  		unsigned irq_type;
>  
> -		/* max 32 queues per MEC */
> -		if ((i >= 32) || (i >= AMDGPU_MAX_COMPUTE_RINGS)) {
> -			DRM_ERROR("Too many (%d) compute rings!\n", i);
> -			break;
> -		}
> -		ring = &adev->gfx.compute_ring[i];
> +		if (!test_bit(i, adev->gfx.mec.queue_bitmap))
> +			continue;
> +
> +		ring = &adev->gfx.compute_ring[ring_id];
> +
> +		/* mec0 is me1 */
> +		ring->me = ((i / adev->gfx.mec.num_queue_per_pipe)
> +				/ adev->gfx.mec.num_pipe_per_mec)
> +				+ 1;
> +		ring->pipe = (i / adev->gfx.mec.num_queue_per_pipe)
> +				% adev->gfx.mec.num_pipe_per_mec;
> +		ring->queue = i % adev->gfx.mec.num_queue_per_pipe;
> +
>  		ring->ring_obj = NULL;
>  		ring->use_doorbell = true;
> -		ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + i;
> -		ring->me = 1; /* first MEC */
> -		ring->pipe = i / 8;
> -		ring->queue = i % 8;
> +		ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + ring_id;
>  		sprintf(ring->name, "comp_%d.%d.%d", ring->me, ring->pipe, ring->queue);
> -		irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + ring->pipe;
> +
> +		irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
> +			+ ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
> +			+ ring->pipe;
> +
>  		/* type-2 packets are deprecated on MEC, use type-3 instead */
>  		r = amdgpu_ring_init(adev, ring, 1024,
>  				     &adev->gfx.eop_irq, irq_type);
>  		if (r)
>  			return r;
> +
> +		ring_id++;

[FK] Is there any case where ring_id != i? I'm just wondering what's the
purpose of a separate variable for this. I'm guessing it's for a later
change in the queue allocation policy ...

>  	}
>  
>  	/* reserve GDS, GWS and OA resource for gfx */
>  	r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
>  				    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
>  				    &adev->gds.gds_gfx_bo, NULL, NULL);
>  	if (r)
>  		return r;
>  
>  	r = amdgpu_bo_create_kernel(adev, adev->gds.gws.gfx_partition_size,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index b34e597..c010b5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -45,21 +45,20 @@
>  #include "gca/gfx_8_0_enum.h"
>  #include "gca/gfx_8_0_sh_mask.h"
>  #include "gca/gfx_8_0_enum.h"
>  
>  #include "dce/dce_10_0_d.h"
>  #include "dce/dce_10_0_sh_mask.h"
>  
>  #include "smu/smu_7_1_3_d.h"
>  
>  #define GFX8_NUM_GFX_RINGS     1
> -#define GFX8_NUM_COMPUTE_RINGS 8
>  #define GFX8_MEC_HPD_SIZE 2048
>  
>  
>  #define TOPAZ_GB_ADDR_CONFIG_GOLDEN 0x22010001
>  #define CARRIZO_GB_ADDR_CONFIG_GOLDEN 0x22010001
>  #define POLARIS11_GB_ADDR_CONFIG_GOLDEN 0x22011002
>  #define TONGA_GB_ADDR_CONFIG_GOLDEN 0x22011003
>  
>  #define ARRAY_MODE(x)					((x) << GB_TILE_MODE0__ARRAY_MODE__SHIFT)
>  #define PIPE_CONFIG(x)					((x) << GB_TILE_MODE0__PIPE_CONFIG__SHIFT)
> @@ -1410,47 +1409,82 @@ static int gfx_v8_0_kiq_init_ring(struct amdgpu_device *adev,
>  }
>  static void gfx_v8_0_kiq_free_ring(struct amdgpu_ring *ring,
>  				   struct amdgpu_irq_src *irq)
>  {
>  	amdgpu_wb_free(ring->adev, ring->adev->virt.reg_val_offs);
>  	amdgpu_ring_fini(ring);
>  }
>  
>  #define GFX8_MEC_HPD_SIZE 2048
>  
> +static void gfx_v8_0_compute_queue_acquire(struct amdgpu_device *adev)
> +{
> +	int i, queue, pipe, mec;
> +
> +	/* policy for amdgpu compute queue ownership */
> +	for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) {
> +		queue = i % adev->gfx.mec.num_queue_per_pipe;
> +		pipe = (i / adev->gfx.mec.num_queue_per_pipe)
> +			% adev->gfx.mec.num_pipe_per_mec;
> +		mec = (i / adev->gfx.mec.num_queue_per_pipe)
> +			/ adev->gfx.mec.num_pipe_per_mec;
> +
> +		/* we've run out of HW */
> +		if (mec > adev->gfx.mec.num_mec)

[FK] >=

> +			break;
> +
> +		/* policy: amdgpu owns all queues in the first pipe */
> +		if (mec == 0 && pipe == 0)
> +			set_bit(i, adev->gfx.mec.queue_bitmap);
> +	}
[snip]
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 5dcc1b0..72eeda3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -31,21 +31,20 @@
>  #include "vega10/GC/gc_9_0_offset.h"
>  #include "vega10/GC/gc_9_0_sh_mask.h"
>  #include "vega10/vega10_enum.h"
>  #include "vega10/HDP/hdp_4_0_offset.h"
>  
>  #include "soc15_common.h"
>  #include "clearstate_gfx9.h"
>  #include "v9_structs.h"
>  
>  #define GFX9_NUM_GFX_RINGS     1
> -#define GFX9_NUM_COMPUTE_RINGS 8
>  #define GFX9_NUM_SE		4
>  #define RLCG_UCODE_LOADING_START_ADDRESS 0x2000
>  
>  MODULE_FIRMWARE("amdgpu/vega10_ce.bin");
>  MODULE_FIRMWARE("amdgpu/vega10_pfp.bin");
>  MODULE_FIRMWARE("amdgpu/vega10_me.bin");
>  MODULE_FIRMWARE("amdgpu/vega10_mec.bin");
>  MODULE_FIRMWARE("amdgpu/vega10_mec2.bin");
>  MODULE_FIRMWARE("amdgpu/vega10_rlc.bin");
>  
> @@ -469,45 +468,79 @@ static void gfx_v9_0_mec_fini(struct amdgpu_device *adev)
>  		amdgpu_bo_unpin(adev->gfx.mec.mec_fw_obj);
>  		amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_obj);
>  
>  		amdgpu_bo_unref(&adev->gfx.mec.mec_fw_obj);
>  		adev->gfx.mec.mec_fw_obj = NULL;
>  	}
>  }
>  
>  #define MEC_HPD_SIZE 2048
>  
> +static void gfx_v9_0_compute_queue_acquire(struct amdgpu_device *adev)
> +{
> +	int i, queue, pipe, mec;
> +
> +	/* policy for amdgpu compute queue ownership */
> +	for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) {
> +		queue = i % adev->gfx.mec.num_queue_per_pipe;
> +		pipe = (i / adev->gfx.mec.num_queue_per_pipe)
> +			% adev->gfx.mec.num_pipe_per_mec;
> +		mec = (i / adev->gfx.mec.num_queue_per_pipe)
> +			/ adev->gfx.mec.num_pipe_per_mec;
> +
> +		/* we've run out of HW */
> +		if (mec > adev->gfx.mec.num_mec)
> +			break;

[FK] >=

> +
> +		/* policy: amdgpu owns all queues in the first pipe */
> +		if (mec == 0 && pipe == 0)
> +			set_bit(i, adev->gfx.mec.queue_bitmap);
> +	}
> +
> +	/* update the number of active compute rings */
> +	adev->gfx.num_compute_rings =
> +		bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
> +
> +	/* If you hit this case and edited the policy, you probably just
> +	 * need to increase AMDGPU_MAX_COMPUTE_RINGS */
> +	if (WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS))
> +		adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
> +}
> +
[snip]