[PATCH 09/25] drm/amdgpu: allow split of queues with kfd at queue granularity v2

andresx7@xxxxxxxxx (Andres Rodriguez) · Wed, 5 Apr 2017 00:09:21 -0400

On Tue, Apr 4, 2017 at 11:16 PM, Alex Deucher <alexdeucher at gmail.com> wrote:
> On Tue, Apr 4, 2017 at 6:05 PM, Andres Rodriguez <andresx7 at gmail.com> wrote:
>> Previously the queue/pipe split with kfd operated with pipe
>> granularity. This patch allows amdgpu to take ownership of an arbitrary
>> set of queues.
>>
>> It also consolidates the last few magic numbers in the compute
>> initialization process into mec_init.
>>
>> v2: support for gfx9
>>
>> Reviewed-by: Edward O'Callaghan <funfunctor at folklore1984.net>
>> Acked-by: Christian KÃ¶nig <christian.koenig at amd.com>
>> Signed-off-by: Andres Rodriguez <andresx7 at gmail.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu.h             |  7 +++
>>  drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c           | 83 +++++++++++++++++-------
>>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c           | 81 +++++++++++++++++++-----
>>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c           | 84 +++++++++++++++++++++++--
>>  drivers/gpu/drm/amd/include/kgd_kfd_interface.h |  1 +
>>  5 files changed, 212 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index b92f6cb..e2d8243 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -39,20 +39,22 @@
>>  #include <ttm/ttm_bo_api.h>
>>  #include <ttm/ttm_bo_driver.h>
>>  #include <ttm/ttm_placement.h>
>>  #include <ttm/ttm_module.h>
>>  #include <ttm/ttm_execbuf_util.h>
>>
>>  #include <drm/drmP.h>
>>  #include <drm/drm_gem.h>
>>  #include <drm/amdgpu_drm.h>
>>
>> +#include <kgd_kfd_interface.h>
>> +
>>  #include "amd_shared.h"
>>  #include "amdgpu_mode.h"
>>  #include "amdgpu_ih.h"
>>  #include "amdgpu_irq.h"
>>  #include "amdgpu_ucode.h"
>>  #include "amdgpu_ttm.h"
>>  #include "amdgpu_psp.h"
>>  #include "amdgpu_gds.h"
>>  #include "amdgpu_sync.h"
>>  #include "amdgpu_ring.h"
>> @@ -899,29 +901,34 @@ struct amdgpu_rlc {
>>         u32 reg_list_format_start;
>>         u32 reg_list_format_separate_start;
>>         u32 starting_offsets_start;
>>         u32 reg_list_format_size_bytes;
>>         u32 reg_list_size_bytes;
>>
>>         u32 *register_list_format;
>>         u32 *register_restore;
>>  };
>>
>> +#define AMDGPU_MAX_QUEUES KGD_MAX_QUEUES
>
> Can we rename this to AMDGPU_MAX_COMPUTE_QUEUES or better yet, unify
> it with AMDGPU_MAX_COMPUTE_RINGS?  I don't like having two defines for
> the same thing.
>

I'm okay with the rename here.

Since the meaning is slightly different, unifying them would have some
undesired consequences. AMDGPU_MAX_QUEUES is the highest number of
queues an ASIC may contain (with some wiggle room). On the other hand
AMDGPU_MAX_COMPUTE_RINGS is the maximum number of queues we expect
amdgpu to acquire. If we unify them we would have some arrays increase
in size unnecessarily, e.g. mqd_backup[8 +1] would become
mqd_backup[128 + 1].

You seem to be pointing at the problem here correctly though, the
names don't convey that difference.

>> +
>>  struct amdgpu_mec {
>>         struct amdgpu_bo        *hpd_eop_obj;
>>         u64                     hpd_eop_gpu_addr;
>>         struct amdgpu_bo        *mec_fw_obj;
>>         u64                     mec_fw_gpu_addr;
>>         u32 num_mec;
>>         u32 num_pipe_per_mec;
>>         u32 num_queue_per_pipe;
>>         void                    *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS + 1];
>> +
>> +       /* These are the resources for which amdgpu takes ownership */
>> +       DECLARE_BITMAP(queue_bitmap, AMDGPU_MAX_QUEUES);
>>  };
>>
>>  struct amdgpu_kiq {
>>         u64                     eop_gpu_addr;
>>         struct amdgpu_bo        *eop_obj;
>>         struct amdgpu_ring      ring;
>>         struct amdgpu_irq_src   irq;
>>  };
>>
>>  /*
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> index 3340012..0586f1c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
>> @@ -42,21 +42,20 @@
>>  #include "gca/gfx_7_2_enum.h"
>>  #include "gca/gfx_7_2_sh_mask.h"
>>
>>  #include "gmc/gmc_7_0_d.h"
>>  #include "gmc/gmc_7_0_sh_mask.h"
>>
>>  #include "oss/oss_2_0_d.h"
>>  #include "oss/oss_2_0_sh_mask.h"
>>
>>  #define GFX7_NUM_GFX_RINGS     1
>> -#define GFX7_NUM_COMPUTE_RINGS 8
>
> Can we keep the local compute queue NUM defines?  This way we can
> change the number of queues per gfx version without affecting
> everything.
>
> Alex
>

These defines don't affect how many queues amdgpu will grab. Instead,
this can be configured on a per gfx version at
gfx_vX_Y_compute_queue_acquire(). If a maximum is desired it can be
configured within the policy.

Andres

>>  #define GFX7_MEC_HPD_SIZE      2048
>>
>>
>>  static void gfx_v7_0_set_ring_funcs(struct amdgpu_device *adev);
>>  static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev);
>>  static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev);
>>
>>  MODULE_FIRMWARE("radeon/bonaire_pfp.bin");
>>  MODULE_FIRMWARE("radeon/bonaire_me.bin");
>>  MODULE_FIRMWARE("radeon/bonaire_ce.bin");
>> @@ -2817,47 +2816,79 @@ static void gfx_v7_0_mec_fini(struct amdgpu_device *adev)
>>                 if (unlikely(r != 0))
>>                         dev_warn(adev->dev, "(%d) reserve HPD EOP bo failed\n", r);
>>                 amdgpu_bo_unpin(adev->gfx.mec.hpd_eop_obj);
>>                 amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
>>
>>                 amdgpu_bo_unref(&adev->gfx.mec.hpd_eop_obj);
>>                 adev->gfx.mec.hpd_eop_obj = NULL;
>>         }
>>  }
>>
>> +static void gfx_v7_0_compute_queue_acquire(struct amdgpu_device *adev)
>> +{
>> +       int i, queue, pipe, mec;
>> +
>> +       /* policy for amdgpu compute queue ownership */
>> +       for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
>> +               queue = i % adev->gfx.mec.num_queue_per_pipe;
>> +               pipe = (i / adev->gfx.mec.num_queue_per_pipe)
>> +                       % adev->gfx.mec.num_pipe_per_mec;
>> +               mec = (i / adev->gfx.mec.num_queue_per_pipe)
>> +                       / adev->gfx.mec.num_pipe_per_mec;
>> +
>> +               /* we've run out of HW */
>> +               if (mec > adev->gfx.mec.num_mec)
>> +                       break;
>> +
>> +               /* policy: amdgpu owns all queues in the first pipe */
>> +               if (mec == 0 && pipe == 0)
>> +                       set_bit(i, adev->gfx.mec.queue_bitmap);
>> +       }
>> +
>> +       /* update the number of active compute rings */
>> +       adev->gfx.num_compute_rings =
>> +               bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
>> +
>> +       /* If you hit this case and edited the policy, you probably just
>> +        * need to increase AMDGPU_MAX_COMPUTE_RINGS */
>> +       WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS);
>> +       if (adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS)
>> +               adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
>> +}
>> +
>>  static int gfx_v7_0_mec_init(struct amdgpu_device *adev)
>>  {
>>         int r;
>>         u32 *hpd;
>>         size_t mec_hpd_size;
>>
>> -       /*
>> -        * KV:    2 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 64 Queues total
>> -        * CI/KB: 1 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 32 Queues total
>> -        * Nonetheless, we assign only 1 pipe because all other pipes will
>> -        * be handled by KFD
>> -        */
>> +       bitmap_zero(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
>> +
>>         switch (adev->asic_type) {
>>         case CHIP_KAVERI:
>>                 adev->gfx.mec.num_mec = 2;
>>                 break;
>>         case CHIP_BONAIRE:
>>         case CHIP_HAWAII:
>>         case CHIP_KABINI:
>>         case CHIP_MULLINS:
>>         default:
>>                 adev->gfx.mec.num_mec = 1;
>>                 break;
>>         }
>>         adev->gfx.mec.num_pipe_per_mec = 4;
>>         adev->gfx.mec.num_queue_per_pipe = 8;
>>
>> +       /* take ownership of the relevant compute queues */
>> +       gfx_v7_0_compute_queue_acquire(adev);
>> +
>> +       /* allocate space for ALL pipes (even the ones we don't own) */
>>         mec_hpd_size = adev->gfx.mec.num_mec * adev->gfx.mec.num_pipe_per_mec
>>                 * GFX7_MEC_HPD_SIZE * 2;
>>         if (adev->gfx.mec.hpd_eop_obj == NULL) {
>>                 r = amdgpu_bo_create(adev,
>>                                      mec_hpd_size,
>>                                      PAGE_SIZE, true,
>>                                      AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL,
>>                                      &adev->gfx.mec.hpd_eop_obj);
>>                 if (r) {
>>                         dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
>> @@ -4522,21 +4553,21 @@ static const struct amdgpu_gfx_funcs gfx_v7_0_gfx_funcs = {
>>  static const struct amdgpu_rlc_funcs gfx_v7_0_rlc_funcs = {
>>         .enter_safe_mode = gfx_v7_0_enter_rlc_safe_mode,
>>         .exit_safe_mode = gfx_v7_0_exit_rlc_safe_mode
>>  };
>>
>>  static int gfx_v7_0_early_init(void *handle)
>>  {
>>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>         adev->gfx.num_gfx_rings = GFX7_NUM_GFX_RINGS;
>> -       adev->gfx.num_compute_rings = GFX7_NUM_COMPUTE_RINGS;
>> +       adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
>>         adev->gfx.funcs = &gfx_v7_0_gfx_funcs;
>>         adev->gfx.rlc.funcs = &gfx_v7_0_rlc_funcs;
>>         gfx_v7_0_set_ring_funcs(adev);
>>         gfx_v7_0_set_irq_funcs(adev);
>>         gfx_v7_0_set_gds_init(adev);
>>
>>         return 0;
>>  }
>>
>>  static int gfx_v7_0_late_init(void *handle)
>> @@ -4718,21 +4749,21 @@ static void gfx_v7_0_gpu_early_init(struct amdgpu_device *adev)
>>                 gb_addr_config |= (2 << GB_ADDR_CONFIG__ROW_SIZE__SHIFT);
>>                 break;
>>         }
>>         adev->gfx.config.gb_addr_config = gb_addr_config;
>>  }
>>
>>  static int gfx_v7_0_sw_init(void *handle)
>>  {
>>         struct amdgpu_ring *ring;
>>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>> -       int i, r;
>> +       int i, r, ring_id;
>>
>>         /* EOP Event */
>>         r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_LEGACY, 181, &adev->gfx.eop_irq);
>>         if (r)
>>                 return r;
>>
>>         /* Privileged reg */
>>         r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_LEGACY, 184,
>>                               &adev->gfx.priv_reg_irq);
>>         if (r)
>> @@ -4769,42 +4800,52 @@ static int gfx_v7_0_sw_init(void *handle)
>>                 ring = &adev->gfx.gfx_ring[i];
>>                 ring->ring_obj = NULL;
>>                 sprintf(ring->name, "gfx");
>>                 r = amdgpu_ring_init(adev, ring, 1024,
>>                                      &adev->gfx.eop_irq, AMDGPU_CP_IRQ_GFX_EOP);
>>                 if (r)
>>                         return r;
>>         }
>>
>>         /* set up the compute queues */
>> -       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>> +       for (i = 0, ring_id = 0; i < AMDGPU_MAX_QUEUES; i++) {
>>                 unsigned irq_type;
>>
>> -               /* max 32 queues per MEC */
>> -               if ((i >= 32) || (i >= AMDGPU_MAX_COMPUTE_RINGS)) {
>> -                       DRM_ERROR("Too many (%d) compute rings!\n", i);
>> -                       break;
>> -               }
>> -               ring = &adev->gfx.compute_ring[i];
>> +               if (!test_bit(i, adev->gfx.mec.queue_bitmap))
>> +                       continue;
>> +
>> +               ring = &adev->gfx.compute_ring[ring_id];
>> +
>> +               /* mec0 is me1 */
>> +               ring->me = ((i / adev->gfx.mec.num_queue_per_pipe)
>> +                               / adev->gfx.mec.num_pipe_per_mec)
>> +                               + 1;
>> +               ring->pipe = (i / adev->gfx.mec.num_queue_per_pipe)
>> +                               % adev->gfx.mec.num_pipe_per_mec;
>> +               ring->queue = i % adev->gfx.mec.num_queue_per_pipe;
>> +
>>                 ring->ring_obj = NULL;
>>                 ring->use_doorbell = true;
>> -               ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + i;
>> -               ring->me = 1; /* first MEC */
>> -               ring->pipe = i / 8;
>> -               ring->queue = i % 8;
>> +               ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + ring_id;
>>                 sprintf(ring->name, "comp_%d.%d.%d", ring->me, ring->pipe, ring->queue);
>> -               irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + ring->pipe;
>> +
>> +               irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
>> +                       + ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
>> +                       + ring->pipe;
>> +
>>                 /* type-2 packets are deprecated on MEC, use type-3 instead */
>>                 r = amdgpu_ring_init(adev, ring, 1024,
>>                                      &adev->gfx.eop_irq, irq_type);
>>                 if (r)
>>                         return r;
>> +
>> +               ring_id++;
>>         }
>>
>>         /* reserve GDS, GWS and OA resource for gfx */
>>         r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
>>                                     PAGE_SIZE, AMDGPU_GEM_DOMAIN_GDS,
>>                                     &adev->gds.gds_gfx_bo, NULL, NULL);
>>         if (r)
>>                 return r;
>>
>>         r = amdgpu_bo_create_kernel(adev, adev->gds.gws.gfx_partition_size,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> index 86cdcb8..177992c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
>> @@ -45,21 +45,20 @@
>>  #include "gca/gfx_8_0_enum.h"
>>  #include "gca/gfx_8_0_sh_mask.h"
>>  #include "gca/gfx_8_0_enum.h"
>>
>>  #include "dce/dce_10_0_d.h"
>>  #include "dce/dce_10_0_sh_mask.h"
>>
>>  #include "smu/smu_7_1_3_d.h"
>>
>>  #define GFX8_NUM_GFX_RINGS     1
>> -#define GFX8_NUM_COMPUTE_RINGS 8
>>  #define GFX8_MEC_HPD_SIZE 2048
>>
>>
>>  #define TOPAZ_GB_ADDR_CONFIG_GOLDEN 0x22010001
>>  #define CARRIZO_GB_ADDR_CONFIG_GOLDEN 0x22010001
>>  #define POLARIS11_GB_ADDR_CONFIG_GOLDEN 0x22011002
>>  #define TONGA_GB_ADDR_CONFIG_GOLDEN 0x22011003
>>
>>  #define ARRAY_MODE(x)                                  ((x) << GB_TILE_MODE0__ARRAY_MODE__SHIFT)
>>  #define PIPE_CONFIG(x)                                 ((x) << GB_TILE_MODE0__PIPE_CONFIG__SHIFT)
>> @@ -1410,47 +1409,82 @@ static int gfx_v8_0_kiq_init_ring(struct amdgpu_device *adev,
>>  }
>>  static void gfx_v8_0_kiq_free_ring(struct amdgpu_ring *ring,
>>                                    struct amdgpu_irq_src *irq)
>>  {
>>         amdgpu_wb_free(ring->adev, ring->adev->virt.reg_val_offs);
>>         amdgpu_ring_fini(ring);
>>  }
>>
>>  #define GFX8_MEC_HPD_SIZE 2048
>>
>> +static void gfx_v8_0_compute_queue_acquire(struct amdgpu_device *adev)
>> +{
>> +       int i, queue, pipe, mec;
>> +
>> +       /* policy for amdgpu compute queue ownership */
>> +       for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
>> +               queue = i % adev->gfx.mec.num_queue_per_pipe;
>> +               pipe = (i / adev->gfx.mec.num_queue_per_pipe)
>> +                       % adev->gfx.mec.num_pipe_per_mec;
>> +               mec = (i / adev->gfx.mec.num_queue_per_pipe)
>> +                       / adev->gfx.mec.num_pipe_per_mec;
>> +
>> +               /* we've run out of HW */
>> +               if (mec > adev->gfx.mec.num_mec)
>> +                       break;
>> +
>> +               /* policy: amdgpu owns all queues in the first pipe */
>> +               if (mec == 0 && pipe == 0)
>> +                       set_bit(i, adev->gfx.mec.queue_bitmap);
>> +       }
>> +
>> +       /* update the number of active compute rings */
>> +       adev->gfx.num_compute_rings =
>> +               bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
>> +
>> +       /* If you hit this case and edited the policy, you probably just
>> +        * need to increase AMDGPU_MAX_COMPUTE_RINGS */
>> +       if (WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS))
>> +               adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
>> +}
>> +
>>  static int gfx_v8_0_mec_init(struct amdgpu_device *adev)
>>  {
>>         int r;
>>         u32 *hpd;
>>         size_t mec_hpd_size;
>>
>> +       bitmap_zero(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
>> +
>>         switch (adev->asic_type) {
>>         case CHIP_FIJI:
>>         case CHIP_TONGA:
>>         case CHIP_POLARIS11:
>>         case CHIP_POLARIS12:
>>         case CHIP_POLARIS10:
>>         case CHIP_CARRIZO:
>>                 adev->gfx.mec.num_mec = 2;
>>                 break;
>>         case CHIP_TOPAZ:
>>         case CHIP_STONEY:
>>         default:
>>                 adev->gfx.mec.num_mec = 1;
>>                 break;
>>         }
>>
>>         adev->gfx.mec.num_pipe_per_mec = 4;
>>         adev->gfx.mec.num_queue_per_pipe = 8;
>>
>> -       /* only 1 pipe of the first MEC is owned by amdgpu */
>> -       mec_hpd_size = 1 * 1 * adev->gfx.mec.num_queue_per_pipe * GFX8_MEC_HPD_SIZE;
>> +       /* take ownership of the relevant compute queues */
>> +       gfx_v8_0_compute_queue_acquire(adev);
>> +
>> +       mec_hpd_size = adev->gfx.num_compute_rings * GFX8_MEC_HPD_SIZE;
>>
>>         if (adev->gfx.mec.hpd_eop_obj == NULL) {
>>                 r = amdgpu_bo_create(adev,
>>                                      mec_hpd_size,
>>                                      PAGE_SIZE, true,
>>                                      AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL,
>>                                      &adev->gfx.mec.hpd_eop_obj);
>>                 if (r) {
>>                         dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
>>                         return r;
>> @@ -2083,21 +2117,21 @@ static int gfx_v8_0_gpu_early_init(struct amdgpu_device *adev)
>>                 gb_addr_config = REG_SET_FIELD(gb_addr_config, GB_ADDR_CONFIG, ROW_SIZE, 2);
>>                 break;
>>         }
>>         adev->gfx.config.gb_addr_config = gb_addr_config;
>>
>>         return 0;
>>  }
>>
>>  static int gfx_v8_0_sw_init(void *handle)
>>  {
>> -       int i, r;
>> +       int i, r, ring_id;
>>         struct amdgpu_ring *ring;
>>         struct amdgpu_kiq *kiq;
>>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>         /* KIQ event */
>>         r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_LEGACY, 178, &adev->gfx.kiq.irq);
>>         if (r)
>>                 return r;
>>
>>         /* EOP Event */
>> @@ -2150,43 +2184,56 @@ static int gfx_v8_0_sw_init(void *handle)
>>                         ring->doorbell_index = AMDGPU_DOORBELL_GFX_RING0;
>>                 }
>>
>>                 r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
>>                                      AMDGPU_CP_IRQ_GFX_EOP);
>>                 if (r)
>>                         return r;
>>         }
>>
>>         /* set up the compute queues */
>> -       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>> +       for (i = 0, ring_id = 0; i < AMDGPU_MAX_QUEUES; i++) {
>>                 unsigned irq_type;
>>
>> -               /* max 32 queues per MEC */
>> -               if ((i >= 32) || (i >= AMDGPU_MAX_COMPUTE_RINGS)) {
>> -                       DRM_ERROR("Too many (%d) compute rings!\n", i);
>> +               if (!test_bit(i, adev->gfx.mec.queue_bitmap))
>> +                       continue;
>> +
>> +               if (WARN_ON(ring_id >= AMDGPU_MAX_COMPUTE_RINGS))
>>                         break;
>> -               }
>> -               ring = &adev->gfx.compute_ring[i];
>> +
>> +               ring = &adev->gfx.compute_ring[ring_id];
>> +
>> +               /* mec0 is me1 */
>> +               ring->me = ((i / adev->gfx.mec.num_queue_per_pipe)
>> +                               / adev->gfx.mec.num_pipe_per_mec)
>> +                               + 1;
>> +               ring->pipe = (i / adev->gfx.mec.num_queue_per_pipe)
>> +                               % adev->gfx.mec.num_pipe_per_mec;
>> +               ring->queue = i % adev->gfx.mec.num_queue_per_pipe;
>> +
>>                 ring->ring_obj = NULL;
>>                 ring->use_doorbell = true;
>> -               ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + i;
>> -               ring->me = 1; /* first MEC */
>> -               ring->pipe = i / 8;
>> -               ring->queue = i % 8;
>> -               ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + (i * GFX8_MEC_HPD_SIZE);
>> +               ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + (ring_id * GFX8_MEC_HPD_SIZE);
>> +               ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + ring_id;
>>                 sprintf(ring->name, "comp_%d.%d.%d", ring->me, ring->pipe, ring->queue);
>> -               irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + ring->pipe;
>> +
>> +               irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
>> +                       + ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
>> +                       + ring->pipe;
>> +
>>                 /* type-2 packets are deprecated on MEC, use type-3 instead */
>>                 r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
>>                                      irq_type);
>>                 if (r)
>>                         return r;
>> +
>> +               ring_id++;
>>         }
>>
>>         r = gfx_v8_0_kiq_init(adev);
>>         if (r) {
>>                 DRM_ERROR("Failed to init KIQ BOs!\n");
>>                 return r;
>>         }
>>
>>         kiq = &adev->gfx.kiq;
>>         r = gfx_v8_0_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
>> @@ -5686,21 +5733,21 @@ static const struct amdgpu_gfx_funcs gfx_v8_0_gfx_funcs = {
>>         .select_se_sh = &gfx_v8_0_select_se_sh,
>>         .read_wave_data = &gfx_v8_0_read_wave_data,
>>         .read_wave_sgprs = &gfx_v8_0_read_wave_sgprs,
>>  };
>>
>>  static int gfx_v8_0_early_init(void *handle)
>>  {
>>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>         adev->gfx.num_gfx_rings = GFX8_NUM_GFX_RINGS;
>> -       adev->gfx.num_compute_rings = GFX8_NUM_COMPUTE_RINGS;
>> +       adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
>>         adev->gfx.funcs = &gfx_v8_0_gfx_funcs;
>>         gfx_v8_0_set_ring_funcs(adev);
>>         gfx_v8_0_set_irq_funcs(adev);
>>         gfx_v8_0_set_gds_init(adev);
>>         gfx_v8_0_set_rlc_funcs(adev);
>>
>>         return 0;
>>  }
>>
>>  static int gfx_v8_0_late_init(void *handle)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 1a7b743..de6e537 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -31,21 +31,20 @@
>>  #include "vega10/GC/gc_9_0_offset.h"
>>  #include "vega10/GC/gc_9_0_sh_mask.h"
>>  #include "vega10/vega10_enum.h"
>>  #include "vega10/HDP/hdp_4_0_offset.h"
>>
>>  #include "soc15_common.h"
>>  #include "clearstate_gfx9.h"
>>  #include "v9_structs.h"
>>
>>  #define GFX9_NUM_GFX_RINGS     1
>> -#define GFX9_NUM_COMPUTE_RINGS 8
>>  #define GFX9_NUM_SE            4
>>  #define RLCG_UCODE_LOADING_START_ADDRESS 0x2000
>>
>>  MODULE_FIRMWARE("amdgpu/vega10_ce.bin");
>>  MODULE_FIRMWARE("amdgpu/vega10_pfp.bin");
>>  MODULE_FIRMWARE("amdgpu/vega10_me.bin");
>>  MODULE_FIRMWARE("amdgpu/vega10_mec.bin");
>>  MODULE_FIRMWARE("amdgpu/vega10_mec2.bin");
>>  MODULE_FIRMWARE("amdgpu/vega10_rlc.bin");
>>
>> @@ -469,45 +468,79 @@ static void gfx_v9_0_mec_fini(struct amdgpu_device *adev)
>>                 amdgpu_bo_unpin(adev->gfx.mec.mec_fw_obj);
>>                 amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_obj);
>>
>>                 amdgpu_bo_unref(&adev->gfx.mec.mec_fw_obj);
>>                 adev->gfx.mec.mec_fw_obj = NULL;
>>         }
>>  }
>>
>>  #define MEC_HPD_SIZE 2048
>>
>> +static void gfx_v9_0_compute_queue_acquire(struct amdgpu_device *adev)
>> +{
>> +       int i, queue, pipe, mec;
>> +
>> +       /* policy for amdgpu compute queue ownership */
>> +       for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
>> +               queue = i % adev->gfx.mec.num_queue_per_pipe;
>> +               pipe = (i / adev->gfx.mec.num_queue_per_pipe)
>> +                       % adev->gfx.mec.num_pipe_per_mec;
>> +               mec = (i / adev->gfx.mec.num_queue_per_pipe)
>> +                       / adev->gfx.mec.num_pipe_per_mec;
>> +
>> +               /* we've run out of HW */
>> +               if (mec > adev->gfx.mec.num_mec)
>> +                       break;
>> +
>> +               /* policy: amdgpu owns all queues in the first pipe */
>> +               if (mec == 0 && pipe == 0)
>> +                       set_bit(i, adev->gfx.mec.queue_bitmap);
>> +       }
>> +
>> +       /* update the number of active compute rings */
>> +       adev->gfx.num_compute_rings =
>> +               bitmap_weight(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
>> +
>> +       /* If you hit this case and edited the policy, you probably just
>> +        * need to increase AMDGPU_MAX_COMPUTE_RINGS */
>> +       if (WARN_ON(adev->gfx.num_compute_rings > AMDGPU_MAX_COMPUTE_RINGS))
>> +               adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
>> +}
>> +
>>  static int gfx_v9_0_mec_init(struct amdgpu_device *adev)
>>  {
>>         int r;
>>         u32 *hpd;
>>         const __le32 *fw_data;
>>         unsigned fw_size;
>>         u32 *fw;
>>         size_t mec_hpd_size;
>>
>>         const struct gfx_firmware_header_v1_0 *mec_hdr;
>>
>> +       bitmap_zero(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_QUEUES);
>> +
>>         switch (adev->asic_type) {
>>         case CHIP_VEGA10:
>>                 adev->gfx.mec.num_mec = 2;
>>                 break;
>>         default:
>>                 adev->gfx.mec.num_mec = 1;
>>                 break;
>>         }
>>
>>         adev->gfx.mec.num_pipe_per_mec = 4;
>>         adev->gfx.mec.num_queue_per_pipe = 8;
>>
>> -       /* only 1 pipe of the first MEC is owned by amdgpu */
>> -       mec_hpd_size = 1 * 1 * adev->gfx.mec.num_queue_per_pipe * MEC_HPD_SIZE;
>> +       /* take ownership of the relevant compute queues */
>> +       gfx_v9_0_compute_queue_acquire(adev);
>> +       mec_hpd_size = adev->gfx.num_compute_rings * MEC_HPD_SIZE;
>>
>>         if (adev->gfx.mec.hpd_eop_obj == NULL) {
>>                 r = amdgpu_bo_create(adev,
>>                                      mec_hpd_size,
>>                                      PAGE_SIZE, true,
>>                                      AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL,
>>                                      &adev->gfx.mec.hpd_eop_obj);
>>                 if (r) {
>>                         dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r);
>>                         return r;
>> @@ -1024,21 +1057,21 @@ static int gfx_v9_0_ngg_en(struct amdgpu_device *adev)
>>         gfx_v9_0_write_data_to_reg(ring, 0, false,
>>                                    amdgpu_gds_reg_offset[0].mem_size, 0);
>>
>>         amdgpu_ring_commit(ring);
>>
>>         return 0;
>>  }
>>
>>  static int gfx_v9_0_sw_init(void *handle)
>>  {
>> -       int i, r;
>> +       int i, r, ring_id;
>>         struct amdgpu_ring *ring;
>>         struct amdgpu_kiq *kiq;
>>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>         /* KIQ event */
>>         r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_GRBM_CP, 178, &adev->gfx.kiq.irq);
>>         if (r)
>>                 return r;
>>
>>         /* EOP Event */
>> @@ -1081,21 +1114,60 @@ static int gfx_v9_0_sw_init(void *handle)
>>                 sprintf(ring->name, "gfx");
>>                 ring->use_doorbell = true;
>>                 ring->doorbell_index = AMDGPU_DOORBELL64_GFX_RING0 << 1;
>>                 r = amdgpu_ring_init(adev, ring, 1024,
>>                                      &adev->gfx.eop_irq, AMDGPU_CP_IRQ_GFX_EOP);
>>                 if (r)
>>                         return r;
>>         }
>>
>>         /* set up the compute queues */
>> -       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>> +       for (i = 0, ring_id = 0; i < AMDGPU_MAX_QUEUES; i++) {
>> +               unsigned irq_type;
>> +
>> +               if (!test_bit(i, adev->gfx.mec.queue_bitmap))
>> +                       continue;
>> +
>> +               if (WARN_ON(ring_id >= AMDGPU_MAX_COMPUTE_RINGS))
>> +                       break;
>> +
>> +               ring = &adev->gfx.compute_ring[ring_id];
>> +
>> +               /* mec0 is me1 */
>> +               ring->me = ((i / adev->gfx.mec.num_queue_per_pipe)
>> +                               / adev->gfx.mec.num_pipe_per_mec)
>> +                               + 1;
>> +               ring->pipe = (i / adev->gfx.mec.num_queue_per_pipe)
>> +                               % adev->gfx.mec.num_pipe_per_mec;
>> +               ring->queue = i % adev->gfx.mec.num_queue_per_pipe;
>> +
>> +               ring->ring_obj = NULL;
>> +               ring->use_doorbell = true;
>> +               ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + (ring_id * MEC_HPD_SIZE);
>> +               ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + ring_id;
>> +               sprintf(ring->name, "comp_%d.%d.%d", ring->me, ring->pipe, ring->queue);
>> +
>> +               irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
>> +                       + ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
>> +                       + ring->pipe;
>> +
>> +               /* type-2 packets are deprecated on MEC, use type-3 instead */
>> +               r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
>> +                                    irq_type);
>> +               if (r)
>> +                       return r;
>> +
>> +               ring_id++;
>> +       }
>> +
>> +       /* set up the compute queues */
>> +       for (i = 0, ring_id = 0; i < AMDGPU_MAX_QUEUES; i++) {
>>                 unsigned irq_type;
>>
>>                 /* max 32 queues per MEC */
>>                 if ((i >= 32) || (i >= AMDGPU_MAX_COMPUTE_RINGS)) {
>>                         DRM_ERROR("Too many (%d) compute rings!\n", i);
>>                         break;
>>                 }
>>                 ring = &adev->gfx.compute_ring[i];
>>                 ring->ring_obj = NULL;
>>                 ring->use_doorbell = true;
>> @@ -2655,21 +2727,21 @@ static void gfx_v9_0_ring_emit_gds_switch(struct amdgpu_ring *ring,
>>         gfx_v9_0_write_data_to_reg(ring, 0, false,
>>                                    amdgpu_gds_reg_offset[vmid].oa,
>>                                    (1 << (oa_size + oa_base)) - (1 << oa_base));
>>  }
>>
>>  static int gfx_v9_0_early_init(void *handle)
>>  {
>>         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>         adev->gfx.num_gfx_rings = GFX9_NUM_GFX_RINGS;
>> -       adev->gfx.num_compute_rings = GFX9_NUM_COMPUTE_RINGS;
>> +       adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
>>         gfx_v9_0_set_ring_funcs(adev);
>>         gfx_v9_0_set_irq_funcs(adev);
>>         gfx_v9_0_set_gds_init(adev);
>>         gfx_v9_0_set_rlc_funcs(adev);
>>
>>         return 0;
>>  }
>>
>>  static int gfx_v9_0_late_init(void *handle)
>>  {
>> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>> index a09d9f3..67f6d19 100644
>> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
>> @@ -26,20 +26,21 @@
>>   */
>>
>>  #ifndef KGD_KFD_INTERFACE_H_INCLUDED
>>  #define KGD_KFD_INTERFACE_H_INCLUDED
>>
>>  #include <linux/types.h>
>>
>>  struct pci_dev;
>>
>>  #define KFD_INTERFACE_VERSION 1
>> +#define KGD_MAX_QUEUES 128
>>
>>  struct kfd_dev;
>>  struct kgd_dev;
>>
>>  struct kgd_mem;
>>
>>  enum kgd_memory_pool {
>>         KGD_POOL_SYSTEM_CACHEABLE = 1,
>>         KGD_POOL_SYSTEM_WRITECOMBINE = 2,
>>         KGD_POOL_FRAMEBUFFER = 3,
>> --
>> 2.9.3
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx