RE: [PATCH v2] drm/amdkfd: Account for SH/SE count when setting up cu masks.

"Keely, Sean" <Sean.Keely@xxxxxxx> · Wed, 25 Aug 2021 01:11:41 +0000

[AMD Official Use Only]

Right, sorry.  These were two separate branches until checkpatch complained about the nesting level.  Then I broke it.

-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling@xxxxxxx>
Sent: Tuesday, August 24, 2021 7:54 PM
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Keely, Sean <Sean.Keely@xxxxxxx>
Subject: Re: [PATCH v2] drm/amdkfd: Account for SH/SE count when setting up cu masks.

Am 2021-08-23 um 8:36 p.m. schrieb Sean Keely:
> On systems with multiple SH per SE compute_static_thread_mgmt_se#
> is split into independent masks, one for each SH, in the upper and
> lower 16 bits.  We need to detect this and apply cu masking to each
> SH.  The cu mask bits are assigned first to each SE, then to
> alternate SHs, then finally to higher CU id.  This ensures that
> the maximum number of SPIs are engaged as early as possible while
> balancing CU assignment to each SH.
>
> v2: Use max SH/SE rather than max SH in cu_per_sh.
>
> Signed-off-by: Sean Keely <Sean.Keely@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 79 ++++++++++++++------
>  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h |  1 +
>  2 files changed, 59 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> index 88813dad731f..5d7016928d24 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c
> @@ -98,36 +98,73 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
>               uint32_t *se_mask)
>  {
>       struct kfd_cu_info cu_info;
> -     uint32_t cu_per_se[KFD_MAX_NUM_SE] = {0};
> -     int i, se, sh, cu = 0;
> -
> +     uint32_t cu_per_sh[KFD_MAX_NUM_SE][KFD_MAX_NUM_SH_PER_SE] = {0};
> +     int i, se, sh, cu;
>       amdgpu_amdkfd_get_cu_info(mm->dev->kgd, &cu_info);
>
>       if (cu_mask_count > cu_info.cu_active_number)
>               cu_mask_count = cu_info.cu_active_number;
>
> +     // Exceeding these bounds corrupts the stack and indicates a coding error.
> +     // Returning with no CU's enabled will hang the queue, which should be
> +     // attention grabbing.
> +     if (cu_info.num_shader_engines > KFD_MAX_NUM_SE) {
> +             pr_err("Exceeded KFD_MAX_NUM_SE, chip reports %d\n", cu_info.num_shader_engines);
> +             return;
> +     }
> +     if (cu_info.num_shader_arrays_per_engine > KFD_MAX_NUM_SH_PER_SE) {
> +             pr_err("Exceeded KFD_MAX_NUM_SH, chip reports %d\n",
> +                     cu_info.num_shader_arrays_per_engine * cu_info.num_shader_engines);
> +             return;
> +     }
> +
> +     /* Count active CUs per SH.
> +      *
> +      * Some CUs in an SH may be disabled.   HW expects disabled CUs to be
> +      * represented in the high bits of each SH's enable mask (the upper and lower
> +      * 16 bits of se_mask) and will take care of the actual distribution of
> +      * disabled CUs within each SH automatically.
> +      * Each half of se_mask must be filled compactly, LSB first.
> +      *
> +      * See note on Arcturus cu_bitmap layout in gfx_v9_0_get_cu_info.
> +      */
>       for (se = 0; se < cu_info.num_shader_engines; se++)
>               for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++)
> -                     cu_per_se[se] += hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
> -
> -     /* Symmetrically map cu_mask to all SEs:
> -      * cu_mask[0] bit0 -> se_mask[0] bit0;
> -      * cu_mask[0] bit1 -> se_mask[1] bit0;
> -      * ... (if # SE is 4)
> -      * cu_mask[0] bit4 -> se_mask[0] bit1;
> +                     cu_per_sh[se][sh] = hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
> +
> +     /* Symmetrically map cu_mask to all SEs & SHs:
> +      * se_mask programs up to 2 SH in the upper and lower 16 bits.
> +      *
> +      * Examples
> +      * Assuming 1 SH/SE, 4 SEs:
> +      * cu_mask[0] bit0 -> se_mask[0] bit0
> +      * cu_mask[0] bit1 -> se_mask[1] bit0
> +      * ...
> +      * cu_mask[0] bit4 -> se_mask[0] bit1
> +      * ...
> +      *
> +      * Assuming 2 SH/SE, 4 SEs
> +      * cu_mask[0] bit0 -> se_mask[0] bit0 (SE0,SH0,CU0)
> +      * cu_mask[0] bit1 -> se_mask[1] bit0 (SE1,SH0,CU0)
> +      * ...
> +      * cu_mask[0] bit4 -> se_mask[0] bit16 (SE0,SH1,CU0)
> +      * cu_mask[0] bit5 -> se_mask[1] bit16 (SE1,SH1,CU0)
> +      * ...
> +      * cu_mask[0] bit8 -> se_mask[0] bit1 (SE0,SH0,CU1)
>        * ...
>        */
> -     se = 0;
> -     for (i = 0; i < cu_mask_count; i++) {
> -             if (cu_mask[i / 32] & (1 << (i % 32)))
> -                     se_mask[se] |= 1 << cu;
> -
> -             do {
> -                     se++;
> -                     if (se == cu_info.num_shader_engines) {
> -                             se = 0;
> -                             cu++;
> +     i = 0;
> +     for (cu = 0; cu < 16; cu++) {
> +             for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++) {
> +                     for (se = 0; se < cu_info.num_shader_engines; se++) {
> +                             if ((cu_mask[i / 32] & (1 << (i % 32))) &&
> +                                     (cu_per_sh[se][sh] > cu)) {

You need to increment i when cu_mask[i/32] & (1 << (i % 32)) is 0.
Otherwise you'll get stuck on any 0 bit in the CU mask.

Regards,
  Felix

> +                                     se_mask[se] |= 1 << (cu + sh * 16);
> +                                     i++;
> +                                     if (i == cu_mask_count)
> +                                             return;
> +                             }
>                       }
> -             } while (cu >= cu_per_se[se] && cu < 32);
> +             }
>       }
>  }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> index b5e2ea7550d4..6e6918ccedfd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h
> @@ -27,6 +27,7 @@
>  #include "kfd_priv.h"
>
>  #define KFD_MAX_NUM_SE 8
> +#define KFD_MAX_NUM_SH_PER_SE 2
>
>  /**
>   * struct mqd_manager