RE: [PATCHv2 2/4] drm/amdkfd: Update cache info reporting for GFX v9.4.3

"Joshi, Mukul" <Mukul.Joshi@xxxxxxx> · Thu, 7 Sep 2023 22:54:04 +0000



[AMD Official Use Only - General]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@xxxxxxx>
> Sent: Thursday, September 7, 2023 4:51 PM
> To: Joshi, Mukul <Mukul.Joshi@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Subject: Re: [PATCHv2 2/4] drm/amdkfd: Update cache info reporting for GFX
> v9.4.3
>
>
> On 2023-09-06 11:44, Mukul Joshi wrote:
> > Update cache info reporting in sysfs to report the correct number of
> > CUs and associated cache information based on different spatial
> > partitioning modes.
> >
> > Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx>
> > ---
> > v1->v2:
> > - Revert the change in kfd_crat.c
> > - Add a comment to not change value of CRAT_SIBLINGMAP_SIZE.
> >
> >   drivers/gpu/drm/amd/amdkfd/kfd_crat.h     |  4 ++
> >   drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 82 +++++++++++++-------
> ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_topology.h |  2 +-
> >   3 files changed, 51 insertions(+), 37 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> > index 387a8ef49385..74c2d7a0d628 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> > @@ -79,6 +79,10 @@ struct crat_header {
> >   #define CRAT_SUBTYPE_IOLINK_AFFINITY              5
> >   #define CRAT_SUBTYPE_MAX                  6
> >
> > +/*
> > + * Do not change the value of CRAT_SIBLINGMAP_SIZE from 32
> > + * as it breaks the ABI.
> > + */
> >   #define CRAT_SIBLINGMAP_SIZE      32
> >
> >   /*
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > index c54795682dfb..b98cc7930e4c 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > @@ -1596,14 +1596,17 @@ static int fill_in_l1_pcache(struct
> kfd_cache_properties **props_ext,
> >   static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
> >                             struct kfd_gpu_cache_info *pcache_info,
> >                             struct kfd_cu_info *cu_info,
> > -                           int cache_type, unsigned int cu_processor_id)
> > +                           int cache_type, unsigned int cu_processor_id,
> > +                           struct kfd_node *knode)
> >   {
> >     unsigned int cu_sibling_map_mask;
> >     int first_active_cu;
> > -   int i, j, k;
> > +   int i, j, k, xcc, start, end;
> >     struct kfd_cache_properties *pcache = NULL;
> >
> > -   cu_sibling_map_mask = cu_info->cu_bitmap[0][0][0];
> > +   start = ffs(knode->xcc_mask) - 1;
> > +   end = start + NUM_XCC(knode->xcc_mask);
> > +   cu_sibling_map_mask = cu_info->cu_bitmap[start][0][0];
> >     cu_sibling_map_mask &=
> >             ((1 << pcache_info[cache_type].num_cu_shared) - 1);
> >     first_active_cu = ffs(cu_sibling_map_mask); @@ -1638,16 +1641,18
> @@
> > static int fill_in_l2_l3_pcache(struct kfd_cache_properties **props_ext,
> >             cu_sibling_map_mask = cu_sibling_map_mask >>
> (first_active_cu - 1);
> >             k = 0;
> >
> > -           for (i = 0; i < cu_info->num_shader_engines; i++) {
> > -                   for (j = 0; j < cu_info-
> >num_shader_arrays_per_engine; j++) {
> > -                           pcache->sibling_map[k] =
> (uint8_t)(cu_sibling_map_mask & 0xFF);
> > -                           pcache->sibling_map[k+1] =
> (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
> > -                           pcache->sibling_map[k+2] =
> (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
> > -                           pcache->sibling_map[k+3] =
> (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
> > -                           k += 4;
> > -
> > -                           cu_sibling_map_mask = cu_info-
> >cu_bitmap[0][i % 4][j + i / 4];
> > -                           cu_sibling_map_mask &= ((1 <<
> pcache_info[cache_type].num_cu_shared) - 1);
> > +           for (xcc = start; xcc < end; xcc++) {
> > +                   for (i = 0; i < cu_info->num_shader_engines; i++) {
> > +                           for (j = 0; j < cu_info-
> >num_shader_arrays_per_engine; j++) {
> > +                                   pcache->sibling_map[k] =
> (uint8_t)(cu_sibling_map_mask & 0xFF);
> > +                                   pcache->sibling_map[k+1] =
> (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF);
> > +                                   pcache->sibling_map[k+2] =
> (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF);
> > +                                   pcache->sibling_map[k+3] =
> (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF);
> > +                                   k += 4;
> > +
> > +                                   cu_sibling_map_mask = cu_info-
> >cu_bitmap[start][i % 4][j + i /
> > +4];
>
> Shouldn't this use "xcc" as index instead of "start"?
>
Yes that’s correct. Thanks for the catch. I will fix this before committing.

> Other than that, this patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
>
> P.S.: This whole function looks suspicious to me the way it exposes
> active/inactive CUs to user mode. I think user mode only wants to count active
> CUs in the sibling mask because it has no other information about inactive
> CUs. And reading the cu_info into cu_sibling_map_mask and then only using it
> in the next loop iteration is probably also wrong. It basically throws away
> cu_info from the last xcc,SE,SA. Or maybe I just don't understand what this
> function is trying to do.
>
Yes the function is a bit convoluted to understand. It starts with the following code before
entering the loop.
cu_sibling_map_mask = cu_info->cu_bitmap[start][0][0];
cu_sibling_map_mask &=
                ((1 << pcache_info[cache_type].num_cu_shared) - 1);
 ...
cu_sibling_map_mask = cu_sibling_map_mask >> (first_active_cu - 1);
...
It again re-uses cu_info->cu_bitmap[start][0][0]; in the second iteration. I am not sure
if it is doing the right thing either. I will try to take a look at it and understand what
exactly we want to achieve with this function.

Regards,
Mukul

>
> > +                                   cu_sibling_map_mask &= ((1 <<
> pcache_info[cache_type].num_cu_shared) - 1);
> > +                           }
> >                     }
> >             }
> >             pcache->sibling_map_size = k;
> > @@ -1665,7 +1670,7 @@ static int fill_in_l2_l3_pcache(struct
> kfd_cache_properties **props_ext,
> >   static void kfd_fill_cache_non_crat_info(struct kfd_topology_device *dev,
> struct kfd_node *kdev)
> >   {
> >     struct kfd_gpu_cache_info *pcache_info = NULL;
> > -   int i, j, k;
> > +   int i, j, k, xcc, start, end;
> >     int ct = 0;
> >     unsigned int cu_processor_id;
> >     int ret;
> > @@ -1699,37 +1704,42 @@ static void kfd_fill_cache_non_crat_info(struct
> kfd_topology_device *dev, struct
> >      *                      then it will consider only one CU from
> >      *                      the shared unit
> >      */
> > +   start = ffs(kdev->xcc_mask) - 1;
> > +   end = start + NUM_XCC(kdev->xcc_mask);
> > +
> >     for (ct = 0; ct < num_of_cache_types; ct++) {
> >             cu_processor_id = gpu_processor_id;
> >             if (pcache_info[ct].cache_level == 1) {
> > -                   for (i = 0; i < pcu_info->num_shader_engines; i++) {
> > -                           for (j = 0; j < pcu_info-
> >num_shader_arrays_per_engine; j++) {
> > -                                   for (k = 0; k < pcu_info-
> >num_cu_per_sh; k += pcache_info[ct].num_cu_shared) {
> > -
> > -                                           ret =
> fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
> > -
>       pcu_info->cu_bitmap[0][i % 4][j + i / 4], ct,
> > -
>       cu_processor_id, k);
> > -
> > -                                           if (ret < 0)
> > -                                                   break;
> > -
> > -                                           if (!ret) {
> > -                                                   num_of_entries++;
> > -
>       list_add_tail(&props_ext->list, &dev->cache_props);
> > +                   for (xcc = start; xcc < end; xcc++) {
> > +                           for (i = 0; i < pcu_info->num_shader_engines;
> i++) {
> > +                                   for (j = 0; j < pcu_info-
> >num_shader_arrays_per_engine; j++) {
> > +                                           for (k = 0; k < pcu_info-
> >num_cu_per_sh; k +=
> > +pcache_info[ct].num_cu_shared) {
> > +
> > +                                                   ret =
> fill_in_l1_pcache(&props_ext, pcache_info, pcu_info,
> > +
>       pcu_info->cu_bitmap[xcc][i % 4][j + i / 4], ct,
> > +
>       cu_processor_id, k);
> > +
> > +                                                   if (ret < 0)
> > +                                                           break;
> > +
> > +                                                   if (!ret) {
> > +
>       num_of_entries++;
> > +
>       list_add_tail(&props_ext->list, &dev->cache_props);
> > +                                                   }
> > +
> > +                                                   /* Move to next CU
> block */
> > +                                                   num_cu_shared = ((k
> + pcache_info[ct].num_cu_shared) <=
> > +                                                           pcu_info-
> >num_cu_per_sh) ?
> > +
>       pcache_info[ct].num_cu_shared :
> > +                                                           (pcu_info-
> >num_cu_per_sh - k);
> > +                                                   cu_processor_id +=
> num_cu_shared;
> >                                             }
> > -
> > -                                           /* Move to next CU block */
> > -                                           num_cu_shared = ((k +
> pcache_info[ct].num_cu_shared) <=
> > -                                                   pcu_info-
> >num_cu_per_sh) ?
> > -
>       pcache_info[ct].num_cu_shared :
> > -                                                   (pcu_info-
> >num_cu_per_sh - k);
> > -                                           cu_processor_id +=
> num_cu_shared;
> >                                     }
> >                             }
> >                     }
> >             } else {
> >                     ret = fill_in_l2_l3_pcache(&props_ext, pcache_info,
> > -                                                           pcu_info, ct,
> cu_processor_id);
> > +                                   pcu_info, ct, cu_processor_id, kdev);
> >
> >                     if (ret < 0)
> >                             break;
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> > b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> > index dea32a9e5506..27386ce9a021 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> > @@ -89,7 +89,7 @@ struct kfd_mem_properties {
> >     struct attribute        attr;
> >   };
> >
> > -#define CACHE_SIBLINGMAP_SIZE 64
> > +#define CACHE_SIBLINGMAP_SIZE 128
> >
> >   struct kfd_cache_properties {
> >     struct list_head        list;