RE: [PATCH] drm/amdkfd: allow users to target recommended SDMA engines

"Kim, Jonathan" <Jonathan.Kim@xxxxxxx> · Fri, 19 Jul 2024 20:27:26 +0000

[Public]

> -----Original Message-----
> From: Kuehling, Felix <Felix.Kuehling@xxxxxxx>
> Sent: Friday, July 19, 2024 2:34 PM
> To: Kim, Jonathan <Jonathan.Kim@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Subject: Re: [PATCH] drm/amdkfd: allow users to target recommended SDMA
> engines
>
> On 2024-07-18 19:05, Jonathan Kim wrote:
> > Certain GPUs have better copy performance over xGMI on specific
> > SDMA engines depending on the source and destination GPU.
> > Allow users to create SDMA queues on these recommended engines.
> > Close to 2x overall performance has been observed with this
> > optimization.
> >
> > Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx>
> > ---
> >   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 16 ++++++
> >   drivers/gpu/drm/amd/amdkfd/kfd_crat.h         |  3 +-
> >   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 39 +++++++++++++-
> >   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  5 +-
> >   .../amd/amdkfd/kfd_process_queue_manager.c    |  1 +
> >   drivers/gpu/drm/amd/amdkfd/kfd_topology.c     | 52
> +++++++++++++++++++
> >   drivers/gpu/drm/amd/amdkfd/kfd_topology.h     |  1 +
> >   include/uapi/linux/kfd_ioctl.h                |  6 ++-
> >   8 files changed, 119 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index 32e5db509560..9610cb90a47e 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -255,6 +255,7 @@ static int set_queue_properties_from_user(struct
> queue_properties *q_properties,
> >                     args->ctx_save_restore_address;
> >     q_properties->ctx_save_restore_area_size = args-
> >ctx_save_restore_size;
> >     q_properties->ctl_stack_size = args->ctl_stack_size;
> > +   q_properties->sdma_engine_id = args->sdma_engine_id;
> >     if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
> >             args->queue_type ==
> KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
> >             q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
> > @@ -262,6 +263,8 @@ static int set_queue_properties_from_user(struct
> queue_properties *q_properties,
> >             q_properties->type = KFD_QUEUE_TYPE_SDMA;
> >     else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA_XGMI)
> >             q_properties->type = KFD_QUEUE_TYPE_SDMA_XGMI;
> > +   else if (args->queue_type ==
> KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID)
> > +           q_properties->type = KFD_QUEUE_TYPE_SDMA_BY_ENG_ID;
> >     else
> >             return -ENOTSUPP;
> >
> > @@ -334,6 +337,18 @@ static int kfd_ioctl_create_queue(struct file *filep,
> struct kfd_process *p,
> >             goto err_bind_process;
> >     }
> >
> > +   if (q_properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) {
> > +           int max_sdma_eng_id = kfd_get_num_sdma_engines(dev) +
> > +                                 kfd_get_num_xgmi_sdma_engines(dev) -
> 1;
> > +
> > +           if (q_properties.sdma_engine_id > max_sdma_eng_id) {
> > +                   err = -EINVAL;
> > +                   pr_err("sdma_engine_id %i exceeds maximum id
> of %i\n",
> > +                          q_properties.sdma_engine_id,
> max_sdma_eng_id);
> > +                   goto err_sdma_engine_id;
> > +           }
> > +   }
> > +
> >     if (!pdd->qpd.proc_doorbells) {
> >             err = kfd_alloc_process_doorbells(dev->kfd, pdd);
> >             if (err) {
> > @@ -425,6 +440,7 @@ static int kfd_ioctl_create_queue(struct file *filep,
> struct kfd_process *p,
> >     if (wptr_bo)
> >             amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
> >   err_wptr_map_gart:
> > +err_sdma_engine_id:
> >   err_bind_process:
> >   err_pdd:
> >     mutex_unlock(&p->mutex);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> > index a8ca7ecb6d27..e880a71837bc 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h
> > @@ -259,7 +259,7 @@ struct crat_subtype_ccompute {
> >   #define CRAT_IOLINK_TYPE_OTHER            16
> >   #define CRAT_IOLINK_TYPE_MAX              255
> >
> > -#define CRAT_IOLINK_RESERVED_LENGTH        24
> > +#define CRAT_IOLINK_RESERVED_LENGTH        20
> >
> >   struct crat_subtype_iolink {
> >     uint8_t         type;
> > @@ -276,6 +276,7 @@ struct crat_subtype_iolink {
> >     uint32_t        minimum_bandwidth_mbs;
> >     uint32_t        maximum_bandwidth_mbs;
> >     uint32_t        recommended_transfer_size;
> > +   uint32_t        recommended_sdma_eng_id_mask;
>
> This seems completely unnecessary because your code in kfd_topology
> doesn't depend on this info being in the CRAT table.

Ack'd.

>
>
> >     uint8_t         reserved2[CRAT_IOLINK_RESERVED_LENGTH - 1];
> >     uint8_t         weight_xgmi;
> >   };
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > index 4f48507418d2..58d7710ebb30 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> > @@ -1534,6 +1534,42 @@ static int allocate_sdma_queue(struct
> device_queue_manager *dqm,
> >                     q->sdma_id %
> kfd_get_num_xgmi_sdma_engines(dqm->dev);
> >             q->properties.sdma_queue_id = q->sdma_id /
> >                     kfd_get_num_xgmi_sdma_engines(dqm->dev);
> > +   } else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID)
> {
> > +           int i, num_queues, num_engines, eng_offset = 0;
> > +           bool free_bit_found = false, is_xgmi = false;
> > +
> > +           if (q->properties.sdma_engine_id <
> kfd_get_num_sdma_engines(dqm->dev)) {
> > +                   num_queues = get_num_sdma_queues(dqm);
> > +                   num_engines = kfd_get_num_sdma_engines(dqm-
> >dev);
> > +                   q->properties.type = KFD_QUEUE_TYPE_SDMA;
> > +           } else {
> > +                   num_queues = get_num_xgmi_sdma_queues(dqm);
> > +                   num_engines =
> kfd_get_num_xgmi_sdma_engines(dqm->dev);
> > +                   eng_offset = kfd_get_num_sdma_engines(dqm->dev);
> > +                   q->properties.type = KFD_QUEUE_TYPE_SDMA_XGMI;
> > +                   is_xgmi = true;
> > +           }
> > +
> > +           /* Scan available bit based on target engine ID. */
> > +           for (i = 0; i < num_queues; i++) {
> > +                   int tmp_eng_id = eng_offset + i % num_engines;
>
> This could be more efficient:
>
>       for (i = q->properties.sdma_engine_id - eng_offset; i < num_queues; i
> += num_engines) {
>               if (test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm-
> >sdma_bitmap))
>                       continue;
>               ...
>       }

Ack'd.

>
> > +
> > +                   if (!(q->properties.sdma_engine_id == tmp_eng_id &&
> > +                       test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap :
> dqm->sdma_bitmap)))
> > +                           continue;
> > +
> > +                   clear_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm-
> >sdma_bitmap);
> > +                   q->sdma_id = i;
> > +                   q->properties.sdma_queue_id = q->sdma_id /
> num_engines;
> > +                   free_bit_found = true;
> > +                   break;
> > +           }
> > +
> > +           if (!free_bit_found) {
> > +                   dev_err(dev, "No more SDMA queue to allocate for
> target ID %i\n",
> > +                           q->properties.sdma_engine_id);
> > +                   return -ENOMEM;
> > +           }
> >     }
> >
> >     pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
> > @@ -1786,7 +1822,8 @@ static int create_queue_cpsch(struct
> device_queue_manager *dqm, struct queue *q,
> >     }
> >
> >     if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
> > -           q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
> > +           q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI ||
> > +           q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID)
> {
> >             dqm_lock(dqm);
> >             retval = allocate_sdma_queue(dqm, q, qd ? &qd->sdma_id :
> NULL);
> >             dqm_unlock(dqm);
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > index 2b3ec92981e8..7d26e71dfd04 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> > @@ -414,13 +414,16 @@ enum kfd_unmap_queues_filter {
> >    * @KFD_QUEUE_TYPE_DIQ: DIQ queue type.
> >    *
> >    * @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI
> interface.
> > + *
> > + * @KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:  SDMA user mode queue with
> target SDMA engine ID.
> >    */
> >   enum kfd_queue_type  {
> >     KFD_QUEUE_TYPE_COMPUTE,
> >     KFD_QUEUE_TYPE_SDMA,
> >     KFD_QUEUE_TYPE_HIQ,
> >     KFD_QUEUE_TYPE_DIQ,
> > -   KFD_QUEUE_TYPE_SDMA_XGMI
> > +   KFD_QUEUE_TYPE_SDMA_XGMI,
> > +   KFD_QUEUE_TYPE_SDMA_BY_ENG_ID
> >   };
> >
> >   enum kfd_queue_format {
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > index 21f5a1fb3bf8..8adf20760e67 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> > @@ -345,6 +345,7 @@ int pqm_create_queue(struct
> process_queue_manager *pqm,
> >     switch (type) {
> >     case KFD_QUEUE_TYPE_SDMA:
> >     case KFD_QUEUE_TYPE_SDMA_XGMI:
> > +   case KFD_QUEUE_TYPE_SDMA_BY_ENG_ID:
> >             /* SDMA queues are always allocated statically no matter
> >              * which scheduler mode is used. We also do not need to
> >              * check whether a SDMA queue can be allocated here,
> because
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > index 6f89b06f89d3..f6effaabd4b0 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> > @@ -292,6 +292,8 @@ static ssize_t iolink_show(struct kobject *kobj,
> struct attribute *attr,
> >                           iolink->max_bandwidth);
> >     sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size",
> >                           iolink->rec_transfer_size);
> > +   sysfs_show_32bit_prop(buffer, offs,
> "recommended_sdma_engine_id_mask",
> > +                         iolink->rec_sdma_eng_id_mask);
> >     sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags);
> >
> >     return offs;
> > @@ -1265,6 +1267,55 @@ static void kfd_set_iolink_non_coherent(struct
> kfd_topology_device *to_dev,
> >     }
> >   }
> >
> > +#define REC_SDMA_NUM_GPU   8
> > +static const int
> rec_sdma_eng_map[REC_SDMA_NUM_GPU][REC_SDMA_NUM_GPU] = {
> > +                                                   { -1, 14, 12, 2, 4, 8,
> 10, 6 },
> > +                                                   { 14, -1, 2, 10, 8, 4, 6,
> 12 },
> > +                                                   { 10, 2, -1, 12, 14, 6,
> 4, 8 },
> > +                                                   { 2, 12, 10, -1, 6, 14,
> 8, 4 },
> > +                                                   { 4, 8, 14, 6, -1, 10,
> 12, 2 },
> > +                                                   { 8, 4, 6, 14, 12, -1, 2,
> 10 },
> > +                                                   { 10, 6, 4, 8, 12, 2, -1,
> 14 },
> > +                                                   { 6, 12, 8, 4, 2, 10, 14,
> -1 }};
> This matrix is mostly symmetrical, but not quite. Is that a mistake or
> intentional?

This is intentional.  This is a heuristic supported by empirical data supplied by RCCL/IO perf team studies.
It achieves close to theoretical HW limit performance in their all-to-all copy stress test.

>
> It seems it's only using even-numbered engines. An application that
> follows these recommendations can only use half the SDMA engines. Is
> that intentional? See below.

This is also intentional.  Without giving too much away, there's an unfortunate limit to how many SDMA engines can get loaded per AID at a time before performance starts to suffer.
FW fixes can't get around this as it increases too much complexity and bugginess.

ROCm in general will bind a single SDMA queue to every GPU/GPU pair during the process lifetime anyways so there's no worries with unused SDMA engines in this case because this recommendation only applies to dGPU products wired in an 8 GPU hive that happens to have AIDs in SPX mode.
So at most we end up using 7 SDMAs max per gpu anyways.  2 are for host-device leaving 14 in total.  So using even engines only should be sufficient for ROCm usage.

For users that don't user ROCm's runtime, they are free to ignore this recommendation as the name of the io link field suggests it is "recommended" and not "mandatory".

>
>
> > +
> > +static void kfd_set_recommended_sdma_engines(struct
> kfd_topology_device *to_dev,
> > +                                        struct kfd_iolink_properties
> *outbound_link,
> > +                                        struct kfd_iolink_properties
> *inbound_link)
> > +{
> > +   struct kfd_node *gpu = outbound_link->gpu;
> > +   struct amdgpu_device *adev = gpu->adev;
> > +   int num_xgmi_nodes = adev->gmc.xgmi.num_physical_nodes;
> > +   bool support_rec_eng = !amdgpu_sriov_vf(adev) && to_dev->gpu &&
> > +           adev->aid_mask && num_xgmi_nodes &&
> > +           (amdgpu_xcp_query_partition_mode(adev->xcp_mgr,
> AMDGPU_XCP_FL_NONE) ==
> > +                 AMDGPU_SPX_PARTITION_MODE) &&
> > +           (!(adev->flags & AMD_IS_APU) && num_xgmi_nodes == 8);
> > +
> > +   if (support_rec_eng) {
> > +           int src_socket_id = adev->gmc.xgmi.physical_node_id;
> > +           int dst_socket_id = to_dev->gpu->adev-
> >gmc.xgmi.physical_node_id;
> > +
> > +           outbound_link->rec_sdma_eng_id_mask =
> > +                   1 <<
> rec_sdma_eng_map[src_socket_id][dst_socket_id];
> > +           inbound_link->rec_sdma_eng_id_mask =
> > +                   1 <<
> rec_sdma_eng_map[dst_socket_id][src_socket_id];
>
> Was the intention to set two different bits for the inbound and outbound
> links, so an application can get full-duplex bandwidth?
>
> I also find it strange that the outbound link on GPU A uses the same
> engine as the inbound link on GPU B. That means the wiring of each XGMI
> link has the same SDMA engine affinity on both ends of the link?

I'm not sure on whether link affinity is roughly associated with AID or specifically to a given SDMA engine wrt to AID.
My best guess is that there's some complicated heuristics in DF we're not clear on and the time required to do that study is not be feasible.
So the full duplex vs AID/SDMA affinity trade off was never considered..
Basically, an all-to-all async copy stress test was done and the table above gave the best results.

Thanks,

Jon

>
> Regards,
>    Felix
>
>
> > +   } else {
> > +           int num_sdma_eng = kfd_get_num_sdma_engines(gpu);
> > +           int i, eng_offset = 0;
> > +
> > +           if (outbound_link->iolink_type == CRAT_IOLINK_TYPE_XGMI
> &&
> > +               kfd_get_num_xgmi_sdma_engines(gpu) && to_dev->gpu) {
> > +                   eng_offset = num_sdma_eng;
> > +                   num_sdma_eng =
> kfd_get_num_xgmi_sdma_engines(gpu);
> > +           }
> > +
> > +           for (i = 0; i < num_sdma_eng; i++) {
> > +                   outbound_link->rec_sdma_eng_id_mask |= (1 << (i +
> eng_offset));
> > +                   inbound_link->rec_sdma_eng_id_mask |= (1 << (i +
> eng_offset));
> > +           }
> > +   }
> > +}
> > +
> >   static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
> >   {
> >     struct kfd_iolink_properties *link, *inbound_link;
> > @@ -1303,6 +1354,7 @@ static void kfd_fill_iolink_non_crat_info(struct
> kfd_topology_device *dev)
> >                     inbound_link->flags = CRAT_IOLINK_FLAGS_ENABLED;
> >                     kfd_set_iolink_no_atomics(peer_dev, dev,
> inbound_link);
> >                     kfd_set_iolink_non_coherent(peer_dev, link,
> inbound_link);
> > +                   kfd_set_recommended_sdma_engines(peer_dev, link,
> inbound_link);
> >             }
> >     }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> > index 2d1c9d771bef..43ba67890f2c 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h
> > @@ -121,6 +121,7 @@ struct kfd_iolink_properties {
> >     uint32_t                min_bandwidth;
> >     uint32_t                max_bandwidth;
> >     uint32_t                rec_transfer_size;
> > +   uint32_t                rec_sdma_eng_id_mask;
> >     uint32_t                flags;
> >     struct kfd_node         *gpu;
> >     struct kobject          *kobj;
> > diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
> > index 285a36601dc9..71a7ce5f2d4c 100644
> > --- a/include/uapi/linux/kfd_ioctl.h
> > +++ b/include/uapi/linux/kfd_ioctl.h
> > @@ -42,9 +42,10 @@
> >    * - 1.14 - Update kfd_event_data
> >    * - 1.15 - Enable managing mappings in compute VMs with GEM_VA ioctl
> >    * - 1.16 - Add contiguous VRAM allocation flag
> > + * - 1.17 - Add SDMA queue creation with target SDMA engine ID
> >    */
> >   #define KFD_IOCTL_MAJOR_VERSION 1
> > -#define KFD_IOCTL_MINOR_VERSION 16
> > +#define KFD_IOCTL_MINOR_VERSION 17
> >
> >   struct kfd_ioctl_get_version_args {
> >     __u32 major_version;    /* from KFD */
> > @@ -56,6 +57,7 @@ struct kfd_ioctl_get_version_args {
> >   #define KFD_IOC_QUEUE_TYPE_SDMA                   0x1
> >   #define KFD_IOC_QUEUE_TYPE_COMPUTE_AQL            0x2
> >   #define KFD_IOC_QUEUE_TYPE_SDMA_XGMI              0x3
> > +#define KFD_IOC_QUEUE_TYPE_SDMA_BY_ENG_ID  0x4
> >
> >   #define KFD_MAX_QUEUE_PERCENTAGE  100
> >   #define KFD_MAX_QUEUE_PRIORITY            15
> > @@ -78,6 +80,8 @@ struct kfd_ioctl_create_queue_args {
> >     __u64 ctx_save_restore_address; /* to KFD */
> >     __u32 ctx_save_restore_size;    /* to KFD */
> >     __u32 ctl_stack_size;           /* to KFD */
> > +   __u32 sdma_engine_id;           /* to KFD */
> > +   __u32 pad;
> >   };
> >
> >   struct kfd_ioctl_destroy_queue_args {