On 2/14/2025 9:54 PM, Kim, Jonathan wrote: > [Public] > > > We could be talking about 2 types of bandwidth here. > > 1. Bandwidth per link > 2. Bandwidth per peer i.e. multiple xgmi links that are used for SDMA > gang submissions for effective max bandwidth * num_link copy speed. > The is currently used by runtime i.e. max divide by min. The number > of links per peer can be variable. > > > > The peerless request is requesting for #1 because there should be no > speed variability of links based on peer i.e. requesting max bandwidth > per link for 1 link. > > > > The interface could look like amdgpu_xgmi_get_bandwidth(adev, peer, enum > unit_type, int *min, int *max) then. > > Unit_type could be defined for illustration: > > #define AMDGPU_XGMI_BW_MBYTES_MIN_MAX_PER_LINK 0 > > #define AMDGPU_XGMI_BW_MBYTES_MIN_MAX_PER_PEER 1 > > > > Where if unit_type == AMDGPU_XGMI_BW_*_MIN_MAX_PER_LINK, call would > ignore peer and populate *min/max with per link min/max (keeps it open > for powerplay range per link) > > While unit_type == AMDGPU_XGMI_BW_*_MIN_MAX_PER_PEER, call would > populate *min/max with per peer, where min/max is max_bw * num_link range. > Thanks, Jon. I'm aligned with this approach. Thanks, Lijo > > Jon > > > > *From:*Lazar, Lijo <Lijo.Lazar@xxxxxxx> > *Sent:* Friday, February 14, 2025 10:39 AM > *To:* Kim, Jonathan <Jonathan.Kim@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > *Subject:* Re: [PATCH] drm/amdgpu: simplify xgmi peer info calls > > > > [Public] > > > > For minimum bandwidth, we should keep the possibility of going to FW to > get the data when XGMI DPM is in place. So it is all wrapped inside the > API when the devices passed are connected. The caller doesn't need to know. > > > > BTW, what is the real requirement of bandwidth data without any peer > device? In what way that is useful? > > > > Thanks, > > Lijo > > ------------------------------------------------------------------------ > > *From:*Kim, Jonathan <Jonathan.Kim@xxxxxxx <mailto:Jonathan.Kim@xxxxxxx>> > *Sent:* Friday, February 14, 2025 8:27:28 PM > *To:* Lazar, Lijo <Lijo.Lazar@xxxxxxx <mailto:Lijo.Lazar@xxxxxxx>>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx <mailto:amd-gfx@xxxxxxxxxxxxxxxxxxxxx> <amd- > gfx@xxxxxxxxxxxxxxxxxxxxx <mailto:amd-gfx@xxxxxxxxxxxxxxxxxxxxx>> > *Subject:* RE: [PATCH] drm/amdgpu: simplify xgmi peer info calls > > > > [Public] > >> -----Original Message----- >> From: Lazar, Lijo <Lijo.Lazar@xxxxxxx <mailto:Lijo.Lazar@xxxxxxx>> >> Sent: Friday, February 14, 2025 12:58 AM >> To: Kim, Jonathan <Jonathan.Kim@xxxxxxx <mailto:Jonathan.Kim@xxxxxxx>>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx <mailto:amd-gfx@xxxxxxxxxxxxxxxxxxxxx> >> Subject: Re: [PATCH] drm/amdgpu: simplify xgmi peer info calls >> >> >> >> On 2/13/2025 9:20 PM, Kim, Jonathan wrote: >> > [Public] >> > >> >> -----Original Message----- >> >> From: Lazar, Lijo <Lijo.Lazar@xxxxxxx <mailto:Lijo.Lazar@xxxxxxx>> >> >> Sent: Thursday, February 13, 2025 1:35 AM >> >> To: Kim, Jonathan <Jonathan.Kim@xxxxxxx <mailto:Jonathan.Kim@xxxxxxx>>; amd- > gfx@xxxxxxxxxxxxxxxxxxxxx <mailto:amd-gfx@xxxxxxxxxxxxxxxxxxxxx> >> >> Subject: Re: [PATCH] drm/amdgpu: simplify xgmi peer info calls >> >> >> >> >> >> >> >> On 2/12/2025 9:27 PM, Jonathan Kim wrote: >> >>> Deprecate KFD XGMI peer info calls in favour of calling directly from >> >>> simplified XGMI peer info functions. >> >>> >> >>> Signed-off-by: Jonathan Kim <jonathan.kim@xxxxxxx <mailto:jonathan.kim@xxxxxxx>> >> >>> --- >> >>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 42 ------------------ >> >>> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 5 --- >> >>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 51 +++++++++++++++++----- >> >>> drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 6 +-- >> >>> drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 11 +++-- >> >>> 5 files changed, 48 insertions(+), 67 deletions(-) >> >>> >> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >> >>> index 0312231b703e..4cec3a873995 100644 >> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c >> >>> @@ -555,48 +555,6 @@ int amdgpu_amdkfd_get_dmabuf_info(struct >> >> amdgpu_device *adev, int dma_buf_fd, >> >>> return r; >> >>> } >> >>> >> >>> -uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct amdgpu_device *dst, >> >>> - struct amdgpu_device *src) >> >>> -{ >> >>> - struct amdgpu_device *peer_adev = src; >> >>> - struct amdgpu_device *adev = dst; >> >>> - int ret = amdgpu_xgmi_get_hops_count(adev, peer_adev); >> >>> - >> >>> - if (ret < 0) { >> >>> - DRM_ERROR("amdgpu: failed to get xgmi hops count between >> >> node %d and %d. ret = %d\n", >> >>> - adev->gmc.xgmi.physical_node_id, >> >>> - peer_adev->gmc.xgmi.physical_node_id, ret); >> >>> - ret = 0; >> >>> - } >> >>> - return (uint8_t)ret; >> >>> -} >> >>> - >> >>> -int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst, >> >>> - struct amdgpu_device *src, >> >>> - bool is_min) >> >>> -{ >> >>> - struct amdgpu_device *adev = dst, *peer_adev; >> >>> - int num_links; >> >>> - >> >>> - if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 4, 2)) >> >>> - return 0; >> >>> - >> >>> - if (src) >> >>> - peer_adev = src; >> >>> - >> >>> - /* num links returns 0 for indirect peers since indirect route is unknown. */ >> >>> - num_links = is_min ? 1 : amdgpu_xgmi_get_num_links(adev, peer_adev); >> >>> - if (num_links < 0) { >> >>> - DRM_ERROR("amdgpu: failed to get xgmi num links between >> >> node %d and %d. ret = %d\n", >> >>> - adev->gmc.xgmi.physical_node_id, >> >>> - peer_adev->gmc.xgmi.physical_node_id, num_links); >> >>> - num_links = 0; >> >>> - } >> >>> - >> >>> - /* Aldebaran xGMI DPM is defeatured so assume x16 x 25Gbps for >> >> bandwidth. */ >> >>> - return (num_links * 16 * 25000)/BITS_PER_BYTE; >> >>> -} >> >>> - >> >>> int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, >> >> bool is_min) >> >>> { >> >>> int num_lanes_shift = (is_min ? ffs(adev->pm.pcie_mlw_mask) : >> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> >>> index 092dbd8bec97..28eb1cd0eb5a 100644 >> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> >>> @@ -255,11 +255,6 @@ int amdgpu_amdkfd_get_dmabuf_info(struct >> >> amdgpu_device *adev, int dma_buf_fd, >> >>> uint64_t *bo_size, void *metadata_buffer, >> >>> size_t buffer_size, uint32_t *metadata_size, >> >>> uint32_t *flags, int8_t *xcp_id); >> >>> -uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct amdgpu_device *dst, >> >>> - struct amdgpu_device *src); >> >>> -int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst, >> >>> - struct amdgpu_device *src, >> >>> - bool is_min); >> >>> int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, >> >> bool is_min); >> >>> int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, >> >>> uint32_t *payload); >> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> >>> index 74b4349e345a..d18d2a26cc91 100644 >> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c >> >>> @@ -818,28 +818,59 @@ int amdgpu_xgmi_update_topology(struct >> >> amdgpu_hive_info *hive, struct amdgpu_dev >> >>> * num_hops[2:0] = number of hops >> >>> */ >> >>> int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, >> >>> - struct amdgpu_device *peer_adev) >> >>> + struct amdgpu_device *peer_adev) >> >>> { >> >>> struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; >> >>> uint8_t num_hops_mask = 0x7; >> >>> int i; >> >>> >> >>> + if (!adev->gmc.xgmi.supported) >> >>> + return 0; >> >>> + >> >>> for (i = 0 ; i < top->num_nodes; ++i) >> >>> if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) >> >>> return top->nodes[i].num_hops & num_hops_mask; >> >>> - return -EINVAL; >> >>> + >> >>> + DRM_ERROR("amdgpu: failed to get xgmi hops count between node %d >> >> and %d.\n", >> >> >> >> Suggest to use dev_ function to identify the device pci number. Since >> >> the function still passes, maybe info level is good enough. >> > >> > Ack'd. Will change. >> > >> >> >> >>> + adev->gmc.xgmi.physical_node_id, >> >>> + peer_adev->gmc.xgmi.physical_node_id); >> >>> + >> >>> + return 0; >> >>> } >> >>> >> >>> -int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, >> >>> - struct amdgpu_device *peer_adev) >> >>> +int amdgpu_xgmi_get_bandwidth_mbytes(struct amdgpu_device *adev, >> >>> + struct amdgpu_device *peer_adev) >> >>> { >> >>> - struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info; >> >>> - int i; >> >>> + int num_links = !peer_adev ? 1 : 0; >> >>> >> >>> - for (i = 0 ; i < top->num_nodes; ++i) >> >>> - if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id) >> >>> - return top->nodes[i].num_links; >> >>> - return -EINVAL; >> >>> + if (!adev->gmc.xgmi.supported) >> >>> + return 0; >> >>> + >> >>> + if (peer_adev) { >> >>> + struct psp_xgmi_topology_info *top = &adev- >> >>> psp.xgmi_context.top_info; >> >>> + int i; >> >>> + >> >>> + for (i = 0 ; i < top->num_nodes; ++i) { >> >>> + if (top->nodes[i].node_id != peer_adev->gmc.xgmi.node_id) >> >>> + continue; >> >>> + >> >>> + num_links = top->nodes[i].num_links; >> >>> + break; >> >>> + } >> >>> + } >> >>> + >> >>> + /* num links returns 0 for indirect peers since indirect route is unknown. */ >> >>> + if (!num_links) { >> >> >> >> This looks problematic. I guess, it is better to keep the old way of >> >> passing min/max. Otherwise, there is a chance that min reports some >> >> value and max could report this error. >> > >> > I don't think this is a problem. The old way of passing is_min == true is tied to >> peer_dev == NULL, which made it a redundant argument. >> > is_min == false doesn't prevent the issue you've mentioned from happening >> because the old code effective sets num_link = 0 if the peer_dev search fails >> anyways. >> >> The problem I was thinking about was with respect to usage in KFD >> properties. With this way, min_bandwidth will report some value even if >> no link is present between the devices. Max bandwidth will report 0. >> Ideally, we should report both min/max = 0 when there is no link present >> between the devices. I haven't checked if KFD does some other check to >> ensure that there are links between devices before calling the API. > > The KFD only queries for links within a hive. > Checking for max first implies link on non-zero return. > We could change the KFD to set zero on if max is zero then. > >> >> >> > With the line at the top of the proposed function: >> >>> + int num_links = !peer_adev ? 1 : 0; >> > The sematics are as follows: >> > - NULL peer_dev indicates that caller doesn't want peer-to-peer data to factor >> num_links into bandwidth reporting so assume a single XGMI link in bandwidth >> calculation. >> >> I think this semantics doesn't provide enough clarity. XGMI is >> interconnect between devices and ideally we expect two devices to be >> passed to the API. If any one is NULL, then it's better to reject. >> >> > - If a failed peer_dev search ends up with num_links == 0, that means the caller >> passed in an invalid node (i.e. it's not one of the nodes that's been registered to the >> hive). >> > Currently, get_hops_count (old and proposed) operates in the same fashion too. >> > So the functionality between the proposed changes and old changes should have >> remained the same. >> > >> >> The API is usage is clear and general expectation is two devices are not >> NULL. That is clear in the case of get_hops_count. > > I don't see where that expectation is set in the proposal. > peer_dev == NULL is asking for a peerless check of potential bandwidth. > If you're saying we need a redundant is_min bool to ignore peer_dev > num_links checks, I'd say then both are a waste of time for clarity's sake. > > In that case, if we want to be strict with both peers being non-NULL, > change the following: > - expose amdgpu_xgmi_get_num_links to KFD to mirror get_num_hops > - KFD calls amdgpu_xgmi_get_num_links to do some handling and later pass > return into a new bandwidth call if return is non-zero > - change amdgpu_xgmi_get_bandwidth_mbytes(peer1, peer2) to > amdgpu_xgmi_get_bandwidth(adev, const int num_links, enum unit_type) > - adev -> used for future IP checks as a dynamic internal speed > constant setter > - const int num_links -> caller passes in constant link > multiplier. So in this case KFD can pass back the num_links it finds > from previous amdgpu_xgmi_get_num_links > - enum unit_type -> some enum we can define in amdgpu_xgmi.h to > give the back the caller whatever units it's looking for (GTs, Mbytes/ > sec, Mbps etc ...) > > Then this way: > - KFD enforces min/max bandwidth to be consistent i.e. both are 0 or > both are non-zero > - bandwidth call is now more general purpose and flexible and provides > various bandwidth return types by caller needs. > > Jon > >> >> > I'm open to adjusting get_num_hops & get_bandwidth_mbytes to return either - >> ENODEV (device not found in hive) or -EINVAL (bad peer_dev request) on a failed >> search. >> >> On a failed search, does returning 0 work - zero hop/bandwidth >> considered as no link found between the provided devices? >> >> Thanks, >> Lijo >> >> > That would require a change to KFD CRAT logic to error return itself on an error. >> > However, this would change the behaviour of a bandwidth reporting error where >> ROCm could still function, to the KFD not loading at all. >> > Currently, the consequence of passing an incorrect peer_dev to the bandwidth call >> would result in the ROCm Runtime not issuing an SDMA ganged copy i.e. potential >> peer-to-peer performance drop. >> > >> > Jon >> > >> >> >> >> Thanks, >> >> Lijo >> >> >> >>> + DRM_ERROR("amdgpu: failed to get xgmi num links between >> >> node %d and %d.\n", >> >>> + adev->gmc.xgmi.physical_node_id, >> >>> + peer_adev->gmc.xgmi.physical_node_id); >> >>> + } >> >>> + >> >>> + /* >> >>> + * TBD - will update IP based bandwidth later. >> >>> + * Bandwidth currently assumed to be x16 lanes x 25Gbps. >> >>> + */ >> >>> + return (num_links * 16 * 25000)/BITS_PER_BYTE; >> >>> } >> >>> >> >>> bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev, >> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> >>> index d1282b4c6348..325e7972469d 100644 >> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h >> >>> @@ -62,10 +62,8 @@ int amdgpu_xgmi_update_topology(struct >> >> amdgpu_hive_info *hive, struct amdgpu_dev >> >>> int amdgpu_xgmi_add_device(struct amdgpu_device *adev); >> >>> int amdgpu_xgmi_remove_device(struct amdgpu_device *adev); >> >>> int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate); >> >>> -int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, >> >>> - struct amdgpu_device *peer_adev); >> >>> -int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, >> >>> - struct amdgpu_device *peer_adev); >> >>> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, struct >> >> amdgpu_device *peer_adev); >> >>> +int amdgpu_xgmi_get_bandwidth_mbytes(struct amdgpu_device *adev, struct >> >> amdgpu_device *peer_adev); >> >>> bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev, >> >>> struct amdgpu_device *peer_adev); >> >>> uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev, >> >>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c >> >> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c >> >>> index 70b3ae0b74fe..a787d192390c 100644 >> >>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c >> >>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c >> >>> @@ -2133,8 +2133,8 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int >> >> *avail_size, >> >>> bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3); >> >>> int mem_bw = 819200, weight = ext_cpu ? >> >> KFD_CRAT_XGMI_WEIGHT : >> >>> >> >> KFD_CRAT_INTRA_SOCKET_WEIGHT; >> >>> - uint32_t bandwidth = ext_cpu ? >> >> amdgpu_amdkfd_get_xgmi_bandwidth_mbytes( >> >>> - kdev->adev, NULL, true) : >> >> mem_bw; >> >>> + uint32_t bandwidth = ext_cpu ? >> >> amdgpu_xgmi_get_bandwidth_mbytes(kdev->adev, NULL) : >> >>> + mem_bw; >> >>> >> >>> /* >> >>> * with host gpu xgmi link, host can access gpu memory whether >> >>> @@ -2198,12 +2198,11 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int >> >> *avail_size, >> >>> >> >>> if (use_ta_info) { >> >>> sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT * >> >>> - amdgpu_amdkfd_get_xgmi_hops_count(kdev->adev, >> >> peer_kdev->adev); >> >>> + amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev- >> >>> adev); >> >>> sub_type_hdr->maximum_bandwidth_mbs = >> >>> - amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, >> >>> - peer_kdev->adev, false); >> >>> + amdgpu_xgmi_get_bandwidth_mbytes(kdev->adev, >> >> peer_kdev->adev); >> >>> sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr- >> >>> maximum_bandwidth_mbs ? >> >>> - amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, >> >> NULL, true) : 0; >> >>> + amdgpu_xgmi_get_bandwidth_mbytes(kdev->adev, NULL) : >> >> 0; >> >>> } else { >> >>> bool is_single_hop = kdev->kfd == peer_kdev->kfd; >> >>> int weight = is_single_hop ? >> >> KFD_CRAT_INTRA_SOCKET_WEIGHT : >> > >