On 11/28/2024 3:55 PM, Neil Armstrong wrote: > The Adreno GPU Management Unit (GMU) can also scale DDR Bandwidth along > the Frequency and Power Domain level, but by default we leave the > OPP core scale the interconnect ddr path. > > While scaling via the interconnect path was sufficient, newer GPUs > like the A750 requires specific vote paremeters and bandwidth to > achieve full functionality. > > In order to calculate vote values used by the GPU Management > Unit (GMU), we need to parse all the possible OPP Bandwidths and > create a vote value to be sent to the appropriate Bus Control > Modules (BCMs) declared in the GPU info struct. > > This vote value is called IB, while on the othe side the GMU also > takes another vote called AB which is a 16bit quantized value > of the bandwidth against the maximum supported bandwidth. > > The vote array will then be used to dynamically generate the GMU > bw_table sent during the GMU power-up. > > Signed-off-by: Neil Armstrong <neil.armstrong@xxxxxxxxxx> > --- > drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 174 ++++++++++++++++++++++++++++++++++ > drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 14 +++ > drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 1 + > drivers/gpu/drm/msm/adreno/a6xx_hfi.h | 5 + > 4 files changed, 194 insertions(+) > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c > index 14db7376c712d19446b38152e480bd5a1e0a5198..ee2010a01186721dd377f1655fcf05ddaff77131 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c > @@ -9,6 +9,7 @@ > #include <linux/pm_domain.h> > #include <linux/pm_opp.h> > #include <soc/qcom/cmd-db.h> > +#include <soc/qcom/tcs.h> > #include <drm/drm_gem.h> > > #include "a6xx_gpu.h" > @@ -1287,6 +1288,131 @@ static int a6xx_gmu_memory_probe(struct a6xx_gmu *gmu) > return 0; > } > > +/** > + * struct bcm_db - Auxiliary data pertaining to each Bus Clock Manager (BCM) > + * @unit: divisor used to convert bytes/sec bw value to an RPMh msg > + * @width: multiplier used to convert bytes/sec bw value to an RPMh msg > + * @vcd: virtual clock domain that this bcm belongs to > + * @reserved: reserved field > + */ > +struct bcm_db { > + __le32 unit; > + __le16 width; > + u8 vcd; > + u8 reserved; > +}; > + > +static u64 bcm_div(u64 num, u32 base) > +{ > + /* Ensure that small votes aren't lost. */ > + if (num && num < base) > + return 1; > + > + do_div(num, base); > + > + return num; > +} > + > +static int a6xx_gmu_rpmh_bw_votes_init(const struct a6xx_info *info, > + struct a6xx_gmu *gmu) > +{ > + const struct bcm_db *bcm_data[GMU_MAX_BCMS] = { 0 }; > + unsigned int bcm_index, bw_index, bcm_count = 0; > + > + if (!info->bcms) > + return 0; > + > + /* Retrieve BCM data from cmd-db */ > + for (bcm_index = 0; bcm_index < GMU_MAX_BCMS; bcm_index++) { > + size_t count; > + > + /* Stop at first unconfigured bcm */ > + if (!info->bcms[bcm_index].name) > + break; > + > + bcm_data[bcm_index] = cmd_db_read_aux_data( > + info->bcms[bcm_index].name, > + &count); > + if (IS_ERR(bcm_data[bcm_index])) > + return PTR_ERR(bcm_data[bcm_index]); > + > + if (!count) > + return -EINVAL; > + > + ++bcm_count; > + } > + > + /* Generate BCM votes values for each bandwidth & BCM */ > + for (bw_index = 0; bw_index < gmu->nr_gpu_bws; bw_index++) { > + u32 *data = gmu->gpu_ib_votes[bw_index]; > + u32 bw = gmu->gpu_bw_table[bw_index]; > + > + /* Calculations loosely copied from bcm_aggregate() & tcs_cmd_gen() */ > + for (bcm_index = 0; bcm_index < bcm_count; bcm_index++) { > + bool commit = false; > + u64 peak, vote; > + u16 width; > + u32 unit; > + > + /* Skip unconfigured BCM */ > + if (!bcm_data[bcm_index]) > + continue; > + > + if (bcm_index == bcm_count - 1 || > + (bcm_data[bcm_index + 1] && > + bcm_data[bcm_index]->vcd != bcm_data[bcm_index + 1]->vcd)) > + commit = true; > + > + if (!bw) { > + data[bcm_index] = BCM_TCS_CMD(commit, false, 0, 0); > + continue; > + } > + > + if (info->bcms[bcm_index].fixed) { > + u32 perfmode = 0; > + > + if (bw >= info->bcms[bcm_index].perfmode_bw) > + perfmode = info->bcms[bcm_index].perfmode; > + > + data[bcm_index] = BCM_TCS_CMD(commit, true, 0, perfmode); > + continue; > + } > + > + /* Multiply the bandwidth by the width of the connection */ > + width = le16_to_cpu(bcm_data[bcm_index]->width); > + peak = bcm_div((u64)bw * width, info->bcms[bcm_index].buswidth); > + > + /* Input bandwidth value is in KBps, scale the value to BCM unit */ > + unit = le32_to_cpu(bcm_data[bcm_index]->unit); > + vote = bcm_div(peak * 1000ULL, unit); > + > + if (vote > BCM_TCS_CMD_VOTE_MASK) > + vote = BCM_TCS_CMD_VOTE_MASK; use clamp()? > + > + data[bcm_index] = BCM_TCS_CMD(commit, true, vote, vote); > + } > + } > + > + /* Generate AB votes which are a quantitized bandwidth value */ > + for (bw_index = 0; bw_index < gmu->nr_gpu_bws; bw_index++) { > + u64 tmp; > + > + /* > + * The AB vote consists of a 16 bit wide quantized level > + * against the maximum supported bandwidth. > + * Quantization can be calculated as below: > + * vote = (bandwidth * 2^16) / max bandwidth > + */ > + tmp = gmu->gpu_bw_table[bw_index] * MAX_AB_VOTE; > + > + /* Divide by the maximum bandwidth to get a quantized value */ > + gmu->gpu_ab_votes[bw_index] = > + bcm_div(tmp, gmu->gpu_bw_table[gmu->nr_gpu_bws - 1]); > + } So I suppose you are trying to vote AB equal to IB. Aggregation logic for both are different. So this will make DDR scale very aggressively. A more reasonable approach would be to vote a % of IB vote (25%?). Ideally we should measure GPU's bandwidth usage and vote that if you are really concerned about stability issues. IB is just a floor vote, GPU can generate way higher traffic. > + > + return 0; > +} > + > /* Return the 'arc-level' for the given frequency */ > static unsigned int a6xx_gmu_get_arc_level(struct device *dev, > unsigned long freq) > @@ -1390,12 +1516,15 @@ static int a6xx_gmu_rpmh_arc_votes_init(struct device *dev, u32 *votes, > * The GMU votes with the RPMh for itself and on behalf of the GPU but we need > * to construct the list of votes on the CPU and send it over. Query the RPMh > * voltage levels and build the votes > + * The GMU can also vote for DDR interconnects, use the OPP bandwidth entries > + * and BCM parameters to build the votes. > */ > > static int a6xx_gmu_rpmh_votes_init(struct a6xx_gmu *gmu) > { > struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); > struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; > + const struct a6xx_info *info = adreno_gpu->info->a6xx; > struct msm_gpu *gpu = &adreno_gpu->base; > int ret; > > @@ -1407,6 +1536,10 @@ static int a6xx_gmu_rpmh_votes_init(struct a6xx_gmu *gmu) > ret |= a6xx_gmu_rpmh_arc_votes_init(gmu->dev, gmu->cx_arc_votes, > gmu->gmu_freqs, gmu->nr_gmu_freqs, "cx.lvl"); > > + /* Build the interconnect votes */ > + if (info->bcms && gmu->nr_gpu_bws > 1) > + ret |= a6xx_gmu_rpmh_bw_votes_init(info, gmu); > + > return ret; > } > > @@ -1442,10 +1575,43 @@ static int a6xx_gmu_build_freq_table(struct device *dev, unsigned long *freqs, > return index; > } > > +static int a6xx_gmu_build_bw_table(struct device *dev, unsigned long *bandwidths, > + u32 size) > +{ > + int count = dev_pm_opp_get_opp_count(dev); I am less concerned about this now since you are not voting real AB BW. -Akhil. > + struct dev_pm_opp *opp; > + int i, index = 0; > + unsigned int bandwidth = 1; > + > + /* > + * The OPP table doesn't contain the "off" bandwidth level so we need to > + * add 1 to the table size to account for it > + */ > + > + if (WARN(count + 1 > size, > + "The GMU bandwidth table is being truncated\n")) > + count = size - 1; > + > + /* Set the "off" bandwidth */ > + bandwidths[index++] = 0; > + > + for (i = 0; i < count; i++) { > + opp = dev_pm_opp_find_bw_ceil(dev, &bandwidth, 0); > + if (IS_ERR(opp)) > + break; > + > + dev_pm_opp_put(opp); > + bandwidths[index++] = bandwidth++; > + } > + > + return index; > +} > + > static int a6xx_gmu_pwrlevels_probe(struct a6xx_gmu *gmu) > { > struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, gmu); > struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; > + const struct a6xx_info *info = adreno_gpu->info->a6xx; > struct msm_gpu *gpu = &adreno_gpu->base; > > int ret = 0; > @@ -1472,6 +1638,14 @@ static int a6xx_gmu_pwrlevels_probe(struct a6xx_gmu *gmu) > > gmu->current_perf_index = gmu->nr_gpu_freqs - 1; > > + /* > + * The GMU also handles GPU Interconnect Votes so build a list > + * of DDR bandwidths from the GPU OPP table > + */ > + if (info->bcms) > + gmu->nr_gpu_bws = a6xx_gmu_build_bw_table(&gpu->pdev->dev, > + gmu->gpu_bw_table, ARRAY_SIZE(gmu->gpu_bw_table)); > + > /* Build the list of RPMh votes that we'll send to the GMU */ > return a6xx_gmu_rpmh_votes_init(gmu); > } > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h > index 88f18ea6a38a08b5b171709e5020010947a5d347..bdfc106cb3a578c90d7cd84f7d4fe228d761a994 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h > @@ -21,6 +21,15 @@ struct a6xx_gmu_bo { > > #define GMU_MAX_GX_FREQS 16 > #define GMU_MAX_CX_FREQS 4 > +#define GMU_MAX_BCMS 3 > + > +struct a6xx_bcm { > + char *name; > + unsigned int buswidth; > + bool fixed; > + unsigned int perfmode; > + unsigned int perfmode_bw; > +}; > > /* > * These define the different GMU wake up options - these define how both the > @@ -85,6 +94,11 @@ struct a6xx_gmu { > unsigned long gpu_freqs[GMU_MAX_GX_FREQS]; > u32 gx_arc_votes[GMU_MAX_GX_FREQS]; > > + int nr_gpu_bws; > + unsigned long gpu_bw_table[GMU_MAX_GX_FREQS]; > + u32 gpu_ib_votes[GMU_MAX_GX_FREQS][GMU_MAX_BCMS]; > + u16 gpu_ab_votes[GMU_MAX_GX_FREQS]; > + > int nr_gmu_freqs; > unsigned long gmu_freqs[GMU_MAX_CX_FREQS]; > u32 cx_arc_votes[GMU_MAX_CX_FREQS]; > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h > index 4aceffb6aae89c781facc2a6e4a82b20b341b6cb..9201a53dd341bf432923ffb44947e015208a3d02 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h > @@ -44,6 +44,7 @@ struct a6xx_info { > u32 gmu_chipid; > u32 gmu_cgc_mode; > u32 prim_fifo_threshold; > + const struct a6xx_bcm *bcms; > }; > > struct a6xx_gpu { > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_hfi.h b/drivers/gpu/drm/msm/adreno/a6xx_hfi.h > index 528110169398f69f16443a29a1594d19c36fb595..52ba4a07d7b9a709289acd244a751ace9bdaab5d 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_hfi.h > +++ b/drivers/gpu/drm/msm/adreno/a6xx_hfi.h > @@ -173,6 +173,11 @@ struct a6xx_hfi_gx_bw_perf_vote_cmd { > u32 bw; > }; > > +#define AB_VOTE_MASK GENMASK(31, 16) > +#define MAX_AB_VOTE (FIELD_MAX(AB_VOTE_MASK) - 1) > +#define AB_VOTE(vote) FIELD_PREP(AB_VOTE_MASK, (vote)) > +#define AB_VOTE_ENABLE BIT(8) > + > #define HFI_H2F_MSG_PREPARE_SLUMBER 33 > > struct a6xx_hfi_prep_slumber_cmd { >