On 11/27/2024 9:17 PM, neil.armstrong@xxxxxxxxxx wrote: > On 27/11/2024 16:29, Akhil P Oommen wrote: >> On 11/25/2024 1:42 PM, Neil Armstrong wrote: >>> On 23/11/2024 23:46, Akhil P Oommen wrote: >>>> On Sun, Nov 24, 2024 at 02:52:46AM +0530, Akhil P Oommen wrote: >>>>> On Tue, Nov 19, 2024 at 06:56:40PM +0100, Neil Armstrong wrote: >>>>>> The Adreno GMU Management Unit (GMU) can also scale DDR Bandwidth >>>>>> along >>>>>> the Frequency and Power Domain level, but by default we leave the >>>>>> OPP core scale the interconnect ddr path. >>>>>> >>>>>> In order to calculate vote values used by the GPU Management >>>>>> Unit (GMU), we need to parse all the possible OPP Bandwidths and >>>>> >>>>> GMU expects a table of votes for each DDR frequency corners. Can we >>>>> please try to figure out a way to do that? Generally, we should >>>>> ensure the >>>>> data that is send to GMU firmware match downstream exactly. Because, >>>>> when something breaks in firmware or worst, at SoC level, it will be >>>>> pretty >>>>> hard to narrow down the issue. So, I prefer to be very conservative >>>>> about >>>>> this. >>>>> >>>>> KGSL keeps the ddr frequency table in the devicetree. That helps to >>>>> keep >>>>> the driver lean, but I am not sure if that is viable upstream. >>> >>> No it's not, opp table is here for that, and we can reproduce the same >>> bahaviour by parsing all the bandwidths in the opp table. >>> >>>>> >>>>> -Akhil. >>>>> >>>>>> create a vote value to be sent to the appropriate Bus Control >>>>>> Modules (BCMs) declared in the GPU info struct. >>>>>> >>>>>> The vote array will then be used to dynamically generate the GMU >>>>>> bw_table sent during the GMU power-up. >>>>>> >>>>>> Signed-off-by: Neil Armstrong <neil.armstrong@xxxxxxxxxx> >>>>>> --- >>>>>> drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 153 ++++++++++++++++++++++ >>>>>> ++++++++++++ >>>>>> drivers/gpu/drm/msm/adreno/a6xx_gmu.h | 14 ++++ >>>>>> drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 1 + >>>>>> 3 files changed, 168 insertions(+) >>>>>> >>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/ >>>>>> drm/msm/adreno/a6xx_gmu.c >>>>>> index >>>>>> 14db7376c712d19446b38152e480bd5a1e0a5198..f6814d92a4edb29ba8a34a34aabb8b2324e9c6a4 100644 >>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c >>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c >>>>>> @@ -9,6 +9,7 @@ >>>>>> #include <linux/pm_domain.h> >>>>>> #include <linux/pm_opp.h> >>>>>> #include <soc/qcom/cmd-db.h> >>>>>> +#include <soc/qcom/tcs.h> >>>>>> #include <drm/drm_gem.h> >>>>>> #include "a6xx_gpu.h" >>>>>> @@ -1287,6 +1288,109 @@ static int a6xx_gmu_memory_probe(struct >>>>>> a6xx_gmu *gmu) >>>>>> return 0; >>>>>> } >>>>>> +/** >>>>>> + * struct bcm_db - Auxiliary data pertaining to each Bus Clock >>>>>> Manager (BCM) >>>>>> + * @unit: divisor used to convert bytes/sec bw value to an RPMh msg >>>>>> + * @width: multiplier used to convert bytes/sec bw value to an RPMh >>>>>> msg >>>>>> + * @vcd: virtual clock domain that this bcm belongs to >>>>>> + * @reserved: reserved field >>>>>> + */ >>>>>> +struct bcm_db { >>>>>> + __le32 unit; >>>>>> + __le16 width; >>>>>> + u8 vcd; >>>>>> + u8 reserved; >>>>>> +}; >>>> >>>> Shouldn't this be a packed struct? It is naturally aligned, but still! >>> >>> It's used as-is in the interconnecty driver, so I assume it's fine. >>> >>>> >>>>>> + >>>>>> +static u64 bcm_div(u64 num, u32 base) >>>>>> +{ >>>>>> + /* Ensure that small votes aren't lost. */ >>>>>> + if (num && num < base) >>>>>> + return 1; >>>>>> + >>>>>> + do_div(num, base); >>>>>> + >>>>>> + return num; >>>>>> +} >>>>>> + >>>>>> +static int a6xx_gmu_rpmh_bw_votes_init(const struct a6xx_info *info, >>>>>> + struct a6xx_gmu *gmu) >>>>>> +{ >>>>>> + const struct bcm_db *bcm_data[GMU_MAX_BCMS] = { 0 }; >>>>>> + unsigned int bcm_index, bw_index; >>>>>> + >>>>>> + /* Retrieve BCM data from cmd-db */ >>>>>> + for (bcm_index = 0; bcm_index < GMU_MAX_BCMS; bcm_index++) { >>>>>> + size_t count; >>>>>> + >>>>>> + /* Skip unconfigured BCM */ >>>>>> + if (!info->bcm[bcm_index].name) >>>>>> + continue; >>>>>> + >>>>>> + bcm_data[bcm_index] = cmd_db_read_aux_data( >>>>>> + info->bcm[bcm_index].name, >>>>>> + &count); >>>>>> + if (IS_ERR(bcm_data[bcm_index])) >>>>>> + return PTR_ERR(bcm_data[bcm_index]); >>>>>> + >>>>>> + if (!count) >>>>>> + return -EINVAL; >>>>>> + } >>>>>> + >>>>>> + /* Generate BCM votes values for each bandwidth & BCM */ >>>>>> + for (bw_index = 0; bw_index < gmu->nr_gpu_bws; bw_index++) { >>>>>> + u32 *data = gmu->gpu_bw_votes[bw_index]; >>>>>> + u32 bw = gmu->gpu_bw_table[bw_index]; >>>>>> + >>>>>> + /* Calculations loosely copied from bcm_aggregate() & >>>>>> tcs_cmd_gen() */ >>>>>> + for (bcm_index = 0; bcm_index < GMU_MAX_BCMS; bcm_index++) { >>>>>> + bool commit = false; >>>>>> + u64 peak, vote; >>>>>> + u16 width; >>>>>> + u32 unit; >>>>>> + >>>>>> + /* Skip unconfigured BCM */ >>>>>> + if (!info->bcm[bcm_index].name || !bcm_data[bcm_index]) >>>>>> + continue; >>>>>> + >>>>>> + if (bcm_index == GMU_MAX_BCMS - 1 || >>>>>> + (bcm_data[bcm_index + 1] && >>>>>> + bcm_data[bcm_index]->vcd != bcm_data[bcm_index + >>>>>> 1]->vcd)) >>>>>> + commit = true; >>>>>> + >>>>>> + if (!bw) { >>>>>> + data[bcm_index] = BCM_TCS_CMD(commit, false, 0, 0); >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + if (info->bcm[bcm_index].fixed) { >>>>>> + u32 perfmode = 0; >>>>>> + >>>>>> + if (bw >= info->bcm[bcm_index].perfmode_bw) >>>>>> + perfmode = info->bcm[bcm_index].perfmode; >>>>>> + >>>>>> + data[bcm_index] = BCM_TCS_CMD(commit, true, 0, >>>>>> perfmode); >>>>>> + continue; >>>>>> + } >>>>>> + >>>>>> + /* Multiply the bandwidth by the width of the >>>>>> connection */ >>>>>> + width = le16_to_cpu(bcm_data[bcm_index]->width); >>>>>> + peak = bcm_div((u64)bw * width, info- >>>>>>> bcm[bcm_index].buswidth); >>>>>> + >>>>>> + /* Input bandwidth value is in KBps, scale the value to >>>>>> BCM unit */ >>>>>> + unit = le32_to_cpu(bcm_data[bcm_index]->unit); >>>>>> + vote = bcm_div(peak * 1000ULL, unit); >>>>>> + >>>>>> + if (vote > BCM_TCS_CMD_VOTE_MASK) >>>>>> + vote = BCM_TCS_CMD_VOTE_MASK; >>>>>> + >>>>>> + data[bcm_index] = BCM_TCS_CMD(commit, true, vote, vote); >>>>>> + } >>>>>> + } >>>>>> + >>>>>> + return 0; >>>>>> +} >>>>>> + >>>>>> /* Return the 'arc-level' for the given frequency */ >>>>>> static unsigned int a6xx_gmu_get_arc_level(struct device *dev, >>>>>> unsigned long freq) >>>>>> @@ -1390,12 +1494,15 @@ static int >>>>>> a6xx_gmu_rpmh_arc_votes_init(struct device *dev, u32 *votes, >>>>>> * The GMU votes with the RPMh for itself and on behalf of the GPU >>>>>> but we need >>>>>> * to construct the list of votes on the CPU and send it over. >>>>>> Query the RPMh >>>>>> * voltage levels and build the votes >>>>>> + * The GMU can also vote for DDR interconnects, use the OPP >>>>>> bandwidth entries >>>>>> + * and BCM parameters to build the votes. >>>>>> */ >>>>>> static int a6xx_gmu_rpmh_votes_init(struct a6xx_gmu *gmu) >>>>>> { >>>>>> struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, >>>>>> gmu); >>>>>> struct adreno_gpu *adreno_gpu = &a6xx_gpu->base; >>>>>> + const struct a6xx_info *info = adreno_gpu->info->a6xx; >>>>>> struct msm_gpu *gpu = &adreno_gpu->base; >>>>>> int ret; >>>>>> @@ -1407,6 +1514,10 @@ static int a6xx_gmu_rpmh_votes_init(struct >>>>>> a6xx_gmu *gmu) >>>>>> ret |= a6xx_gmu_rpmh_arc_votes_init(gmu->dev, gmu- >>>>>> >cx_arc_votes, >>>>>> gmu->gmu_freqs, gmu->nr_gmu_freqs, "cx.lvl"); >>>>>> + /* Build the interconnect votes */ >>>>>> + if (adreno_gpu->info->features & ADRENO_FEAT_GMU_BW_VOTE) >>>>>> + ret |= a6xx_gmu_rpmh_bw_votes_init(info, gmu); >>>>>> + >>>>>> return ret; >>>>>> } >>>>>> @@ -1442,6 +1553,38 @@ static int a6xx_gmu_build_freq_table(struct >>>>>> device *dev, unsigned long *freqs, >>>>>> return index; >>>>>> } >>>>>> +static int a6xx_gmu_build_bw_table(struct device *dev, unsigned >>>>>> long *bandwidths, >>>>>> + u32 size) >>>>>> +{ >>>>>> + int count = dev_pm_opp_get_opp_count(dev); >>>> >>>> I suppose this doesn't count the opps which are not supported by the >>>> SKU. If we can go through *all* OPPs in the opp table irrespective of >>>> the SKU, we will get something close to a full DDR bw table I mentioned >>>> in the prevous mail. >>> >>> It parses _all_ bandwitdh declared in the opp table, without any >>> discard, >>> so yes we're close to a full DDR table. It only lacks a few low >>> bandwidths. >>> >> Isn't opp_table->opp_list created after filtering with supported_hw >> bitmask? > > Sure, but opp filtering is not supported on a7xx, and nevetheless only > the high OPPs would be filtered out, so I don't understand why it would > matter ? > > As of today, the gx_arc_votes are already constructed the exact same way > I build the BW vote table, so why the BW table should be the ____exact____ > same as downstream while we don't care about the gx_arc_votes ? > Sorry to comment on the older revision, but how is arc votes table different from kgsl? Just want to check if we have a problem. It should match downstream driver exactly. -Akhil > Neil > >> >> -Akhil. >>>> >>>>>> + struct dev_pm_opp *opp; >>>>>> + int i, index = 0; >>>>>> + unsigned int bandwidth = 1; >>>>>> + >>>>>> + /* >>>>>> + * The OPP table doesn't contain the "off" bandwidth level so >>>>>> we need to >>>>>> + * add 1 to the table size to account for it >>>>>> + */ >>>>>> + >>>>>> + if (WARN(count + 1 > size, >>>>>> + "The GMU bandwidth table is being truncated\n")) >>>>>> + count = size - 1; >>>>>> + >>>>>> + /* Set the "off" bandwidth */ >>>>>> + bandwidths[index++] = 0; >>>>>> + >>>>>> + for (i = 0; i < count; i++) { >>>>>> + opp = dev_pm_opp_find_bw_ceil(dev, &bandwidth, 0); >>>>>> + if (IS_ERR(opp)) >>>>>> + break; >>>>>> + >>>>>> + dev_pm_opp_put(opp); >>>>>> + bandwidths[index++] = bandwidth++; >>>>>> + } >>>>>> + >>>>>> + return index; >>>>>> +} >>>>>> + >>>>>> static int a6xx_gmu_pwrlevels_probe(struct a6xx_gmu *gmu) >>>>>> { >>>>>> struct a6xx_gpu *a6xx_gpu = container_of(gmu, struct a6xx_gpu, >>>>>> gmu); >>>>>> @@ -1472,6 +1615,16 @@ static int a6xx_gmu_pwrlevels_probe(struct >>>>>> a6xx_gmu *gmu) >>>>>> gmu->current_perf_index = gmu->nr_gpu_freqs - 1; >>>>>> + /* >>>>>> + * The GMU also handles GPU Interconnect Votes so build a list >>>>>> + * of DDR bandwidths from the GPU OPP table >>>>>> + */ >>>>>> + if (adreno_gpu->info->features & ADRENO_FEAT_GMU_BW_VOTE) >>>>>> + gmu->nr_gpu_bws = a6xx_gmu_build_bw_table(&gpu->pdev->dev, >>>>>> + gmu->gpu_bw_table, ARRAY_SIZE(gmu->gpu_bw_table)); >>>>>> + >>>>>> + gmu->current_perf_index = gmu->nr_gpu_freqs - 1; >>>> >>>> duplicate line. >>>> >>>>>> + >>>>>> /* Build the list of RPMh votes that we'll send to the GMU */ >>>>>> return a6xx_gmu_rpmh_votes_init(gmu); >>>>>> } >>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h b/drivers/gpu/ >>>>>> drm/msm/adreno/a6xx_gmu.h >>>>>> index >>>>>> b4a79f88ccf45cfe651c86d2a9da39541c5772b3..03603eadc0f9ed866899c95e99f333a511ebc3c1 100644 >>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.h >>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.h >>>>>> @@ -19,6 +19,16 @@ struct a6xx_gmu_bo { >>>>>> u64 iova; >>>>>> }; >>>>>> +#define GMU_MAX_BCMS 3 >>>>>> + >>>>>> +struct a6xx_bcm { >>>>>> + char *name; >>>>>> + unsigned int buswidth; >>>>>> + bool fixed; >>>>>> + unsigned int perfmode; >>>>>> + unsigned int perfmode_bw; >>>>>> +}; >>>>>> + >>>>>> /* >>>>>> * These define the different GMU wake up options - these define >>>>>> how both the >>>>>> * CPU and the GMU bring up the hardware >>>>>> @@ -82,6 +92,10 @@ struct a6xx_gmu { >>>>>> unsigned long gpu_freqs[16]; >>>>>> u32 gx_arc_votes[16]; >>>>>> + int nr_gpu_bws; >>>>>> + unsigned long gpu_bw_table[16]; >>>>>> + u32 gpu_bw_votes[16][GMU_MAX_BCMS]; >>>>>> + >>>>>> int nr_gmu_freqs; >>>>>> unsigned long gmu_freqs[4]; >>>>>> u32 cx_arc_votes[4]; >>>>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/ >>>>>> drm/msm/adreno/a6xx_gpu.h >>>>>> index >>>>>> 4aceffb6aae89c781facc2a6e4a82b20b341b6cb..5b80919e595fa1ba0a3afcca55feb89e60870cb1 100644 >>>>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h >>>>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h >>>>>> @@ -44,6 +44,7 @@ struct a6xx_info { >>>>>> u32 gmu_chipid; >>>>>> u32 gmu_cgc_mode; >>>>>> u32 prim_fifo_threshold; >>>>>> + const struct a6xx_bcm bcm[GMU_MAX_BCMS]; >>>> >>>> This table is duplicated a lot. Lets keep a pointer instead. We can >>>> probably use >>>> this pointer as a flag to check for GMU_IB_VOTE support too. >>> >>> It's partially duplicated, basically only the permode bits and level is >>> different. >>> >>> We can move it out when we support more GPUs with this feature. >>> >>> Neil >>> >>>> >>>> -Akhil >>>> >>>>>> }; >>>>>> struct a6xx_gpu { >>>>>> >>>>>> -- >>>>>> 2.34.1 >>>>>> >>> >> >