On 7/31/19 22:06, Evan Green wrote: > On Tue, Jul 30, 2019 at 5:37 PM David Dai <daidavid1@xxxxxxxxxxxxxx> wrote: >> >> >> On 7/30/2019 3:54 PM, Evan Green wrote: >>> On Thu, Jul 18, 2019 at 10:59 AM David Dai <daidavid1@xxxxxxxxxxxxxx> wrote: >>>> On 7/16/2019 1:15 PM, Evan Green wrote: >>>>> On Mon, Jul 15, 2019 at 4:34 PM David Dai <daidavid1@xxxxxxxxxxxxxx> wrote: >>>>>> Hi Evan, >>>>>> >>>>>> Thanks for the continued help in reviewing these patches! >>>>> No problem. I want to do more, but haven't found time to do the >>>>> prerequisite research before jumping into some of the other >>>>> discussions yet. >>>>> >>>>>> On 7/11/2019 10:06 AM, Evan Green wrote: >>>>>>> Hi Georgi and David, >>>>>>> >>>>>>> On Tue, Jun 18, 2019 at 2:17 AM Georgi Djakov <georgi.djakov@xxxxxxxxxx> wrote: >>>>>>>> From: David Dai <daidavid1@xxxxxxxxxxxxxx> >>>>>>>> >>>>>>>> Add support for wake and sleep commands by using a tag to indicate >>>>>>>> whether or not the aggregate and set requests fall into execution >>>>>>>> state specific bucket. >>>>>>>> >>>>>>>> Signed-off-by: David Dai <daidavid1@xxxxxxxxxxxxxx> >>>>>>>> Signed-off-by: Georgi Djakov <georgi.djakov@xxxxxxxxxx> >>>>>>>> --- >>>>>>>> drivers/interconnect/qcom/sdm845.c | 129 ++++++++++++++++++++++------- >>>>>>>> 1 file changed, 98 insertions(+), 31 deletions(-) >>>>>>>> >>>>>>>> diff --git a/drivers/interconnect/qcom/sdm845.c b/drivers/interconnect/qcom/sdm845.c >>>>>>>> index fb526004c82e..c100aab39415 100644 >>>>>>>> --- a/drivers/interconnect/qcom/sdm845.c >>>>>>>> +++ b/drivers/interconnect/qcom/sdm845.c >>>>>>>> @@ -66,6 +66,17 @@ struct bcm_db { >>>>>>>> #define SDM845_MAX_BCM_PER_NODE 2 >>>>>>>> #define SDM845_MAX_VCD 10 >>>>>>>> >>>>>>>> +#define QCOM_ICC_BUCKET_AMC 0 >>>>>>> What is AMC again? Is it the "right now" bucket? Maybe a comment on >>>>>>> the meaning of this bucket would be helpful. >>>>>> That's correct. Will add a comment for this. >>>>>>>> +#define QCOM_ICC_BUCKET_WAKE 1 >>>>>>>> +#define QCOM_ICC_BUCKET_SLEEP 2 >>>>>>>> +#define QCOM_ICC_NUM_BUCKETS 3 >>>>>>>> +#define QCOM_ICC_TAG_AMC BIT(QCOM_ICC_BUCKET_AMC) >>>>>>>> +#define QCOM_ICC_TAG_WAKE BIT(QCOM_ICC_BUCKET_WAKE) >>>>>>>> +#define QCOM_ICC_TAG_SLEEP BIT(QCOM_ICC_BUCKET_SLEEP) >>>>>>>> +#define QCOM_ICC_TAG_ACTIVE_ONLY (QCOM_ICC_TAG_AMC | QCOM_ICC_TAG_WAKE) >>>>>>>> +#define QCOM_ICC_TAG_ALWAYS (QCOM_ICC_TAG_AMC | QCOM_ICC_TAG_WAKE |\ >>>>>>>> + QCOM_ICC_TAG_SLEEP) >>>>>>>> + >>>>>>>> /** >>>>>>>> * struct qcom_icc_node - Qualcomm specific interconnect nodes >>>>>>>> * @name: the node name used in debugfs >>>>>>>> @@ -75,7 +86,9 @@ struct bcm_db { >>>>>>>> * @channels: num of channels at this node >>>>>>>> * @buswidth: width of the interconnect between a node and the bus >>>>>>>> * @sum_avg: current sum aggregate value of all avg bw requests >>>>>>>> + * @sum_avg_cached: previous sum aggregate value of all avg bw requests >>>>>>>> * @max_peak: current max aggregate value of all peak bw requests >>>>>>>> + * @max_peak_cached: previous max aggregate value of all peak bw requests >>>>>>>> * @bcms: list of bcms associated with this logical node >>>>>>>> * @num_bcms: num of @bcms >>>>>>>> */ >>>>>>>> @@ -86,8 +99,10 @@ struct qcom_icc_node { >>>>>>>> u16 num_links; >>>>>>>> u16 channels; >>>>>>>> u16 buswidth; >>>>>>>> - u64 sum_avg; >>>>>>>> - u64 max_peak; >>>>>>>> + u64 sum_avg[QCOM_ICC_NUM_BUCKETS]; >>>>>>>> + u64 sum_avg_cached[QCOM_ICC_NUM_BUCKETS]; >>>>>>>> + u64 max_peak[QCOM_ICC_NUM_BUCKETS]; >>>>>>>> + u64 max_peak_cached[QCOM_ICC_NUM_BUCKETS]; >>>>>>>> struct qcom_icc_bcm *bcms[SDM845_MAX_BCM_PER_NODE]; >>>>>>>> size_t num_bcms; >>>>>>>> }; >>>>>>>> @@ -112,8 +127,8 @@ struct qcom_icc_bcm { >>>>>>>> const char *name; >>>>>>>> u32 type; >>>>>>>> u32 addr; >>>>>>>> - u64 vote_x; >>>>>>>> - u64 vote_y; >>>>>>>> + u64 vote_x[QCOM_ICC_NUM_BUCKETS]; >>>>>>>> + u64 vote_y[QCOM_ICC_NUM_BUCKETS]; >>>>>>>> bool dirty; >>>>>>>> bool keepalive; >>>>>>>> struct bcm_db aux_data; >>>>>>>> @@ -555,7 +570,7 @@ inline void tcs_cmd_gen(struct tcs_cmd *cmd, u64 vote_x, u64 vote_y, >>>>>>>> cmd->wait = true; >>>>>>>> } >>>>>>>> >>>>>>>> -static void tcs_list_gen(struct list_head *bcm_list, >>>>>>>> +static void tcs_list_gen(struct list_head *bcm_list, int bucket, >>>>>>>> struct tcs_cmd tcs_list[SDM845_MAX_VCD], >>>>>>>> int n[SDM845_MAX_VCD]) >>>>>>>> { >>>>>>>> @@ -573,8 +588,8 @@ static void tcs_list_gen(struct list_head *bcm_list, >>>>>>>> commit = true; >>>>>>>> cur_vcd_size = 0; >>>>>>>> } >>>>>>>> - tcs_cmd_gen(&tcs_list[idx], bcm->vote_x, bcm->vote_y, >>>>>>>> - bcm->addr, commit); >>>>>>>> + tcs_cmd_gen(&tcs_list[idx], bcm->vote_x[bucket], >>>>>>>> + bcm->vote_y[bucket], bcm->addr, commit); >>>>>>>> idx++; >>>>>>>> n[batch]++; >>>>>>>> /* >>>>>>>> @@ -595,32 +610,39 @@ static void tcs_list_gen(struct list_head *bcm_list, >>>>>>>> >>>>>>>> static void bcm_aggregate(struct qcom_icc_bcm *bcm) >>>>>>>> { >>>>>>>> - size_t i; >>>>>>>> - u64 agg_avg = 0; >>>>>>>> - u64 agg_peak = 0; >>>>>>>> + size_t i, bucket; >>>>>>>> + u64 agg_avg[QCOM_ICC_NUM_BUCKETS] = {0}; >>>>>>>> + u64 agg_peak[QCOM_ICC_NUM_BUCKETS] = {0}; >>>>>>>> u64 temp; >>>>>>>> >>>>>>>> - for (i = 0; i < bcm->num_nodes; i++) { >>>>>>>> - temp = bcm->nodes[i]->sum_avg * bcm->aux_data.width; >>>>>>>> - do_div(temp, bcm->nodes[i]->buswidth * bcm->nodes[i]->channels); >>>>>>>> - agg_avg = max(agg_avg, temp); >>>>>>>> + for (bucket = 0; bucket < QCOM_ICC_NUM_BUCKETS; bucket++) { >>>>>>>> + for (i = 0; i < bcm->num_nodes; i++) { >>>>>>>> + temp = bcm->nodes[i]->sum_avg_cached[bucket] * bcm->aux_data.width; >>>>>>>> + do_div(temp, bcm->nodes[i]->buswidth * bcm->nodes[i]->channels); >>>>>>>> + agg_avg[bucket] = max(agg_avg[bucket], temp); >>>>>>>> >>>>>>>> - temp = bcm->nodes[i]->max_peak * bcm->aux_data.width; >>>>>>>> - do_div(temp, bcm->nodes[i]->buswidth); >>>>>>> Why is it that this one doesn't have the multiply by >>>>>>> bcm->nodes[i]->channels again? I can't recall if there was a reason. >>>>>>> If it's correct maybe it deserves a comment. >>>>>> I think the rationale behind this is generally for consumers to target a >>>>>> certain minimum threshold to satisfy some structural latency >>>>>> requirements as opposed to strictly throughput, and it may be easier for >>>>>> consumers to reuse certain values to support hitting some minimum NoC >>>>>> frequencies without having to be concerned with the number of channels >>>>>> that may change from platform to platform. >>>>> I was mostly pointing out that sum_avg seems to have the multiply, but >>>>> max_peak does not. I would have expected those two things to be of the >>>>> same units, and get the same treatment. Maybe the hardware is taking >>>>> in different final units for that field, one that is per-channel and >>>>> one that isn't? >>>> The hardware isn't treating the values differently. I couldn't find any >>>> justification other than the intuition mentioned above for the ease of >>>> voting from the consumer perspective. The consumer would know that this >>>> peak_bw value results in some floor performance from the system to >>>> satisfy its latency requirements. The same approach would work if we >>>> accounted for the number of channels as well, but given that channels >>>> may vary from platform to platform or even on the same platform that >>>> shares multiple channel configurations(DDR), it can be difficult for >>>> consumers to keep track of and have to adjust their votes constantly(to >>>> try to hit some frequency/latency requirement, this intuition doesn't >>>> apply for avg_bw since we're concerned with throughput in that case). >>>> >>>>>>>> - agg_peak = max(agg_peak, temp); >>>>>>>> - } >>>>>>>> + temp = bcm->nodes[i]->max_peak_cached[bucket] * bcm->aux_data.width; >>>>>>>> + do_div(temp, bcm->nodes[i]->buswidth); >>>>>>>> + agg_peak[bucket] = max(agg_peak[bucket], temp); >>>>>>>> >>>>>>>> - temp = agg_avg * 1000ULL; >>>>>>>> - do_div(temp, bcm->aux_data.unit); >>>>>>>> - bcm->vote_x = temp; >>>>>>>> + bcm->nodes[i]->sum_avg[bucket] = 0; >>>>>>>> + bcm->nodes[i]->max_peak[bucket] = 0; >>>>>>> I don't understand the sum_avg vs sum_avg_cached. Here's what I understand: >>>>>>> 1. qcom_icc_aggregate() does the math from the incoming values on >>>>>>> sum_avg, and then clobbers sum_avg_cached with those values. >>>>>>> 2. bcm_aggregate() uses sum_avg_cached in its calculations, then clears sum_avg. >>>>>>> >>>>>>> But I don't get why that's needed. Why not just have sum_avg? Wouldn't >>>>>>> it work the same? Ok, it wouldn't if you ended up calling >>>>>>> bcm_aggregate() multiple times on the same bcm. But you have a dirty >>>>>>> flag that prevents this from happening. So I think it's safe to remove >>>>>>> the cached arrays, and just clear out the sum_avg when you aggregate. >>>>>> You are correct in that the dirty flag would prevent another repeat of >>>>>> the bcm_aggregate() call in the same icc_set request. But consider a >>>>>> following icc_set request on a different node that shares the same BCM, >>>>>> the next bcm_aggregate() would result in an incorrect aggregate sum_avg >>>>>> for the BCM since the avg_sum from the previous node(from the previous >>>>>> icc_set) was cleared out. We need a way to retain the current state of >>>>>> all nodes to accurately aggregate the bw values for the BCM. >>>>> I don't get it. qcom_icc_aggregate() clobbers sum_avg_cached. So >>>>> they're only ever a) equal, like after qcom_icc_aggregate(), or b) >>>>> sum_avg is zeroed, and sum_avg_cached is its old value. A new >>>>> icc_set_bw() would call aggregate_requests(), which would clobber >>>>> sum_avg_cached to sum_avg for every BCM involved. Then the core would >>>>> call apply_constraints(), then qcom_icc_set(), which would use >>>>> sum_avg_cached, and clear out sum_avg, being sure with the dirty flag >>>>> that bcm_aggregate() is only called once per BCM. This all happens >>>>> under the mutex held in the core. A new request would start the whole >>>>> thing over, since sum_avg is cleared. It seems to me that flow would >>>>> work the same with one array as it does with two. Maybe you can walk >>>>> me through a scenario? >>>>> -Evan >>>> Let's walk through the scenario you've just described with the >>>> assumption that there's only one avg_sum value per node with two >>>> icc_set_bw() requests on two different nodes(say 2MB for node 1 and 1MB >>>> for node 2) under the same BCM(say BCM A). The first >>>> qcom_icc_aggregate() aggregates to a 2MB avg_sum at the node1 followed >>>> by apply_constraints(), qcom_icc_set(), bcm_aggregate() which causes BCM >>>> A to aggregate to max(node1->avg_sum, node2->avg_sum) and reach a vote_x >>>> of 2MB(for simplicity let's ignore unit). We then clear out >>>> node1->avg_sum before we start the next icc_set_bw(). In the following >>>> icc_set_bw(), the qcom_icc_aggregate() aggregates to 1MB in node2 >>>> followed by apply_constraints(), qcom_icc_set(), bcm_aggregate(), but >>>> now incorrectly aggregates BCM A to 1MB by looking at >>>> max(node1->avg_sum, node2->avg_sum) because node1->avg_sum was cleared >>>> out when in reality BCM A should have a vote_x value of 2MB at this >>>> point. The subsequent bcm_aggregate do not re-aggregate all of the >>>> requests for each of its nodes, but assumes that the aggregated results >>>> at the nodes are correct. >>> Ah, I finally get it. Thanks for the detailed explanation. It's pretty >>> confusing that there are essentially two connected graphs laid on top >>> of each other, one graph consisting of nodes the framework deals with, >>> and another graph that groups those nodes together into BCMs. I was >>> failing to understand that bcm_aggregate loops over nodes that have >>> nothing to do with the current request, and so it needs to remember >>> the old totals from former requests. You've got the two arrays >>> basically to differentiate between "add together all requests for this >>> node", and "max all nodes into a BCM", since you need to reset sum_avg >>> at the start of the first call to qcom_icc_aggregate(). >> Well it's not really two graphs since the BCMs aren't really connected >> to each other, they only have association with some groups of physical >> nodes that share a clock domain(There's some nuances here, but let's >> assume for the sake of simplicity). Their only job is to aggregate to >> some threshold value and select a performance point and they don't >> contain any information about the connectivity of the nodes. > > Right ok, I see. > >>> I had suggested a callback in the core earlier to tell the providers >>> "I'm about to start aggregating on these nodes", which would have >>> allowed you to clear sum_avg in that callback and reduce down to one >>> array. IMO that's a lot easier to understand than these double arrays, >>> but maybe it's just me that gets confused. >> I do admit looking at this is somewhat confusing. I'm not totally >> against the idea of adding another callback in the framework, maybe we >> can re-evaluate this when there are other providers using the >> interconnect framework. I'd prefer to have the justification of needing >> additional ops in the core if somehow there's some hardware out there >> that dictates that we need some pre or post aggregation stage as opposed >> to easier book keeping? Though I do like the idea of reducing complexity >> overall, any thoughts on this Georgi? > > Sure. I suppose any other SoC that does this same grouping thing in > the hardware will end up duplicating this same complexity. We'll see > if anybody has anything like this. It also might end up being useful > even if it's just for QC SoCs if we find ourselves copy/pasting a lot > of this logic in sdm845.c for sdm-next.c. Generally we should aim to > keep the providers as dumb as we can, but I'm fine waiting until > there's something to refactor down. If this same logic would be re-used in the upcoming SoCs and adding a single callback would simplify the providers significantly, then let's do it and try to keep the complexity at minimum from the beginning. Will give it a try. Thanks, Georgi >>> >>> Why do we bother with the individual nodes at all, why don't we just >>> build a graph out of the BCMs themselves and pass that to the >>> framework? I guess you can't do that because of .channels and >>> .bus_width, you wouldn't know what to multiply/divide by to translate >>> to a vote value? Hm... it would be great to make this simpler, but I'm >>> out of suggestions for now. >> >> I appreciate the thought, but not only do the nodes provide the >> width/channel, they provide all the connectivity data and an accurate >> representation of the NoC topology. There's no way to aggregate the >> nodes and the paths properly if we lose out on the granularity that the >> current graph provides(Imagine the example of two nodes on some mutually >> exclusive path under the same BCM again using avg_bw, 1MBps on node1 and >> 1MBps node2 should result in an aggregate BCM node of 1MBps since they >> physically don't share the same port where as if we clobbered the nodes >> together and represent them under a single BCM, it would suggest that >> they share the same physical port and aggregate 2MBps when in reality >> they don't need to be since they are parallel). > > Oh right, that makes sense. I'm on board. > -Evan > >> >>> -Evan >>> >>>>>>>> + } >>>>>>>> >>>>>>>> - temp = agg_peak * 1000ULL; >>>>>>>> - do_div(temp, bcm->aux_data.unit); >>>>>>>> - bcm->vote_y = temp; >>>>>>>> + temp = agg_avg[bucket] * 1000ULL; >>>>>>>> + do_div(temp, bcm->aux_data.unit); >>>>>>>> + bcm->vote_x[bucket] = temp; >>>>>>>> >>>>>>>> - if (bcm->keepalive && bcm->vote_x == 0 && bcm->vote_y == 0) { >>>>>>>> - bcm->vote_x = 1; >>>>>>>> - bcm->vote_y = 1; >>>>>>>> + temp = agg_peak[bucket] * 1000ULL; >>>>>>>> + do_div(temp, bcm->aux_data.unit); >>>>>>>> + bcm->vote_y[bucket] = temp; >>>>>>>> + } >>>>>>>> + >>>>>>>> + if (bcm->keepalive && bcm->vote_x[0] == 0 && bcm->vote_y[0] == 0) { >>>>>>>> + bcm->vote_x[QCOM_ICC_BUCKET_AMC] = 1; >>>>>>>> + bcm->vote_x[QCOM_ICC_BUCKET_WAKE] = 1; >>>>>>>> + bcm->vote_y[QCOM_ICC_BUCKET_AMC] = 1; >>>>>>>> + bcm->vote_y[QCOM_ICC_BUCKET_WAKE] = 1; >>>>>>>> } >>>>>>>> >>>>>>>> bcm->dirty = false; >>>>>>>> @@ -631,15 +653,25 @@ static int qcom_icc_aggregate(struct icc_node *node, u32 tag, u32 avg_bw, >>>>>>>> { >>>>>>>> size_t i; >>>>>>>> struct qcom_icc_node *qn; >>>>>>>> + unsigned long tag_word = (unsigned long)tag; >>>>>>>> >>>>>>>> qn = node->data; >>>>>>>> >>>>>>>> + if (!tag) >>>>>>>> + tag_word = QCOM_ICC_TAG_ALWAYS; >>>>>>>> + >>>>>>>> + for (i = 0; i < QCOM_ICC_NUM_BUCKETS; i++) { >>>>>>>> + if (test_bit(i, &tag_word)) { >>>>>>> I guess all this extra business with tag_word and casting is so that >>>>>>> you can use test_bit, which is presumably a tiny bit faster? Does this >>>>>>> actually make a measurable difference? Maybe in the name of simplicity >>>>>>> we just do if (tag & BIT(i)), and then optimize if we find that >>>>>>> conditional to be a hotspot? >>>>>> Using (tag & BIT(i)) as opposed to test_bit seems reasonable to me. >>>>>>>> + qn->sum_avg[i] += avg_bw; >>>>>>>> + qn->max_peak[i] = max_t(u32, qn->max_peak[i], peak_bw); >>>>>>>> + qn->sum_avg_cached[i] = qn->sum_avg[i]; >>>>>>>> + qn->max_peak_cached[i] = qn->max_peak[i]; >>>>>>>> + } >>>>>>>> + } >>>>>>>> + >>>>>>>> *agg_avg += avg_bw; >>>>>>>> *agg_peak = max_t(u32, *agg_peak, peak_bw); >>>>>>>> >>>>>>>> - qn->sum_avg = *agg_avg; >>>>>>>> - qn->max_peak = *agg_peak; >>>>>>>> - >>>>>>>> for (i = 0; i < qn->num_bcms; i++) >>>>>>>> qn->bcms[i]->dirty = true; >>>>>>>> >>>>>>>> @@ -675,7 +707,7 @@ static int qcom_icc_set(struct icc_node *src, struct icc_node *dst) >>>>>>>> * Construct the command list based on a pre ordered list of BCMs >>>>>>>> * based on VCD. >>>>>>>> */ >>>>>>>> - tcs_list_gen(&commit_list, cmds, commit_idx); >>>>>>>> + tcs_list_gen(&commit_list, QCOM_ICC_BUCKET_AMC, cmds, commit_idx); >>>>>>>> >>>>>>>> if (!commit_idx[0]) >>>>>>>> return ret; >>>>>>>> @@ -693,6 +725,41 @@ static int qcom_icc_set(struct icc_node *src, struct icc_node *dst) >>>>>>>> return ret; >>>>>>>> } >>>>>>>> >>>>>>>> + INIT_LIST_HEAD(&commit_list); >>>>>>>> + >>>>>>>> + for (i = 0; i < qp->num_bcms; i++) { >>>>>>>> + /* >>>>>>>> + * Only generate WAKE and SLEEP commands if a resource's >>>>>>>> + * requirements change as the execution environment transitions >>>>>>>> + * between different power states. >>>>>>>> + */ >>>>>>>> + if (qp->bcms[i]->vote_x[QCOM_ICC_BUCKET_WAKE] != >>>>>>>> + qp->bcms[i]->vote_x[QCOM_ICC_BUCKET_SLEEP] || >>>>>>>> + qp->bcms[i]->vote_y[QCOM_ICC_BUCKET_WAKE] != >>>>>>>> + qp->bcms[i]->vote_y[QCOM_ICC_BUCKET_SLEEP]) { >>>>>>>> + list_add_tail(&qp->bcms[i]->list, &commit_list); >>>>>>>> + } >>>>>>>> + } >>>>>>>> + >>>>>>>> + if (list_empty(&commit_list)) >>>>>>>> + return ret; >>>>>>>> + >>>>>>>> + tcs_list_gen(&commit_list, QCOM_ICC_BUCKET_WAKE, cmds, commit_idx); >>>>>>>> + >>>>>>>> + ret = rpmh_write_batch(qp->dev, RPMH_WAKE_ONLY_STATE, cmds, commit_idx); >>>>>>>> + if (ret) { >>>>>>>> + pr_err("Error sending WAKE RPMH requests (%d)\n", ret); >>>>>>>> + return ret; >>>>>>>> + } >>>>>>>> + >>>>>>>> + tcs_list_gen(&commit_list, QCOM_ICC_BUCKET_SLEEP, cmds, commit_idx); >>>>>>>> + >>>>>>>> + ret = rpmh_write_batch(qp->dev, RPMH_SLEEP_STATE, cmds, commit_idx); >>>>>>>> + if (ret) { >>>>>>>> + pr_err("Error sending SLEEP RPMH requests (%d)\n", ret); >>>>>>>> + return ret; >>>>>>>> + } >>>>>>>> + >>>>>>>> return ret; >>>>>>>> } >>>>>>>> >>>>>> -- >>>>>> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, >>>>>> a Linux Foundation Collaborative Project >>>>>> >>>> -- >>>> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, >>>> a Linux Foundation Collaborative Project >>>> >> -- >> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, >> a Linux Foundation Collaborative Project >>