Hi Krzysztof, On 9/27/19 11:19 AM, Krzysztof Kozlowski wrote: > On Wed, Sep 25, 2019 at 06:18:13PM +0200, Lukasz Luba wrote: >> Introduce a new interrupt driven mechanism for managing speed of the >> memory controller. The interrupts are generated due to performance >> counters overflow. The performance counters might track memory reads, >> writes, transfers, page misses, etc. In the basic algorithm tracking >> read transfers and calculating memory pressure should be enough to >> skip polling mode in devfreq. >> >> Signed-off-by: Lukasz Luba <l.luba@xxxxxxxxxxxxxxxxxxx> >> --- >> drivers/memory/samsung/exynos5422-dmc.c | 297 ++++++++++++++++++++++-- >> 1 file changed, 272 insertions(+), 25 deletions(-) >> >> diff --git a/drivers/memory/samsung/exynos5422-dmc.c b/drivers/memory/samsung/exynos5422-dmc.c >> index 0fe5f2186139..86e1844b97ef 100644 >> --- a/drivers/memory/samsung/exynos5422-dmc.c >> +++ b/drivers/memory/samsung/exynos5422-dmc.c >> @@ -8,6 +8,7 @@ >> #include <linux/devfreq.h> >> #include <linux/devfreq-event.h> >> #include <linux/device.h> >> +#include <linux/interrupt.h> >> #include <linux/io.h> >> #include <linux/mfd/syscon.h> >> #include <linux/module.h> >> @@ -35,6 +36,29 @@ >> #define USE_BPLL_TIMINGS (0) >> #define EXYNOS5_AREF_NORMAL (0x2e) >> >> +#define DREX_PPCCLKCON (0x0130) >> +#define DREX_PEREV2CONFIG (0x013c) >> +#define DREX_PMNC_PPC (0xE000) >> +#define DREX_CNTENS_PPC (0xE010) >> +#define DREX_CNTENC_PPC (0xE020) >> +#define DREX_INTENS_PPC (0xE030) >> +#define DREX_INTENC_PPC (0xE040) >> +#define DREX_FLAG_PPC (0xE050) >> +#define DREX_PMCNT2_PPC (0xE130) >> + >> +#define CC_RESET BIT(2) >> +#define PPC_COUNTER_RESET BIT(1) >> +#define PPC_ENABLE BIT(0) >> +#define PEREV_CLK_EN BIT(0) >> +#define PERF_CNT2 BIT(2) >> +#define PERF_CCNT BIT(31) > > Describe to which registers these bitfields are applicable. OK, I will add comment above them. > >> + >> +#define READ_TRANSFER_CH0 (0x6d) >> +#define READ_TRANSFER_CH1 (0x6f) > > The same. Otherwise they all look like some generic constants which is > not true. Make sense, agree. I will add comment above them. > >> + >> +#define PERF_COUNTER_START_VALUE 0xff000000 >> +#define PERF_EVENT_UP_DOWN_THRESHOLD 900000000ULL >> + >> /** >> * struct dmc_opp_table - Operating level desciption >> * >> @@ -85,6 +109,9 @@ struct exynos5_dmc { >> struct clk *mout_mx_mspll_ccore_phy; >> struct devfreq_event_dev **counter; >> int num_counters; >> + u64 last_overflow_ts[2]; >> + unsigned long load, total; > > One member per line. This decreases readability. OK > >> + bool in_irq_mode; >> }; >> >> #define TIMING_FIELD(t_name, t_bit_beg, t_bit_end) \ >> @@ -653,6 +680,167 @@ static int exynos5_counters_get(struct exynos5_dmc *dmc, >> return 0; >> } >> >> +/** >> + * exynos5_dmc_start_perf_events() - Setup and start performance event counters >> + * @dmc: device for which the counters are going to be checked >> + * @beg_value: initial value for the counter >> + * >> + * Function which enables needed counters, interrupts and sets initial values >> + * then starts the counters. >> + */ >> +static void exynos5_dmc_start_perf_events(struct exynos5_dmc *dmc, >> + u32 beg_value) >> +{ >> + /* Enable interrupts for counter 2 */ >> + writel(PERF_CNT2, dmc->base_drexi0 + DREX_INTENS_PPC); >> + writel(PERF_CNT2, dmc->base_drexi1 + DREX_INTENS_PPC); > > Blank line. > >> + /* Enable counter 2 and CCNT */ >> + writel(PERF_CNT2 | PERF_CCNT, dmc->base_drexi0 + DREX_CNTENS_PPC); >> + writel(PERF_CNT2 | PERF_CCNT, dmc->base_drexi1 + DREX_CNTENS_PPC); > > Blank line. > >> + /* Clear overflow flag for all counters */ >> + writel(PERF_CNT2 | PERF_CCNT, dmc->base_drexi0 + DREX_FLAG_PPC); >> + writel(PERF_CNT2 | PERF_CCNT, dmc->base_drexi1 + DREX_FLAG_PPC); > > Blank line. > >> + /* Reset all counters */ >> + writel(CC_RESET | PPC_COUNTER_RESET, dmc->base_drexi0 + DREX_PMNC_PPC); >> + writel(CC_RESET | PPC_COUNTER_RESET, dmc->base_drexi1 + DREX_PMNC_PPC); > > Blank line. > >> + /* >> + * Set start value for the counters, the number of samples that >> + * will be gathered is calculated as: 0xffffffff - beg_value >> + */ >> + writel(beg_value, dmc->base_drexi0 + DREX_PMCNT2_PPC); >> + writel(beg_value, dmc->base_drexi1 + DREX_PMCNT2_PPC); > > Blank line. > >> + /* Start all counters */ >> + writel(PPC_ENABLE, dmc->base_drexi0 + DREX_PMNC_PPC); >> + writel(PPC_ENABLE, dmc->base_drexi1 + DREX_PMNC_PPC); >> +} >> + >> +/** >> + * exynos5_dmc_perf_events_calc() - Calculate utilization >> + * @dmc: device for which the counters are going to be checked >> + * @diff_ts: time between last interrupt and current one >> + * >> + * Function which calculates needed utilization for the devfreq governor. >> + * It prepares values for 'busy_time' and 'total_time' based on elapsed time >> + * between interrupts, which approximates utilization. >> + */ >> +static void exynos5_dmc_perf_events_calc(struct exynos5_dmc *dmc, u64 diff_ts) >> +{ >> + /* >> + * This is a simple algorithm for managing traffic on DMC. >> + * When there is almost no load the counters overflow every 4s, >> + * no mater the DMC frequency. >> + * The high load might be approximated using linear function. >> + * Knowing that, simple calculation can provide 'busy_time' and >> + * 'total_time' to the devfreq governor which picks up target >> + * frequency. >> + * We want a fast ramp up and slow decay in frequency change function. >> + */ >> + if (diff_ts < PERF_EVENT_UP_DOWN_THRESHOLD) { >> + /* >> + * Set higher utilization for the simple_ondemand governor. >> + * The governor should increase the frequency of the DMC. >> + */ >> + dmc->load = 70; >> + dmc->total = 100; >> + } else { >> + /* >> + * Set low utilization for the simple_ondemand governor. >> + * The governor should decrease the frequency of the DMC. >> + */ >> + dmc->load = 35; >> + dmc->total = 100; >> + } >> + >> + dev_dbg(dmc->dev, "diff_ts=%llu\n", diff_ts); >> +} >> + >> +/** >> + * exynos5_dmc_perf_events_check() - Checks the status of the counters >> + * @dmc: device for which the counters are going to be checked >> + * >> + * Function which is called from threaded IRQ to check the counters state >> + * and to call approximation for the needed utilization. >> + */ >> +static void exynos5_dmc_perf_events_check(struct exynos5_dmc *dmc) >> +{ >> + u32 val; >> + u64 diff_ts, ts; >> + >> + ts = ktime_get_ns(); >> + >> + /* Stop all counters */ >> + writel(0, dmc->base_drexi0 + DREX_PMNC_PPC); >> + writel(0, dmc->base_drexi1 + DREX_PMNC_PPC); >> + >> + /* Check the source in interrupt flag registers (which channel) */ >> + val = readl(dmc->base_drexi0 + DREX_FLAG_PPC); >> + if (val) { >> + diff_ts = ts - dmc->last_overflow_ts[0]; >> + dmc->last_overflow_ts[0] = ts; >> + dev_dbg(dmc->dev, "drex0 0xE050 val= 0x%08x\n", val); >> + } else { >> + val = readl(dmc->base_drexi1 + DREX_FLAG_PPC); >> + diff_ts = ts - dmc->last_overflow_ts[1]; >> + dmc->last_overflow_ts[1] = ts; >> + dev_dbg(dmc->dev, "drex1 0xE050 val= 0x%08x\n", val); >> + } >> + >> + exynos5_dmc_perf_events_calc(dmc, diff_ts); >> + >> + exynos5_dmc_start_perf_events(dmc, PERF_COUNTER_START_VALUE); >> +} >> + >> +/** >> + * exynos5_dmc_enable_perf_events() - Enable performance events >> + * @dmc: device for which the counters are going to be checked >> + * >> + * Function which is setup needed environment and enables counters. >> + */ >> +static void exynos5_dmc_enable_perf_events(struct exynos5_dmc *dmc) >> +{ >> + u64 ts; >> + >> + /* Enable Performance Event Clock */ >> + writel(PEREV_CLK_EN, dmc->base_drexi0 + DREX_PPCCLKCON); >> + writel(PEREV_CLK_EN, dmc->base_drexi1 + DREX_PPCCLKCON); >> + >> + /* Select read transfers as performance event2 */ >> + writel(READ_TRANSFER_CH0, dmc->base_drexi0 + DREX_PEREV2CONFIG); >> + writel(READ_TRANSFER_CH1, dmc->base_drexi1 + DREX_PEREV2CONFIG); >> + >> + dmc->in_irq_mode = 1; > > Move this outside, to the probe. Logically it belongs there. OK > >> + >> + ts = ktime_get_ns(); >> + dmc->last_overflow_ts[0] = ts; >> + dmc->last_overflow_ts[1] = ts; >> + >> + /* Devfreq shouldn't be faster than initialization, play safe though. */ >> + dmc->load = 99; >> + dmc->total = 100; >> +} >> + >> +/** >> + * exynos5_dmc_disable_perf_events() - Disable performance events >> + * @dmc: device for which the counters are going to be checked >> + * >> + * Function which stops, disables performance event counters and interrupts. >> + */ >> +static void exynos5_dmc_disable_perf_events(struct exynos5_dmc *dmc) >> +{ >> + /* Stop all counters */ >> + writel(0, dmc->base_drexi0 + DREX_PMNC_PPC); >> + writel(0, dmc->base_drexi1 + DREX_PMNC_PPC); > > Blank line here and later. OK - for all the 'blank line' hints. > >> + /* Disable interrupts for counter 2 */ >> + writel(PERF_CNT2, dmc->base_drexi0 + DREX_INTENC_PPC); >> + writel(PERF_CNT2, dmc->base_drexi1 + DREX_INTENC_PPC); >> + /* Disable counter 2 and CCNT */ >> + writel(PERF_CNT2 | PERF_CCNT, dmc->base_drexi0 + DREX_CNTENC_PPC); >> + writel(PERF_CNT2 | PERF_CCNT, dmc->base_drexi1 + DREX_CNTENC_PPC); >> + /* Clear overflow flag for all counters */ >> + writel(PERF_CNT2 | PERF_CCNT, dmc->base_drexi0 + DREX_FLAG_PPC); >> + writel(PERF_CNT2 | PERF_CCNT, dmc->base_drexi1 + DREX_FLAG_PPC); >> +} >> + >> /** >> * exynos5_dmc_get_status() - Read current DMC performance statistics. >> * @dev: device for which the statistics are requested >> @@ -669,18 +857,24 @@ static int exynos5_dmc_get_status(struct device *dev, >> unsigned long load, total; >> int ret; >> >> - ret = exynos5_counters_get(dmc, &load, &total); >> - if (ret < 0) >> - return -EINVAL; >> + if (dmc->in_irq_mode) { >> + stat->current_frequency = dmc->curr_rate; >> + stat->busy_time = dmc->load; >> + stat->total_time = dmc->total; >> + } else { >> + ret = exynos5_counters_get(dmc, &load, &total); >> + if (ret < 0) >> + return -EINVAL; >> >> - /* To protect from overflow in calculation ratios, divide by 1024 */ >> - stat->busy_time = load >> 10; >> - stat->total_time = total >> 10; >> + /* To protect from overflow, divide by 1024 */ >> + stat->busy_time = load >> 10; >> + stat->total_time = total >> 10; >> >> - ret = exynos5_counters_set_event(dmc); >> - if (ret < 0) { >> - dev_err(dev, "could not set event counter\n"); >> - return ret; >> + ret = exynos5_counters_set_event(dmc); >> + if (ret < 0) { >> + dev_err(dev, "could not set event counter\n"); >> + return ret; >> + } >> } >> >> return 0; >> @@ -712,7 +906,6 @@ static int exynos5_dmc_get_cur_freq(struct device *dev, unsigned long *freq) >> * It provides to the devfreq framework needed functions and polling period. >> */ >> static struct devfreq_dev_profile exynos5_dmc_df_profile = { >> - .polling_ms = 500, >> .target = exynos5_dmc_target, >> .get_dev_status = exynos5_dmc_get_status, >> .get_cur_freq = exynos5_dmc_get_cur_freq, >> @@ -1108,6 +1301,26 @@ static inline int exynos5_dmc_set_pause_on_switching(struct exynos5_dmc *dmc) >> return 0; >> } >> >> +static irqreturn_t dmc_irq_thread(int irq, void *priv) >> +{ >> + int res; >> + struct exynos5_dmc *dmc = priv; >> + >> + dev_dbg(dmc->dev, "irq thread handler\n"); > > Skip a debug in thread handler for memory. It can pollute your log (I > guess depending on workload). OK, I will remove it. > >> + >> + mutex_lock(&dmc->df->lock); >> + >> + exynos5_dmc_perf_events_check(dmc); >> + >> + res = update_devfreq(dmc->df); >> + if (res) >> + dev_err(dmc->dev, "devfreq failed with %d\n", res); > > dev_warn() OK > >> + >> + mutex_unlock(&dmc->df->lock); >> + >> + return IRQ_HANDLED; >> +} >> + >> /** >> * exynos5_dmc_probe() - Probe function for the DMC driver >> * @pdev: platform device for which the driver is going to be initialized >> @@ -1125,6 +1338,7 @@ static int exynos5_dmc_probe(struct platform_device *pdev) >> struct device_node *np = dev->of_node; >> struct exynos5_dmc *dmc; >> struct resource *res; >> + int irq; >> >> dmc = devm_kzalloc(dev, sizeof(*dmc), GFP_KERNEL); >> if (!dmc) >> @@ -1172,24 +1386,48 @@ static int exynos5_dmc_probe(struct platform_device *pdev) >> goto remove_clocks; >> } >> >> - ret = exynos5_performance_counters_init(dmc); >> - if (ret) { >> - dev_warn(dev, "couldn't probe performance counters\n"); >> - goto remove_clocks; >> - } >> - >> ret = exynos5_dmc_set_pause_on_switching(dmc); >> if (ret) { >> dev_warn(dev, "couldn't get access to PAUSE register\n"); >> goto err_devfreq_add; > > This is wrong now, I think. Good point, should be 'goto remove_clocks'. > >> } >> >> - /* >> - * Setup default thresholds for the devfreq governor. >> - * The values are chosen based on experiments. >> - */ >> - dmc->gov_data.upthreshold = 30; >> - dmc->gov_data.downdifferential = 5; >> + /* There is two modes in which the driver works: polling or IRQ */ >> + irq = platform_get_irq(pdev, 0); > > You need to document it in bindings. OK > >> + if (irq < 0) { >> + ret = exynos5_performance_counters_init(dmc); >> + if (ret) { >> + dev_warn(dev, "couldn't probe performance counters\n"); >> + goto remove_clocks; > > Weird, previous error jump goes to err_devfreq_add. This goes to error > label which is narrower (less to cleanup). Right. Thank you for the review. I will address the issues in the next version. Regards, Lukasz