Re: [PATCH v5 5/8] drm/i915/skl+: change WM calc to fixed point 16.16

Paulo Zanoni <paulo.r.zanoni@xxxxxxxxx> · Tue, 22 Nov 2016 10:42:36 -0200

Em Sex, 2016-11-18 às 20:39 +0530, Mahesh Kumar escreveu:
> This patch changes Watermak calculation to fixed point calculation.
> Problem with current calculation is during plane_blocks_per_line
> calculation we divide intermediate blocks with min_scanlines and
> takes floor of the result because of integer operation.
> hence we end-up assigning less blocks than required. Which leads to
> flickers.
> 
> Changes since V1:
>  - Add fixed point data type as per Paulo's review
> 
> Signed-off-by: Mahesh Kumar <mahesh1.kumar@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_drv.h | 84
> +++++++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_pm.c | 70 ++++++++++++++++++++-----------
> ---
>  2 files changed, 125 insertions(+), 29 deletions(-)

First of all, there's non-standard indentation all over the code. When
a statement is broken into multiple lines, sometimes there's way too
many tabs, sometimes tabs/spaces are missing.

> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h
> b/drivers/gpu/drm/i915/i915_drv.h
> index 394d7ea..20691e9 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -119,6 +119,90 @@ bool __i915_inject_load_failure(const char
> *func, int line);
>  #define i915_inject_load_failure() \
>  	__i915_inject_load_failure(__func__, __LINE__)
>  
> +typedef struct {
> +	uint32_t val;
> +} uint_fixed_16_16_t;
> +
> +#define FP_16_16_MAX ({ \
> +		uint_fixed_16_16_t fp; \
> +		fp.val = UINT_MAX; \
> +		fp; \
> +})
> +
> +static inline uint_fixed_16_16_t u32_to_fp_16_16(uint32_t val)

In the type name the patch uses "fixed", but in some functions it uses
"fp". I know that fp is supposed to help function names shorter, but
IMHO we should stick with the same word all the time. My suggestion is
to keep "fixed" everywhere, because the definition of "fp" is not
obvious to everybody (fixed point? floating point? frame pointer?
functional programming?).

> +{
> +	uint_fixed_16_16_t fp;
> +
> +	WARN_ON(val >> 16);
> +
> +	fp.val = val << 16;
> +	return fp;
> +}
> +
> +static inline uint32_t fp_16_16_to_u32_round_up(uint_fixed_16_16_t
> fp)
> +{
> +	return DIV_ROUND_UP(fp.val, 1 << 16);
> +}
> +
> +static inline uint32_t fp_16_16_to_u32(uint_fixed_16_16_t fp)
> +{
> +	return (fp.val / 1 << 16);

This is dividing by 1 and then shifting. Doesn't look correct.

> +}
> +
> +static inline uint_fixed_16_16_t min_fp_16_16(uint_fixed_16_16_t
> min1,
> +						uint_fixed_16_16_t
> min2)
> +{
> +	uint_fixed_16_16_t min;
> +
> +	min.val = min(min1.val, min2.val);
> +	return min;
> +}
> +
> +static inline uint_fixed_16_16_t max_fp_16_16(uint_fixed_16_16_t
> max1,
> +						uint_fixed_16_16_t
> max2)
> +{
> +	uint_fixed_16_16_t max;
> +
> +	max.val = max(max1.val, max2.val);
> +	return max;
> +}
> +
> +static inline uint_fixed_16_16_t fp_16_16_div_round_up(uint32_t val,
> uint32_t d)
> +{
> +	uint_fixed_16_16_t fp, res;
> +
> +	fp = u32_to_fp_16_16(val);
> +	res.val = DIV_ROUND_UP(fp.val, d);
> +	return res;
> +}
> +
> +static inline uint_fixed_16_16_t fp_16_16_div_round_up_ull(uint64_t
> val,
> +								uint
> 32_t d)

I'd end the name with "u64" instead of "ull" since it's an uint64_t
type, not unsigned long long.

> +{
> +	uint_fixed_16_16_t res;
> +	uint64_t interm_val;
> +
> +	WARN_ON(val >> 48);
> +	val <<= 16;
> +	interm_val = DIV_ROUND_UP_ULL(val, d);
> +	WARN_ON(interm_val >> 32);
> +	res.val = (uint32_t) interm_val;
> +
> +	return res;
> +}
> +
> +static inline uint_fixed_16_16_t mul_fp_16_16_u32(uint32_t val,
> +						uint_fixed_16_16_t
> mul)
> +{
> +	uint64_t intermediate_val;
> +	uint_fixed_16_16_t fp;
> +
> +	intermediate_val = (uint64_t) val * mul.val;
> +	WARN_ON(intermediate_val >> 32);
> +	fp.val = (uint32_t) intermediate_val;
> +	return fp;
> +}
> +
>  static inline const char *yesno(bool v)
>  {
>  	return v ? "yes" : "no";
> diff --git a/drivers/gpu/drm/i915/intel_pm.c
> b/drivers/gpu/drm/i915/intel_pm.c
> index d8090aa..7c788ac 100644
> --- a/drivers/gpu/drm/i915/intel_pm.c
> +++ b/drivers/gpu/drm/i915/intel_pm.c
> @@ -3515,33 +3515,37 @@ skl_allocate_pipe_ddb(struct intel_crtc_state
> *cstate,
>   * for the read latency) and cpp should always be <= 8, so that
>   * should allow pixel_rate up to ~2 GHz which seems sufficient since
> max
>   * 2xcdclk is 1350 MHz and the pixel rate should never exceed that.
> + * Both Method1 & Method2 returns fixedpoint 16.16 output

This line is not needed anymore: function signatures are saying it now.

>  */
> -static uint32_t skl_wm_method1(uint32_t pixel_rate, uint8_t cpp,
> uint32_t latency)
> +static uint_fixed_16_16_t skl_wm_method1(uint32_t pixel_rate,
> uint8_t cpp,
> +					 uint32_t latency)
>  {
> -	uint32_t wm_intermediate_val, ret;
> +	uint64_t wm_intermediate_val;

Why are we using uint64_t now? If we weren't overflowing before, we
shouldn't be overflowing now, should we? Is this a separate bug fix?

Everything else looks good. Thanks for the improvements!

> +	uint_fixed_16_16_t ret;
>  
>  	if (latency == 0)
> -		return UINT_MAX;
> -
> -	wm_intermediate_val = latency * pixel_rate * cpp / 512;
> -	ret = DIV_ROUND_UP(wm_intermediate_val, 1000);
> +		return FP_16_16_MAX;
>  
> +	wm_intermediate_val = latency * pixel_rate * cpp;
> +	ret = fp_16_16_div_round_up_ull(wm_intermediate_val, 1000 *
> 512);
>  	return ret;
>  }
>  
> -static uint32_t skl_wm_method2(uint32_t pixel_rate, uint32_t
> pipe_htotal,
> -			       uint32_t latency, uint32_t
> plane_blocks_per_line)
> +static uint_fixed_16_16_t skl_wm_method2(uint32_t pixel_rate,
> +			uint32_t pipe_htotal,
> +			uint32_t latency,
> +			uint_fixed_16_16_t plane_blocks_per_line)
>  {
> -	uint32_t ret;
>  	uint32_t wm_intermediate_val;
> +	uint_fixed_16_16_t ret;
>  
>  	if (latency == 0)
> -		return UINT_MAX;
> +		return FP_16_16_MAX;
>  
>  	wm_intermediate_val = latency * pixel_rate;
> -	ret = DIV_ROUND_UP(wm_intermediate_val, pipe_htotal * 1000)
> *
> -				plane_blocks_per_line;
> -
> +	wm_intermediate_val = DIV_ROUND_UP(wm_intermediate_val,
> +							pipe_htotal
> * 1000);
> +	ret = mul_fp_16_16_u32(wm_intermediate_val,
> plane_blocks_per_line);
>  	return ret;
>  }
>  
> @@ -3581,14 +3585,17 @@ static int skl_compute_plane_wm(const struct
> drm_i915_private *dev_priv,
>  	struct drm_plane_state *pstate = &intel_pstate->base;
>  	struct drm_framebuffer *fb = pstate->fb;
>  	uint32_t latency = dev_priv->wm.skl_latency[level];
> -	uint32_t method1, method2;
> -	uint32_t plane_bytes_per_line, plane_blocks_per_line;
> +	uint_fixed_16_16_t method1, method2;
> +	uint_fixed_16_16_t plane_blocks_per_line;
> +	uint_fixed_16_16_t selected_result;
> +	uint32_t interm_pbpl;
> +	uint32_t plane_bytes_per_line;
>  	uint32_t res_blocks, res_lines;
> -	uint32_t selected_result;
>  	uint8_t cpp;
>  	uint32_t width = 0, height = 0;
>  	uint32_t plane_pixel_rate;
> -	uint32_t y_tile_minimum, y_min_scanlines;
> +	uint_fixed_16_16_t y_tile_minimum;
> +	uint32_t y_min_scanlines;
>  	struct intel_atomic_state *state =
>  		to_intel_atomic_state(cstate->base.state);
>  	bool apply_memory_bw_wa = skl_needs_memory_bw_wa(state);
> @@ -3647,14 +3654,16 @@ static int skl_compute_plane_wm(const struct
> drm_i915_private *dev_priv,
>  
>  	plane_bytes_per_line = width * cpp;
>  	if (y_tiled) {
> +		interm_pbpl = DIV_ROUND_UP(plane_bytes_per_line *
> +						y_min_scanlines,
> 512);
>  		plane_blocks_per_line =
> -		      DIV_ROUND_UP(plane_bytes_per_line *
> y_min_scanlines, 512);
> -		plane_blocks_per_line /= y_min_scanlines;
> +		      fp_16_16_div_round_up(interm_pbpl,
> y_min_scanlines);
>  	} else if (x_tiled) {
> -		plane_blocks_per_line =
> DIV_ROUND_UP(plane_bytes_per_line, 512);
> +		interm_pbpl = DIV_ROUND_UP(plane_bytes_per_line,
> 512);
> +		plane_blocks_per_line =
> u32_to_fp_16_16(interm_pbpl);
>  	} else {
> -		plane_blocks_per_line =
> DIV_ROUND_UP(plane_bytes_per_line, 512)
> -					+ 1;
> +		interm_pbpl = DIV_ROUND_UP(plane_bytes_per_line,
> 512) + 1;
> +		plane_blocks_per_line =
> u32_to_fp_16_16(interm_pbpl);
>  	}
>  
>  	method1 = skl_wm_method1(plane_pixel_rate, cpp, latency);
> @@ -3663,26 +3672,29 @@ static int skl_compute_plane_wm(const struct
> drm_i915_private *dev_priv,
>  				 latency,
>  				 plane_blocks_per_line);
>  
> -	y_tile_minimum = plane_blocks_per_line * y_min_scanlines;
> +	y_tile_minimum = mul_fp_16_16_u32(y_min_scanlines,
> +						plane_blocks_per_lin
> e);
>  
>  	if (y_tiled) {
> -		selected_result = max(method2, y_tile_minimum);
> +		selected_result = max_fp_16_16(method2,
> y_tile_minimum);
>  	} else {
>  		if ((cpp * cstate->base.adjusted_mode.crtc_htotal /
> 512 < 1) &&
>  		    (plane_bytes_per_line / 512 < 1))
>  			selected_result = method2;
> -		else if ((ddb_allocation / plane_blocks_per_line) >=
> 1)
> -			selected_result = min(method1, method2);
> +		else if ((ddb_allocation /
> +			fp_16_16_to_u32_round_up(plane_blocks_per_li
> ne)) >= 1)
> +			selected_result = min_fp_16_16(method1,
> method2);
>		else
>  			selected_result = method1;
>  	}
>  
> -	res_blocks = selected_result + 1;
> -	res_lines = DIV_ROUND_UP(selected_result,
> plane_blocks_per_line);
> +	res_blocks = fp_16_16_to_u32_round_up(selected_result) + 1;
> +	res_lines = DIV_ROUND_UP(selected_result.val,
> +					plane_blocks_per_line.val);
>  
>  	if (level >= 1 && level <= 7) {
>  		if (y_tiled) {
> -			res_blocks += y_tile_minimum;
> +			res_blocks +=
> fp_16_16_to_u32_round_up(y_tile_minimum);
>  			res_lines += y_min_scanlines;
>  		} else {
>  			res_blocks++;
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx