Re: [PATCH] drm/i915: Use hash tables for the command parser

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@xxxxxxxxx wrote:
> From: Brad Volkin <bradley.d.volkin@xxxxxxxxx>
> 
> For clients that submit large batch buffers the command parser has
> a substantial impact on performance. On my HSW ULT system performance
> drops as much as ~20% on some tests. Most of the time is spent in the
> command lookup code. Converting that from the current naive search to
> a hash table lookup reduces the performance impact by as much as ~10%.
> 
> The choice of value for I915_CMD_HASH_ORDER allows all commands
> currently used in the parser tables to hash to their own bucket (except
> for one collision on the render ring). The tradeoff is that it wastes
> memory. Because the opcodes for the commands in the tables are not
> particularly well distributed, reducing the order still leaves many
> buckets empty. The increased collisions don't seem to have a huge
> impact on the performance gain, but for now anyhow, the parser trades
> memory for performance.

For the collisions have you looked into pre-munging the key a bit so that
we use more bits? A few shifts and xors shouldn't affect perf much really.

Also since the tables are mostly empty we could just overflow to the next
hashtable entry, but unfortunately that would require a bit of custom
insert and lookup code.

Finally if we manage to get 0 collisions a WARN_ON would be good for
that, to make sure we don't accidentally regress.

Anyway just a few more ideas.
-Daniel

Finally if we manage to get 0 collisions a WARN_ON would be good for
that, to make sure we don't accidentally regress.

Anyway just a few more ideas.
-Daniel

> 
> Signed-off-by: Brad Volkin <bradley.d.volkin@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/i915_cmd_parser.c  | 136 ++++++++++++++++++++++++--------
>  drivers/gpu/drm/i915/i915_drv.h         |   1 +
>  drivers/gpu/drm/i915/intel_ringbuffer.c |   2 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  11 ++-
>  4 files changed, 116 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index 9bac097..9dca899 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header)
>  	return 0;
>  }
>  
> -static bool validate_cmds_sorted(struct intel_ring_buffer *ring)
> +static bool validate_cmds_sorted(struct intel_ring_buffer *ring,
> +				 const struct drm_i915_cmd_table *cmd_tables,
> +				 int cmd_table_count)
>  {
>  	int i;
>  	bool ret = true;
>  
> -	if (!ring->cmd_tables || ring->cmd_table_count == 0)
> +	if (!cmd_tables || cmd_table_count == 0)
>  		return true;
>  
> -	for (i = 0; i < ring->cmd_table_count; i++) {
> -		const struct drm_i915_cmd_table *table = &ring->cmd_tables[i];
> +	for (i = 0; i < cmd_table_count; i++) {
> +		const struct drm_i915_cmd_table *table = &cmd_tables[i];
>  		u32 previous = 0;
>  		int j;
>  
> @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring)
>  			     ring->master_reg_count);
>  }
>  
> +struct cmd_node {
> +	const struct drm_i915_cmd_descriptor *desc;
> +	struct hlist_node node;
> +};
> +
> +/*
> + * Different command ranges have different numbers of bits for the opcode.
> + * In order to use the opcode bits, and only the opcode bits, for the hash key
> + * we should use the MI_* command opcode mask (since those commands use the
> + * fewest bits for the opcode.)
> + */
> +#define CMD_HASH_MASK STD_MI_OPCODE_MASK
> +
> +static int init_hash_table(struct intel_ring_buffer *ring,
> +			   const struct drm_i915_cmd_table *cmd_tables,
> +			   int cmd_table_count)
> +{
> +	int i, j;
> +
> +	hash_init(ring->cmd_hash);
> +
> +	for (i = 0; i < cmd_table_count; i++) {
> +		const struct drm_i915_cmd_table *table = &cmd_tables[i];
> +
> +		for (j = 0; j < table->count; j++) {
> +			const struct drm_i915_cmd_descriptor *desc =
> +				&table->table[j];
> +			struct cmd_node *desc_node =
> +				kmalloc(sizeof(*desc_node), GFP_KERNEL);
> +
> +			if (!desc_node)
> +				return -ENOMEM;
> +
> +			desc_node->desc = desc;
> +			hash_add(ring->cmd_hash, &desc_node->node,
> +				 desc->cmd.value & CMD_HASH_MASK);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void fini_hash_table(struct intel_ring_buffer *ring)
> +{
> +	struct hlist_node *tmp;
> +	struct cmd_node *desc_node;
> +	int i;
> +
> +	hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) {
> +		hash_del(&desc_node->node);
> +		kfree(desc_node);
> +	}
> +}
> +
>  /**
>   * i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer
>   * @ring: the ringbuffer to initialize
> @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring)
>   */
>  void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring)
>  {
> +	const struct drm_i915_cmd_table *cmd_tables;
> +	int cmd_table_count;
> +
>  	if (!IS_GEN7(ring->dev))
>  		return;
>  
>  	switch (ring->id) {
>  	case RCS:
>  		if (IS_HASWELL(ring->dev)) {
> -			ring->cmd_tables = hsw_render_ring_cmds;
> -			ring->cmd_table_count =
> +			cmd_tables = hsw_render_ring_cmds;
> +			cmd_table_count =
>  				ARRAY_SIZE(hsw_render_ring_cmds);
>  		} else {
> -			ring->cmd_tables = gen7_render_cmds;
> -			ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds);
> +			cmd_tables = gen7_render_cmds;
> +			cmd_table_count = ARRAY_SIZE(gen7_render_cmds);
>  		}
>  
>  		ring->reg_table = gen7_render_regs;
> @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring)
>  		ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask;
>  		break;
>  	case VCS:
> -		ring->cmd_tables = gen7_video_cmds;
> -		ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds);
> +		cmd_tables = gen7_video_cmds;
> +		cmd_table_count = ARRAY_SIZE(gen7_video_cmds);
>  		ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
>  		break;
>  	case BCS:
>  		if (IS_HASWELL(ring->dev)) {
> -			ring->cmd_tables = hsw_blt_ring_cmds;
> -			ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds);
> +			cmd_tables = hsw_blt_ring_cmds;
> +			cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds);
>  		} else {
> -			ring->cmd_tables = gen7_blt_cmds;
> -			ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds);
> +			cmd_tables = gen7_blt_cmds;
> +			cmd_table_count = ARRAY_SIZE(gen7_blt_cmds);
>  		}
>  
>  		ring->reg_table = gen7_blt_regs;
> @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring)
>  		ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
>  		break;
>  	case VECS:
> -		ring->cmd_tables = hsw_vebox_cmds;
> -		ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds);
> +		cmd_tables = hsw_vebox_cmds;
> +		cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds);
>  		/* VECS can use the same length_mask function as VCS */
>  		ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
>  		break;
> @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring)
>  		BUG();
>  	}
>  
> -	BUG_ON(!validate_cmds_sorted(ring));
> +	BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count));
>  	BUG_ON(!validate_regs_sorted(ring));
> +
> +	BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count));
> +
> +	ring->needs_cmd_parser = true;
> +}
> +
> +/**
> + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields
> + * @ring: the ringbuffer to clean up
> + *
> + * Releases any resources related to command parsing that may have been
> + * initialized for the specified ring.
> + */
> +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring)
> +{
> +	if (!ring->needs_cmd_parser)
> +		return;
> +
> +	fini_hash_table(ring);
>  }
>  
>  static const struct drm_i915_cmd_descriptor*
> -find_cmd_in_table(const struct drm_i915_cmd_table *table,
> +find_cmd_in_table(struct intel_ring_buffer *ring,
>  		  u32 cmd_header)
>  {
> -	int i;
> +	struct cmd_node *desc_node;
>  
> -	for (i = 0; i < table->count; i++) {
> -		const struct drm_i915_cmd_descriptor *desc = &table->table[i];
> +	hash_for_each_possible(ring->cmd_hash, desc_node, node,
> +			       cmd_header & CMD_HASH_MASK) {
> +		const struct drm_i915_cmd_descriptor *desc = desc_node->desc;
>  		u32 masked_cmd = desc->cmd.mask & cmd_header;
>  		u32 masked_value = desc->cmd.value & desc->cmd.mask;
>  
> @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring,
>  	 u32 cmd_header,
>  	 struct drm_i915_cmd_descriptor *default_desc)
>  {
> +	const struct drm_i915_cmd_descriptor *desc;
>  	u32 mask;
> -	int i;
> -
> -	for (i = 0; i < ring->cmd_table_count; i++) {
> -		const struct drm_i915_cmd_descriptor *desc;
>  
> -		desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header);
> -		if (desc)
> -			return desc;
> -	}
> +	desc = find_cmd_in_table(ring, cmd_header);
> +	if (desc)
> +		return desc;
>  
>  	mask = ring->get_cmd_length_mask(cmd_header);
>  	if (!mask)
> @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer *ring)
>  {
>  	struct drm_i915_private *dev_priv = ring->dev->dev_private;
>  
> -	/* No command tables indicates a platform without parsing */
> -	if (!ring->cmd_tables)
> +	if (!ring->needs_cmd_parser)
>  		return false;
>  
>  	/*
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7d6acb4..8c8388f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type);
>  /* i915_cmd_parser.c */
>  int i915_cmd_parser_get_version(void);
>  void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring);
> +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring);
>  bool i915_needs_cmd_parser(struct intel_ring_buffer *ring);
>  int i915_parse_cmds(struct intel_ring_buffer *ring,
>  		    struct drm_i915_gem_object *batch_obj,
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index eb3dd26..06fa9a3 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring)
>  		ring->cleanup(ring);
>  
>  	cleanup_status_page(ring);
> +
> +	i915_cmd_parser_fini_ring(ring);
>  }
>  
>  static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 413cdc7..48daa91 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -1,6 +1,10 @@
>  #ifndef _INTEL_RINGBUFFER_H_
>  #define _INTEL_RINGBUFFER_H_
>  
> +#include <linux/hashtable.h>
> +
> +#define I915_CMD_HASH_ORDER 9
> +
>  /*
>   * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use"
>   * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use"
> @@ -164,12 +168,13 @@ struct  intel_ring_buffer {
>  		volatile u32 *cpu_page;
>  	} scratch;
>  
> +	bool needs_cmd_parser;
> +
>  	/*
> -	 * Tables of commands the command parser needs to know about
> +	 * Table of commands the command parser needs to know about
>  	 * for this ring.
>  	 */
> -	const struct drm_i915_cmd_table *cmd_tables;
> -	int cmd_table_count;
> +	DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER);
>  
>  	/*
>  	 * Table of registers allowed in commands that read/write registers.
> -- 
> 1.8.3.2
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux