On Mon, Apr 28, 2014 at 08:53:30AM -0700, Daniel Vetter wrote: > On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.volkin@xxxxxxxxx wrote: > > From: Brad Volkin <bradley.d.volkin@xxxxxxxxx> > > > > For clients that submit large batch buffers the command parser has > > a substantial impact on performance. On my HSW ULT system performance > > drops as much as ~20% on some tests. Most of the time is spent in the > > command lookup code. Converting that from the current naive search to > > a hash table lookup reduces the performance impact by as much as ~10%. > > > > The choice of value for I915_CMD_HASH_ORDER allows all commands > > currently used in the parser tables to hash to their own bucket (except > > for one collision on the render ring). The tradeoff is that it wastes > > memory. Because the opcodes for the commands in the tables are not > > particularly well distributed, reducing the order still leaves many > > buckets empty. The increased collisions don't seem to have a huge > > impact on the performance gain, but for now anyhow, the parser trades > > memory for performance. > > For the collisions have you looked into pre-munging the key a bit so that > we use more bits? A few shifts and xors shouldn't affect perf much really. I looked at this briefly but didn't find a substantial improvement. The basic patch improved things enough that I wanted to just get it out. I can look into this more, but I'd like to think about implementing the batch buffer copy portion next. I don't want to optimize this, make people happy, and then introduce another perf drop from the copy. Better to just take the full hit now and then continue the improvements. Sound reasonable? Brad > > Also since the tables are mostly empty we could just overflow to the next > hashtable entry, but unfortunately that would require a bit of custom > insert and lookup code. > > Finally if we manage to get 0 collisions a WARN_ON would be good for > that, to make sure we don't accidentally regress. > > Anyway just a few more ideas. > -Daniel > > Finally if we manage to get 0 collisions a WARN_ON would be good for > that, to make sure we don't accidentally regress. > > Anyway just a few more ideas. > -Daniel > > > > > Signed-off-by: Brad Volkin <bradley.d.volkin@xxxxxxxxx> > > --- > > drivers/gpu/drm/i915/i915_cmd_parser.c | 136 ++++++++++++++++++++++++-------- > > drivers/gpu/drm/i915/i915_drv.h | 1 + > > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 + > > drivers/gpu/drm/i915/intel_ringbuffer.h | 11 ++- > > 4 files changed, 116 insertions(+), 34 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c > > index 9bac097..9dca899 100644 > > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > > @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 cmd_header) > > return 0; > > } > > > > -static bool validate_cmds_sorted(struct intel_ring_buffer *ring) > > +static bool validate_cmds_sorted(struct intel_ring_buffer *ring, > > + const struct drm_i915_cmd_table *cmd_tables, > > + int cmd_table_count) > > { > > int i; > > bool ret = true; > > > > - if (!ring->cmd_tables || ring->cmd_table_count == 0) > > + if (!cmd_tables || cmd_table_count == 0) > > return true; > > > > - for (i = 0; i < ring->cmd_table_count; i++) { > > - const struct drm_i915_cmd_table *table = &ring->cmd_tables[i]; > > + for (i = 0; i < cmd_table_count; i++) { > > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > > u32 previous = 0; > > int j; > > > > @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > > ring->master_reg_count); > > } > > > > +struct cmd_node { > > + const struct drm_i915_cmd_descriptor *desc; > > + struct hlist_node node; > > +}; > > + > > +/* > > + * Different command ranges have different numbers of bits for the opcode. > > + * In order to use the opcode bits, and only the opcode bits, for the hash key > > + * we should use the MI_* command opcode mask (since those commands use the > > + * fewest bits for the opcode.) > > + */ > > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > > + > > +static int init_hash_table(struct intel_ring_buffer *ring, > > + const struct drm_i915_cmd_table *cmd_tables, > > + int cmd_table_count) > > +{ > > + int i, j; > > + > > + hash_init(ring->cmd_hash); > > + > > + for (i = 0; i < cmd_table_count; i++) { > > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > > + > > + for (j = 0; j < table->count; j++) { > > + const struct drm_i915_cmd_descriptor *desc = > > + &table->table[j]; > > + struct cmd_node *desc_node = > > + kmalloc(sizeof(*desc_node), GFP_KERNEL); > > + > > + if (!desc_node) > > + return -ENOMEM; > > + > > + desc_node->desc = desc; > > + hash_add(ring->cmd_hash, &desc_node->node, > > + desc->cmd.value & CMD_HASH_MASK); > > + } > > + } > > + > > + return 0; > > +} > > + > > +static void fini_hash_table(struct intel_ring_buffer *ring) > > +{ > > + struct hlist_node *tmp; > > + struct cmd_node *desc_node; > > + int i; > > + > > + hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) { > > + hash_del(&desc_node->node); > > + kfree(desc_node); > > + } > > +} > > + > > /** > > * i915_cmd_parser_init_ring() - set cmd parser related fields for a ringbuffer > > * @ring: the ringbuffer to initialize > > @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct intel_ring_buffer *ring) > > */ > > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > { > > + const struct drm_i915_cmd_table *cmd_tables; > > + int cmd_table_count; > > + > > if (!IS_GEN7(ring->dev)) > > return; > > > > switch (ring->id) { > > case RCS: > > if (IS_HASWELL(ring->dev)) { > > - ring->cmd_tables = hsw_render_ring_cmds; > > - ring->cmd_table_count = > > + cmd_tables = hsw_render_ring_cmds; > > + cmd_table_count = > > ARRAY_SIZE(hsw_render_ring_cmds); > > } else { > > - ring->cmd_tables = gen7_render_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > > + cmd_tables = gen7_render_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > > } > > > > ring->reg_table = gen7_render_regs; > > @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask; > > break; > > case VCS: > > - ring->cmd_tables = gen7_video_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > > + cmd_tables = gen7_video_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > > break; > > case BCS: > > if (IS_HASWELL(ring->dev)) { > > - ring->cmd_tables = hsw_blt_ring_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > > + cmd_tables = hsw_blt_ring_cmds; > > + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > > } else { > > - ring->cmd_tables = gen7_blt_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > > + cmd_tables = gen7_blt_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > > } > > > > ring->reg_table = gen7_blt_regs; > > @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; > > break; > > case VECS: > > - ring->cmd_tables = hsw_vebox_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > > + cmd_tables = hsw_vebox_cmds; > > + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > > /* VECS can use the same length_mask function as VCS */ > > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > > break; > > @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > BUG(); > > } > > > > - BUG_ON(!validate_cmds_sorted(ring)); > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > BUG_ON(!validate_regs_sorted(ring)); > > + > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > + > > + ring->needs_cmd_parser = true; > > +} > > + > > +/** > > + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields > > + * @ring: the ringbuffer to clean up > > + * > > + * Releases any resources related to command parsing that may have been > > + * initialized for the specified ring. > > + */ > > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring) > > +{ > > + if (!ring->needs_cmd_parser) > > + return; > > + > > + fini_hash_table(ring); > > } > > > > static const struct drm_i915_cmd_descriptor* > > -find_cmd_in_table(const struct drm_i915_cmd_table *table, > > +find_cmd_in_table(struct intel_ring_buffer *ring, > > u32 cmd_header) > > { > > - int i; > > + struct cmd_node *desc_node; > > > > - for (i = 0; i < table->count; i++) { > > - const struct drm_i915_cmd_descriptor *desc = &table->table[i]; > > + hash_for_each_possible(ring->cmd_hash, desc_node, node, > > + cmd_header & CMD_HASH_MASK) { > > + const struct drm_i915_cmd_descriptor *desc = desc_node->desc; > > u32 masked_cmd = desc->cmd.mask & cmd_header; > > u32 masked_value = desc->cmd.value & desc->cmd.mask; > > > > @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring, > > u32 cmd_header, > > struct drm_i915_cmd_descriptor *default_desc) > > { > > + const struct drm_i915_cmd_descriptor *desc; > > u32 mask; > > - int i; > > - > > - for (i = 0; i < ring->cmd_table_count; i++) { > > - const struct drm_i915_cmd_descriptor *desc; > > > > - desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header); > > - if (desc) > > - return desc; > > - } > > + desc = find_cmd_in_table(ring, cmd_header); > > + if (desc) > > + return desc; > > > > mask = ring->get_cmd_length_mask(cmd_header); > > if (!mask) > > @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer *ring) > > { > > struct drm_i915_private *dev_priv = ring->dev->dev_private; > > > > - /* No command tables indicates a platform without parsing */ > > - if (!ring->cmd_tables) > > + if (!ring->needs_cmd_parser) > > return false; > > > > /* > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > > index 7d6acb4..8c8388f 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.h > > +++ b/drivers/gpu/drm/i915/i915_drv.h > > @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type); > > /* i915_cmd_parser.c */ > > int i915_cmd_parser_get_version(void); > > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring); > > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring); > > bool i915_needs_cmd_parser(struct intel_ring_buffer *ring); > > int i915_parse_cmds(struct intel_ring_buffer *ring, > > struct drm_i915_gem_object *batch_obj, > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c > > index eb3dd26..06fa9a3 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > > @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct intel_ring_buffer *ring) > > ring->cleanup(ring); > > > > cleanup_status_page(ring); > > + > > + i915_cmd_parser_fini_ring(ring); > > } > > > > static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n) > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > > index 413cdc7..48daa91 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > > @@ -1,6 +1,10 @@ > > #ifndef _INTEL_RINGBUFFER_H_ > > #define _INTEL_RINGBUFFER_H_ > > > > +#include <linux/hashtable.h> > > + > > +#define I915_CMD_HASH_ORDER 9 > > + > > /* > > * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use" > > * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer Use" > > @@ -164,12 +168,13 @@ struct intel_ring_buffer { > > volatile u32 *cpu_page; > > } scratch; > > > > + bool needs_cmd_parser; > > + > > /* > > - * Tables of commands the command parser needs to know about > > + * Table of commands the command parser needs to know about > > * for this ring. > > */ > > - const struct drm_i915_cmd_table *cmd_tables; > > - int cmd_table_count; > > + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); > > > > /* > > * Table of registers allowed in commands that read/write registers. > > -- > > 1.8.3.2 > > > > _______________________________________________ > > Intel-gfx mailing list > > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx > > -- > Daniel Vetter > Software Engineer, Intel Corporation > +41 (0) 79 365 57 48 - http://blog.ffwll.ch _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx