From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> GCC cannot optimize well calculations hidden in macros and assigned to temporary structures. We can cache the register in ELSP write, and refactor reading of the CSB a bit to enable it to do a better job. Code is still equally readable but the generated body of the CSB read loop is 30% smaller, and since that loop runs at least once per interrupt, which in turn can fire in tens or hundreds thousands times per second, must be of some value. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> --- drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++------------- drivers/gpu/drm/i915/intel_lrc.h | 11 ++++++++--- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 3a23b9549f7b..67592f8354d6 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -361,8 +361,8 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, { struct intel_engine_cs *engine = rq0->engine; - struct drm_device *dev = engine->dev; - struct drm_i915_private *dev_priv = dev->dev_private; + struct drm_i915_private *dev_priv = rq0->i915; + i915_reg_t elsp_reg = RING_ELSP(engine); uint64_t desc[2]; if (rq1) { @@ -376,12 +376,12 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0, rq0->elsp_submitted++; /* You must always write both descriptors in the order below. */ - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1])); - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1])); + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[1])); + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[1])); - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0])); + I915_WRITE_FW(elsp_reg, upper_32_bits(desc[0])); /* The context is automatically loaded after the following */ - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0])); + I915_WRITE_FW(elsp_reg, lower_32_bits(desc[0])); /* ELSP is a wo register, use another nearby reg for posting */ POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine)); @@ -517,21 +517,19 @@ execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id) } static u32 -get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer, - u32 *context_id) +get_context_status(struct drm_i915_private *dev_priv, u32 csb_base, + unsigned int read_pointer, u32 *context_id) { - struct drm_i915_private *dev_priv = engine->dev->dev_private; u32 status; read_pointer %= GEN8_CSB_ENTRIES; - status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer)); + status = I915_READ_FW(RING_CSB_LO(csb_base, read_pointer)); if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) return 0; - *context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine, - read_pointer)); + *context_id = I915_READ_FW(RING_CSB_HI(csb_base, read_pointer)); return status; } @@ -548,6 +546,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine) struct drm_i915_private *dev_priv = engine->dev->dev_private; u32 status_pointer; unsigned int read_pointer, write_pointer; + u32 csb_base = RING_CSB_BASE(engine); u32 csb[GEN8_CSB_ENTRIES][2]; unsigned int csb_read = 0, i; unsigned int submit_contexts = 0; @@ -565,7 +564,8 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine) while (read_pointer < write_pointer) { if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES)) break; - csb[csb_read][0] = get_context_status(engine, ++read_pointer, + csb[csb_read][0] = get_context_status(dev_priv, csb_base, + ++read_pointer, &csb[csb_read][1]); csb_read++; } diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h index a17cb12221ba..6690d93d603f 100644 --- a/drivers/gpu/drm/i915/intel_lrc.h +++ b/drivers/gpu/drm/i915/intel_lrc.h @@ -34,9 +34,14 @@ #define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH (1 << 3) #define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT (1 << 0) #define CTX_CTRL_RS_CTX_ENABLE (1 << 1) -#define RING_CONTEXT_STATUS_BUF_LO(ring, i) _MMIO((ring)->mmio_base + 0x370 + (i) * 8) -#define RING_CONTEXT_STATUS_BUF_HI(ring, i) _MMIO((ring)->mmio_base + 0x370 + (i) * 8 + 4) -#define RING_CONTEXT_STATUS_PTR(ring) _MMIO((ring)->mmio_base + 0x3a0) + +#define RING_CSB_BASE(ring) ((ring)->mmio_base + 0x370) +#define RING_CSB_LO(csb_base, i) _MMIO((csb_base) + (i) * 8) +#define RING_CSB_HI(csb_base, i) _MMIO((csb_base) + (i) * 8 + 4) + +#define RING_CONTEXT_STATUS_BUF_LO(ring, i) RING_CSB_LO(RING_CSB_BASE(ring), i) +#define RING_CONTEXT_STATUS_BUF_HI(ring, i) RING_CSB_HI(RING_CSB_BASE(ring), i) +#define RING_CONTEXT_STATUS_PTR(ring) _MMIO((ring)->mmio_base + 0x3a0) /* The docs specify that the write pointer wraps around after 5h, "After status * is written out to the last available status QW at offset 5h, this pointer -- 1.9.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx