On 03/05/2017 12:37, Chris Wilson wrote:
add/remove: 1/1 grow/shrink: 5/4 up/down: 391/-578 (-187)
function old new delta
execlists_submit_ports 262 471 +209
port_assign.isra - 136 +136
capture 6344 6359 +15
reset_common_ring 438 452 +14
execlists_submit_request 228 238 +10
gen8_init_common_ring 334 341 +7
intel_engine_is_idle 106 105 -1
i915_engine_info 2314 2290 -24
__i915_gem_set_wedged_BKL 485 411 -74
intel_lrc_irq_handler 1789 1604 -185
execlists_update_context 294 - -294
The most important change there is the improve to the
intel_lrc_irq_handler and excclist_submit_ports (net improvement since
execlists_update_context is now inlined).
v2: Use the port_api() for guc as well (even though currently we do not
pack any counters in there, yet) and hide all port->request_count inside
the helpers.
Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
drivers/gpu/drm/i915/i915_debugfs.c | 32 ++++----
drivers/gpu/drm/i915/i915_gem.c | 6 +-
drivers/gpu/drm/i915/i915_gpu_error.c | 13 +++-
drivers/gpu/drm/i915/i915_guc_submission.c | 32 +++++---
drivers/gpu/drm/i915/intel_engine_cs.c | 2 +-
drivers/gpu/drm/i915/intel_lrc.c | 117 ++++++++++++++++-------------
drivers/gpu/drm/i915/intel_ringbuffer.h | 10 ++-
7 files changed, 122 insertions(+), 90 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 870c470177b5..0b5d7142d8d9 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
if (i915.enable_execlists) {
u32 ptr, read, write;
struct rb_node *rb;
+ unsigned int idx;
seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3332,8 +3333,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
if (read > write)
write += GEN8_CSB_ENTRIES;
while (read < write) {
- unsigned int idx = ++read % GEN8_CSB_ENTRIES;
-
+ idx = ++read % GEN8_CSB_ENTRIES;
seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n",
idx,
I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)),
@@ -3341,21 +3341,19 @@ static int i915_engine_info(struct seq_file *m, void *unused)
}
rcu_read_lock();
- rq = READ_ONCE(engine->execlist_port[0].request);
- if (rq) {
- seq_printf(m, "\t\tELSP[0] count=%d, ",
- engine->execlist_port[0].count);
- print_request(m, rq, "rq: ");
- } else {
- seq_printf(m, "\t\tELSP[0] idle\n");
- }
- rq = READ_ONCE(engine->execlist_port[1].request);
- if (rq) {
- seq_printf(m, "\t\tELSP[1] count=%d, ",
- engine->execlist_port[1].count);
- print_request(m, rq, "rq: ");
- } else {
- seq_printf(m, "\t\tELSP[1] idle\n");
+ for (idx = 0; idx < ARRAY_SIZE(engine->execlist_port); idx++) {
+ unsigned int count;
+
+ rq = port_unpack(&engine->execlist_port[idx],
+ &count);
+ if (rq) {
+ seq_printf(m, "\t\tELSP[%d] count=%d, ",
+ idx, count);
+ print_request(m, rq, "rq: ");
+ } else {
+ seq_printf(m, "\t\tELSP[%d] idle\n",
+ idx);
+ }
}
rcu_read_unlock();
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f79079f2e265..c1df4b9d2661 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3038,12 +3038,14 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
*/
if (i915.enable_execlists) {
+ struct execlist_port *port = engine->execlist_port;
unsigned long flags;
+ unsigned int n;
spin_lock_irqsave(&engine->timeline->lock, flags);
- i915_gem_request_put(engine->execlist_port[0].request);
- i915_gem_request_put(engine->execlist_port[1].request);
+ for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
+ i915_gem_request_put(port_request(&port[n]));
memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
engine->execlist_queue = RB_ROOT;
engine->execlist_first = NULL;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ec526d92f908..e18f350bc364 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1324,12 +1324,17 @@ static void engine_record_requests(struct intel_engine_cs *engine,
static void error_record_engine_execlists(struct intel_engine_cs *engine,
struct drm_i915_error_engine *ee)
{
+ const struct execlist_port *port = engine->execlist_port;
unsigned int n;
- for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
- if (engine->execlist_port[n].request)
- record_request(engine->execlist_port[n].request,
- &ee->execlist[n]);
+ for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
+ struct drm_i915_gem_request *rq = port_request(&port[n]);
+
+ if (!rq)
+ break;
+
+ record_request(rq, &ee->execlist[n]);
+ }
}
static void record_context(struct drm_i915_error_context *e,
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 7e85b5ab8ae2..62d3831a8a0d 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -653,10 +653,22 @@ static void nested_enable_signaling(struct drm_i915_gem_request *rq)
spin_unlock(&rq->lock);
}
+static void port_assign(struct execlist_port *port,
+ struct drm_i915_gem_request *rq)
+{
+ GEM_BUG_ON(rq == port_request(port));
+
+ if (port_isset(port))
+ i915_gem_request_put(port_request(port));
+
+ port_set(port, i915_gem_request_get(rq));
+ nested_enable_signaling(rq);
+}
+
static bool i915_guc_dequeue(struct intel_engine_cs *engine)
{
struct execlist_port *port = engine->execlist_port;
- struct drm_i915_gem_request *last = port[0].request;
+ struct drm_i915_gem_request *last = port_request(&port[0]);
struct rb_node *rb;
bool submit = false;
@@ -670,8 +682,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
if (port != engine->execlist_port)
break;
- i915_gem_request_assign(&port->request, last);
- nested_enable_signaling(last);
+ port_assign(port, last);
port++;
}
@@ -686,8 +697,7 @@ static bool i915_guc_dequeue(struct intel_engine_cs *engine)
submit = true;
}
if (submit) {
- i915_gem_request_assign(&port->request, last);
- nested_enable_signaling(last);
+ port_assign(port, last);
engine->execlist_first = rb;
}
spin_unlock_irq(&engine->timeline->lock);
@@ -703,17 +713,19 @@ static void i915_guc_irq_handler(unsigned long data)
bool submit;
do {
- rq = port[0].request;
+ rq = port_request(&port[0]);
while (rq && i915_gem_request_completed(rq)) {
trace_i915_gem_request_out(rq);
i915_gem_request_put(rq);
- port[0].request = port[1].request;
- port[1].request = NULL;
- rq = port[0].request;
+
+ port[0] = port[1];
+ memset(&port[1], 0, sizeof(port[1]));
+
+ rq = port_request(&port[0]);
}
submit = false;
- if (!port[1].request)
+ if (!port_count(&port[1]))
submit = i915_guc_dequeue(engine);
} while (submit);
}
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 6d3d83876da9..24659788e7a3 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1230,7 +1230,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
return false;
/* Both ports drained, no more ELSP submission? */
- if (engine->execlist_port[0].request)
+ if (port_request(&engine->execlist_port[0]))
return false;
/* Ring stopped? */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0909549ad320..9f7b6112c53a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -337,39 +337,32 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
static void execlists_submit_ports(struct intel_engine_cs *engine)
{
- struct drm_i915_private *dev_priv = engine->i915;
struct execlist_port *port = engine->execlist_port;
u32 __iomem *elsp =
- dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
- u64 desc[2];
-
- GEM_BUG_ON(port[0].count > 1);
- if (!port[0].count)
- execlists_context_status_change(port[0].request,
- INTEL_CONTEXT_SCHEDULE_IN);
- desc[0] = execlists_update_context(port[0].request);
- GEM_DEBUG_EXEC(port[0].context_id = upper_32_bits(desc[0]));
- port[0].count++;
-
- if (port[1].request) {
- GEM_BUG_ON(port[1].count);
- execlists_context_status_change(port[1].request,
- INTEL_CONTEXT_SCHEDULE_IN);
- desc[1] = execlists_update_context(port[1].request);
- GEM_DEBUG_EXEC(port[1].context_id = upper_32_bits(desc[1]));
- port[1].count = 1;
- } else {
- desc[1] = 0;
- }
- GEM_BUG_ON(desc[0] == desc[1]);
+ engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+ unsigned int n;
- /* You must always write both descriptors in the order below. */
- writel(upper_32_bits(desc[1]), elsp);
- writel(lower_32_bits(desc[1]), elsp);
+ for (n = ARRAY_SIZE(engine->execlist_port); n--; ) {
+ struct drm_i915_gem_request *rq;
+ unsigned int count;
+ u64 desc;
+
+ rq = port_unpack(&port[n], &count);
+ if (rq) {
+ GEM_BUG_ON(count > !n);
+ if (!count++)
+ execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
+ port_set(&port[n], port_pack(rq, count));
+ desc = execlists_update_context(rq);
+ GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
+ } else {
+ GEM_BUG_ON(!n);
+ desc = 0;
+ }
- writel(upper_32_bits(desc[0]), elsp);
- /* The context is automatically loaded after the following */
- writel(lower_32_bits(desc[0]), elsp);
+ writel(upper_32_bits(desc), elsp);
+ writel(lower_32_bits(desc), elsp);
+ }
}
static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
@@ -390,6 +383,17 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
return true;
}
+static void port_assign(struct execlist_port *port,
+ struct drm_i915_gem_request *rq)
+{
+ GEM_BUG_ON(rq == port_request(port));
+
+ if (port_isset(port))
+ i915_gem_request_put(port_request(port));
+
+ port_set(port, port_pack(i915_gem_request_get(rq), port_count(port)));
+}
Reason for not having one implementation of port_assign with
enable_nested_signalling outside it in the guc case?
+
static void execlists_dequeue(struct intel_engine_cs *engine)
{
struct drm_i915_gem_request *last;
@@ -397,7 +401,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
struct rb_node *rb;
bool submit = false;
- last = port->request;
+ last = port_request(port);
if (last)
/* WaIdleLiteRestore:bdw,skl
* Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
@@ -407,7 +411,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
*/
last->tail = last->wa_tail;
- GEM_BUG_ON(port[1].request);
+ GEM_BUG_ON(port_isset(&port[1]));
/* Hardware submission is through 2 ports. Conceptually each port
* has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
@@ -464,7 +468,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
GEM_BUG_ON(last->ctx == cursor->ctx);
- i915_gem_request_assign(&port->request, last);
+ if (submit)
+ port_assign(port, last);
Optimisation? GEM_BUG_ON(last != port_request(port))?
port++;
}
@@ -479,7 +484,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
submit = true;
}
if (submit) {
- i915_gem_request_assign(&port->request, last);
+ port_assign(port, last);
engine->execlist_first = rb;
}
spin_unlock_irq(&engine->timeline->lock);
@@ -488,16 +493,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
execlists_submit_ports(engine);
}
-static bool execlists_elsp_idle(struct intel_engine_cs *engine)
-{
- return !engine->execlist_port[0].request;
-}
-
static bool execlists_elsp_ready(const struct intel_engine_cs *engine)
{
const struct execlist_port *port = engine->execlist_port;
- return port[0].count + port[1].count < 2;
+ return port_count(&port[0]) + port_count(&port[1]) < 2;
}
/*
@@ -547,7 +547,9 @@ static void intel_lrc_irq_handler(unsigned long data)
tail = GEN8_CSB_WRITE_PTR(head);
head = GEN8_CSB_READ_PTR(head);
while (head != tail) {
+ struct drm_i915_gem_request *rq;
unsigned int status;
+ unsigned int count;
if (++head == GEN8_CSB_ENTRIES)
head = 0;
@@ -577,20 +579,24 @@ static void intel_lrc_irq_handler(unsigned long data)
GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) !=
port[0].context_id);
- GEM_BUG_ON(port[0].count == 0);
- if (--port[0].count == 0) {
+ rq = port_unpack(&port[0], &count);
+ GEM_BUG_ON(count == 0);
+ if (--count == 0) {
If you changed this to be:
count--;
port_set(...);
if (count > 0)
continue;
It would be perhaps a bit more readable, but a potentially useless
port_set on the other hand. Not sure.
GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
- GEM_BUG_ON(!i915_gem_request_completed(port[0].request));
- execlists_context_status_change(port[0].request,
- INTEL_CONTEXT_SCHEDULE_OUT);
+ GEM_BUG_ON(!i915_gem_request_completed(rq));
+ execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
+
+ trace_i915_gem_request_out(rq);
+ i915_gem_request_put(rq);
- trace_i915_gem_request_out(port[0].request);
- i915_gem_request_put(port[0].request);
port[0] = port[1];
memset(&port[1], 0, sizeof(port[1]));
+ } else {
+ port_set(&port[0], port_pack(rq, count));
}
- GEM_BUG_ON(port[0].count == 0 &&
+ /* After the final element, the hw should be idle */
+ GEM_BUG_ON(port_count(&port[0]) == 0 &&
!(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
}
@@ -1148,6 +1154,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
struct drm_i915_private *dev_priv = engine->i915;
struct execlist_port *port = engine->execlist_port;
unsigned int n;
+ bool submit;
int ret;
ret = intel_mocs_init_engine(engine);
@@ -1169,19 +1176,21 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
/* After a GPU reset, we may have requests to replay */
clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+ submit = false;
for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
- if (!port[n].request)
+ if (!port_isset(&port[n]))
break;
DRM_DEBUG_DRIVER("Restarting %s:%d from 0x%x\n",
engine->name, n,
- port[n].request->global_seqno);
+ port_request(&port[n])->global_seqno);
/* Discard the current inflight count */
- port[n].count = 0;
+ port_set(&port[n], port_request(&port[n]));
+ submit = true;
}
- if (!i915.enable_guc_submission && !execlists_elsp_idle(engine))
+ if (submit && !i915.enable_guc_submission)
execlists_submit_ports(engine);
return 0;
@@ -1259,13 +1268,13 @@ static void reset_common_ring(struct intel_engine_cs *engine,
intel_ring_update_space(request->ring);
/* Catch up with any missed context-switch interrupts */
- if (request->ctx != port[0].request->ctx) {
- i915_gem_request_put(port[0].request);
+ if (request->ctx != port_request(&port[0])->ctx) {
+ i915_gem_request_put(port_request(&port[0]));
port[0] = port[1];
memset(&port[1], 0, sizeof(port[1]));
}
- GEM_BUG_ON(request->ctx != port[0].request->ctx);
+ GEM_BUG_ON(request->ctx != port_request(&port[0])->ctx);
/* Reset WaIdleLiteRestore:bdw,skl as well */
request->tail =
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 4a599e3480a9..68765d45116b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -368,8 +368,14 @@ struct intel_engine_cs {
/* Execlists */
struct tasklet_struct irq_tasklet;
struct execlist_port {
- struct drm_i915_gem_request *request;
- unsigned int count;
+ struct drm_i915_gem_request *request_count;
+#define EXECLIST_COUNT_BITS 2
+#define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS)
+#define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS)
+#define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS)
+#define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, EXECLIST_COUNT_BITS)
+#define port_set(p, packed) ((p)->request_count = (packed))
+#define port_isset(p) ((p)->request_count)
GEM_DEBUG_DECL(u32 context_id);
} execlist_port[2];
struct rb_root execlist_queue;
Looks correct but I am still having trouble accepting the structure
shrink and code savings are worth having less readable code.
Regards,
Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx