now mmio switch between vGPUs need to switch to host first then to expected vGPU, it waste one time mmio save/restore. r/w mmio usually is time-consuming, and there are so many mocs registers need to save/restore during vGPU switch. Combine the switch_to_host and switch_to_vgpu can reduce 1 time mmio save/restore, it will reduce the CPU utilization and performance while there is multi VMs with heavy work load. Signed-off-by: Weinan Li <weinan.z.li@xxxxxxxxx> --- drivers/gpu/drm/i915/gvt/render.c | 212 ++++++++++++++++---------------------- drivers/gpu/drm/i915/gvt/trace.h | 15 +-- 2 files changed, 95 insertions(+), 132 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/render.c b/drivers/gpu/drm/i915/gvt/render.c index dac12c2..ec1e60d 100644 --- a/drivers/gpu/drm/i915/gvt/render.c +++ b/drivers/gpu/drm/i915/gvt/render.c @@ -190,9 +190,10 @@ static void handle_tlb_pending_event(struct intel_vgpu *vgpu, int ring_id) gvt_dbg_core("invalidate TLB for ring %d\n", ring_id); } -static void load_mocs(struct intel_vgpu *vgpu, int ring_id) +static void switch_mocs(struct intel_vgpu *pre, struct intel_vgpu *next, + int ring_id) { - struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + struct drm_i915_private *dev_priv; i915_reg_t offset, l3_offset; u32 regs[] = { [RCS] = 0xc800, @@ -203,54 +204,44 @@ static void load_mocs(struct intel_vgpu *vgpu, int ring_id) }; int i; + dev_priv = pre ? pre->gvt->dev_priv : next->gvt->dev_priv; if (WARN_ON(ring_id >= ARRAY_SIZE(regs))) return; offset.reg = regs[ring_id]; - for (i = 0; i < 64; i++) { - gen9_render_mocs[ring_id][i] = I915_READ_FW(offset); - I915_WRITE_FW(offset, vgpu_vreg(vgpu, offset)); - offset.reg += 4; - } - - if (ring_id == RCS) { - l3_offset.reg = 0xb020; - for (i = 0; i < 32; i++) { - gen9_render_mocs_L3[i] = I915_READ_FW(l3_offset); - I915_WRITE_FW(l3_offset, vgpu_vreg(vgpu, l3_offset)); - l3_offset.reg += 4; - } - } -} -static void restore_mocs(struct intel_vgpu *vgpu, int ring_id) -{ - struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; - i915_reg_t offset, l3_offset; - u32 regs[] = { - [RCS] = 0xc800, - [VCS] = 0xc900, - [VCS2] = 0xca00, - [BCS] = 0xcc00, - [VECS] = 0xcb00, - }; - int i; + for (i = 0; i < 64; i++) { + if (pre) + vgpu_vreg(pre, offset) = + I915_READ_FW(offset); + else + gen9_render_mocs[ring_id][i] = + I915_READ_FW(offset); - if (WARN_ON(ring_id >= ARRAY_SIZE(regs))) - return; + if (next) + I915_WRITE_FW(offset, vgpu_vreg(next, offset)); + else + I915_WRITE_FW(offset, gen9_render_mocs[ring_id][i]); - offset.reg = regs[ring_id]; - for (i = 0; i < 64; i++) { - vgpu_vreg(vgpu, offset) = I915_READ_FW(offset); - I915_WRITE_FW(offset, gen9_render_mocs[ring_id][i]); offset.reg += 4; } if (ring_id == RCS) { l3_offset.reg = 0xb020; for (i = 0; i < 32; i++) { - vgpu_vreg(vgpu, l3_offset) = I915_READ_FW(l3_offset); - I915_WRITE_FW(l3_offset, gen9_render_mocs_L3[i]); + if (pre) + vgpu_vreg(pre, l3_offset) = + I915_READ_FW(l3_offset); + else + gen9_render_mocs_L3[i] = + I915_READ_FW(l3_offset); + if (next) + I915_WRITE_FW(l3_offset, + vgpu_vreg(next, l3_offset)); + else + I915_WRITE_FW(l3_offset, + gen9_render_mocs_L3[i]); + l3_offset.reg += 4; } } @@ -258,78 +249,25 @@ static void restore_mocs(struct intel_vgpu *vgpu, int ring_id) #define CTX_CONTEXT_CONTROL_VAL 0x03 -/* Switch ring mmio values (context) from host to a vgpu. */ -static void switch_mmio_to_vgpu(struct intel_vgpu *vgpu, int ring_id) -{ - struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; - struct intel_vgpu_submission *s = &vgpu->submission; - u32 *reg_state = s->shadow_ctx->engine[ring_id].lrc_reg_state; - u32 ctx_ctrl = reg_state[CTX_CONTEXT_CONTROL_VAL]; - u32 inhibit_mask = - _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); - i915_reg_t last_reg = _MMIO(0); - struct render_mmio *mmio; - u32 v; - int i, array_size; - - if (IS_SKYLAKE(vgpu->gvt->dev_priv) - || IS_KABYLAKE(vgpu->gvt->dev_priv)) { - mmio = gen9_render_mmio_list; - array_size = ARRAY_SIZE(gen9_render_mmio_list); - load_mocs(vgpu, ring_id); - } else { - mmio = gen8_render_mmio_list; - array_size = ARRAY_SIZE(gen8_render_mmio_list); - } - - for (i = 0; i < array_size; i++, mmio++) { - if (mmio->ring_id != ring_id) - continue; - - mmio->value = I915_READ_FW(mmio->reg); - - /* - * if it is an inhibit context, load in_context mmio - * into HW by mmio write. If it is not, skip this mmio - * write. - */ - if (mmio->in_context && - (ctx_ctrl & inhibit_mask) != inhibit_mask) - continue; - - if (mmio->mask) - v = vgpu_vreg(vgpu, mmio->reg) | (mmio->mask << 16); - else - v = vgpu_vreg(vgpu, mmio->reg); - - I915_WRITE_FW(mmio->reg, v); - last_reg = mmio->reg; - - trace_render_mmio(vgpu->id, "load", - i915_mmio_reg_offset(mmio->reg), - mmio->value, v); - } - - /* Make sure the swiched MMIOs has taken effect. */ - if (likely(INTEL_GVT_MMIO_OFFSET(last_reg))) - I915_READ_FW(last_reg); - - handle_tlb_pending_event(vgpu, ring_id); -} - -/* Switch ring mmio values (context) from vgpu to host. */ -static void switch_mmio_to_host(struct intel_vgpu *vgpu, int ring_id) +/* Switch ring mmio values (context). */ +static void switch_mmio(struct intel_vgpu *pre, struct intel_vgpu *next, + int ring_id) { - struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv; + struct drm_i915_private *dev_priv; struct render_mmio *mmio; i915_reg_t last_reg = _MMIO(0); - u32 v; + u32 old_v, new_v; int i, array_size; + struct intel_vgpu_submission *s; + u32 *reg_state, ctx_ctrl; + u32 inhibit_mask = + _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); + dev_priv = pre ? pre->gvt->dev_priv : next->gvt->dev_priv; if (IS_SKYLAKE(dev_priv) || IS_KABYLAKE(dev_priv)) { mmio = gen9_render_mmio_list; array_size = ARRAY_SIZE(gen9_render_mmio_list); - restore_mocs(vgpu, ring_id); + switch_mocs(pre, next, ring_id); } else { mmio = gen8_render_mmio_list; array_size = ARRAY_SIZE(gen8_render_mmio_list); @@ -338,29 +276,61 @@ static void switch_mmio_to_host(struct intel_vgpu *vgpu, int ring_id) for (i = 0; i < array_size; i++, mmio++) { if (mmio->ring_id != ring_id) continue; + // save + if (pre) { + vgpu_vreg(pre, mmio->reg) = I915_READ_FW(mmio->reg); + if (mmio->mask) + vgpu_vreg(pre, mmio->reg) &= + ~(mmio->mask << 16); + old_v = vgpu_vreg(pre, mmio->reg); + } else { + mmio->value = I915_READ_FW(mmio->reg); + old_v = mmio->value; + } - vgpu_vreg(vgpu, mmio->reg) = I915_READ_FW(mmio->reg); - - if (mmio->mask) { - vgpu_vreg(vgpu, mmio->reg) &= ~(mmio->mask << 16); - v = mmio->value | (mmio->mask << 16); - } else - v = mmio->value; - - if (mmio->in_context) - continue; + // restore + if (next) { + s = &next->submission; + reg_state = + s->shadow_ctx->engine[ring_id].lrc_reg_state; + ctx_ctrl = reg_state[CTX_CONTEXT_CONTROL_VAL]; + /* + * if it is an inhibit context, load in_context mmio + * into HW by mmio write. If it is not, skip this mmio + * write. + */ + if (mmio->in_context && + (ctx_ctrl & inhibit_mask) != inhibit_mask) + continue; + + if (mmio->mask) + new_v = vgpu_vreg(next, mmio->reg) | + (mmio->mask << 16); + else + new_v = vgpu_vreg(next, mmio->reg); + } else { + if (mmio->in_context) + continue; + if (mmio->mask) + new_v = mmio->value | (mmio->mask << 16); + else + new_v = mmio->value; + } - I915_WRITE_FW(mmio->reg, v); + I915_WRITE_FW(mmio->reg, new_v); last_reg = mmio->reg; - - trace_render_mmio(vgpu->id, "restore", + trace_render_mmio(pre ? pre->id : 0, + next ? next->id : 0, "switch", i915_mmio_reg_offset(mmio->reg), - mmio->value, v); + old_v, new_v); } /* Make sure the swiched MMIOs has taken effect. */ if (likely(INTEL_GVT_MMIO_OFFSET(last_reg))) I915_READ_FW(last_reg); + + if (next) + handle_tlb_pending_event(next, ring_id); } /** @@ -391,16 +361,6 @@ void intel_gvt_switch_mmio(struct intel_vgpu *pre, * handle forcewake mannually. */ intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); - - /** - * TODO: Optimize for vGPU to vGPU switch by merging - * switch_mmio_to_host() and switch_mmio_to_vgpu(). - */ - if (pre) - switch_mmio_to_host(pre, ring_id); - - if (next) - switch_mmio_to_vgpu(next, ring_id); - + switch_mmio(pre, next, ring_id); intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); } diff --git a/drivers/gpu/drm/i915/gvt/trace.h b/drivers/gpu/drm/i915/gvt/trace.h index 8c15038..7a25115 100644 --- a/drivers/gpu/drm/i915/gvt/trace.h +++ b/drivers/gpu/drm/i915/gvt/trace.h @@ -330,13 +330,14 @@ ); TRACE_EVENT(render_mmio, - TP_PROTO(int id, char *action, unsigned int reg, + TP_PROTO(int old_id, int new_id, char *action, unsigned int reg, unsigned int old_val, unsigned int new_val), - TP_ARGS(id, action, reg, new_val, old_val), + TP_ARGS(old_id, new_id, action, reg, new_val, old_val), TP_STRUCT__entry( - __field(int, id) + __field(int, old_id) + __field(int, new_id) __array(char, buf, GVT_TEMP_STR_LEN) __field(unsigned int, reg) __field(unsigned int, old_val) @@ -344,15 +345,17 @@ ), TP_fast_assign( - __entry->id = id; + __entry->old_id = old_id; + __entry->new_id = new_id; snprintf(__entry->buf, GVT_TEMP_STR_LEN, "%s", action); __entry->reg = reg; __entry->old_val = old_val; __entry->new_val = new_val; ), - TP_printk("VM%u %s reg %x, old %08x new %08x\n", - __entry->id, __entry->buf, __entry->reg, + TP_printk("VM%u -> VM%u %s reg %x, old %08x new %08x\n", + __entry->old_id, __entry->new_id, + __entry->buf, __entry->reg, __entry->old_val, __entry->new_val) ); -- 1.9.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx