Quoting Lionel Landwerlin (2019-07-09 10:32:02) > +static u32 *save_register(struct drm_i915_private *i915, u32 *cs, > + i915_reg_t reg, u32 offset, u32 dword_count) > +{ > + uint32_t d; > + > + for (d = 0; d < dword_count; d++) { > + *cs++ = INTEL_GEN(i915) >= 8 ? > + MI_STORE_REGISTER_MEM_GEN8 : MI_STORE_REGISTER_MEM; > + *cs++ = i915_mmio_reg_offset(reg) + 4 * d; > + *cs++ = intel_gt_scratch_offset(&i915->gt, offset) + 4 * d; > + if (INTEL_GEN(i915) >= 8) > + *cs++ = 0; restore_register doesn't care about the extra MI_NOOP :) > + } > + > + return cs; > +} > + > +static u32 *restore_register(struct drm_i915_private *i915, u32 *cs, > + i915_reg_t reg, u32 offset, u32 dword_count) > +{ > + uint32_t d; > + > + for (d = 0; d < dword_count; d++) { > + *cs++ = INTEL_GEN(i915) >= 8 ? > + MI_LOAD_REGISTER_MEM_GEN8 : MI_LOAD_REGISTER_MEM; > + *cs++ = i915_mmio_reg_offset(reg); > + *cs++ = intel_gt_scratch_offset(&i915->gt, offset); > + *cs++ = 0; > + } > + > + return cs; > +} > + > +static int alloc_noa_wait(struct drm_i915_private *i915) > +{ > + struct drm_i915_gem_object *bo; > + struct i915_vma *vma; > + u64 delay_ns = atomic64_read(&i915->perf.oa.noa_programming_delay), delay_ticks; I would const u64 delay_ticks = foo(i915); That would save the distraction later in the middle of CS. > + u32 *batch, *ts0, *cs, *jump; > + int ret, i; > + enum { START_TS, NOW_TS, DELTA_TS, JUMP_PREDICATE, DELTA_TARGET, N_CS_GPR }; > + > + bo = i915_gem_object_create_internal(i915, 4096); > + if (IS_ERR(bo)) { > + DRM_ERROR("Failed to allocate NOA wait batchbuffer\n"); > + return PTR_ERR(bo); > + } > + > + /* > + * We pin in GGTT because we jump into this buffer now because > + * multiple OA config BOs will have a jump to this address and it > + * needs to be fixed during the lifetime of the i915/perf stream. > + */ > + vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 4096, 0); > + if (IS_ERR(vma)) { > + ret = PTR_ERR(vma); > + goto err_unref; > + } > + > + batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB); > + if (IS_ERR(batch)) { > + ret = PTR_ERR(batch); > + goto err_unpin; > + } > + > + /* Save registers. */ > + for (i = 0; i < N_CS_GPR; i++) { > + cs = save_register(i915, cs, HSW_CS_GPR(i), > + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); > + } > + cs = save_register(i915, cs, MI_PREDICATE_RESULT_1, > + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); > + > + /* First timestamp snapshot location. */ > + ts0 = cs; > + > + /* > + * Initial snapshot of the timestamp register to implement the wait. > + * We work with 32b values, so clear out the top 32b bits of the > + * register because the ALU works 64bits. > + */ > + *cs++ = MI_LOAD_REGISTER_IMM(1); > + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)) + 4; > + *cs++ = 0; > + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); > + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); > + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(START_TS)); > + > + /* > + * This is the location we're going to jump back into until the > + * required amount of time has passed. > + */ > + jump = cs; > + > + /* > + * Take another snapshot of the timestamp register. Take care to clear > + * up the top 32bits of CS_GPR(1) as we're using it for other > + * operations below. > + */ > + *cs++ = MI_LOAD_REGISTER_IMM(1); > + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)) + 4; > + *cs++ = 0; > + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); > + *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(RENDER_RING_BASE)); > + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(NOW_TS)); > + > + /* > + * Do a diff between the 2 timestamps and store the result back into > + * CS_GPR(1). > + */ > + *cs++ = MI_MATH(5); > + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(NOW_TS)); > + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(START_TS)); > + *cs++ = MI_ALU_OP(MI_ALU_OP_SUB, 0, 0); > + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(DELTA_TS), MI_ALU_SRC_ACCU); > + *cs++ = MI_ALU_OP(MI_ALU_OP_STORE, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); > + > + /* > + * Transfer the carry flag (set to 1 if ts1 < ts0, meaning the > + * timestamp have rolled over the 32bits) into the predicate register > + * to be used for the predicated jump. > + */ > + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); > + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); > + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); > + > + /* Restart from the beginning if we had timestamps roll over. */ > + *cs++ = (INTEL_GEN(i915) < 8 ? > + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | > + MI_BATCH_PREDICATE; > + *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; > + *cs++ = 0; > + > + /* > + * Now add the diff between to previous timestamps and add it to : > + * (((1 * << 64) - 1) - delay_ns) > + * > + * When the Carry Flag contains 1 this means the elapsed time is > + * longer than the expected delay, and we can exit the wait loop. > + */ > + delay_ticks = 0xffffffffffffffff - > + DIV64_U64_ROUND_UP(delay_ns * > + RUNTIME_INFO(i915)->cs_timestamp_frequency_khz, > + 1000000ull); > + *cs++ = MI_LOAD_REGISTER_IMM(2); > + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)); > + *cs++ = lower_32_bits(delay_ticks); > + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(DELTA_TARGET)) + 4; > + *cs++ = upper_32_bits(delay_ticks); > + > + *cs++ = MI_MATH(4); > + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCA, MI_ALU_SRC_REG(DELTA_TS)); > + *cs++ = MI_ALU_OP(MI_ALU_OP_LOAD, MI_ALU_SRC_SRCB, MI_ALU_SRC_REG(DELTA_TARGET)); > + *cs++ = MI_ALU_OP(MI_ALU_OP_ADD, 0, 0); > + *cs++ = MI_ALU_OP(MI_ALU_OP_STOREINV, MI_ALU_SRC_REG(JUMP_PREDICATE), MI_ALU_SRC_CF); > + > + /* > + * Transfer the result into the predicate register to be used for the > + * predicated jump. > + */ > + *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); > + *cs++ = i915_mmio_reg_offset(HSW_CS_GPR(JUMP_PREDICATE)); > + *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1); > + > + /* Predicate the jump. */ > + *cs++ = (INTEL_GEN(i915) < 8 ? > + MI_BATCH_BUFFER_START : MI_BATCH_BUFFER_START_GEN8) | > + MI_BATCH_PREDICATE; > + *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; > + *cs++ = 0; > + > + /* Restore registers. */ > + for (i = 0; i < N_CS_GPR; i++) { > + cs = restore_register(i915, cs, HSW_CS_GPR(i), > + INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); > + } > + cs = restore_register(i915, cs, MI_PREDICATE_RESULT_1, > + INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); > + > + /* And return to the ring. */ > + *cs++ = MI_BATCH_BUFFER_END; > + > + GEM_BUG_ON((cs - batch) > (PAGE_SIZE / sizeof(*batch))); > + > + i915_gem_object_flush_map(bo); > + i915_gem_object_unpin_map(bo); > + > + i915->perf.oa.noa_wait = vma; > + > + return 0; > + > +err_unpin: > + __i915_vma_unpin(vma); > + > +err_unref: > + i915_gem_object_put(bo); > + > + return ret; > +} Preferably with the nit above, Reviewed-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx