Re: [PATCH] drm/i915/bxt: Broxton decoupled MMIO

Praveen Paneri <praveen.paneri@xxxxxxxxx> · Mon, 19 Sep 2016 22:35:45 +0530






On Tuesday 06 September 2016 12:06 PM, Chris Wilson wrote:
On Tue, Sep 06, 2016 at 10:54:14AM +0530, Praveen Paneri wrote:
Decoupled MMIO is an alternative way to access forcewake domain
registers, which requires less cycles and avoids frequent software
forcewake.

How about when forcewake is already held? You'll note that we still
Will try to add the same check (for domain->wake_count) in decoupled 
MMIO path as well and do a direct register access if forcewake is 
already held.
require irq-spinlocks so the mmio access is still not great. And we
still will have to frequently take forcewake manually, apparently.

Do you have any statistics to say that we do reduce grabing the fw
wakelock and that the busywait you add instead is negligible. You are
still using a 50ms timeout, so there is some doubt about "less cycles".
Sorry didn't find any such statistics with Windows folks.
But can do an exercise myself to measure the actual benefit of Decoupled 
MMIO. Please can you suggest some method to do that.

The feature definitely helps HW for synchronization as the cycles are
internally serialized in GT and eliminates the risk of hitting certain
hangs which exist in theory.

+/*
+ * Decoupled MMIO access for only 1 DWORD
+ */
+static void __gen9_decoupled_mmio_access(struct drm_i915_private *dev_priv,
+					 uint32_t reg, u32 *ptr_data,
+					 enum power_domains pd, int operation)
+{
+	u32 ctrl_reg_data = 0;
+
+	if (operation == GEN9_DECOUPLED_OP_WRITE)
+		__raw_i915_write32(dev_priv,
+				GEN9_DECOUPLED_REG0_DW0,
+				*ptr_data);
+
+	ctrl_reg_data |= reg;
+	ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
+	ctrl_reg_data |= (pd << GEN9_DECOUPLED_PD_SHIFT);
+	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
+
+	ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
+	__raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1, ctrl_reg_data);
+
+	if (wait_for_atomic((__raw_i915_read32(dev_priv,
+			GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
+			FORCEWAKE_ACK_TIMEOUT_MS))
+		DRM_ERROR("Decoupled MMIO wait timed out\n");
+
+	if (operation == GEN9_DECOUPLED_OP_READ)
+		*ptr_data = __raw_i915_read32(dev_priv,
+				GEN9_DECOUPLED_REG0_DW0);
+}
+
  #define GEN2_READ_HEADER(x) \
  	u##x val = 0; \
  	assert_rpm_wakelock_held(dev_priv);
@@ -932,12 +997,27 @@ chv_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
  static u##x \
  gen9_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool trace) { \
  	enum forcewake_domains fw_engine; \
+	enum power_domains pd_engine; \
  	GEN6_READ_HEADER(x); \
-	fw_engine = __gen9_reg_read_fw_domains(offset); \
-	if (fw_engine) \
-		__force_wake_auto(dev_priv, fw_engine); \
-	val = __raw_i915_read##x(dev_priv, reg); \
-	GEN6_READ_FOOTER; \
+	pd_engine = __gen9_reg_read_power_domains(offset); \
+	if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \

Move the platform test out of here (since it is already a per-platform
vfunc) and then skip the duplicated gen9 functions.

+		u32 *ptr_data = (u32 *) &val; \
+		unsigned i = 0; \
+		for (i = 0; i < x/32; i++) { \

And tidy up the reassignments.

+			__gen9_decoupled_mmio_access(dev_priv, \
+					(offset + i*4), \
+					ptr_data + i, \
+					pd_engine, \
+					GEN9_DECOUPLED_OP_READ); \
+			ptr_data++; \
+		} \
+	} else { \
+		fw_engine = __gen9_reg_read_fw_domains(offset); \
+		if (fw_engine) \
+			__force_wake_auto(dev_priv, fw_engine); \
+		val = __raw_i915_read##x(dev_priv, reg); \
+	} \
+		GEN6_READ_FOOTER; \

Misleading indentation.

  }

  __gen9_read(8)
@@ -1101,11 +1181,26 @@ static void \
  gen9_write##x(struct drm_i915_private *dev_priv, i915_reg_t reg, u##x val, \
  		bool trace) { \
  	enum forcewake_domains fw_engine; \
+	enum power_domains pd_engine; \
  	GEN6_WRITE_HEADER; \
-	fw_engine = __gen9_reg_write_fw_domains(offset); \


-	if (fw_engine) \
-		__force_wake_auto(dev_priv, fw_engine); \
-	__raw_i915_write##x(dev_priv, reg, val); \
+	pd_engine = __gen9_reg_write_power_domains(offset); \
+	if (HAS_DECOUPLED_MMIO(dev_priv) && pd_engine && x%32 == 0) { \
+		u32 *ptr_data = (u32 *) &val; \
+		unsigned i = 0; \
+		for (i = 0; i < x/32; i++) { \
+			__gen9_decoupled_mmio_access(dev_priv, \
+					(offset + i*4), \
+					ptr_data + i, \
+					pd_engine, \
+					GEN9_DECOUPLED_OP_WRITE); \
+			ptr_data++; \
+		} \

This is scary for a 64bit write. They are assumed to be an atomic
transaction with hw - when they are not we encounter fun races where the
hardware operates on the intermediate state. Hence we avoid them.
Decoupled MMIO currently doesn't support single 64 bit write. We can 
continue to use existing method for 64 bit writes.
Thanks,
Praveen
-Chisr

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx