On Tue, Nov 22, 2022 at 02:50:17PM -0800, Ceraolo Spurio, Daniele wrote: > > > On 11/22/2022 12:46 PM, Rodrigo Vivi wrote: > > On Mon, Nov 21, 2022 at 03:16:15PM -0800, Daniele Ceraolo Spurio wrote: > > > If the GSC was loaded, the only way to stop it during the driver unload > > > flow is to do a driver-FLR. > > > The driver-FLR is not the same as PCI config space FLR in that > > > it doesn't reset the SGUnit and doesn't modify the PCI config > > > space. Thus, it doesn't require a re-enumeration of the PCI BARs. > > > However, the driver-FLR does cause a memory wipe of graphics memory > > > on all discrete GPU platforms or a wipe limited to stolen memory > > > on the integrated GPU platforms. > > Nothing major or blocking, but a few thoughts: > > > > 1. Should we document this in the code, at least in a comment in the > > flr function? > > Sure, I'll add it in > > > 2. Should we call this driver_initiated_flr, aiming to reduce even more > > the ambiguity of it? > > ok > > > > > > We perform the FLR as the last action before releasing the MMIO bar, so > > > that we don't have to care about the consequences of the reset on the > > > unload flow. > > 3. should we try to implement this already in the gt_reset case as the > > last resrouce before wedging the gt? So we can already test this flow > > in the current platforms? > > This would be nice to have, but very complicated to implement. The fact that > FLR kills everything on the system, including resetting display and wiping > LMEM, means that we would need a new recovery path to re-initialize all > components. There are also potential questions on how to handle LMEM: do we > try to migrate it to SMEM before triggering the FLR (potentially via CPU > memcpy if the GT is dead), or do we just let it get wiped? > > The reason why I wanted the FLR to be the very last thing before releasing > MMIO access was exactly to not have to care about the recovery path ;) it makes sense indeed. > > Daniele > > > > > > Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx> > > > Signed-off-by: Alan Previn <alan.previn.teres.alexis@xxxxxxxxx> > > > --- > > > drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c | 9 +++++ > > > drivers/gpu/drm/i915/i915_reg.h | 3 ++ > > > drivers/gpu/drm/i915/intel_uncore.c | 45 +++++++++++++++++++++++ > > > drivers/gpu/drm/i915/intel_uncore.h | 13 +++++++ > > > 4 files changed, 70 insertions(+) > > > > > > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c > > > index 510fb47193ec..5dad3c19c445 100644 > > > --- a/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c > > > +++ b/drivers/gpu/drm/i915/gt/uc/intel_gsc_fw.c > > > @@ -173,6 +173,15 @@ int intel_gsc_fw_upload(struct intel_gsc_uc *gsc) > > > if (err) > > > goto fail; > > > + /* > > > + * Once the GSC FW is loaded, the only way to kill it on driver unload > > > + * is to do a driver FLR. Given this is a very disruptive action, we > > > + * want to do it as the last action before releasing the access to the > > > + * MMIO bar, which means we need to do it as part of the primary uncore > > > + * cleanup. > > > + */ > > > + intel_uncore_set_flr_on_fini(>->i915->uncore); > > > + > > > /* FW is not fully operational until we enable SW proxy */ > > > intel_uc_fw_change_status(gsc_fw, INTEL_UC_FIRMWARE_TRANSFERRED); > > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > > > index 8e1892d14774..60e55245200b 100644 > > > --- a/drivers/gpu/drm/i915/i915_reg.h > > > +++ b/drivers/gpu/drm/i915/i915_reg.h > > > @@ -118,6 +118,9 @@ > > > #define GU_CNTL _MMIO(0x101010) > > > #define LMEM_INIT REG_BIT(7) > > > +#define DRIVERFLR REG_BIT(31) > > > +#define GU_DEBUG _MMIO(0x101018) > > > +#define DRIVERFLR_STATUS REG_BIT(31) > > > #define GEN6_STOLEN_RESERVED _MMIO(0x1082C0) > > > #define GEN6_STOLEN_RESERVED_ADDR_MASK (0xFFF << 20) > > > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c > > > index 8006a6c61466..c1befa33ff59 100644 > > > --- a/drivers/gpu/drm/i915/intel_uncore.c > > > +++ b/drivers/gpu/drm/i915/intel_uncore.c > > > @@ -2703,6 +2703,48 @@ void intel_uncore_prune_engine_fw_domains(struct intel_uncore *uncore, > > > } > > > } > > > +static void driver_flr(struct intel_uncore *uncore) > > > +{ > > > + struct drm_i915_private *i915 = uncore->i915; > > > + const unsigned int flr_timeout_ms = 3000; /* specs recommend a 3s wait */ > > > + int ret; > > > + > > > + drm_dbg(&i915->drm, "Triggering Driver-FLR\n"); > > > + > > > + /* > > > + * Make sure any pending FLR requests have cleared by waiting for the > > > + * FLR trigger bit to go to zero. Also clear GU_DEBUG's DRIVERFLR_STATUS > > > + * to make sure it's not still set from a prior attempt (it's a write to > > > + * clear bit). > > > + * Note that we should never be in a situation where a previous attempt > > > + * is still pending (unless the HW is totally dead), but better to be > > > + * safe in case something unexpected happens > > > + */ > > > + ret = intel_wait_for_register_fw(uncore, GU_CNTL, DRIVERFLR, 0, flr_timeout_ms); > > > + if (ret) { > > > + drm_err(&i915->drm, > > > + "Failed to wait for Driver-FLR bit to clear! %d\n", > > > + ret); > > > + return; > > > + } > > > + intel_uncore_write_fw(uncore, GU_DEBUG, DRIVERFLR_STATUS); > > > + > > > + /* Trigger the actual Driver-FLR */ > > > + intel_uncore_rmw_fw(uncore, GU_CNTL, 0, DRIVERFLR); > > > + > > > + ret = intel_wait_for_register_fw(uncore, GU_DEBUG, > > > + DRIVERFLR_STATUS, DRIVERFLR_STATUS, > > > + flr_timeout_ms); > > > + if (ret) { > > > + drm_err(&i915->drm, "wait for Driver-FLR completion failed! %d\n", ret); > > > + return; > > > + } > > > + > > > + intel_uncore_write_fw(uncore, GU_DEBUG, DRIVERFLR_STATUS); > > > + > > > + return; > > > +} > > > + > > > /* Called via drm-managed action */ > > > void intel_uncore_fini_mmio(struct drm_device *dev, void *data) > > > { > > > @@ -2716,6 +2758,9 @@ void intel_uncore_fini_mmio(struct drm_device *dev, void *data) > > > intel_uncore_fw_domains_fini(uncore); > > > iosf_mbi_punit_release(); > > > } > > > + > > > + if (intel_uncore_needs_flr_on_fini(uncore)) > > > + driver_flr(uncore); > > > } > > > /** > > > diff --git a/drivers/gpu/drm/i915/intel_uncore.h b/drivers/gpu/drm/i915/intel_uncore.h > > > index 5449146a0624..a9fa0b11e7e4 100644 > > > --- a/drivers/gpu/drm/i915/intel_uncore.h > > > +++ b/drivers/gpu/drm/i915/intel_uncore.h > > > @@ -153,6 +153,7 @@ struct intel_uncore { > > > #define UNCORE_HAS_FPGA_DBG_UNCLAIMED BIT(1) > > > #define UNCORE_HAS_DBG_UNCLAIMED BIT(2) > > > #define UNCORE_HAS_FIFO BIT(3) > > > +#define UNCORE_NEEDS_FLR_ON_FINI BIT(3) > > > const struct intel_forcewake_range *fw_domains_table; > > > unsigned int fw_domains_table_entries; > > > @@ -223,6 +224,18 @@ intel_uncore_has_fifo(const struct intel_uncore *uncore) > > > return uncore->flags & UNCORE_HAS_FIFO; > > > } > > > +static inline bool > > > +intel_uncore_needs_flr_on_fini(const struct intel_uncore *uncore) > > > +{ > > > + return uncore->flags & UNCORE_NEEDS_FLR_ON_FINI; > > > +} > > > + > > > +static inline bool > > > +intel_uncore_set_flr_on_fini(struct intel_uncore *uncore) > > > +{ > > > + return uncore->flags |= UNCORE_NEEDS_FLR_ON_FINI; > > > +} > > > + > > > void intel_uncore_mmio_debug_init_early(struct drm_i915_private *i915); > > > void intel_uncore_init_early(struct intel_uncore *uncore, > > > struct intel_gt *gt); > > > -- > > > 2.37.3 > > > >