On Tue, 2020-10-13 at 13:02 +0200, Janusz Krzysztofik wrote: > Removal of igt_fork_hang_detector() from local_i915_healthcheck() by > commit 1fbd127bd4e1 ("core_hotplug: Teach the healthcheck how to > check > execution status") resulted in unintentional removal of an important > though implicit test feature of detecting, reporting as failures and > recovering from potential misses of debugfs subdirs of hot rebound > i915 > devices. As a consequence, unexpected failures or skips of other > unrelated but subsequently run tests have been observed on CI. > > On the other hand, removal of the debugfs issue detection and subtest > failures from right after hot rebinding the driver enabled the better > version of the i915 GPU health check fixed by the same commit to > detect > and report other issues potentially triggered by device late close. > > Restore the missing test feature by introducing an explicit sysfs > health check, not limited to i915, that verifies existence of device > sysfs and debugfs areas. Also, split hotrebind/hotreplug scenarios > into a pair of each, one that performs the health check right after > hot > rebind/replug and delegates the device late close step to a follow up > recovery phase, while the other one checks device health only after > late closing it. > > v2: Give GPU health check a better chance to detect issues - run it > before sysfs health checks. > v3: Run sysfs health check on any hardware, not only i915. > > Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik@xxxxxxxxxxxxxxx > > > Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > --- > Even if the root cause has occurred to be sitting on the IGT lib side > and has been already fixed by commit 937526629344 ("lib: Don't fail > debugfs lookup on an expected absent drm device"), I think we should > restore the debugfs health check just in case new issues with similar > symptoms appear in the future and start affecting subsequent tests > silently. > > Thanks, > Janusz > > tests/core_hotunplug.c | 68 ++++++++++++++++++++++++++++++++++++++ > ---- > 1 file changed, 62 insertions(+), 6 deletions(-) > > diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c > index 70669c590..cdc07c85d 100644 > --- a/tests/core_hotunplug.c > +++ b/tests/core_hotunplug.c > @@ -308,7 +308,7 @@ static void node_healthcheck(struct hotunplug > *priv, unsigned flags) > priv->failure = "Unrecoverable test failure"; > if (local_i915_healthcheck(fd_drm, "") && > (!(flags & FLAG_RECOVER) || > local_i915_recover(fd_drm))) > - priv->failure = "Healthcheck failure!"; > + priv->failure = "GPU healthcheck failure!"; > else > priv->failure = NULL; > > @@ -317,6 +317,16 @@ static void node_healthcheck(struct hotunplug > *priv, unsigned flags) > priv->failure = NULL; > } > > + if (!priv->failure) { > + char path[200]; > + > + priv->failure = "Device sysfs healthckeck failure!"; > + local_debug("%s\n", "running device sysfs > healthcheck"); > + igt_assert(igt_sysfs_path(fd_drm, path, sizeof(path))); > + igt_assert(igt_debugfs_path(fd_drm, path, > sizeof(path))); > + priv->failure = NULL; > + } > + LGTM, Reviewed-by: Marcin Bernatowicz <marcin.bernatowicz@xxxxxxxxxxxxxxx> > fd_drm = close_device(fd_drm, "", "health checked "); > if (closed || fd_drm < -1) /* update status for > post_healthcheck */ > priv->fd.drm_hc = fd_drm; > @@ -437,7 +447,7 @@ static void hotunplug_rescan(struct hotunplug > *priv) > healthcheck(priv, false); > } > > -static void hotrebind_lateclose(struct hotunplug *priv) > +static void hotrebind(struct hotunplug *priv) > { > igt_assert_eq(priv->fd.drm, -1); > igt_assert_eq(priv->fd.drm_hc, -1); > @@ -448,6 +458,30 @@ static void hotrebind_lateclose(struct hotunplug > *priv) > driver_bind(priv, 0); > > healthcheck(priv, false); > +} > + > +static void hotreplug(struct hotunplug *priv) > +{ > + igt_assert_eq(priv->fd.drm, -1); > + igt_assert_eq(priv->fd.drm_hc, -1); > + priv->fd.drm = local_drm_open_driver(false, "", " for hot > replug"); > + > + device_unplug(priv, "hot ", 60); > + > + bus_rescan(priv, 0); > + > + healthcheck(priv, false); > +} > + > +static void hotrebind_lateclose(struct hotunplug *priv) > +{ > + igt_assert_eq(priv->fd.drm, -1); > + igt_assert_eq(priv->fd.drm_hc, -1); > + priv->fd.drm = local_drm_open_driver(false, "", " for hot > rebind"); > + > + driver_unbind(priv, "hot ", 60); > + > + driver_bind(priv, 0); > > priv->fd.drm = close_device(priv->fd.drm, "late ", "unbound "); > igt_assert_eq(priv->fd.drm, -1); > @@ -465,8 +499,6 @@ static void hotreplug_lateclose(struct hotunplug > *priv) > > bus_rescan(priv, 0); > > - healthcheck(priv, false); > - > priv->fd.drm = close_device(priv->fd.drm, "late ", "removed "); > igt_assert_eq(priv->fd.drm, -1); > > @@ -570,7 +602,31 @@ igt_main > post_healthcheck(&priv); > > igt_subtest_group { > - igt_describe("Check if the driver hot unbound from a > still open device can be cleanly rebound, then the old instance > released"); > + igt_describe("Check if the driver can be cleanly > rebound to a device with a still open hot unbound driver instance"); > + igt_subtest("hotrebind") > + hotrebind(&priv); > + > + igt_fixture > + recover(&priv); > + } > + > + igt_fixture > + post_healthcheck(&priv); > + > + igt_subtest_group { > + igt_describe("Check if a hot unplugged and still open > device can be cleanly restored"); > + igt_subtest("hotreplug") > + hotreplug(&priv); > + > + igt_fixture > + recover(&priv); > + } > + > + igt_fixture > + post_healthcheck(&priv); > + > + igt_subtest_group { > + igt_describe("Check if a hot unbound driver instance > still open after hot rebind can be cleanly released"); > igt_subtest("hotrebind-lateclose") > hotrebind_lateclose(&priv); > > @@ -582,7 +638,7 @@ igt_main > post_healthcheck(&priv); > > igt_subtest_group { > - igt_describe("Check if a still open while hot unplugged > device can be cleanly restored, then the old instance released"); > + igt_describe("Check if an instance of a still open > while hot replugged device can be cleanly released"); > igt_subtest("hotreplug-lateclose") > hotreplug_lateclose(&priv); > _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx