On Thu, Oct 20, 2022 at 02:08:41PM +0100, Tvrtko Ursulin wrote: > From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > > Since a7c01fa93aeb ("signal: break out of wait loops on kthread_stop()") > kthread_stop() started asserting a pending signal which wreaks havoc with > a few of our selftests. Mainly because they are not fully expecting to > handle signals, but also cutting the intended test runtimes short due > signal_pending() now returning true (via __igt_timeout), which therefore > breaks both the patterns of: > > kthread_run() > ..sleep for igt_timeout_ms to allow test to exercise stuff.. > kthread_stop() > > And check for errors recorded in the thread. > > And also: > > Main thread | Test thread > ---------------+------------------------------ > kthread_run() | > kthread_stop() | do stuff until __igt_timeout > | -- exits early due signal -- > > Where this kthread_stop() was assume would have a "join" semantics, which > it would have had if not the new signal assertion issue. > > To recap, threads are now likely to catch a previously impossible > ERESTARTSYS or EINTR, marking the test as failed, or have a pointlessly > short run time. > > To work around this start using kthread_work(er) API which provides > an explicit way of waiting for threads to exit. And for cases where > parent controls the test duration we add explicit signaling which threads > will now use instead of relying on kthread_should_stop(). > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > Cc: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> > --- > .../drm/i915/gem/selftests/i915_gem_context.c | 118 ++++---- > drivers/gpu/drm/i915/gt/selftest_execlists.c | 48 ++-- > drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 51 ++-- > drivers/gpu/drm/i915/selftests/i915_request.c | 252 +++++++++++------- > 4 files changed, 281 insertions(+), 188 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c > index c6ad67b90e8a..d8864444432b 100644 > --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c > @@ -179,97 +179,108 @@ static int live_nop_switch(void *arg) > } > > struct parallel_switch { > - struct task_struct *tsk; > + struct kthread_worker *worker; > + struct kthread_work work; > struct intel_context *ce[2]; > + int result; > }; > > -static int __live_parallel_switch1(void *data) > +static void __live_parallel_switch1(struct kthread_work *work) > { > - struct parallel_switch *arg = data; > + struct parallel_switch *arg = > + container_of(work, typeof(*arg), work); > IGT_TIMEOUT(end_time); > unsigned long count; > > count = 0; > + arg->result = 0; > do { > struct i915_request *rq = NULL; > - int err, n; > + int n; > > - err = 0; > - for (n = 0; !err && n < ARRAY_SIZE(arg->ce); n++) { > + for (n = 0; !arg->result && n < ARRAY_SIZE(arg->ce); n++) { > struct i915_request *prev = rq; > > rq = i915_request_create(arg->ce[n]); > if (IS_ERR(rq)) { > i915_request_put(prev); > - return PTR_ERR(rq); > + arg->result = PTR_ERR(rq); > + break; > } > > i915_request_get(rq); > if (prev) { > - err = i915_request_await_dma_fence(rq, &prev->fence); > + arg->result = > + i915_request_await_dma_fence(rq, > + &prev->fence); > i915_request_put(prev); > } > > i915_request_add(rq); > } > + > + if (IS_ERR_OR_NULL(rq)) > + break; > + > if (i915_request_wait(rq, 0, HZ) < 0) > - err = -ETIME; > + arg->result = -ETIME; > + > i915_request_put(rq); > - if (err) > - return err; > > count++; > - } while (!__igt_timeout(end_time, NULL)); > + } while (!arg->result && !__igt_timeout(end_time, NULL)); > > - pr_info("%s: %lu switches (sync)\n", arg->ce[0]->engine->name, count); > - return 0; > + pr_info("%s: %lu switches (sync) <%d>\n", > + arg->ce[0]->engine->name, count, arg->result); > } > > -static int __live_parallel_switchN(void *data) > +static void __live_parallel_switchN(struct kthread_work *work) > { > - struct parallel_switch *arg = data; > + struct parallel_switch *arg = > + container_of(work, typeof(*arg), work); > struct i915_request *rq = NULL; > IGT_TIMEOUT(end_time); > unsigned long count; > int n; > > count = 0; > + arg->result = 0; > do { > - for (n = 0; n < ARRAY_SIZE(arg->ce); n++) { > + for (n = 0; !arg->result && n < ARRAY_SIZE(arg->ce); n++) { > struct i915_request *prev = rq; > - int err = 0; > > rq = i915_request_create(arg->ce[n]); > if (IS_ERR(rq)) { > i915_request_put(prev); > - return PTR_ERR(rq); > + arg->result = PTR_ERR(rq); > + break; > } > > i915_request_get(rq); > if (prev) { > - err = i915_request_await_dma_fence(rq, &prev->fence); > + arg->result = > + i915_request_await_dma_fence(rq, > + &prev->fence); > i915_request_put(prev); > } > > i915_request_add(rq); > - if (err) { > - i915_request_put(rq); > - return err; > - } > } > > count++; > - } while (!__igt_timeout(end_time, NULL)); > - i915_request_put(rq); > + } while (!arg->result && !__igt_timeout(end_time, NULL)); > > - pr_info("%s: %lu switches (many)\n", arg->ce[0]->engine->name, count); > - return 0; > + if (!IS_ERR_OR_NULL(rq)) > + i915_request_put(rq); > + > + pr_info("%s: %lu switches (many) <%d>\n", > + arg->ce[0]->engine->name, count, arg->result); > } > > static int live_parallel_switch(void *arg) > { > struct drm_i915_private *i915 = arg; > - static int (* const func[])(void *arg) = { > + static void (* const func[])(struct kthread_work *) = { > __live_parallel_switch1, > __live_parallel_switchN, > NULL, > @@ -277,7 +288,7 @@ static int live_parallel_switch(void *arg) > struct parallel_switch *data = NULL; > struct i915_gem_engines *engines; > struct i915_gem_engines_iter it; > - int (* const *fn)(void *arg); > + void (* const *fn)(struct kthread_work *); > struct i915_gem_context *ctx; > struct intel_context *ce; > struct file *file; > @@ -348,9 +359,22 @@ static int live_parallel_switch(void *arg) > } > } > > + for (n = 0; n < count; n++) { > + struct kthread_worker *worker; > + > + if (!data[n].ce[0]) > + continue; > + > + worker = kthread_create_worker(0, "igt/parallel:%s", > + data[n].ce[0]->engine->name); > + if (IS_ERR(worker)) > + goto out; > + > + data[n].worker = worker; > + } > + > for (fn = func; !err && *fn; fn++) { > struct igt_live_test t; > - int n; > > err = igt_live_test_begin(&t, i915, __func__, ""); > if (err) > @@ -360,30 +384,17 @@ static int live_parallel_switch(void *arg) > if (!data[n].ce[0]) > continue; > > - data[n].tsk = kthread_run(*fn, &data[n], > - "igt/parallel:%s", > - data[n].ce[0]->engine->name); > - if (IS_ERR(data[n].tsk)) { > - err = PTR_ERR(data[n].tsk); > - break; > - } > - get_task_struct(data[n].tsk); > + data[n].result = 0; > + kthread_init_work(&data[n].work, *fn); > + kthread_queue_work(data[n].worker, &data[n].work); > } > > - yield(); /* start all threads before we kthread_stop() */ > - > for (n = 0; n < count; n++) { > - int status; > - > - if (IS_ERR_OR_NULL(data[n].tsk)) > - continue; > - > - status = kthread_stop(data[n].tsk); > - if (status && !err) > - err = status; > - > - put_task_struct(data[n].tsk); > - data[n].tsk = NULL; > + if (data[n].ce[0]) { > + kthread_flush_work(&data[n].work); > + if (data[n].result && !err) > + err = data[n].result; > + } > } > > if (igt_live_test_end(&t)) > @@ -399,6 +410,9 @@ static int live_parallel_switch(void *arg) > intel_context_unpin(data[n].ce[m]); > intel_context_put(data[n].ce[m]); > } > + > + if (data[n].worker) > + kthread_destroy_worker(data[n].worker); > } > kfree(data); > out_file: > diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c > index 56b7d5b5fea0..2c7c053a8808 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_execlists.c > +++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c > @@ -3473,12 +3473,14 @@ static int random_priority(struct rnd_state *rnd) > > struct preempt_smoke { > struct intel_gt *gt; > + struct kthread_work work; > struct i915_gem_context **contexts; > struct intel_engine_cs *engine; > struct drm_i915_gem_object *batch; > unsigned int ncontext; > struct rnd_state prng; > unsigned long count; > + int result; > }; > > static struct i915_gem_context *smoke_context(struct preempt_smoke *smoke) > @@ -3538,34 +3540,31 @@ static int smoke_submit(struct preempt_smoke *smoke, > return err; > } > > -static int smoke_crescendo_thread(void *arg) > +static void smoke_crescendo_work(struct kthread_work *work) > { > - struct preempt_smoke *smoke = arg; > + struct preempt_smoke *smoke = container_of(work, typeof(*smoke), work); > IGT_TIMEOUT(end_time); > unsigned long count; > > count = 0; > do { > struct i915_gem_context *ctx = smoke_context(smoke); > - int err; > > - err = smoke_submit(smoke, > - ctx, count % I915_PRIORITY_MAX, > - smoke->batch); > - if (err) > - return err; > + smoke->result = smoke_submit(smoke, ctx, > + count % I915_PRIORITY_MAX, > + smoke->batch); > > count++; > - } while (count < smoke->ncontext && !__igt_timeout(end_time, NULL)); > + } while (!smoke->result && count < smoke->ncontext && > + !__igt_timeout(end_time, NULL)); > > smoke->count = count; > - return 0; > } > > static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags) > #define BATCH BIT(0) > { > - struct task_struct *tsk[I915_NUM_ENGINES] = {}; > + struct kthread_worker *worker[I915_NUM_ENGINES] = {}; > struct preempt_smoke *arg; > struct intel_engine_cs *engine; > enum intel_engine_id id; > @@ -3576,6 +3575,8 @@ static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags) > if (!arg) > return -ENOMEM; > > + memset(arg, 0, I915_NUM_ENGINES * sizeof(*arg)); kcalloc()? > + > for_each_engine(engine, smoke->gt, id) { > arg[id] = *smoke; > arg[id].engine = engine; > @@ -3583,31 +3584,28 @@ static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags) > arg[id].batch = NULL; > arg[id].count = 0; > > - tsk[id] = kthread_run(smoke_crescendo_thread, arg, > - "igt/smoke:%d", id); > - if (IS_ERR(tsk[id])) { > - err = PTR_ERR(tsk[id]); > + worker[id] = kthread_create_worker(0, "igt/smoke:%d", id); > + if (IS_ERR(worker[id])) { > + err = PTR_ERR(worker[id]); > break; > } > - get_task_struct(tsk[id]); > - } > > - yield(); /* start all threads before we kthread_stop() */ > + kthread_init_work(&arg[id].work, smoke_crescendo_work); > + kthread_queue_work(worker[id], &arg[id].work); > + } > > count = 0; > for_each_engine(engine, smoke->gt, id) { > - int status; > - > - if (IS_ERR_OR_NULL(tsk[id])) > + if (IS_ERR_OR_NULL(worker[id])) > continue; > > - status = kthread_stop(tsk[id]); > - if (status && !err) > - err = status; > + kthread_flush_work(&arg[id].work); > + if (arg[id].result && !err) > + err = arg[id].result; > > count += arg[id].count; > > - put_task_struct(tsk[id]); > + kthread_destroy_worker(worker[id]); > } > > pr_info("Submitted %lu crescendo:%x requests across %d engines and %d contexts\n", > diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c > index 7f3bb1d34dfb..71263058a7b0 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c > +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c > @@ -866,10 +866,13 @@ static int igt_reset_active_engine(void *arg) > } > > struct active_engine { > - struct task_struct *task; > + struct kthread_worker *worker; > + struct kthread_work work; > struct intel_engine_cs *engine; > unsigned long resets; > unsigned int flags; > + bool stop; > + int result; > }; > > #define TEST_ACTIVE BIT(0) > @@ -900,10 +903,10 @@ static int active_request_put(struct i915_request *rq) > return err; > } > > -static int active_engine(void *data) > +static void active_engine(struct kthread_work *work) > { > I915_RND_STATE(prng); > - struct active_engine *arg = data; > + struct active_engine *arg = container_of(work, typeof(*arg), work); > struct intel_engine_cs *engine = arg->engine; > struct i915_request *rq[8] = {}; > struct intel_context *ce[ARRAY_SIZE(rq)]; > @@ -913,16 +916,17 @@ static int active_engine(void *data) > for (count = 0; count < ARRAY_SIZE(ce); count++) { > ce[count] = intel_context_create(engine); > if (IS_ERR(ce[count])) { > - err = PTR_ERR(ce[count]); > - pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err); > + arg->result = PTR_ERR(ce[count]); > + pr_err("[%s] Create context #%ld failed: %d!\n", > + engine->name, count, arg->result); > while (--count) > intel_context_put(ce[count]); > - return err; > + return; > } > } > > count = 0; > - while (!kthread_should_stop()) { > + while (!READ_ONCE(arg->stop)) { > unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); > struct i915_request *old = rq[idx]; > struct i915_request *new; > @@ -967,7 +971,7 @@ static int active_engine(void *data) > intel_context_put(ce[count]); > } > > - return err; > + arg->result = err; > } > > static int __igt_reset_engines(struct intel_gt *gt, > @@ -1022,7 +1026,7 @@ static int __igt_reset_engines(struct intel_gt *gt, > > memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES); > for_each_engine(other, gt, tmp) { > - struct task_struct *tsk; > + struct kthread_worker *worker; > > threads[tmp].resets = > i915_reset_engine_count(global, other); > @@ -1036,19 +1040,21 @@ static int __igt_reset_engines(struct intel_gt *gt, > threads[tmp].engine = other; > threads[tmp].flags = flags; > > - tsk = kthread_run(active_engine, &threads[tmp], > - "igt/%s", other->name); > - if (IS_ERR(tsk)) { > - err = PTR_ERR(tsk); > - pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err); > + worker = kthread_create_worker(0, "igt/%s", > + other->name); > + if (IS_ERR(worker)) { > + err = PTR_ERR(worker); > + pr_err("[%s] Worker create failed: %d!\n", > + engine->name, err); > goto unwind; > } > > - threads[tmp].task = tsk; > - get_task_struct(tsk); > - } > + threads[tmp].worker = worker; > > - yield(); /* start all threads before we begin */ > + kthread_init_work(&threads[tmp].work, active_engine); > + kthread_queue_work(threads[tmp].worker, > + &threads[tmp].work); > + } > > st_engine_heartbeat_disable_no_pm(engine); > GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id, > @@ -1197,17 +1203,20 @@ static int __igt_reset_engines(struct intel_gt *gt, > for_each_engine(other, gt, tmp) { > int ret; > > - if (!threads[tmp].task) > + if (!threads[tmp].worker) > continue; > > - ret = kthread_stop(threads[tmp].task); > + WRITE_ONCE(threads[tmp].stop, true); > + kthread_flush_work(&threads[tmp].work); Could perhaps optimize things a bit flagging all the threads to stop before doing any flush_work()s. And maybe also do all the init stuff before queuing any works. But dunno if that's worth the hassle. Looks reasonable enough to me. Reviewed-by: Ville Syrjälä <ville.syrjala@xxxxxxxxxxxxxxx> -- Ville Syrjälä Intel