On 16/05/2023 12:30, Oded Gabbay wrote: > If a workload got stuck, we print an error to the kernel log about it. > Add to that print the configured max timeout value, as that value is > not fixed between ASICs and in addition it can be configured using > a kernel module parameter. > > Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx> > --- > .../habanalabs/common/command_submission.c | 26 +++++++++++-------- > 1 file changed, 15 insertions(+), 11 deletions(-) > > diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c > index ccf68f482948..4ec28af3ed78 100644 > --- a/drivers/accel/habanalabs/common/command_submission.c > +++ b/drivers/accel/habanalabs/common/command_submission.c > @@ -804,12 +804,14 @@ static void cs_do_release(struct kref *ref) > > static void cs_timedout(struct work_struct *work) > { > + struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work); > + bool skip_reset_on_timeout, device_reset = false; > struct hl_device *hdev; > u64 event_mask = 0x0; > + uint timeout_sec; > int rc; > - struct hl_cs *cs = container_of(work, struct hl_cs, > - work_tdr.work); > - bool skip_reset_on_timeout = cs->skip_reset_on_timeout, device_reset = false; > + > + skip_reset_on_timeout = cs->skip_reset_on_timeout; > > rc = cs_get_unless_zero(cs); > if (!rc) > @@ -840,29 +842,31 @@ static void cs_timedout(struct work_struct *work) > event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT; > } > > + timeout_sec = jiffies_to_msecs(hdev->timeout_jiffies) / 1000; > + > switch (cs->type) { > case CS_TYPE_SIGNAL: > dev_err(hdev->dev, > - "Signal command submission %llu has not finished in time!\n", > - cs->sequence); > + "Signal command submission %llu has not finished in %u seconds!\n", > + cs->sequence, timeout_sec); > break; > > case CS_TYPE_WAIT: > dev_err(hdev->dev, > - "Wait command submission %llu has not finished in time!\n", > - cs->sequence); > + "Wait command submission %llu has not finished in %u seconds!\n", > + cs->sequence, timeout_sec); > break; > > case CS_TYPE_COLLECTIVE_WAIT: > dev_err(hdev->dev, > - "Collective Wait command submission %llu has not finished in time!\n", > - cs->sequence); > + "Collective Wait command submission %llu has not finished in %u seconds!\n", > + cs->sequence, timeout_sec); > break; > > default: > dev_err(hdev->dev, > - "Command submission %llu has not finished in time!\n", > - cs->sequence); > + "Command submission %llu has not finished in %u seconds!\n", > + cs->sequence, timeout_sec); > break; > } > Reviewed-by: Ofir Bitton<obitton@xxxxxxxxx>