Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.

"Grodzovsky, Andrey" <Andrey.Grodzovsky@xxxxxxx> · Tue, 23 Apr 2019 14:12:34 +0000

On 4/23/19 8:32 AM, Koenig, Christian wrote:

> Well you at least have to give me time till after the holidays to get
> going again :)
>
> Not sure exactly jet why we need patch number 5.

Probably you missed the mail where I pointed out a bug I found during 
testing - I am  reattaching the mail and the KASAN dump.

Andrey


>
> And we should probably commit patch #1 and #2.
>
> Christian.
>
> Am 22.04.19 um 13:54 schrieb Grodzovsky, Andrey:
>> Ping for patches 3, new patch 5 and patch 6.
>>
>> Andrey
>>
>> On 4/18/19 11:00 AM, Andrey Grodzovsky wrote:
>>> Also reject TDRs if another one already running.
>>>
>>> v2:
>>> Stop all schedulers across device and entire XGMI hive before
>>> force signaling HW fences.
>>> Avoid passing job_signaled to helper fnctions to keep all the decision
>>> making about skipping HW reset in one place.
>>>
>>> v3:
>>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>>> against it's decrement in drm_sched_stop in non HW reset case.
>>> v4: rebase
>>> v5: Revert v3 as we do it now in sceduler code.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>
>>> ---
>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 +++++++++++++++++++----------
>>>     1 file changed, 95 insertions(+), 48 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index a0e165c..85f8792 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3334,8 +3334,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     		if (!ring || !ring->sched.thread)
>>>     			continue;
>>>     
>>> -		drm_sched_stop(&ring->sched, &job->base);
>>> -
>>>     		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
>>>     		amdgpu_fence_driver_force_completion(ring);
>>>     	}
>>> @@ -3343,6 +3341,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>>     	if(job)
>>>     		drm_sched_increase_karma(&job->base);
>>>     
>>> +	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
>>>     	if (!amdgpu_sriov_vf(adev)) {
>>>     
>>>     		if (!need_full_reset)
>>> @@ -3480,37 +3479,21 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
>>>     	return r;
>>>     }
>>>     
>>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>> +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
>>>     {
>>> -	int i;
>>> -
>>> -	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> -		struct amdgpu_ring *ring = adev->rings[i];
>>> -
>>> -		if (!ring || !ring->sched.thread)
>>> -			continue;
>>> -
>>> -		if (!adev->asic_reset_res)
>>> -			drm_sched_resubmit_jobs(&ring->sched);
>>> +	if (trylock) {
>>> +		if (!mutex_trylock(&adev->lock_reset))
>>> +			return false;
>>> +	} else
>>> +		mutex_lock(&adev->lock_reset);
>>>     
>>> -		drm_sched_start(&ring->sched, !adev->asic_reset_res);
>>> -	}
>>> -
>>> -	if (!amdgpu_device_has_dc_support(adev)) {
>>> -		drm_helper_resume_force_mode(adev->ddev);
>>> -	}
>>> -
>>> -	adev->asic_reset_res = 0;
>>> -}
>>> -
>>> -static void amdgpu_device_lock_adev(struct amdgpu_device *adev)
>>> -{
>>> -	mutex_lock(&adev->lock_reset);
>>>     	atomic_inc(&adev->gpu_reset_counter);
>>>     	adev->in_gpu_reset = 1;
>>>     	/* Block kfd: SRIOV would do it separately */
>>>     	if (!amdgpu_sriov_vf(adev))
>>>                     amdgpu_amdkfd_pre_reset(adev);
>>> +
>>> +	return true;
>>>     }
>>>     
>>>     static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>> @@ -3538,40 +3521,42 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>>     int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     			      struct amdgpu_job *job)
>>>     {
>>> -	int r;
>>> +	struct list_head device_list, *device_list_handle =  NULL;
>>> +	bool need_full_reset, job_signaled;
>>>     	struct amdgpu_hive_info *hive = NULL;
>>> -	bool need_full_reset = false;
>>>     	struct amdgpu_device *tmp_adev = NULL;
>>> -	struct list_head device_list, *device_list_handle =  NULL;
>>> +	int i, r = 0;
>>>     
>>> +	need_full_reset = job_signaled = false;
>>>     	INIT_LIST_HEAD(&device_list);
>>>     
>>>     	dev_info(adev->dev, "GPU reset begin!\n");
>>>     
>>> +	hive = amdgpu_get_xgmi_hive(adev, false);
>>> +
>>>     	/*
>>> -	 * In case of XGMI hive disallow concurrent resets to be triggered
>>> -	 * by different nodes. No point also since the one node already executing
>>> -	 * reset will also reset all the other nodes in the hive.
>>> +	 * Here we trylock to avoid chain of resets executing from
>>> +	 * either trigger by jobs on different adevs in XGMI hive or jobs on
>>> +	 * different schedulers for same device while this TO handler is running.
>>> +	 * We always reset all schedulers for device and all devices for XGMI
>>> +	 * hive so that should take care of them too.
>>>     	 */
>>> -	hive = amdgpu_get_xgmi_hive(adev, 0);
>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
>>> -	    !mutex_trylock(&hive->reset_lock))
>>> +
>>> +	if (hive && !mutex_trylock(&hive->reset_lock)) {
>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
>>> +			 job->base.id, hive->hive_id);
>>>     		return 0;
>>> +	}
>>>     
>>>     	/* Start with adev pre asic reset first for soft reset check.*/
>>> -	amdgpu_device_lock_adev(adev);
>>> -	r = amdgpu_device_pre_asic_reset(adev,
>>> -					 job,
>>> -					 &need_full_reset);
>>> -	if (r) {
>>> -		/*TODO Should we stop ?*/
>>> -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>> -			  r, adev->ddev->unique);
>>> -		adev->asic_reset_res = r;
>>> +	if (!amdgpu_device_lock_adev(adev, !hive)) {
>>> +		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
>>> +					 job->base.id);
>>> +		return 0;
>>>     	}
>>>     
>>>     	/* Build list of devices to reset */
>>> -	if  (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) {
>>> +	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>>>     		if (!hive) {
>>>     			amdgpu_device_unlock_adev(adev);
>>>     			return -ENODEV;
>>> @@ -3588,13 +3573,56 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     		device_list_handle = &device_list;
>>>     	}
>>>     
>>> +	/* block all schedulers and reset given job's ring */
>>> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> +
>>> +			if (!ring || !ring->sched.thread)
>>> +				continue;
>>> +
>>> +			drm_sched_stop(&ring->sched, &job->base);
>>> +		}
>>> +	}
>>> +
>>> +
>>> +	/*
>>> +	 * Must check guilty signal here since after this point all old
>>> +	 * HW fences are force signaled.
>>> +	 *
>>> +	 * job->base holds a reference to parent fence
>>> +	 */
>>> +	if (job && job->base.s_fence->parent &&
>>> +	    dma_fence_is_signaled(job->base.s_fence->parent))
>>> +		job_signaled = true;
>>> +
>>> +	if (!amdgpu_device_ip_need_full_reset(adev))
>>> +		device_list_handle = &device_list;
>>> +
>>> +	if (job_signaled) {
>>> +		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>>> +		goto skip_hw_reset;
>>> +	}
>>> +
>>> +
>>> +	/* Guilty job will be freed after this*/
>>> +	r = amdgpu_device_pre_asic_reset(adev,
>>> +					 job,
>>> +					 &need_full_reset);
>>> +	if (r) {
>>> +		/*TODO Should we stop ?*/
>>> +		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
>>> +			  r, adev->ddev->unique);
>>> +		adev->asic_reset_res = r;
>>> +	}
>>> +
>>>     retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>>>     	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>>     
>>>     		if (tmp_adev == adev)
>>>     			continue;
>>>     
>>> -		amdgpu_device_lock_adev(tmp_adev);
>>> +		amdgpu_device_lock_adev(tmp_adev, false);
>>>     		r = amdgpu_device_pre_asic_reset(tmp_adev,
>>>     						 NULL,
>>>     						 &need_full_reset);
>>> @@ -3618,9 +3646,28 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     			goto retry;
>>>     	}
>>>     
>>> +skip_hw_reset:
>>> +
>>>     	/* Post ASIC reset for all devs .*/
>>>     	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>>> -		amdgpu_device_post_asic_reset(tmp_adev);
>>> +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>> +			struct amdgpu_ring *ring = tmp_adev->rings[i];
>>> +
>>> +			if (!ring || !ring->sched.thread)
>>> +				continue;
>>> +
>>> +			/* No point to resubmit jobs if we didn't HW reset*/
>>> +			if (!tmp_adev->asic_reset_res && !job_signaled)
>>> +				drm_sched_resubmit_jobs(&ring->sched);
>>> +
>>> +			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
>>> +		}
>>> +
>>> +		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
>>> +			drm_helper_resume_force_mode(tmp_adev->ddev);
>>> +		}
>>> +
>>> +		tmp_adev->asic_reset_res = 0;
>>>     
>>>     		if (r) {
>>>     			/* bad news, how to tell it to userspace ? */
>>> @@ -3633,7 +3680,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>>     		amdgpu_device_unlock_adev(tmp_adev);
>>>     	}
>>>     
>>> -	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
>>> +	if (hive)
>>>     		mutex_unlock(&hive->reset_lock);
>>>     
>>>     	if (r)
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
  121.189757 <    0.000171>] amdgpu 0000:01:00.0: GPU reset(5) succeeded!
passed[  121.189894 <    0.000137>] ==================================================================


[  121.189951 <    0.000057>] BUG: KASAN: use-after-free in drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
Run Summary:    Type  Total    Ran Passed Failed Inactive
              suites      8      0    n/a      0        0
               tests     39      1      1      0        0
             asserts      8      8      8      0      n/a

Elapsed time =    0.001 seconds[  121.189956 <    0.000005>] Read of size 8 at addr ffff88840389a8b0 by task kworker/2:2/1140


[  121.189969 <    0.000013>] CPU: 2 PID: 1140 Comm: kworker/2:2 Tainted: G           OE     5.1.0-rc2-misc+ #1
[  121.189972 <    0.000003>] Hardware name: System manufacturer System Product Name/Z170-PRO, BIOS 1902 06/27/2016
[  121.189977 <    0.000005>] Workqueue: events drm_sched_job_timedout [gpu_sched]
[  121.189980 <    0.000003>] Call Trace:
[  121.189985 <    0.000005>]  dump_stack+0x9b/0xf5
[  121.189992 <    0.000007>]  print_address_description+0x70/0x290
[  121.189997 <    0.000005>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190002 <    0.000005>]  kasan_report+0x134/0x191
[  121.190006 <    0.000004>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190014 <    0.000008>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190019 <    0.000005>]  __asan_load8+0x54/0x90
[  121.190024 <    0.000005>]  drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190034 <    0.000010>]  process_one_work+0x466/0xb00
[  121.190046 <    0.000012>]  ? queue_work_node+0x180/0x180
[  121.190061 <    0.000015>]  worker_thread+0x83/0x6c0
[  121.190075 <    0.000014>]  kthread+0x1a9/0x1f0
[  121.190079 <    0.000004>]  ? rescuer_thread+0x760/0x760
[  121.190081 <    0.000002>]  ? kthread_cancel_delayed_work_sync+0x20/0x20
[  121.190088 <    0.000007>]  ret_from_fork+0x3a/0x50

[  121.190105 <    0.000017>] Allocated by task 1421:
[  121.190110 <    0.000005>]  save_stack+0x46/0xd0
[  121.190112 <    0.000002>]  __kasan_kmalloc+0xab/0xe0
[  121.190115 <    0.000003>]  kasan_kmalloc+0xf/0x20
[  121.190117 <    0.000002>]  __kmalloc+0x167/0x390
[  121.190210 <    0.000093>]  amdgpu_job_alloc+0x47/0x170 [amdgpu]
[  121.190289 <    0.000079>]  amdgpu_cs_ioctl+0x9bd/0x2e70 [amdgpu]
[  121.190312 <    0.000023>]  drm_ioctl_kernel+0x17e/0x1d0 [drm]
[  121.190334 <    0.000022>]  drm_ioctl+0x5e1/0x640 [drm]
[  121.190409 <    0.000075>]  amdgpu_drm_ioctl+0x78/0xd0 [amdgpu]
[  121.190413 <    0.000004>]  do_vfs_ioctl+0x152/0xa30
[  121.190415 <    0.000002>]  ksys_ioctl+0x6d/0x80
[  121.190418 <    0.000003>]  __x64_sys_ioctl+0x43/0x50
[  121.190425 <    0.000007>]  do_syscall_64+0x7d/0x240
[  121.190430 <    0.000005>]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

[  121.190440 <    0.000010>] Freed by task 1242:
[  121.190448 <    0.000008>]  save_stack+0x46/0xd0
[  121.190453 <    0.000005>]  __kasan_slab_free+0x13c/0x1a0
[  121.190458 <    0.000005>]  kasan_slab_free+0xe/0x10
[  121.190462 <    0.000004>]  kfree+0xfa/0x2e0
[  121.190584 <    0.000122>]  amdgpu_job_free_cb+0x7f/0x90 [amdgpu]
[  121.190589 <    0.000005>]  drm_sched_cleanup_jobs.part.10+0xcf/0x1a0 [gpu_sched]
[  121.190594 <    0.000005>]  drm_sched_main+0x38a/0x430 [gpu_sched]
[  121.190596 <    0.000002>]  kthread+0x1a9/0x1f0
[  121.190599 <    0.000003>]  ret_from_fork+0x3a/0x50


--- Begin Message ---

Subject: Re: [PATCH v3 1/5] drm/scheduler: rework job destruction
From: Andrey Grodzovsky <Andrey.Grodzovsky@xxxxxxx>
Date: Wed, 17 Apr 2019 17:06:31 -0400
Cc: "Kazlauskas, Nicholas" <Nicholas.Kazlauskas@xxxxxxx>
In-reply-to: <8344ae6b-03a8-0a9b-aa02-66b59b580f37@amd.com>
References: <1555357403-30813-1-git-send-email-andrey.grodzovsky@amd.com> <878swb6k9a.fsf@anholt.net> <b62ea5d8-1dbe-7e67-7e62-1f452ed860c2@gmail.com> <84200035-bb50-310d-7d48-20dbe072621c@amd.com> <4d7c99ae-eec4-f6af-6865-7c844475078c@amd.com> <e8255594-6a3f-ba83-e6ff-d6aae4e9e9ba@amd.com> <93c03b16-b01f-d17e-49d3-4cce515052aa@amd.com> <8344ae6b-03a8-0a9b-aa02-66b59b580f37@amd.com>
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Thunderbird/60.4.0


On 4/16/19 12:00 PM, Koenig, Christian wrote:
Am 16.04.19 um 17:42 schrieb Grodzovsky, Andrey:
On 4/16/19 10:58 AM, Grodzovsky, Andrey wrote:
On 4/16/19 10:43 AM, Koenig, Christian wrote:
Am 16.04.19 um 16:36 schrieb Grodzovsky, Andrey:
On 4/16/19 5:47 AM, Christian König wrote:
Am 15.04.19 um 23:17 schrieb Eric Anholt:
Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> writes:

From: Christian König <christian.koenig@xxxxxxx>

We now destroy finished jobs from the worker thread to make sure that
we never destroy a job currently in timeout processing.
By this we avoid holding lock around ring mirror list in drm_sched_stop
which should solve a deadlock reported by a user.

v2: Remove unused variable.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692

Signed-off-by: Christian König <christian.koenig@xxxxxxx>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>
---
       drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  17 ++--
       drivers/gpu/drm/etnaviv/etnaviv_dump.c     |   4 -
       drivers/gpu/drm/etnaviv/etnaviv_sched.c    |   9 +-
       drivers/gpu/drm/scheduler/sched_main.c     | 138
+++++++++++++++++------------
       drivers/gpu/drm/v3d/v3d_sched.c            |   9 +-
Missing corresponding panfrost and lima updates.  You should probably
pull in drm-misc for hacking on the scheduler.

diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
b/drivers/gpu/drm/v3d/v3d_sched.c
index ce7c737b..8efb091 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -232,11 +232,18 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d,
struct drm_sched_job *sched_job)
             /* block scheduler */
           for (q = 0; q < V3D_MAX_QUEUES; q++)
-        drm_sched_stop(&v3d->queue[q].sched);
+        drm_sched_stop(&v3d->queue[q].sched, sched_job);
             if(sched_job)
               drm_sched_increase_karma(sched_job);
       +    /*
+     * Guilty job did complete and hence needs to be manually removed
+     * See drm_sched_stop doc.
+     */
+    if (list_empty(&sched_job->node))
+        sched_job->sched->ops->free_job(sched_job);
If the if (sched_job) is necessary up above, then this should clearly be
under it.

But, can we please have a core scheduler thing we call here instead of
drivers all replicating it?
Yeah that's also something I noted before.

Essential problem is that we remove finished jobs from the mirror list
and so need to destruct them because we otherwise leak them.

Alternative approach here would be to keep the jobs on the ring mirror
list, but not submit them again.

Regards,
Christian.
I really prefer to avoid this, it means adding extra flag to sched_job
to check in each iteration of the ring mirror list.
Mhm, why actually? We just need to check if the scheduler fence is signaled.
OK, i see it's equivalent but this still en extra check for all the
iterations.

What about changing
signature of drm_sched_backend_ops.timedout_job to return drm_sched_job*
instead of void, this way we can return the guilty job back from the
driver specific handler to the generic drm_sched_job_timedout and
release it there.
Well the timeout handler already has the job, so returning it doesn't
make much sense.

The problem is rather that the timeout handler doesn't know if it should
destroy the job or not.
But the driver specific handler does, and actually returning back either
the pointer to the job or null will give an indication of that. We can
even return bool.

Andrey
Thinking a bit more about this - the way this check is done now "if
(list_empty(&sched_job->node)) then free the sched_job" actually makes
it possible to just move this as is from driver specific callbacks into
drm_sched_job_timeout without any other changes.
Oh, well that sounds like a good idea off hand.

Need to see the final code, but at least the best idea so far.

Christian.

Unfortunately looks like it's not that good idea at the end, take a look 
at the attached KASAN print - sched thread's cleanup function races 
against TDR handler and removes the guilty job from mirror list and we 
have no way of differentiating if the job was removed from within the 
TDR handler or from the sched. thread's clean-up function. So looks like 
we either need 'keep the jobs on the ring mirror list, but not submit 
them again' as you suggested before or add a flag to sched_job to hint 
to drm_sched_job_timedout that guilty job requires manual removal. Your 
suggestion implies we will need an extra check in almost every place of 
traversal of the mirror ring to avoid handling signaled jobs while mine 
requires extra flag in sched_job struct . I feel that keeping completed 
jobs in the mirror list when they actually don't belong there any more 
is confusing and an opening for future bugs.

Andrey


Andrey

Christian.

Andrey

+
           /* get the GPU back into the init state */
           v3d_reset(v3d);
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
  121.189757 <    0.000171>] amdgpu 0000:01:00.0: GPU reset(5) succeeded!
passed[  121.189894 <    0.000137>] ==================================================================


[  121.189951 <    0.000057>] BUG: KASAN: use-after-free in drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
Run Summary:    Type  Total    Ran Passed Failed Inactive
              suites      8      0    n/a      0        0
               tests     39      1      1      0        0
             asserts      8      8      8      0      n/a

Elapsed time =    0.001 seconds[  121.189956 <    0.000005>] Read of size 8 at addr ffff88840389a8b0 by task kworker/2:2/1140


[  121.189969 <    0.000013>] CPU: 2 PID: 1140 Comm: kworker/2:2 Tainted: G           OE     5.1.0-rc2-misc+ #1
[  121.189972 <    0.000003>] Hardware name: System manufacturer System Product Name/Z170-PRO, BIOS 1902 06/27/2016
[  121.189977 <    0.000005>] Workqueue: events drm_sched_job_timedout [gpu_sched]
[  121.189980 <    0.000003>] Call Trace:
[  121.189985 <    0.000005>]  dump_stack+0x9b/0xf5
[  121.189992 <    0.000007>]  print_address_description+0x70/0x290
[  121.189997 <    0.000005>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190002 <    0.000005>]  kasan_report+0x134/0x191
[  121.190006 <    0.000004>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190014 <    0.000008>]  ? drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190019 <    0.000005>]  __asan_load8+0x54/0x90
[  121.190024 <    0.000005>]  drm_sched_job_timedout+0x7a/0xf0 [gpu_sched]
[  121.190034 <    0.000010>]  process_one_work+0x466/0xb00
[  121.190046 <    0.000012>]  ? queue_work_node+0x180/0x180
[  121.190061 <    0.000015>]  worker_thread+0x83/0x6c0
[  121.190075 <    0.000014>]  kthread+0x1a9/0x1f0
[  121.190079 <    0.000004>]  ? rescuer_thread+0x760/0x760
[  121.190081 <    0.000002>]  ? kthread_cancel_delayed_work_sync+0x20/0x20
[  121.190088 <    0.000007>]  ret_from_fork+0x3a/0x50

[  121.190105 <    0.000017>] Allocated by task 1421:
[  121.190110 <    0.000005>]  save_stack+0x46/0xd0
[  121.190112 <    0.000002>]  __kasan_kmalloc+0xab/0xe0
[  121.190115 <    0.000003>]  kasan_kmalloc+0xf/0x20
[  121.190117 <    0.000002>]  __kmalloc+0x167/0x390
[  121.190210 <    0.000093>]  amdgpu_job_alloc+0x47/0x170 [amdgpu]
[  121.190289 <    0.000079>]  amdgpu_cs_ioctl+0x9bd/0x2e70 [amdgpu]
[  121.190312 <    0.000023>]  drm_ioctl_kernel+0x17e/0x1d0 [drm]
[  121.190334 <    0.000022>]  drm_ioctl+0x5e1/0x640 [drm]
[  121.190409 <    0.000075>]  amdgpu_drm_ioctl+0x78/0xd0 [amdgpu]
[  121.190413 <    0.000004>]  do_vfs_ioctl+0x152/0xa30
[  121.190415 <    0.000002>]  ksys_ioctl+0x6d/0x80
[  121.190418 <    0.000003>]  __x64_sys_ioctl+0x43/0x50
[  121.190425 <    0.000007>]  do_syscall_64+0x7d/0x240
[  121.190430 <    0.000005>]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

[  121.190440 <    0.000010>] Freed by task 1242:
[  121.190448 <    0.000008>]  save_stack+0x46/0xd0
[  121.190453 <    0.000005>]  __kasan_slab_free+0x13c/0x1a0
[  121.190458 <    0.000005>]  kasan_slab_free+0xe/0x10
[  121.190462 <    0.000004>]  kfree+0xfa/0x2e0
[  121.190584 <    0.000122>]  amdgpu_job_free_cb+0x7f/0x90 [amdgpu]
[  121.190589 <    0.000005>]  drm_sched_cleanup_jobs.part.10+0xcf/0x1a0 [gpu_sched]
[  121.190594 <    0.000005>]  drm_sched_main+0x38a/0x430 [gpu_sched]
[  121.190596 <    0.000002>]  kthread+0x1a9/0x1f0
[  121.190599 <    0.000003>]  ret_from_fork+0x3a/0x50



--- End Message ---
_______________________________________________
dri-devel mailing list
dri-devel@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/dri-devel