> During GPU reset, IOCTL still submit command to GPU. No it won't. We disable the GPU scheduler during GPU reset, so that isn't an issue. Do you have a concrete problem (e.g. dmesg?) you ran into? Regards, Christian. Am 21.04.2017 um 11:00 schrieb He, Hongbo: > Hi Christian: > > During GPU reset, IOCTL still submit command to GPU. > That will result in lots of strange problems when we test this feature. > > I have no better idea to handle this case. At that time, I have tried with return error directly rather than waiting GPU reset completion. > But that doesn't work and it lead to other problems. > > Thanks > Roger(Hongbo.He) > > -----Original Message----- > From: Christian König [mailto:deathsimple at vodafone.de] > Sent: Friday, April 21, 2017 4:27 PM > To: He, Hongbo; amd-gfx at lists.freedesktop.org > Cc: Zhou, David(ChunMing) > Subject: Re: [PATCH 3/3] drm/amdgpu: fix gpu reset issue > > NAK, that is exactly what we wanted to avoid. > > We used to have an exclusive lock for this and it cause a whole bunch of problems. > > Please elaborate why that should be necessary. > > Regards, > Christian. > > Am 21.04.2017 um 09:08 schrieb Roger.He: >> Change-Id: Ib77d33a09f348ebf2e3a9d7861411f4b951ebf7c >> Signed-off-by: Roger.He <Hongbo.He at amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 6 ++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c | 7 ++++++- >> 4 files changed, 31 insertions(+), 2 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 71364f5..ab0ffa8 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -1526,6 +1526,7 @@ struct amdgpu_device { >> atomic64_t num_bytes_moved; >> atomic64_t num_evictions; >> atomic_t gpu_reset_counter; >> + atomic_t gpu_reset_state; >> >> /* data for buffer migration throttling */ >> struct { >> @@ -1851,6 +1852,11 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring) >> #define amdgpu_psp_check_fw_loading_status(adev, i) >> (adev)->firmware.funcs->check_fw_loading_status((adev), (i)) >> >> /* Common functions */ >> +static inline int amdgpu_device_is_reset(struct amdgpu_device *adev) >> +{ >> + return atomic_read(&adev->gpu_reset_state); >> +} >> + >> int amdgpu_gpu_reset(struct amdgpu_device *adev); >> bool amdgpu_need_backup(struct amdgpu_device *adev); >> void amdgpu_pci_config_reset(struct amdgpu_device *adev); diff --git >> a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index f882496..0fb4716 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -1894,6 +1894,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, >> mutex_init(&adev->grbm_idx_mutex); >> mutex_init(&adev->mn_lock); >> hash_init(adev->mn_hash); >> + atomic_set(&adev->gpu_reset_state, 0); >> >> amdgpu_check_arguments(adev); >> >> @@ -2655,6 +2656,18 @@ int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, bool voluntary) >> } >> >> /** >> + * amdgpu_device_set_reset_state - set gpu reset state >> + * >> + * @adev: amdgpu device pointer >> + * @state: true when start to reset gpu; false: reset done */ static >> +inline void amdgpu_device_set_reset_state(struct amdgpu_device *adev, >> + bool state) >> +{ >> + atomic_set(&adev->gpu_reset_state, state); } >> + >> +/** >> * amdgpu_gpu_reset - reset the asic >> * >> * @adev: amdgpu device pointer >> @@ -2678,7 +2691,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) >> } >> >> atomic_inc(&adev->gpu_reset_counter); >> - >> + amdgpu_device_set_reset_state(adev, true); >> /* block TTM */ >> resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); >> /* store modesetting */ >> @@ -2811,6 +2824,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) >> dev_info(adev->dev, "GPU reset failed\n"); >> } >> >> + amdgpu_device_set_reset_state(adev, false); >> return r; >> } >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> index ead00d7..8cc14af 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> @@ -679,11 +679,15 @@ long amdgpu_drm_ioctl(struct file *filp, >> struct drm_file *file_priv = filp->private_data; >> struct drm_device *dev; >> long ret; >> + >> dev = file_priv->minor->dev; >> ret = pm_runtime_get_sync(dev->dev); >> if (ret < 0) >> return ret; >> >> + while (amdgpu_device_is_reset(dev->dev_private)) >> + msleep(100); >> + >> ret = drm_ioctl(filp, cmd, arg); >> >> pm_runtime_mark_last_busy(dev->dev); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c >> index 2648291..22b8059 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ioc32.c >> @@ -36,10 +36,15 @@ >> long amdgpu_kms_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) >> { >> unsigned int nr = DRM_IOCTL_NR(cmd); >> + struct drm_file *file_priv = filp->private_data; >> + struct amdgpu_device *adev = file_priv->minor->dev->dev_private; >> int ret; >> >> - if (nr < DRM_COMMAND_BASE) >> + if (nr < DRM_COMMAND_BASE) { >> + while (amdgpu_device_is_reset(adev)) >> + msleep(100); >> return drm_compat_ioctl(filp, cmd, arg); >> + } >> >> ret = amdgpu_drm_ioctl(filp, cmd, arg); >> >