On 2017å¹´05æ??16æ?¥ 16:13, Christian König wrote: > Am 16.05.2017 um 08:44 schrieb Chunming Zhou: >> below ioctl will return -ENODEV: >> amdgpu_cs_ioctl >> amdgpu_cs_wait_ioctl >> amdgpu_cs_wait_fences_ioctl >> amdgpu_gem_va_ioctl >> amdgpu_info_ioctl >> >> Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e >> Signed-off-by: Chunming Zhou <David1.Zhou at amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 9 +++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 ++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 ++++++++++ >> 5 files changed, 28 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index f9da215..dcd6203 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -855,6 +855,7 @@ struct amdgpu_fpriv { >> struct amdgpu_ctx_mgr ctx_mgr; >> spinlock_t sem_handles_lock; >> struct idr sem_handles; >> + u32 vram_lost_counter; >> }; >> /* >> @@ -1607,6 +1608,7 @@ struct amdgpu_device { >> atomic64_t num_bytes_moved; >> atomic64_t num_evictions; >> atomic_t gpu_reset_counter; >> + atomic_t vram_lost_counter; >> /* data for buffer migration throttling */ >> struct { >> @@ -2005,6 +2007,8 @@ static inline void >> amdgpu_unregister_atpx_handler(void) {} >> extern const struct drm_ioctl_desc amdgpu_ioctls_kms[]; >> extern const int amdgpu_max_kms_ioctl; >> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, >> + struct amdgpu_fpriv *fpriv); >> int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long >> flags); >> int amdgpu_driver_unload_kms(struct drm_device *dev); >> void amdgpu_driver_lastclose_kms(struct drm_device *dev); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> index b803412..911aa02 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct >> amdgpu_cs_parser *p, >> int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct >> drm_file *filp) >> { >> struct amdgpu_device *adev = dev->dev_private; >> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >> union drm_amdgpu_cs *cs = data; >> struct amdgpu_cs_parser parser = {}; >> bool reserved_buffers = false; >> @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, >> void *data, struct drm_file *filp) >> if (!adev->accel_working) >> return -EBUSY; >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; >> parser.adev = adev; >> parser.filp = filp; >> @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device >> *dev, void *data, >> { >> union drm_amdgpu_wait_cs *wait = data; >> struct amdgpu_device *adev = dev->dev_private; >> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >> unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout); >> struct amdgpu_ring *ring = NULL; >> struct amdgpu_ctx *ctx; >> struct fence *fence; >> long r; >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; >> r = amdgpu_cs_get_ring(adev, wait->in.ip_type, >> wait->in.ip_instance, >> wait->in.ring, &ring); >> if (r) >> @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct >> drm_device *dev, void *data, >> struct drm_file *filp) >> { >> struct amdgpu_device *adev = dev->dev_private; >> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >> union drm_amdgpu_wait_fences *wait = data; >> uint32_t fence_count = wait->in.fence_count; >> struct drm_amdgpu_fence *fences_user; >> struct drm_amdgpu_fence *fences; >> int r; >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; >> /* Get the fences from userspace */ >> fences = kmalloc_array(fence_count, sizeof(struct >> drm_amdgpu_fence), >> GFP_KERNEL); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 58af9ea..417b8f8 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -2919,8 +2919,10 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) >> if (r) >> goto out; >> vram_lost = amdgpu_check_vram_lost(adev); >> - if (vram_lost) >> + if (vram_lost) { >> DRM_ERROR("VRAM is lost!\n"); >> + atomic_inc(&adev->vram_lost_counter); >> + } >> r = amdgpu_ttm_recover_gart(adev); >> if (r) >> goto out; >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >> index d8275ef..0f0b736 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c >> @@ -776,6 +776,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, >> void *data, >> if (!adev->vm_manager.enabled) >> return -ENOTTY; >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; > > We should only block AMDGPU_VA_OP_MAP and AMDGPU_VA_OP_REPLACE here > and still allow AMDGPU_VA_OP_UNMAP and AMDGPU_VA_OP_CLEAR. > > BTW: How should the UMD recover from that situation? Completely close > the fd and recreate it? > > That might be tricky for processes like X or the Compositor. Should we > have an IOCTL to reset the vram_lost counter for an fd? if no NAK, I can add it. Regards, David Zhou > > Christian. > >> if (args->va_address < AMDGPU_VA_RESERVED_SIZE) { >> dev_err(&dev->pdev->dev, >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> index 368829a..a231aa1 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c >> @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct >> drm_amdgpu_info_firmware *fw_info, >> static int amdgpu_info_ioctl(struct drm_device *dev, void *data, >> struct drm_file *filp) >> { >> struct amdgpu_device *adev = dev->dev_private; >> + struct amdgpu_fpriv *fpriv = filp->driver_priv; >> struct drm_amdgpu_info *info = data; >> struct amdgpu_mode_info *minfo = &adev->mode_info; >> void __user *out = (void __user *)(uintptr_t)info->return_pointer; >> @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device >> *dev, void *data, struct drm_file >> if (!info->return_size || !info->return_pointer) >> return -EINVAL; >> + if (amdgpu_kms_vram_lost(adev, fpriv)) >> + return -ENODEV; >> switch (info->query) { >> case AMDGPU_INFO_VIRTUAL_RANGE: { >> @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct >> drm_device *dev) >> vga_switcheroo_process_delayed_switch(); >> } >> +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, >> + struct amdgpu_fpriv *fpriv) >> +{ >> + return fpriv->vram_lost_counter != >> atomic_read(&adev->vram_lost_counter); >> +} >> + >> /** >> * amdgpu_driver_open_kms - drm callback for open >> * >> @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device >> *dev, struct drm_file *file_priv) >> amdgpu_ctx_mgr_init(&fpriv->ctx_mgr); >> + fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter); >> file_priv->driver_priv = fpriv; >> out_suspend: > >