Am 16.05.2017 um 08:44 schrieb Chunming Zhou: > below ioctl will return -ENODEV: > amdgpu_cs_ioctl > amdgpu_cs_wait_ioctl > amdgpu_cs_wait_fences_ioctl > amdgpu_gem_va_ioctl > amdgpu_info_ioctl > > Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e > Signed-off-by: Chunming Zhou <David1.Zhou at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 9 +++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +++- > drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 ++++++++++ > 5 files changed, 28 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index f9da215..dcd6203 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -855,6 +855,7 @@ struct amdgpu_fpriv { > struct amdgpu_ctx_mgr ctx_mgr; > spinlock_t sem_handles_lock; > struct idr sem_handles; > + u32 vram_lost_counter; > }; > > /* > @@ -1607,6 +1608,7 @@ struct amdgpu_device { > atomic64_t num_bytes_moved; > atomic64_t num_evictions; > atomic_t gpu_reset_counter; > + atomic_t vram_lost_counter; > > /* data for buffer migration throttling */ > struct { > @@ -2005,6 +2007,8 @@ static inline void amdgpu_unregister_atpx_handler(void) {} > extern const struct drm_ioctl_desc amdgpu_ioctls_kms[]; > extern const int amdgpu_max_kms_ioctl; > > +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, > + struct amdgpu_fpriv *fpriv); > int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags); > int amdgpu_driver_unload_kms(struct drm_device *dev); > void amdgpu_driver_lastclose_kms(struct drm_device *dev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index b803412..911aa02 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, > int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > { > struct amdgpu_device *adev = dev->dev_private; > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > union drm_amdgpu_cs *cs = data; > struct amdgpu_cs_parser parser = {}; > bool reserved_buffers = false; > @@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > > if (!adev->accel_working) > return -EBUSY; > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; > > parser.adev = adev; > parser.filp = filp; > @@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data, > { > union drm_amdgpu_wait_cs *wait = data; > struct amdgpu_device *adev = dev->dev_private; > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout); > struct amdgpu_ring *ring = NULL; > struct amdgpu_ctx *ctx; > struct fence *fence; > long r; > > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; > r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait->in.ip_instance, > wait->in.ring, &ring); > if (r) > @@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, > struct drm_file *filp) > { > struct amdgpu_device *adev = dev->dev_private; > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > union drm_amdgpu_wait_fences *wait = data; > uint32_t fence_count = wait->in.fence_count; > struct drm_amdgpu_fence *fences_user; > struct drm_amdgpu_fence *fences; > int r; > > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; > /* Get the fences from userspace */ > fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence), > GFP_KERNEL); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 58af9ea..417b8f8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2919,8 +2919,10 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) > if (r) > goto out; > vram_lost = amdgpu_check_vram_lost(adev); > - if (vram_lost) > + if (vram_lost) { > DRM_ERROR("VRAM is lost!\n"); > + atomic_inc(&adev->vram_lost_counter); > + } > r = amdgpu_ttm_recover_gart(adev); > if (r) > goto out; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > index d8275ef..0f0b736 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > @@ -776,6 +776,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, > > if (!adev->vm_manager.enabled) > return -ENOTTY; > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; We should only block AMDGPU_VA_OP_MAP and AMDGPU_VA_OP_REPLACE here and still allow AMDGPU_VA_OP_UNMAP and AMDGPU_VA_OP_CLEAR. BTW: How should the UMD recover from that situation? Completely close the fd and recreate it? That might be tricky for processes like X or the Compositor. Should we have an IOCTL to reset the vram_lost counter for an fd? Christian. > > if (args->va_address < AMDGPU_VA_RESERVED_SIZE) { > dev_err(&dev->pdev->dev, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > index 368829a..a231aa1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > @@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info, > static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > { > struct amdgpu_device *adev = dev->dev_private; > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > struct drm_amdgpu_info *info = data; > struct amdgpu_mode_info *minfo = &adev->mode_info; > void __user *out = (void __user *)(uintptr_t)info->return_pointer; > @@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file > > if (!info->return_size || !info->return_pointer) > return -EINVAL; > + if (amdgpu_kms_vram_lost(adev, fpriv)) > + return -ENODEV; > > switch (info->query) { > case AMDGPU_INFO_VIRTUAL_RANGE: { > @@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct drm_device *dev) > vga_switcheroo_process_delayed_switch(); > } > > +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, > + struct amdgpu_fpriv *fpriv) > +{ > + return fpriv->vram_lost_counter != atomic_read(&adev->vram_lost_counter); > +} > + > /** > * amdgpu_driver_open_kms - drm callback for open > * > @@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) > > amdgpu_ctx_mgr_init(&fpriv->ctx_mgr); > > + fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter); > file_priv->driver_priv = fpriv; > > out_suspend: