On 09.10.2017 10:16, Christian König wrote: > From: Christian König <christian.koenig at amd.com> > > Revert "drm/amdgpu: skip all jobs of guilty vm" and > "drm/amdgpu: return -ENODEV to user space when vram is lost v2" > > Forcing userspace to restart without a chance to recover in case of a GPU reset > doesn't make much sense and just completely breaks GPU reset handling and makes > the system unuseable after a reset. > > Signed-off-by: Christian König <christian.koenig at amd.com> Acked-by: Nicolai Hähnle <nicolai.haehnle at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ---- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 14 -------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +--- > drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 5 ----- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 15 ++++----------- > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 ---------- > 6 files changed, 5 insertions(+), 47 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 71e971f..81dd5ef 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -772,7 +772,6 @@ struct amdgpu_fpriv { > struct mutex bo_list_lock; > struct idr bo_list_handles; > struct amdgpu_ctx_mgr ctx_mgr; > - u32 vram_lost_counter; > }; > > /* > @@ -1501,7 +1500,6 @@ struct amdgpu_device { > atomic64_t num_evictions; > atomic64_t num_vram_cpu_page_faults; > atomic_t gpu_reset_counter; > - atomic_t vram_lost_counter; > > /* data for buffer migration throttling */ > struct { > @@ -1845,8 +1843,6 @@ static inline bool amdgpu_has_atpx(void) { return false; } > extern const struct drm_ioctl_desc amdgpu_ioctls_kms[]; > extern const int amdgpu_max_kms_ioctl; > > -bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, > - struct amdgpu_fpriv *fpriv); > int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags); > void amdgpu_driver_unload_kms(struct drm_device *dev); > void amdgpu_driver_lastclose_kms(struct drm_device *dev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index ab83dfc..adb0c1c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -1189,7 +1189,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, > int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > { > struct amdgpu_device *adev = dev->dev_private; > - struct amdgpu_fpriv *fpriv = filp->driver_priv; > union drm_amdgpu_cs *cs = data; > struct amdgpu_cs_parser parser = {}; > bool reserved_buffers = false; > @@ -1197,8 +1196,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > > if (!adev->accel_working) > return -EBUSY; > - if (amdgpu_kms_vram_lost(adev, fpriv)) > - return -ENODEV; > > parser.adev = adev; > parser.filp = filp; > @@ -1257,16 +1254,12 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data, > { > union drm_amdgpu_wait_cs *wait = data; > struct amdgpu_device *adev = dev->dev_private; > - struct amdgpu_fpriv *fpriv = filp->driver_priv; > unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout); > struct amdgpu_ring *ring = NULL; > struct amdgpu_ctx *ctx; > struct dma_fence *fence; > long r; > > - if (amdgpu_kms_vram_lost(adev, fpriv)) > - return -ENODEV; > - > ctx = amdgpu_ctx_get(filp->driver_priv, wait->in.ctx_id); > if (ctx == NULL) > return -EINVAL; > @@ -1335,16 +1328,12 @@ int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data, > struct drm_file *filp) > { > struct amdgpu_device *adev = dev->dev_private; > - struct amdgpu_fpriv *fpriv = filp->driver_priv; > union drm_amdgpu_fence_to_handle *info = data; > struct dma_fence *fence; > struct drm_syncobj *syncobj; > struct sync_file *sync_file; > int fd, r; > > - if (amdgpu_kms_vram_lost(adev, fpriv)) > - return -ENODEV; > - > fence = amdgpu_cs_get_fence(adev, filp, &info->in.fence); > if (IS_ERR(fence)) > return PTR_ERR(fence); > @@ -1506,15 +1495,12 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, > struct drm_file *filp) > { > struct amdgpu_device *adev = dev->dev_private; > - struct amdgpu_fpriv *fpriv = filp->driver_priv; > union drm_amdgpu_wait_fences *wait = data; > uint32_t fence_count = wait->in.fence_count; > struct drm_amdgpu_fence *fences_user; > struct drm_amdgpu_fence *fences; > int r; > > - if (amdgpu_kms_vram_lost(adev, fpriv)) > - return -ENODEV; > /* Get the fences from userspace */ > fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence), > GFP_KERNEL); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 24f6e3c..6d641e0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2951,10 +2951,8 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev) > if (r) > goto out; > vram_lost = amdgpu_check_vram_lost(adev); > - if (vram_lost) { > + if (vram_lost) > DRM_ERROR("VRAM is lost!\n"); > - atomic_inc(&adev->vram_lost_counter); > - } > r = amdgpu_ttm_recover_gart(adev); > if (r) > goto out; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > index b0d45c8..c69048c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > @@ -577,11 +577,6 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data, > args->operation); > return -EINVAL; > } > - if ((args->operation == AMDGPU_VA_OP_MAP) || > - (args->operation == AMDGPU_VA_OP_REPLACE)) { > - if (amdgpu_kms_vram_lost(adev, fpriv)) > - return -ENODEV; > - } > > INIT_LIST_HEAD(&list); > INIT_LIST_HEAD(&duplicates); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 4510627..63bd372 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -178,7 +178,6 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) > { > struct dma_fence *fence = NULL; > struct amdgpu_job *job; > - struct amdgpu_fpriv *fpriv = NULL; > int r; > > if (!sched_job) { > @@ -190,16 +189,10 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) > BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); > > trace_amdgpu_sched_run_job(job); > - if (job->vm) > - fpriv = container_of(job->vm, struct amdgpu_fpriv, vm); > - /* skip ib schedule when vram is lost */ > - if (fpriv && amdgpu_kms_vram_lost(job->adev, fpriv)) > - DRM_ERROR("Skip scheduling IBs!\n"); > - else { > - r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence); > - if (r) > - DRM_ERROR("Error scheduling IBs (%d)\n", r); > - } > + r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, &fence); > + if (r) > + DRM_ERROR("Error scheduling IBs (%d)\n", r); > + > /* if gpu reset, hw fence will be replaced here */ > dma_fence_put(job->fence); > job->fence = dma_fence_get(fence); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > index 8c6fd56..0fc36b2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > @@ -269,7 +269,6 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info, > static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > { > struct amdgpu_device *adev = dev->dev_private; > - struct amdgpu_fpriv *fpriv = filp->driver_priv; > struct drm_amdgpu_info *info = data; > struct amdgpu_mode_info *minfo = &adev->mode_info; > void __user *out = (void __user *)(uintptr_t)info->return_pointer; > @@ -282,8 +281,6 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file > > if (!info->return_size || !info->return_pointer) > return -EINVAL; > - if (amdgpu_kms_vram_lost(adev, fpriv)) > - return -ENODEV; > > switch (info->query) { > case AMDGPU_INFO_ACCEL_WORKING: > @@ -791,12 +788,6 @@ void amdgpu_driver_lastclose_kms(struct drm_device *dev) > vga_switcheroo_process_delayed_switch(); > } > > -bool amdgpu_kms_vram_lost(struct amdgpu_device *adev, > - struct amdgpu_fpriv *fpriv) > -{ > - return fpriv->vram_lost_counter != atomic_read(&adev->vram_lost_counter); > -} > - > /** > * amdgpu_driver_open_kms - drm callback for open > * > @@ -853,7 +844,6 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) > > amdgpu_ctx_mgr_init(&fpriv->ctx_mgr); > > - fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter); > file_priv->driver_priv = fpriv; > > out_suspend: > -- Lerne, wie die Welt wirklich ist, Aber vergiss niemals, wie sie sein sollte.