Am 23.03.2017 um 18:22 schrieb Nicolai Hähnle: > From: Nicolai Hähnle <nicolai.haehnle at amd.com> > > Also, add the fence of the clear operations to the BO to ensure that > the underlying memory can only be re-used after all PTEs pointing to > it have been cleared. > > This avoids the following sequence of events that could be triggered > by user space: > > 1. Submit a CS that accesses some BO _without_ adding that BO to the > buffer list. > 2. Free that BO. > 3. Some other task re-uses the memory underlying the BO. > 4. The CS is submitted to the hardware and accesses memory that is > now already in use by somebody else. > > By clearing the page tables immediately in step 2, a GPU VM fault will > be triggered in step 4 instead of wild memory accesses. First of all please split adding the fence parameter to amdgpu_vm_clear_freed() into a separate patch. Not a must have, but that should make it easier to review. > > Signed-off-by: Nicolai Hähnle <nicolai.haehnle at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 13 +++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 20 ++++++++++++++------ > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 3 ++- > 4 files changed, 28 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index 55d553a..85e6070 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -778,21 +778,21 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p) > int i, r; > > r = amdgpu_vm_update_page_directory(adev, vm); > if (r) > return r; > > r = amdgpu_sync_fence(adev, &p->job->sync, vm->page_directory_fence); > if (r) > return r; > > - r = amdgpu_vm_clear_freed(adev, vm); > + r = amdgpu_vm_clear_freed(adev, vm, NULL); > if (r) > return r; > > r = amdgpu_vm_bo_update(adev, fpriv->prt_va, false); > if (r) > return r; > > r = amdgpu_sync_fence(adev, &p->job->sync, > fpriv->prt_va->last_pt_update); > if (r) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > index be9fb2c..bd2daef 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > @@ -145,20 +145,21 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj, > struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); > struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); > struct amdgpu_fpriv *fpriv = file_priv->driver_priv; > struct amdgpu_vm *vm = &fpriv->vm; > > struct amdgpu_bo_list_entry vm_pd; > struct list_head list, duplicates; > struct ttm_validate_buffer tv; > struct ww_acquire_ctx ticket; > struct amdgpu_bo_va *bo_va; > + struct fence *fence = NULL; > int r; > > INIT_LIST_HEAD(&list); > INIT_LIST_HEAD(&duplicates); > > tv.bo = &bo->tbo; > tv.shared = true; > list_add(&tv.head, &list); > > amdgpu_vm_get_pd_bo(vm, &list, &vm_pd); > @@ -166,23 +167,31 @@ void amdgpu_gem_object_close(struct drm_gem_object *obj, > r = ttm_eu_reserve_buffers(&ticket, &list, false, &duplicates); > if (r) { > dev_err(adev->dev, "leaking bo va because " > "we fail to reserve bo (%d)\n", r); > return; > } > bo_va = amdgpu_vm_bo_find(vm, bo); > if (bo_va) { > if (--bo_va->ref_count == 0) { > amdgpu_vm_bo_rmv(adev, bo_va); > + > + amdgpu_vm_clear_freed(adev, vm, &fence); > } > } > - ttm_eu_backoff_reservation(&ticket, &list); > + > + if (fence) { > + ttm_eu_fence_buffer_objects(&ticket, &list, fence); > + fence_put(fence); > + } else { > + ttm_eu_backoff_reservation(&ticket, &list); > + } Ah, now I remember why we didn't do that before. We could run into problems because amdgpu_gem_object_close() can't fail and adding this needs memory. Anyway, "tv.shared = true;" was there before. So your patch doesn't make it any worse. But I suggest to use amdgpu_bo_fence() instead of ttm_eu_fence_buffer_objects(). This way we don't try to add the fence to the page directory. Apart from that the patch looks good to me, Christian. > } > > static int amdgpu_gem_handle_lockup(struct amdgpu_device *adev, int r) > { > if (r == -EDEADLK) { > r = amdgpu_gpu_reset(adev); > if (!r) > r = -EAGAIN; > } > return r; > @@ -535,21 +544,21 @@ static void amdgpu_gem_va_update_vm(struct amdgpu_device *adev, > > r = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_gem_va_check, > NULL); > if (r) > goto error; > > r = amdgpu_vm_update_page_directory(adev, vm); > if (r) > goto error; > > - r = amdgpu_vm_clear_freed(adev, vm); > + r = amdgpu_vm_clear_freed(adev, vm, NULL); > if (r) > goto error; > > if (operation == AMDGPU_VA_OP_MAP || > operation == AMDGPU_VA_OP_REPLACE) > r = amdgpu_vm_bo_update(adev, bo_va, false); > > error: > if (r && r != -ERESTARTSYS) > DRM_ERROR("Couldn't update BO_VA (%d)\n", r); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index dd7df45..b6029ed 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -1397,48 +1397,56 @@ static void amdgpu_vm_prt_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) > } > > kfree(shared); > } > > /** > * amdgpu_vm_clear_freed - clear freed BOs in the PT > * > * @adev: amdgpu_device pointer > * @vm: requested vm > + * @fence: optional resulting fence > * > * Make sure all freed BOs are cleared in the PT. > * Returns 0 for success. > * > * PTs have to be reserved and mutex must be locked! > */ > int amdgpu_vm_clear_freed(struct amdgpu_device *adev, > - struct amdgpu_vm *vm) > + struct amdgpu_vm *vm, > + struct fence **fence) > { > struct amdgpu_bo_va_mapping *mapping; > - struct fence *fence = NULL; > + struct fence *f = NULL; > int r; > > while (!list_empty(&vm->freed)) { > mapping = list_first_entry(&vm->freed, > struct amdgpu_bo_va_mapping, list); > list_del(&mapping->list); > > r = amdgpu_vm_bo_split_mapping(adev, NULL, 0, NULL, vm, mapping, > - 0, 0, &fence); > - amdgpu_vm_free_mapping(adev, vm, mapping, fence); > + 0, 0, &f); > + amdgpu_vm_free_mapping(adev, vm, mapping, f); > if (r) { > - fence_put(fence); > + fence_put(f); > return r; > } > + } > > + if (fence && f) { > + fence_put(*fence); > + *fence = f; > + } else { > + fence_put(f); > } > - fence_put(fence); > + > return 0; > > } > > /** > * amdgpu_vm_clear_invalids - clear invalidated BOs in the PT > * > * @adev: amdgpu_device pointer > * @vm: requested vm > * > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index ff10fa5..9d5a572 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -187,21 +187,22 @@ int amdgpu_vm_alloc_pts(struct amdgpu_device *adev, > struct amdgpu_vm *vm, > uint64_t saddr, uint64_t size); > int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring, > struct amdgpu_sync *sync, struct fence *fence, > struct amdgpu_job *job); > int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job); > void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vm_id); > int amdgpu_vm_update_page_directory(struct amdgpu_device *adev, > struct amdgpu_vm *vm); > int amdgpu_vm_clear_freed(struct amdgpu_device *adev, > - struct amdgpu_vm *vm); > + struct amdgpu_vm *vm, > + struct fence **fence); > int amdgpu_vm_clear_invalids(struct amdgpu_device *adev, struct amdgpu_vm *vm, > struct amdgpu_sync *sync); > int amdgpu_vm_bo_update(struct amdgpu_device *adev, > struct amdgpu_bo_va *bo_va, > bool clear); > void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev, > struct amdgpu_bo *bo); > struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm, > struct amdgpu_bo *bo); > struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,