[Public] Ping.. > -----Original Message----- > From: Chen, Guchun <Guchun.Chen@xxxxxxx> > Sent: Wednesday, May 24, 2023 5:23 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Deucher, Alexander > <Alexander.Deucher@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Yang, Philip > <Philip.Yang@xxxxxxx>; Kuehling, Felix <Felix.Kuehling@xxxxxxx> > Cc: Chen, Guchun <Guchun.Chen@xxxxxxx> > Subject: [PATCH] drm/amdgpu: add a flag to indicate if a VM is attached to > fpriv > > Recent code stores xcp_id to amdgpu bo for accounting memory usage or > find correct KFD node, and this xcp_id is from file private data after opening > device. However, not all VMs are attached to this fpriv structure like the case > in amdgpu_mes_self_test. > So add a flag to differentiate the cases. Otherwise, KASAN will complain out > of bound access. > > [ 77.292314] BUG: KASAN: slab-out-of-bounds in > amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] > [ 77.293845] Read of size 4 at addr ffff888102c48a48 by task > modprobe/1069 > [ 77.294146] Call Trace: > [ 77.294178] <TASK> > [ 77.294208] dump_stack_lvl+0x49/0x63 > [ 77.294260] print_report+0x16f/0x4a6 > [ 77.294307] ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] > [ 77.295979] ? kasan_complete_mode_report_info+0x3c/0x200 > [ 77.296057] ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] > [ 77.297556] kasan_report+0xb4/0x130 > [ 77.297609] ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] > [ 77.299202] __asan_load4+0x6f/0x90 > [ 77.299272] amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu] > [ 77.300796] ? amdgpu_init+0x6e/0x1000 [amdgpu] > [ 77.302222] ? amdgpu_vm_pt_clear+0x750/0x750 [amdgpu] > [ 77.303721] ? preempt_count_sub+0x18/0xc0 > [ 77.303786] amdgpu_vm_init+0x39e/0x870 [amdgpu] > [ 77.305186] ? amdgpu_vm_wait_idle+0x90/0x90 [amdgpu] > [ 77.306683] ? kasan_set_track+0x25/0x30 > [ 77.306737] ? kasan_save_alloc_info+0x1b/0x30 > [ 77.306795] ? __kasan_kmalloc+0x87/0xa0 > [ 77.306852] amdgpu_mes_self_test+0x169/0x620 [amdgpu] > > Fixes: ffc6deb773f7("drm/amdkfd: Store xcp partition id to amdgpu bo") > Signed-off-by: Guchun Chen <guchun.chen@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 ++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 5 ++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 12 +++++++++--- > 5 files changed, 19 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > index 41d047e5de69..79b80f9233db 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c > @@ -1229,7 +1229,7 @@ int amdgpu_driver_open_kms(struct drm_device > *dev, struct drm_file *file_priv) > pasid = 0; > } > > - r = amdgpu_vm_init(adev, &fpriv->vm); > + r = amdgpu_vm_init(adev, &fpriv->vm, true); > if (r) > goto error_pasid; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > index 49bb6c03d606..3be5219edf88 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > @@ -1345,7 +1345,7 @@ int amdgpu_mes_self_test(struct amdgpu_device > *adev) > goto error_pasid; > } > > - r = amdgpu_vm_init(adev, vm); > + r = amdgpu_vm_init(adev, vm, false); > if (r) { > DRM_ERROR("failed to initialize vm\n"); > goto error_pasid; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 37b9d8a8dbec..47ffaa1526a0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -2099,13 +2099,15 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm > *vm, long timeout) > * > * @adev: amdgpu_device pointer > * @vm: requested vm > + * @vm_attach_to_fpriv: flag to tell if vm is attached to file private > + data > * > * Init @vm fields. > * > * Returns: > * 0 for success, error for failure. > */ > -int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm) > +int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, > + bool vm_attach_to_fpriv) > { > struct amdgpu_bo *root_bo; > struct amdgpu_bo_vm *root; > @@ -2131,6 +2133,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, > struct amdgpu_vm *vm) > > vm->pte_support_ats = false; > vm->is_compute_context = false; > + vm->vm_attach_to_fpriv = vm_attach_to_fpriv; > > vm->use_cpu_for_update = !!(adev- > >vm_manager.vm_update_mode & > AMDGPU_VM_USE_CPU_FOR_GFX); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index d551fca1780e..62ed14b1fc16 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -333,6 +333,9 @@ struct amdgpu_vm { > /* Flag to indicate if VM is used for compute */ > bool is_compute_context; > > + /* Flag to tell if VM is attached to file private data */ > + bool vm_attach_to_fpriv; > + > /* Memory partition number, -1 means any partition */ > int8_t mem_id; > }; > @@ -392,7 +395,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device > *adev, struct amdgpu_vm *vm, > u32 pasid); > > long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout); -int > amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm); > +int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, > +bool vm_attach_to_fpriv); > int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct > amdgpu_vm *vm); void amdgpu_vm_release_compute(struct > amdgpu_device *adev, struct amdgpu_vm *vm); void > amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); diff > --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > index cc3b1b596e56..16b3350aa896 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c > @@ -502,7 +502,7 @@ int amdgpu_vm_pt_clear(struct amdgpu_device > *adev, struct amdgpu_vm *vm, int amdgpu_vm_pt_create(struct > amdgpu_device *adev, struct amdgpu_vm *vm, > int level, bool immediate, struct amdgpu_bo_vm > **vmbo) { > - struct amdgpu_fpriv *fpriv = container_of(vm, struct amdgpu_fpriv, > vm); > + struct amdgpu_fpriv *fpriv; > struct amdgpu_bo_param bp; > struct amdgpu_bo *bo; > struct dma_resv *resv; > @@ -535,7 +535,10 @@ int amdgpu_vm_pt_create(struct amdgpu_device > *adev, struct amdgpu_vm *vm, > > bp.type = ttm_bo_type_kernel; > bp.no_wait_gpu = immediate; > - bp.xcp_id_plus1 = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id + 1; > + if (vm->vm_attach_to_fpriv) { > + fpriv = container_of(vm, struct amdgpu_fpriv, vm); > + bp.xcp_id_plus1 = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id + 1; > + } > > if (vm->root.bo) > bp.resv = vm->root.bo->tbo.base.resv; @@ -561,7 +564,10 > @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct > amdgpu_vm *vm, > bp.type = ttm_bo_type_kernel; > bp.resv = bo->tbo.base.resv; > bp.bo_ptr_size = sizeof(struct amdgpu_bo); > - bp.xcp_id_plus1 = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id + 1; > + if (vm->vm_attach_to_fpriv) { > + fpriv = container_of(vm, struct amdgpu_fpriv, vm); > + bp.xcp_id_plus1 = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id + 1; > + } > > r = amdgpu_bo_create(adev, &bp, &(*vmbo)->shadow); > > -- > 2.25.1