Thanks for this. A few comments and a question inline. On 2018-08-31 09:27 AM, Christian König wrote: > Since we have a lot of FAQ on the VM state machine try to improve the > documentation by adding functions for each state move. > > Signed-off-by: Christian König <christian.koenig at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 107 ++++++++++++++++++++++++--------- > 1 file changed, 79 insertions(+), 28 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index a9275a99d793..40c22635fefd 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -204,6 +204,69 @@ static unsigned amdgpu_vm_bo_size(struct amdgpu_device *adev, unsigned level) > return AMDGPU_GPU_PAGE_ALIGN(amdgpu_vm_num_entries(adev, level) * 8); > } > > +/** > + * amdgpu_vm_bo_evicted - vm_bo is evicted > + * > + * @vm_bo: vm_bo which is evicted > + * > + * State for PDs/PTs and per VM BOs which are not at the location they should > + * be. > + */ > +static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) > +{ > + struct amdgpu_vm *vm = vm_bo->vm; > + struct amdgpu_bo *bo = vm_bo->bo; > + > + vm_bo->moved = true; > + if (bo->tbo.type == ttm_bo_type_kernel) > + list_move(&vm_bo->vm_status, &vm->evicted); > + else > + list_move_tail(&vm_bo->vm_status, &vm->evicted); > +} > + > +/** > + * amdgpu_vm_bo_relocated - vm_bo is reloacted > + * > + * @vm_bo: vm_bo which is relocated > + * > + * State for PDs/PTs which needs to update their parent PD. > + */ > +static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo) > +{ > + list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); > +} > + > +/** > + * amdgpu_vm_bo_moved - vm_bo is moved > + * > + * @vm_bo: vm_bo which is moved > + * > + * State for per VM and normal BOs which are moved, but that change is not yet > + * reflected in the page tables. I have a question here. Why does amdgpu_cs_vm_handling call amdgpu_vm_bo_update manually for its BO list entries? Wouldn't it be enough to just call amdgpu_vm_handle_moved? > + */ > +static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo) > +{ > + struct amdgpu_vm *vm = vm_bo->vm; > + > + spin_lock(&vm->moved_lock); > + list_move(&vm_bo->vm_status, &vm->moved); > + spin_unlock(&vm->moved_lock); If vm->moved_lock protects the moved list, do we also need to take it whenever something is moved from that list? That could potentially be any list_move operation that uses vm_bo->vm_status. I found one case below where that may not be handled correctly. > +} > + > +/** > + * amdgpu_vm_bo_idle - vm_bo is idle > + * > + * @vm_bo: vm_bo which is now idle > + * > + * State for PDs/PTs and per VM BOs which have gone through the state machine > + * and are now idle. > + */ > +static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo) > +{ > + list_move(&vm_bo->vm_status, &vm_bo->vm->idle); > + vm_bo->moved = false; > +} > + > /** > * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm > * > @@ -232,9 +295,9 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, > > vm->bulk_moveable = false; > if (bo->tbo.type == ttm_bo_type_kernel) > - list_move(&base->vm_status, &vm->relocated); > + amdgpu_vm_bo_relocated(base); > else > - list_move(&base->vm_status, &vm->idle); > + amdgpu_vm_bo_idle(base); > > if (bo->preferred_domains & > amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type)) > @@ -245,8 +308,7 @@ static void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, > * is currently evicted. add the bo to the evicted list to make sure it > * is validated on next vm use to avoid fault. > * */ > - list_move_tail(&base->vm_status, &vm->evicted); > - base->moved = true; > + amdgpu_vm_bo_evicted(base); > } > > /** > @@ -342,9 +404,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, > break; > > if (bo->tbo.type != ttm_bo_type_kernel) { > - spin_lock(&vm->moved_lock); > - list_move(&bo_base->vm_status, &vm->moved); > - spin_unlock(&vm->moved_lock); > + amdgpu_vm_bo_moved(bo_base); > } else { > if (vm->use_cpu_for_update) > r = amdgpu_bo_kmap(bo, NULL); > @@ -352,7 +412,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, > r = amdgpu_ttm_alloc_gart(&bo->tbo); > if (r) > break; > - list_move(&bo_base->vm_status, &vm->relocated); > + amdgpu_vm_bo_relocated(bo_base); > } > } > > @@ -1123,8 +1183,7 @@ int amdgpu_vm_update_directories(struct amdgpu_device *adev, > bo_base = list_first_entry(&vm->relocated, > struct amdgpu_vm_bo_base, > vm_status); > - bo_base->moved = false; > - list_move(&bo_base->vm_status, &vm->idle); > + amdgpu_vm_bo_idle(bo_base); > > bo = bo_base->bo->parent; > if (!bo) > @@ -1243,7 +1302,7 @@ static void amdgpu_vm_handle_huge_pages(struct amdgpu_pte_update_params *p, > if (entry->huge) { > /* Add the entry to the relocated list to update it. */ > entry->huge = false; > - list_move(&entry->base.vm_status, &p->vm->relocated); > + amdgpu_vm_bo_relocated(&entry->base); > } > return; > } > @@ -1746,9 +1805,9 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, > uint32_t mem_type = bo->tbo.mem.mem_type; > > if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(mem_type))) > - list_add_tail(&bo_va->base.vm_status, &vm->evicted); > + amdgpu_vm_bo_evicted(&bo_va->base); > else > - list_add(&bo_va->base.vm_status, &vm->idle); > + amdgpu_vm_bo_idle(&bo_va->base); There is a small change in behaviour here for clearing bo_va->base.moved. Not sure if it matters. > } > > list_splice_init(&bo_va->invalids, &bo_va->valids); > @@ -2472,28 +2531,20 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev, > > list_for_each_entry(bo_base, &bo->va, bo_list) { > struct amdgpu_vm *vm = bo_base->vm; > - bool was_moved = bo_base->moved; > > - bo_base->moved = true; > if (evicted && bo->tbo.resv == vm->root.base.bo->tbo.resv) { > - if (bo->tbo.type == ttm_bo_type_kernel) > - list_move(&bo_base->vm_status, &vm->evicted); > - else > - list_move_tail(&bo_base->vm_status, > - &vm->evicted); > + amdgpu_vm_bo_evicted(bo_base); I think here it's possible that the BO was on the moved list. I think that means amdgpu_vm_bo_evicted should take the moved_lock just in case. Regards,  Felix > continue; > } > > - if (was_moved) > + if (bo_base->moved) > continue; > > - if (bo->tbo.type == ttm_bo_type_kernel) { > - list_move(&bo_base->vm_status, &vm->relocated); > - } else { > - spin_lock(&bo_base->vm->moved_lock); > - list_move(&bo_base->vm_status, &vm->moved); > - spin_unlock(&bo_base->vm->moved_lock); > - } > + bo_base->moved = true; > + if (bo->tbo.type == ttm_bo_type_kernel) > + amdgpu_vm_bo_relocated(bo_base); > + else > + amdgpu_vm_bo_moved(bo_base); > } > } >