On 2019-08-02 16:07, Jason Gunthorpe wrote: > When using mmu_notififer_unregister_no_release() the caller must ensure > there is a SRCU synchronize before the mn memory is freed, otherwise use > after free races are possible, for instance: > > CPU0 CPU1 > invalidate_range_start > hlist_for_each_entry_rcu(..) > mmu_notifier_unregister_no_release(&p->mn) > kfree(mn) > if (mn->ops->invalidate_range_end) > > The error unwind in amdkfd misses the SRCU synchronization. > > amdkfd keeps the kfd_process around until the mm is released, so split the > flow to fully initialize the kfd_process and register it for find_process, > and with the notifier. Past this point the kfd_process does not need to be > cleaned up as it is fully ready. > > The final failable step does a vm_mmap() and does not seem to impact the > kfd_process global state. Since it also cannot be undone (and already has > problems with undo if it internally fails), it has to be last. > > This way we don't have to try to unwind the mmu_notifier_register() and > avoid the problem with the SRCU. > > Along the way this also fixes various other error unwind bugs in the flow. > > Fixes: 45102048f77e ("amdkfd: Add process queue manager module") > Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx> > --- > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 74 +++++++++++------------- > 1 file changed, 35 insertions(+), 39 deletions(-) > > amdkfd folks, this little bug is blocking some rework I have for the > mmu notifiers (ie mm/mmu_notifiers: remove unregister_no_release) > > Can I get your help to review and if needed polish this change? I'd > like to send this patch through the hmm tree along with the rework, > thanks Thanks. That's a nice cleanup of the error handling during KFD process creation. One nit-pick inline, otherwise this looks good to me. > > You can see the larger series here: > > https://github.com/jgunthorpe/linux/commits/mmu_notifier > > Jason > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 8f1076c0c88a25..81e3ee3f1813bf 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -62,8 +62,8 @@ static struct workqueue_struct *kfd_restore_wq; > > static struct kfd_process *find_process(const struct task_struct *thread); > static void kfd_process_ref_release(struct kref *ref); > -static struct kfd_process *create_process(const struct task_struct *thread, > - struct file *filep); > +static struct kfd_process *create_process(const struct task_struct *thread); > +static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep); > > static void evict_process_worker(struct work_struct *work); > static void restore_process_worker(struct work_struct *work); > @@ -289,7 +289,15 @@ struct kfd_process *kfd_create_process(struct file *filep) > if (process) { > pr_debug("Process already found\n"); > } else { > - process = create_process(thread, filep); > + process = create_process(thread); > + if (IS_ERR(process)) > + goto out; > + > + ret = kfd_process_init_cwsr_apu(process, filep); > + if (ret) { > + process = ERR_PTR(ret); > + goto out; > + } > > if (!procfs.kobj) > goto out; > @@ -609,64 +617,56 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd) > return 0; > } > > -static struct kfd_process *create_process(const struct task_struct *thread, > - struct file *filep) > +/* > + * On return the kfd_process is fully operational and will be freed when the > + * mm is released > + */ > +static struct kfd_process *create_process(const struct task_struct *thread) > { > struct kfd_process *process; > int err = -ENOMEM; > > process = kzalloc(sizeof(*process), GFP_KERNEL); > - > if (!process) > goto err_alloc_process; > > - process->pasid = kfd_pasid_alloc(); > - if (process->pasid == 0) > - goto err_alloc_pasid; > - > - if (kfd_alloc_process_doorbells(process) < 0) > - goto err_alloc_doorbells; > - > kref_init(&process->ref); > - > mutex_init(&process->mutex); > - > process->mm = thread->mm; > - > - /* register notifier */ > - process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; > - err = mmu_notifier_register(&process->mmu_notifier, process->mm); > - if (err) > - goto err_mmu_notifier; > - > - hash_add_rcu(kfd_processes_table, &process->kfd_processes, > - (uintptr_t)process->mm); > - > process->lead_thread = thread->group_leader; > - get_task_struct(process->lead_thread); > - > INIT_LIST_HEAD(&process->per_device_data); > - > + INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); > + INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); > + process->last_restore_timestamp = get_jiffies_64(); > kfd_event_init_process(process); > + process->is_32bit_user_mode = in_compat_syscall(); > + > + process->pasid = kfd_pasid_alloc(); > + if (process->pasid == 0) > + goto err_alloc_pasid; > + > + if (kfd_alloc_process_doorbells(process) < 0) > + goto err_alloc_doorbells; > > err = pqm_init(&process->pqm, process); > if (err != 0) > goto err_process_pqm_init; > > /* init process apertures*/ > - process->is_32bit_user_mode = in_compat_syscall(); > err = kfd_init_apertures(process); > if (err != 0) > goto err_init_apertures; > > - INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); > - INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); > - process->last_restore_timestamp = get_jiffies_64(); > - > - err = kfd_process_init_cwsr_apu(process, filep); > + /* Must be last, have to use release destruction after this */ > + process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; > + err = mmu_notifier_register(&process->mmu_notifier, process->mm); > if (err) > goto err_init_cwsr; This label should be renamed to something like err_mmu_notifier. With that fixed this patch is Reviewed-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> > > + get_task_struct(process->lead_thread); > + hash_add_rcu(kfd_processes_table, &process->kfd_processes, > + (uintptr_t)process->mm); > + > return process; > > err_init_cwsr: > @@ -675,15 +675,11 @@ static struct kfd_process *create_process(const struct task_struct *thread, > err_init_apertures: > pqm_uninit(&process->pqm); > err_process_pqm_init: > - hash_del_rcu(&process->kfd_processes); > - synchronize_rcu(); > - mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm); > -err_mmu_notifier: > - mutex_destroy(&process->mutex); > kfd_free_process_doorbells(process); > err_alloc_doorbells: > kfd_pasid_free(process->pasid); > err_alloc_pasid: > + mutex_destroy(&process->mutex); > kfree(process); > err_alloc_process: > return ERR_PTR(err);