Am 21.10.2017 um 02:23 schrieb Felix Kuehling: > The kfd_process doesn't own a reference to the mm_struct, so it can > disappear without warning even while the kfd_process still exists. > In fact, the delayed kfd_process teardown is triggered by an MMU > notifier when the mm_struct is destroyed. Permanently holding a > reference to the mm_struct would prevent this from happening. > > Therefore, avoid dereferencing the kfd_process.mm pointer and make > it opaque. Use get_task_mm to get a temporary reference to the mm > when it's needed. Actually that patch is unnecessary. Process tear down (and calling the MMU release callback) is triggered when mm_struct->mm_users reaches zero. The mm_struct is freed up when mm_struct->mm_count becomes zero. So what you can do is grab a reference to the mm_struct with mmgrab() and still be notified by process tear down. Regards, Christian. > > Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> > --- > drivers/gpu/drm/amd/amdkfd/kfd_events.c | 19 +++++++++++++++---- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 7 ++++++- > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++++- > 3 files changed, 26 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > index 944abfa..61ce547 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c > @@ -24,8 +24,8 @@ > #include <linux/slab.h> > #include <linux/types.h> > #include <linux/sched/signal.h> > +#include <linux/sched/mm.h> > #include <linux/uaccess.h> > -#include <linux/mm.h> > #include <linux/mman.h> > #include <linux/memory.h> > #include "kfd_priv.h" > @@ -904,14 +904,24 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, > * running so the lookup function returns a locked process. > */ > struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); > + struct mm_struct *mm; > > if (!p) > return; /* Presumably process exited. */ > > + /* Take a safe reference to the mm_struct, which may otherwise > + * disappear even while the kfd_process is still referenced. > + */ > + mm = get_task_mm(p->lead_thread); > + if (!mm) { > + mutex_unlock(&p->mutex); > + return; /* Process is exiting */ > + } > + > memset(&memory_exception_data, 0, sizeof(memory_exception_data)); > > - down_read(&p->mm->mmap_sem); > - vma = find_vma(p->mm, address); > + down_read(&mm->mmap_sem); > + vma = find_vma(mm, address); > > memory_exception_data.gpu_id = dev->id; > memory_exception_data.va = address; > @@ -937,7 +947,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid, > } > } > > - up_read(&p->mm->mmap_sem); > + up_read(&mm->mmap_sem); > + mmput(mm); > > mutex_lock(&p->event_mutex); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 7d86ec9..1a483a7 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -494,7 +494,12 @@ struct kfd_process { > */ > struct hlist_node kfd_processes; > > - struct mm_struct *mm; > + /* > + * Opaque pointer to mm_struct. We don't hold a reference to > + * it so it should never be dereferenced from here. This is > + * only used for looking up processes by their mm. > + */ > + void *mm; > > struct mutex mutex; > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 3ccb3b5..21d27e5 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -200,7 +200,11 @@ static void kfd_process_destroy_delayed(struct rcu_head *rcu) > struct kfd_process *p; > > p = container_of(rcu, struct kfd_process, rcu); > - WARN_ON(atomic_read(&p->mm->mm_count) <= 0); > + /* > + * This cast should be safe here because we grabbed a > + * reference to the mm in kfd_process_notifier_release > + */ > + WARN_ON(atomic_read(&((struct mm_struct *)p->mm)->mm_count) <= 0); > > mmdrop(p->mm); >