On 2017-10-26 02:11 PM, Christian König wrote: > But now reading the patch there is something else which I stumbled over: >> -   WARN_ON(atomic_read(&p->mm->mm_count) <= 0); >> +   /* >> +    * This cast should be safe here because we grabbed a >> +    * reference to the mm in kfd_process_notifier_release >> +    */ >> +   WARN_ON(atomic_read(&((struct mm_struct *)p->mm)->mm_count) <= 0); >>        mmdrop(p->mm); > Well that isn't good coding style. You shouldn't obfuscate what > pointer it is by changing it to "void*", but rather set it to NULL as > soon as you know that it is stale. > > Additional to that it is certainly not job of the driver to warn on a > run over mm_count. Yeah. We don't have this in our current staging branch. The whole process teardown has changed quite a bit. I just fixed this up to make it work with current upstream. If you prefer, I could just remove the WARN_ON. Regards,  Felix > > Regards, > Christian. > >> >> Regards, >>   Felix >> >>> Regards, >>> Christian. >>> >>>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> >>>> --- >>>>   drivers/gpu/drm/amd/amdkfd/kfd_events.c | 19 +++++++++++++++---- >>>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 7 ++++++- >>>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++++- >>>>   3 files changed, 26 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c >>>> index 944abfa..61ce547 100644 >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c >>>> @@ -24,8 +24,8 @@ >>>>   #include <linux/slab.h> >>>>   #include <linux/types.h> >>>>   #include <linux/sched/signal.h> >>>> +#include <linux/sched/mm.h> >>>>   #include <linux/uaccess.h> >>>> -#include <linux/mm.h> >>>>   #include <linux/mman.h> >>>>   #include <linux/memory.h> >>>>   #include "kfd_priv.h" >>>> @@ -904,14 +904,24 @@ void kfd_signal_iommu_event(struct kfd_dev >>>> *dev, unsigned int pasid, >>>>        * running so the lookup function returns a locked process. >>>>        */ >>>>       struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); >>>> +   struct mm_struct *mm; >>>>        if (!p) >>>>           return; /* Presumably process exited. */ >>>>   +   /* Take a safe reference to the mm_struct, which may otherwise >>>> +    * disappear even while the kfd_process is still referenced. >>>> +    */ >>>> +   mm = get_task_mm(p->lead_thread); >>>> +   if (!mm) { >>>> +       mutex_unlock(&p->mutex); >>>> +       return; /* Process is exiting */ >>>> +   } >>>> + >>>>       memset(&memory_exception_data, 0, >>>> sizeof(memory_exception_data)); >>>>   -   down_read(&p->mm->mmap_sem); >>>> -   vma = find_vma(p->mm, address); >>>> +   down_read(&mm->mmap_sem); >>>> +   vma = find_vma(mm, address); >>>>        memory_exception_data.gpu_id = dev->id; >>>>       memory_exception_data.va = address; >>>> @@ -937,7 +947,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, >>>> unsigned int pasid, >>>>           } >>>>       } >>>>   -   up_read(&p->mm->mmap_sem); >>>> +   up_read(&mm->mmap_sem); >>>> +   mmput(mm); >>>>        mutex_lock(&p->event_mutex); >>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> index 7d86ec9..1a483a7 100644 >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>> @@ -494,7 +494,12 @@ struct kfd_process { >>>>        */ >>>>       struct hlist_node kfd_processes; >>>>   -   struct mm_struct *mm; >>>> +   /* >>>> +    * Opaque pointer to mm_struct. We don't hold a reference to >>>> +    * it so it should never be dereferenced from here. This is >>>> +    * only used for looking up processes by their mm. >>>> +    */ >>>> +   void *mm; >>>>        struct mutex mutex; >>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> index 3ccb3b5..21d27e5 100644 >>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>> @@ -200,7 +200,11 @@ static void kfd_process_destroy_delayed(struct >>>> rcu_head *rcu) >>>>       struct kfd_process *p; >>>>        p = container_of(rcu, struct kfd_process, rcu); >>>> -   WARN_ON(atomic_read(&p->mm->mm_count) <= 0); >>>> +   /* >>>> +    * This cast should be safe here because we grabbed a >>>> +    * reference to the mm in kfd_process_notifier_release >>>> +    */ >>>> +   WARN_ON(atomic_read(&((struct mm_struct *)p->mm)->mm_count) <= >>>> 0); >>>>        mmdrop(p->mm); >>>>   >>> >