On 2017-10-27 03:41 AM, Christian König wrote: > Am 27.10.2017 um 09:22 schrieb Christian König: >> Am 26.10.2017 um 20:54 schrieb Felix Kuehling: >>> On 2017-10-26 02:11 PM, Christian König wrote: >>>> But now reading the patch there is something else which I stumbled >>>> over: >>>>> - WARN_ON(atomic_read(&p->mm->mm_count) <= 0); >>>>> +   /* >>>>> +    * This cast should be safe here because we grabbed a >>>>> +    * reference to the mm in kfd_process_notifier_release >>>>> +    */ >>>>> +   WARN_ON(atomic_read(&((struct mm_struct *)p->mm)->mm_count) >>>>> <= 0); >>>>>         mmdrop(p->mm); >>>> Well that isn't good coding style. You shouldn't obfuscate what >>>> pointer it is by changing it to "void*", but rather set it to NULL as >>>> soon as you know that it is stale. >>>> >>>> Additional to that it is certainly not job of the driver to warn on a >>>> run over mm_count. >>> Yeah. We don't have this in our current staging branch. The whole >>> process teardown has changed quite a bit. I just fixed this up to make >>> it work with current upstream. >>> >>> If you prefer, I could just remove the WARN_ON. >> >> That sounds like a good idea to me, yes. > > Wait a second, now that I though once more about it what do you mean > with this comment: >> +   /* >> +    * Opaque pointer to mm_struct. We don't hold a reference to >> +    * it so it should never be dereferenced from here. This is >> +    * only used for looking up processes by their mm. >> +    */ > E.g. what means looking up the processes by their mm? > > Do you use a hashtable or something like this and then check if you > got the correct mm structure by comparing the pointers? > > If that is the case then you should certainly set p->mm to NULL after > mmdrop(p->mm). > > Otherwise it can happen that a new mm structure is allocated at the > same location as the old one and you run into problems because the > kernel driver accidentally uses the wrong kfd_process structure. The processes are in an SRCU-protected hashtable that's read by find_process_by_mm. Before this function (kfd_process_destroy_delayed) runs, the process is already removed from the hash table and synchronize_srcu is used to ensure there are no more readers active. That means the process being destroyed can't be found any more, so there is no need to reset p->mm to NULL at that point. Regards,  Felix > > Regards, > Christian. > >> >> Regards, >> Christian. >> >>> >>> Regards, >>>   Felix >>> >>>> Regards, >>>> Christian. >>>> >>>>> Regards, >>>>>    Felix >>>>> >>>>>> Regards, >>>>>> Christian. >>>>>> >>>>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com> >>>>>>> --- >>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_events.c | 19 >>>>>>> +++++++++++++++---- >>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h   | 7 ++++++- >>>>>>>    drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 +++++- >>>>>>>    3 files changed, 26 insertions(+), 6 deletions(-) >>>>>>> >>>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c >>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c >>>>>>> index 944abfa..61ce547 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c >>>>>>> @@ -24,8 +24,8 @@ >>>>>>>    #include <linux/slab.h> >>>>>>>    #include <linux/types.h> >>>>>>>    #include <linux/sched/signal.h> >>>>>>> +#include <linux/sched/mm.h> >>>>>>>    #include <linux/uaccess.h> >>>>>>> -#include <linux/mm.h> >>>>>>>    #include <linux/mman.h> >>>>>>>    #include <linux/memory.h> >>>>>>>    #include "kfd_priv.h" >>>>>>> @@ -904,14 +904,24 @@ void kfd_signal_iommu_event(struct kfd_dev >>>>>>> *dev, unsigned int pasid, >>>>>>>         * running so the lookup function returns a locked process. >>>>>>>         */ >>>>>>>        struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); >>>>>>> +   struct mm_struct *mm; >>>>>>>         if (!p) >>>>>>>            return; /* Presumably process exited. */ >>>>>>>    +   /* Take a safe reference to the mm_struct, which may >>>>>>> otherwise >>>>>>> +    * disappear even while the kfd_process is still referenced. >>>>>>> +    */ >>>>>>> +   mm = get_task_mm(p->lead_thread); >>>>>>> +   if (!mm) { >>>>>>> +       mutex_unlock(&p->mutex); >>>>>>> +       return; /* Process is exiting */ >>>>>>> +   } >>>>>>> + >>>>>>>        memset(&memory_exception_data, 0, >>>>>>> sizeof(memory_exception_data)); >>>>>>>    -   down_read(&p->mm->mmap_sem); >>>>>>> -   vma = find_vma(p->mm, address); >>>>>>> +   down_read(&mm->mmap_sem); >>>>>>> +   vma = find_vma(mm, address); >>>>>>>         memory_exception_data.gpu_id = dev->id; >>>>>>>        memory_exception_data.va = address; >>>>>>> @@ -937,7 +947,8 @@ void kfd_signal_iommu_event(struct kfd_dev >>>>>>> *dev, >>>>>>> unsigned int pasid, >>>>>>>            } >>>>>>>        } >>>>>>>    -   up_read(&p->mm->mmap_sem); >>>>>>> +   up_read(&mm->mmap_sem); >>>>>>> +   mmput(mm); >>>>>>>         mutex_lock(&p->event_mutex); >>>>>>>    diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>>>>> index 7d86ec9..1a483a7 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >>>>>>> @@ -494,7 +494,12 @@ struct kfd_process { >>>>>>>         */ >>>>>>>        struct hlist_node kfd_processes; >>>>>>>    -   struct mm_struct *mm; >>>>>>> +   /* >>>>>>> +    * Opaque pointer to mm_struct. We don't hold a reference to >>>>>>> +    * it so it should never be dereferenced from here. This is >>>>>>> +    * only used for looking up processes by their mm. >>>>>>> +    */ >>>>>>> +   void *mm; >>>>>>>         struct mutex mutex; >>>>>>>    diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>>>>> index 3ccb3b5..21d27e5 100644 >>>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >>>>>>> @@ -200,7 +200,11 @@ static void kfd_process_destroy_delayed(struct >>>>>>> rcu_head *rcu) >>>>>>>        struct kfd_process *p; >>>>>>>         p = container_of(rcu, struct kfd_process, rcu); >>>>>>> -   WARN_ON(atomic_read(&p->mm->mm_count) <= 0); >>>>>>> +   /* >>>>>>> +    * This cast should be safe here because we grabbed a >>>>>>> +    * reference to the mm in kfd_process_notifier_release >>>>>>> +    */ >>>>>>> +   WARN_ON(atomic_read(&((struct mm_struct *)p->mm)->mm_count) <= >>>>>>> 0); >>>>>>>         mmdrop(p->mm); >>> _______________________________________________ >>> amd-gfx mailing list >>> amd-gfx at lists.freedesktop.org >>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >> >> >