On 2018-01-29 08:43 AM, Christian König wrote: > Hi Felix & Harish, > > maybe explain why I found that odd: dma_fence_add_callback() sets the > DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT flag before adding the callback. > > So the flag should always be set when there are callbacks. > Did I miss anything? I don't think we add any callbacks to our eviction fences. Regards,  Felix > > Regards, > Christian. > > Am 29.01.2018 um 00:55 schrieb Felix Kuehling: >> [+Harish, forgot to acknowledge him in the commit description, will fix >> that in v2] >> >> Harish, please see Christian's question below in amd_kfd_fence_signal. >> Did I understand this correctly? >> >> Regards, >>   Felix >> >> On 2018-01-28 06:42 PM, Felix Kuehling wrote: >>> On 2018-01-27 04:16 AM, Christian König wrote: >>>> Am 27.01.2018 um 02:09 schrieb Felix Kuehling: >>> [snip] >>>>> +struct amdgpu_amdkfd_fence *amdgpu_amdkfd_fence_create(u64 context, >>>>> +                              void *mm) >>>>> +{ >>>>> +   struct amdgpu_amdkfd_fence *fence = NULL; >>>>> + >>>>> +   fence = kzalloc(sizeof(*fence), GFP_KERNEL); >>>>> +   if (fence == NULL) >>>>> +       return NULL; >>>>> + >>>>> +   /* mm_struct mm is used as void pointer to identify the parent >>>>> +    * KFD process. Don't dereference it. Fence and any threads >>>>> using >>>>> +    * mm is guranteed to be released before process termination. >>>>> +    */ >>>>> +   fence->mm = mm; >>>> That won't work. Fences can live much longer than any process who >>>> created them. >>>> >>>> I've already found a fence in a BO still living hours after the >>>> process was killed and the pid long recycled. >>>> >>>> I suggest to make fence->mm a real mm_struct pointer with reference >>>> counting and then set it to NULL and drop the reference in >>>> enable_signaling. >>> I agree. But enable_signaling may be too early to drop the reference. >>> amd_kfd_fence_check_mm could still be called later from >>> amdgpu_ttm_bo_eviction_valuable, as long as the fence hasn't >>> signaled yet. >>> >>> The safe place is problably in amd_kfd_fence_release. >>> >>>>> +   get_task_comm(fence->timeline_name, current); >>>>> +   spin_lock_init(&fence->lock); >>>>> + >>>>> +   dma_fence_init(&fence->base, &amd_kfd_fence_ops, &fence->lock, >>>>> +          context, atomic_inc_return(&fence_seq)); >>>>> + >>>>> +   return fence; >>>>> +} >>>>> + >>>>> +struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct >>>>> dma_fence *f) >>>>> +{ >>>>> +   struct amdgpu_amdkfd_fence *fence; >>>>> + >>>>> +   if (!f) >>>>> +       return NULL; >>>>> + >>>>> +   fence = container_of(f, struct amdgpu_amdkfd_fence, base); >>>>> +   if (fence && f->ops == &amd_kfd_fence_ops) >>>>> +       return fence; >>>>> + >>>>> +   return NULL; >>>>> +} >>>>> + >>>>> +static const char *amd_kfd_fence_get_driver_name(struct dma_fence >>>>> *f) >>>>> +{ >>>>> +   return "amdgpu_amdkfd_fence"; >>>>> +} >>>>> + >>>>> +static const char *amd_kfd_fence_get_timeline_name(struct >>>>> dma_fence *f) >>>>> +{ >>>>> +   struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); >>>>> + >>>>> +   return fence->timeline_name; >>>>> +} >>>>> + >>>>> +/** >>>>> + * amd_kfd_fence_enable_signaling - This gets called when TTM wants >>>>> to evict >>>>> + * a KFD BO and schedules a job to move the BO. >>>>> + * If fence is already signaled return true. >>>>> + * If fence is not signaled schedule a evict KFD process work item. >>>>> + */ >>>>> +static bool amd_kfd_fence_enable_signaling(struct dma_fence *f) >>>>> +{ >>>>> +   struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); >>>>> + >>>>> +   if (!fence) >>>>> +       return false; >>>>> + >>>>> +   if (dma_fence_is_signaled(f)) >>>>> +       return true; >>>>> + >>>>> +   if (!kgd2kfd->schedule_evict_and_restore_process( >>>>> +               (struct mm_struct *)fence->mm, f)) >>>>> +       return true; >>>>> + >>>>> +   return false; >>>>> +} >>>>> + >>>>> +static int amd_kfd_fence_signal(struct dma_fence *f) >>>>> +{ >>>>> +   unsigned long flags; >>>>> +   int ret; >>>>> + >>>>> +   spin_lock_irqsave(f->lock, flags); >>>>> +   /* Set enabled bit so cb will called */ >>>>> +   set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &f->flags); >>>> Mhm, why is that necessary? >>> This only gets called from fence_release below. I think this is to >>> avoid >>> needlessly scheduling an eviction/restore cycle when an eviction fence >>> gets destroyed that hasn't been triggered before, probably during >>> process termination. >>> >>> Harish, do you remember any other reason for this? >>> >>>>> +   ret = dma_fence_signal_locked(f); >>>>> +   spin_unlock_irqrestore(f->lock, flags); >>>>> + >>>>> +   return ret; >>>>> +} >>>>> + >>>>> +/** >>>>> + * amd_kfd_fence_release - callback that fence can be freed >>>>> + * >>>>> + * @fence: fence >>>>> + * >>>>> + * This function is called when the reference count becomes zero. >>>>> + * It just RCU schedules freeing up the fence. >>>>> + */ >>>>> +static void amd_kfd_fence_release(struct dma_fence *f) >>>>> +{ >>>>> +   struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); >>>>> +   /* Unconditionally signal the fence. The process is getting >>>>> +    * terminated. >>>>> +    */ >>>>> +   if (WARN_ON(!fence)) >>>>> +       return; /* Not an amdgpu_amdkfd_fence */ >>>>> + >>>>> +   amd_kfd_fence_signal(f); >>>>> +   kfree_rcu(f, rcu); >>>>> +} >>>>> + >>>>> +/** >>>>> + * amd_kfd_fence_check_mm - Check if @mm is same as that of the >>>>> fence @f >>>>> + * if same return TRUE else return FALSE. >>>>> + * >>>>> + * @f: [IN] fence >>>>> + * @mm: [IN] mm that needs to be verified >>>>> + */ >>>>> +bool amd_kfd_fence_check_mm(struct dma_fence *f, void *mm) >>>>> +{ >>>>> +   struct amdgpu_amdkfd_fence *fence = to_amdgpu_amdkfd_fence(f); >>>>> + >>>>> +   if (!fence) >>>>> +       return false; >>>>> +   else if (fence->mm == mm) >>>>> +       return true; >>>>> + >>>>> +   return false; >>>>> +} >>>>> + >>>>> +const struct dma_fence_ops amd_kfd_fence_ops = { >>>>> +   .get_driver_name = amd_kfd_fence_get_driver_name, >>>>> +   .get_timeline_name = amd_kfd_fence_get_timeline_name, >>>>> +   .enable_signaling = amd_kfd_fence_enable_signaling, >>>>> +   .signaled = NULL, >>>>> +   .wait = dma_fence_default_wait, >>>>> +   .release = amd_kfd_fence_release, >>>>> +}; >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >>>>> index 65d5a4e..ca00dd2 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h >>>>> @@ -36,8 +36,9 @@ >>>>>   #define AMDGPU_MAX_UVD_ENC_RINGS   2 >>>>>    /* some special values for the owner field */ >>>>> -#define AMDGPU_FENCE_OWNER_UNDEFINED   ((void*)0ul) >>>>> -#define AMDGPU_FENCE_OWNER_VM       ((void*)1ul) >>>>> +#define AMDGPU_FENCE_OWNER_UNDEFINED   ((void *)0ul) >>>>> +#define AMDGPU_FENCE_OWNER_VM       ((void *)1ul) >>>>> +#define AMDGPU_FENCE_OWNER_KFD       ((void *)2ul) >>>>>    #define AMDGPU_FENCE_FLAG_64BIT        (1 << 0) >>>>>   #define AMDGPU_FENCE_FLAG_INT          (1 << 1) >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c >>>>> index df65c66..0cb31d9 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c >>>>> @@ -31,6 +31,7 @@ >>>>>   #include <drm/drmP.h> >>>>>   #include "amdgpu.h" >>>>>   #include "amdgpu_trace.h" >>>>> +#include "amdgpu_amdkfd.h" >>>>>    struct amdgpu_sync_entry { >>>>>       struct hlist_node   node; >>>>> @@ -86,10 +87,18 @@ static bool amdgpu_sync_same_dev(struct >>>>> amdgpu_device *adev, >>>>>   static void *amdgpu_sync_get_owner(struct dma_fence *f) >>>>>   { >>>>>       struct drm_sched_fence *s_fence = to_drm_sched_fence(f); >>>>> +   struct amdgpu_amdkfd_fence *kfd_fence; >>>>> + >>>>> +   if (!f) >>>>> +       return AMDGPU_FENCE_OWNER_UNDEFINED; >>>> When you add the extra NULL check here then please move the >>>> to_drm_sched_fence() after it as well. >>> Yeah, makes sense. >>> >>> Regards, >>>   Felix >>> >>>> Christian. >>>> >>>>>        if (s_fence) >>>>>           return s_fence->owner; >>>>>   +   kfd_fence = to_amdgpu_amdkfd_fence(f); >>>>> +   if (kfd_fence) >>>>> +       return AMDGPU_FENCE_OWNER_KFD; >>>>> + >>>>>       return AMDGPU_FENCE_OWNER_UNDEFINED; >>>>>   } >>>>>   @@ -204,11 +213,18 @@ int amdgpu_sync_resv(struct amdgpu_device >>>>> *adev, >>>>>       for (i = 0; i < flist->shared_count; ++i) { >>>>>           f = rcu_dereference_protected(flist->shared[i], >>>>>                             reservation_object_held(resv)); >>>>> +       /* We only want to trigger KFD eviction fences on >>>>> +        * evict or move jobs. Skip KFD fences otherwise. >>>>> +        */ >>>>> +       fence_owner = amdgpu_sync_get_owner(f); >>>>> +       if (fence_owner == AMDGPU_FENCE_OWNER_KFD && >>>>> +           owner != AMDGPU_FENCE_OWNER_UNDEFINED) >>>>> +           continue; >>>>> + >>>>>           if (amdgpu_sync_same_dev(adev, f)) { >>>>>               /* VM updates are only interesting >>>>>                * for other VM updates and moves. >>>>>                */ >>>>> -           fence_owner = amdgpu_sync_get_owner(f); >>>>>               if ((owner != AMDGPU_FENCE_OWNER_UNDEFINED) && >>>>>                   (fence_owner != AMDGPU_FENCE_OWNER_UNDEFINED) && >>>>>                   ((owner == AMDGPU_FENCE_OWNER_VM) != >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>>>> index e4bb435..c3f33d3 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>>>> @@ -46,6 +46,7 @@ >>>>>   #include "amdgpu.h" >>>>>   #include "amdgpu_object.h" >>>>>   #include "amdgpu_trace.h" >>>>> +#include "amdgpu_amdkfd.h" >>>>>   #include "bif/bif_4_1_d.h" >>>>>    #define DRM_FILE_PAGE_OFFSET (0x100000000ULL >> PAGE_SHIFT) >>>>> @@ -1170,6 +1171,23 @@ static bool >>>>> amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, >>>>>   { >>>>>       unsigned long num_pages = bo->mem.num_pages; >>>>>       struct drm_mm_node *node = bo->mem.mm_node; >>>>> +   struct reservation_object_list *flist; >>>>> +   struct dma_fence *f; >>>>> +   int i; >>>>> + >>>>> +   /* If bo is a KFD BO, check if the bo belongs to the current >>>>> process. >>>>> +    * If true, then return false as any KFD process needs all its >>>>> BOs to >>>>> +    * be resident to run successfully >>>>> +    */ >>>>> +   flist = reservation_object_get_list(bo->resv); >>>>> +   if (flist) { >>>>> +       for (i = 0; i < flist->shared_count; ++i) { >>>>> +           f = rcu_dereference_protected(flist->shared[i], >>>>> +               reservation_object_held(bo->resv)); >>>>> +           if (amd_kfd_fence_check_mm(f, current->mm)) >>>>> +               return false; >>>>> +       } >>>>> +   } >>>>>        switch (bo->mem.mem_type) { >>>>>       case TTM_PL_TT: >>>>> diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h >>>>> b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h >>>>> index 94eab54..9e35249 100644 >>>>> --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h >>>>> +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h >>>>> @@ -30,6 +30,7 @@ >>>>>    #include <linux/types.h> >>>>>   #include <linux/bitmap.h> >>>>> +#include <linux/dma-fence.h> >>>>>    struct pci_dev; >>>>>   @@ -286,6 +287,9 @@ struct kfd2kgd_calls { >>>>>    * >>>>>    * @resume: Notifies amdkfd about a resume action done to a kgd >>>>> device >>>>>    * >>>>> + * @schedule_evict_and_restore_process: Schedules work queue that >>>>> will prepare >>>>> + * for safe eviction of KFD BOs that belong to the specified >>>>> process. >>>>> + * >>>>>    * This structure contains function callback pointers so the kgd >>>>> driver >>>>>    * will notify to the amdkfd about certain status changes. >>>>>    * >>>>> @@ -300,6 +304,8 @@ struct kgd2kfd_calls { >>>>>       void (*interrupt)(struct kfd_dev *kfd, const void >>>>> *ih_ring_entry); >>>>>       void (*suspend)(struct kfd_dev *kfd); >>>>>       int (*resume)(struct kfd_dev *kfd); >>>>> +   int (*schedule_evict_and_restore_process)(struct mm_struct *mm, >>>>> +           struct dma_fence *fence); >>>>>   }; >>>>>    int kgd2kfd_init(unsigned interface_version, >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx at lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/amd-gfx >