Re: [PATCH 5/5] drm/amdgpu: allow specifying a syncobj for VM map operations

Christian König <christian.koenig@xxxxxxx> · Mon, 4 Apr 2022 08:43:08 +0200

Hi Bas,

Am 04.04.22 um 03:32 schrieb Bas Nieuwenhuizen:
Hi Christian,

Couple of concerns here:

1) This is missing the option to wait on a syncobj before executing the VA op.

Uff, that was not really planned in any way.

Currently all VM operations are scheduled to run behind all CS 
operations and that is not easily changeable.

In other words we planned that disable the VM->CS implicit sync, but not 
CS->VM.

2) There are no mechanisms to disable implicit sync yet.

Specifying a sync_obj will do that.

3) in radv we typically signal multiple syncobj, so it would be nice
if we could have that capability here too. Technically we can use the
transfer ioctls, but it is kinda annoying.

That shouldn't be much of a problem.

I had https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FBNieuwenhuizen%2Flinux%2Fcommit%2Fd8a1cce0c4c5c87522ae8866faf4f67be7189ef0&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C6d4dec03a52b4ce510b208da15daf2fa%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637846327828400310%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=K6J%2FHb2NAL8NpoTkaqmpjq4xJS9ozpu2UJBZwSA8OIk%3D&amp;reserved=0
+ radv implementation before we ended up in the situation of waits not
being a doable thing.

Well I changed the TLB handling so that waits are doable now :)


For (1) we can emulate in userspace by waiting for all syncobj to
signal first, but I'm concerned that will result in GPU bubbles due to
CPU work.  To test this out I'm starting to hook up more implicit sync
disabling stuff (starting with the submissions themselves, WIP at
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithub.com%2FBNieuwenhuizen%2Flinux%2Ftree%2Fno-implicit-sync&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C6d4dec03a52b4ce510b208da15daf2fa%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637846327828400310%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000&amp;sdata=taf3fjRYd2OT9GR%2FgtBsCcoIOtOW0pYjdcsGAe%2BnJSw%3D&amp;reserved=0), which
is why you're seeing some random comments on your dma resv usage
series coming your way.

Which is rather welcomed. That patch set certainly needs more eyes on it.

Thanks,
Christian.


- Bas


On Thu, Mar 31, 2022 at 11:47 AM Christian König
<ckoenig.leichtzumerken@xxxxxxxxx> wrote:
To improve synchronization of command submissions with page table updates RADV
wants to manually wait for the updates to be completed without affecting
parallel submissions.

Implement this by allowing to specify a drm_sync_obj handle and a timeline
point for the GEM_VA IOCTL.

Signed-off-by: Christian König <christian.koenig@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 79 ++++++++++++++++++++-----
  include/uapi/drm/amdgpu_drm.h           |  5 +-
  2 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 9cdfee67efeb..bf0092f629f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -33,6 +33,7 @@

  #include <drm/amdgpu_drm.h>
  #include <drm/drm_drv.h>
+#include <drm/drm_syncobj.h>
  #include <drm/drm_gem_ttm_helper.h>

  #include "amdgpu.h"
@@ -598,6 +599,7 @@ int amdgpu_gem_metadata_ioctl(struct drm_device *dev, void *data,
   * @vm: vm to update
   * @bo_va: bo_va to update
   * @operation: map, unmap or clear
+ * @last_update: optional pointer to a dma_fence for the last VM update
   *
   * Update the bo_va directly after setting its address. Errors are not
   * vital here, so they are not reported back to userspace.
@@ -605,20 +607,21 @@ int amdgpu_gem_metadata_ioctl(struct drm_device *dev, void *data,
  static void amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
                                     struct amdgpu_vm *vm,
                                     struct amdgpu_bo_va *bo_va,
-                                   uint32_t operation)
+                                   uint32_t operation,
+                                   struct dma_fence **last_update)
  {
         int r;

         if (!amdgpu_vm_ready(vm))
                 return;

-       r = amdgpu_vm_clear_freed(adev, vm, NULL);
+       r = amdgpu_vm_clear_freed(adev, vm, last_update);
         if (r)
                 goto error;

         if (operation == AMDGPU_VA_OP_MAP ||
             operation == AMDGPU_VA_OP_REPLACE) {
-               r = amdgpu_vm_bo_update(adev, bo_va, false, NULL);
+               r = amdgpu_vm_bo_update(adev, bo_va, false, last_update);
                 if (r)
                         goto error;
         }
@@ -671,6 +674,9 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
         struct drm_gem_object *gobj;
         struct amdgpu_device *adev = drm_to_adev(dev);
         struct amdgpu_fpriv *fpriv = filp->driver_priv;
+       struct dma_fence *fence = dma_fence_get_stub();
+       struct dma_fence_chain *chain = NULL;
+       struct drm_syncobj *syncobj = NULL;
         struct amdgpu_bo *abo;
         struct amdgpu_bo_va *bo_va;
         struct amdgpu_bo_list_entry vm_pd;
@@ -714,17 +720,9 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
                 return -EINVAL;
         }

-       switch (args->operation) {
-       case AMDGPU_VA_OP_MAP:
-       case AMDGPU_VA_OP_UNMAP:
-       case AMDGPU_VA_OP_CLEAR:
-       case AMDGPU_VA_OP_REPLACE:
-               break;
-       default:
-               dev_dbg(dev->dev, "unsupported operation %d\n",
-                       args->operation);
-               return -EINVAL;
-       }
+       /* For debugging delay all VM updates till CS time */
+       if (!amdgpu_vm_debug)
+               args->flags |= AMDGPU_VM_DELAY_UPDATE;

         INIT_LIST_HEAD(&list);
         INIT_LIST_HEAD(&duplicates);
@@ -763,6 +761,30 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
                 bo_va = NULL;
         }

+       if (args->syncobj) {
+               syncobj = drm_syncobj_find(filp, args->syncobj);
+               if (!syncobj) {
+                       r = -EINVAL;
+                       goto error_backoff;
+               }
+
+               if (args->timeline_point) {
+                       chain = dma_fence_chain_alloc();
+                       if (!chain) {
+                               r = -ENOMEM;
+                               goto error_put_syncobj;
+                       }
+               }
+
+               /*
+                * Update the VM once before to make sure there are no other
+                * pending updates.
+                */
+               if (!(args->flags & AMDGPU_VM_DELAY_UPDATE))
+                       amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va,
+                                               args->operation, NULL);
+       }
+
         switch (args->operation) {
         case AMDGPU_VA_OP_MAP:
                 va_flags = amdgpu_gem_va_map_flags(adev, args->flags);
@@ -786,17 +808,42 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
                                              va_flags);
                 break;
         default:
+               dev_dbg(dev->dev, "unsupported operation %d\n",
+                       args->operation);
+               r = -EINVAL;
                 break;
         }
-       if (!r && !(args->flags & AMDGPU_VM_DELAY_UPDATE) && !amdgpu_vm_debug)
+       if (r)
+               goto error_free_chain;
+
+       if (!(args->flags & AMDGPU_VM_DELAY_UPDATE))
                 amdgpu_gem_va_update_vm(adev, &fpriv->vm, bo_va,
-                                       args->operation);
+                                       args->operation, syncobj ?
+                                       &fence : NULL);
+
+       if (syncobj) {
+               if (chain) {
+                       drm_syncobj_add_point(syncobj, chain, fence,
+                                             args->timeline_point);
+                       chain = NULL;
+               } else {
+                       drm_syncobj_replace_fence(syncobj, fence);
+               }
+       }
+
+error_free_chain:
+       dma_fence_chain_free(chain);
+
+error_put_syncobj:
+       if (syncobj)
+               drm_syncobj_put(syncobj);

  error_backoff:
         ttm_eu_backoff_reservation(&ticket, &list);

  error_unref:
         drm_gem_object_put(gobj);
+       dma_fence_put(fence);
         return r;
  }

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 1d65c1fbc4ec..f84b5f2c817c 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -533,7 +533,8 @@ struct drm_amdgpu_gem_op {
  struct drm_amdgpu_gem_va {
         /** GEM object handle */
         __u32 handle;
-       __u32 _pad;
+       /** Optional DRM Syncobj to signal when operation completes */
+       __u32 syncobj;
         /** AMDGPU_VA_OP_* */
         __u32 operation;
         /** AMDGPU_VM_PAGE_* */
@@ -544,6 +545,8 @@ struct drm_amdgpu_gem_va {
         __u64 offset_in_bo;
         /** Specify mapping size. Must be correctly aligned. */
         __u64 map_size;
+       /** Optional Syncobj timeline point to signal */
+       __u64 timeline_point;
  };

  #define AMDGPU_HW_IP_GFX          0
--
2.25.1