On 07/11/2018 11:05 AM, Christian König wrote: > Am 11.07.2018 um 16:51 schrieb Andrey Grodzovsky: >> This change is to support MESA performace optimization. >> Modify CS IOCTL to allow its input as command buffer and an array of >> buffer handles to create a temporay bo list and then destroy it >> when IOCTL completes. >> This saves on calling for BO_LIST create and destry IOCTLs in MESA >> and by this improves performance. >> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com> >> --- >>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        | 12 ++++++ >>  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c | 62 >> ++++++++++++++++++----------- >>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     | 52 >> +++++++++++++++++++++--- >>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    | 3 +- >>  include/uapi/drm/amdgpu_drm.h              | 1 + >>  5 files changed, 101 insertions(+), 29 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 8eaba0f..e082eba 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -732,6 +732,16 @@ void amdgpu_bo_list_get_list(struct >> amdgpu_bo_list *list, >>                   struct list_head *validated); >>  void amdgpu_bo_list_put(struct amdgpu_bo_list *list); >>  void amdgpu_bo_list_free(struct amdgpu_bo_list *list); >> +int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in, >> +                     struct drm_amdgpu_bo_list_entry **info_param); >> + >> +int amdgpu_bo_list_create(struct amdgpu_device *adev, >> +                struct drm_file *filp, >> +                struct drm_amdgpu_bo_list_entry *info, >> +                unsigned num_entries, >> +                int *id); >> + >> +void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int id); >>   /* >>   * GFX stuff >> @@ -1048,6 +1058,8 @@ struct amdgpu_cs_parser { >>       unsigned num_post_dep_syncobjs; >>      struct drm_syncobj **post_dep_syncobjs; >> + >> +   bool destroy_bo_list; > > I think you don't need this. IIRC the bo_list structure was reference > counted. > > So all you need to do is to make sure that the temporary bo_list you > create has a reference count of 1 and so is destroyed when the CS > IOCTL calls amdgpu_bo_list_put() on it. > > That would simplify the patch quite a bit. amdgpu_bo_list_destroy is essential because it removes the list from idr struct bo_list_handles, amdgpu_bo_list_put doesn't do it. Regarding the refcount, it's 2 because it's 1 on list create in amdgpu_cs_bo_handles_chunk->amdgpu_bo_list_create and then it's incremented to 2 in amdgpu_cs_parser_bos->amdgpu_bo_list_get. So by calling amdgpu_bo_list_put and then amdgpu_bo_list_destroy the count properly drops down to 0. Andrey > > Apart from that looks really good to me, especially that you only need > a new chunk ID for the UAPI is quite nice. > > Thanks, > Christian. > >>  }; >>   #define AMDGPU_PREAMBLE_IB_PRESENT         (1 << 0) /* bit set >> means command submit involves a preamble IB */ >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c >> index 92be7f6..9acdacc 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c >> @@ -55,7 +55,7 @@ static void amdgpu_bo_list_release_rcu(struct kref >> *ref) >>      kfree_rcu(list, rhead); >>  } >>  -static int amdgpu_bo_list_create(struct amdgpu_device *adev, >> +int amdgpu_bo_list_create(struct amdgpu_device *adev, >>                   struct drm_file *filp, >>                   struct drm_amdgpu_bo_list_entry *info, >>                   unsigned num_entries, >> @@ -91,7 +91,7 @@ static int amdgpu_bo_list_create(struct >> amdgpu_device *adev, >>      return 0; >>  } >>  -static void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int >> id) >> +void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int id) >>  { >>      struct amdgpu_bo_list *list; >>  @@ -263,49 +263,64 @@ void amdgpu_bo_list_free(struct >> amdgpu_bo_list *list) >>      kfree(list); >>  } >>  -int amdgpu_bo_list_ioctl(struct drm_device *dev, void *data, >> -               struct drm_file *filp) >> +int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in, >> +                     struct drm_amdgpu_bo_list_entry **info_param) >>  { >> -   const uint32_t info_size = sizeof(struct drm_amdgpu_bo_list_entry); >> - >> -   struct amdgpu_device *adev = dev->dev_private; >> -   struct amdgpu_fpriv *fpriv = filp->driver_priv; >> -   union drm_amdgpu_bo_list *args = data; >> -   uint32_t handle = args->in.list_handle; >> -   const void __user *uptr = u64_to_user_ptr(args->in.bo_info_ptr); >> - >>      struct drm_amdgpu_bo_list_entry *info; >> -   struct amdgpu_bo_list *list; >> - >>      int r; >> +   const void __user *uptr = u64_to_user_ptr(in->bo_info_ptr); >> +   const uint32_t info_size = sizeof(struct drm_amdgpu_bo_list_entry); >>  -   info = kvmalloc_array(args->in.bo_number, >> +   info = kvmalloc_array(in->bo_number, >>                   sizeof(struct drm_amdgpu_bo_list_entry), GFP_KERNEL); >>      if (!info) >>          return -ENOMEM; >>       /* copy the handle array from userspace to a kernel buffer */ >>      r = -EFAULT; >> -   if (likely(info_size == args->in.bo_info_size)) { >> -       unsigned long bytes = args->in.bo_number * >> -           args->in.bo_info_size; >> +   if (likely(info_size == in->bo_info_size)) { >> +       unsigned long bytes = in->bo_number * >> +           in->bo_info_size; >>           if (copy_from_user(info, uptr, bytes)) >>              goto error_free; >>       } else { >> -       unsigned long bytes = min(args->in.bo_info_size, info_size); >> +       unsigned long bytes = min(in->bo_info_size, info_size); >>          unsigned i; >>  -       memset(info, 0, args->in.bo_number * info_size); >> -       for (i = 0; i < args->in.bo_number; ++i) { >> +       memset(info, 0, in->bo_number * info_size); >> +       for (i = 0; i < in->bo_number; ++i) { >>              if (copy_from_user(&info[i], uptr, bytes)) >>                  goto error_free; >>  -           uptr += args->in.bo_info_size; >> +           uptr += in->bo_info_size; >>          } >>      } >>  +   *info_param = info; >> +   return 0; >> + >> +error_free: >> +   kvfree(info); >> +   return r; >> +} >> + >> +int amdgpu_bo_list_ioctl(struct drm_device *dev, void *data, >> +               struct drm_file *filp) >> +{ >> +   struct amdgpu_device *adev = dev->dev_private; >> +   struct amdgpu_fpriv *fpriv = filp->driver_priv; >> +   union drm_amdgpu_bo_list *args = data; >> +   uint32_t handle = args->in.list_handle; >> +   struct drm_amdgpu_bo_list_entry *info = NULL; >> +   struct amdgpu_bo_list *list; >> +   int r; >> + >> +   r = amdgpu_bo_create_list_entry_array(&args->in, &info); >> +   if (r) >> +       goto error_free; >> + >>      switch (args->in.operation) { >>      case AMDGPU_BO_LIST_OP_CREATE: >>          r = amdgpu_bo_list_create(adev, filp, info, >> args->in.bo_number, >> @@ -345,6 +360,7 @@ int amdgpu_bo_list_ioctl(struct drm_device *dev, >> void *data, >>      return 0; >>   error_free: >> -   kvfree(info); >> +   if (info) >> +       kvfree(info); >>      return r; >>  } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> index 9881a1e..1b17f6b 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> @@ -66,11 +66,35 @@ static int amdgpu_cs_user_fence_chunk(struct >> amdgpu_cs_parser *p, >>      return 0; >>  } >>  -static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void >> *data) >> +static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p, >> +                     struct drm_amdgpu_bo_list_in *data, >> +                     int *id) >> +{ >> +   int r; >> +   struct drm_amdgpu_bo_list_entry *info = NULL; >> + >> +   r = amdgpu_bo_create_list_entry_array(data, &info); >> +   if (r) >> +       return r; >> + >> +   r = amdgpu_bo_list_create(p->adev, p->filp, info, >> data->bo_number, id); >> +   if (r) >> +       goto error_free; >> + >> +   kvfree(info); >> +   return 0; >> + >> +error_free: >> +   if (info) >> +       kvfree(info); >> + >> +   return r; >> +} >> + >> +static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union >> drm_amdgpu_cs *cs) >>  { >>      struct amdgpu_fpriv *fpriv = p->filp->driver_priv; >>      struct amdgpu_vm *vm = &fpriv->vm; >> -   union drm_amdgpu_cs *cs = data; >>      uint64_t *chunk_array_user; >>      uint64_t *chunk_array; >>      unsigned size, num_ibs = 0; >> @@ -164,6 +188,20 @@ static int amdgpu_cs_parser_init(struct >> amdgpu_cs_parser *p, void *data) >>               break; >>  +       case AMDGPU_CHUNK_ID_BO_HANDLES: >> +           size = sizeof(struct drm_amdgpu_bo_list_in); >> +           if (p->chunks[i].length_dw * sizeof(uint32_t) < size) { >> +               ret = -EINVAL; >> +               goto free_partial_kdata; >> +           } >> + >> +           ret = amdgpu_cs_bo_handles_chunk(p, p->chunks[i].kdata, >> &cs->in.bo_list_handle); >> +           p->destroy_bo_list = true; >> +           if (ret) >> +               goto free_partial_kdata; >> + >> +           break; >> + >>          case AMDGPU_CHUNK_ID_DEPENDENCIES: >>          case AMDGPU_CHUNK_ID_SYNCOBJ_IN: >>          case AMDGPU_CHUNK_ID_SYNCOBJ_OUT: >> @@ -747,7 +785,7 @@ static int amdgpu_cs_sync_rings(struct >> amdgpu_cs_parser *p) >>   * used by parsing context. >>   **/ >>  static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, >> int error, >> -                 bool backoff) >> +                 bool backoff, int id) >>  { >>      unsigned i; >>  @@ -765,9 +803,13 @@ static void amdgpu_cs_parser_fini(struct >> amdgpu_cs_parser *parser, int error, >>          mutex_unlock(&parser->ctx->lock); >>          amdgpu_ctx_put(parser->ctx); >>      } >> -   if (parser->bo_list) >> +   if (parser->bo_list) { >>          amdgpu_bo_list_put(parser->bo_list); >>  +       if (parser->destroy_bo_list) >> + amdgpu_bo_list_destroy(parser->filp->driver_priv, id); >> +   } >> + >>      for (i = 0; i < parser->nchunks; i++) >>          kvfree(parser->chunks[i].kdata); >>      kfree(parser->chunks); >> @@ -1277,7 +1319,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, >> void *data, struct drm_file *filp) >>      r = amdgpu_cs_submit(&parser, cs); >>   out: >> -   amdgpu_cs_parser_fini(&parser, r, reserved_buffers); >> +   amdgpu_cs_parser_fini(&parser, r, reserved_buffers, >> cs->in.bo_list_handle); >>      return r; >>  } >>  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> index 06aede1..529500c 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c >> @@ -69,9 +69,10 @@ >>   * - 3.24.0 - Add high priority compute support for gfx9 >>   * - 3.25.0 - Add support for sensor query info (stable pstate >> sclk/mclk). >>   * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE. >> + * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation. >>   */ >>  #define KMS_DRIVER_MAJOR   3 >> -#define KMS_DRIVER_MINOR   26 >> +#define KMS_DRIVER_MINOR   27 >>  #define KMS_DRIVER_PATCHLEVEL   0 >>   int amdgpu_vram_limit = 0; >> diff --git a/include/uapi/drm/amdgpu_drm.h >> b/include/uapi/drm/amdgpu_drm.h >> index 09741ba..94444ee 100644 >> --- a/include/uapi/drm/amdgpu_drm.h >> +++ b/include/uapi/drm/amdgpu_drm.h >> @@ -519,6 +519,7 @@ struct drm_amdgpu_gem_va { >>  #define AMDGPU_CHUNK_ID_DEPENDENCIES   0x03 >>  #define AMDGPU_CHUNK_ID_SYNCOBJ_IN     0x04 >>  #define AMDGPU_CHUNK_ID_SYNCOBJ_OUT    0x05 >> +#define AMDGPU_CHUNK_ID_BO_HANDLES     0x06 >>   struct drm_amdgpu_cs_chunk { >>      __u32       chunk_id; >