Am 11.07.2018 um 22:57 schrieb Andrey Grodzovsky: > This change is to support MESA performace optimization. > Modify CS IOCTL to allow its input as command buffer and an array of > buffer handles to create a temporay bo list and then destroy it > when IOCTL completes. > This saves on calling for BO_LIST create and destry IOCTLs in MESA > and by this improves performance. > > v2: Avoid inserting the temp list into idr struct. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 11 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c | 86 ++++++++++++++++++----------- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 51 +++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +- > include/uapi/drm/amdgpu_drm.h | 1 + > 5 files changed, 114 insertions(+), 38 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 8eaba0f..9b472b2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -732,6 +732,17 @@ void amdgpu_bo_list_get_list(struct amdgpu_bo_list *list, > struct list_head *validated); > void amdgpu_bo_list_put(struct amdgpu_bo_list *list); > void amdgpu_bo_list_free(struct amdgpu_bo_list *list); > +int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in, > + struct drm_amdgpu_bo_list_entry **info_param); > + > +int amdgpu_bo_list_create(struct amdgpu_device *adev, > + struct drm_file *filp, > + struct drm_amdgpu_bo_list_entry *info, > + unsigned num_entries, > + int *id, > + struct amdgpu_bo_list **list); > + > +void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int id); > > /* > * GFX stuff > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c > index 92be7f6..14c7c59 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c > @@ -55,11 +55,12 @@ static void amdgpu_bo_list_release_rcu(struct kref *ref) > kfree_rcu(list, rhead); > } > > -static int amdgpu_bo_list_create(struct amdgpu_device *adev, > +int amdgpu_bo_list_create(struct amdgpu_device *adev, > struct drm_file *filp, > struct drm_amdgpu_bo_list_entry *info, > unsigned num_entries, > - int *id) > + int *id, > + struct amdgpu_bo_list **list_out) > { > int r; > struct amdgpu_fpriv *fpriv = filp->driver_priv; > @@ -78,20 +79,25 @@ static int amdgpu_bo_list_create(struct amdgpu_device *adev, > return r; > } > > + if (id) { > /* idr alloc should be called only after initialization of bo list. */ > - mutex_lock(&fpriv->bo_list_lock); > - r = idr_alloc(&fpriv->bo_list_handles, list, 1, 0, GFP_KERNEL); > - mutex_unlock(&fpriv->bo_list_lock); > - if (r < 0) { > - amdgpu_bo_list_free(list); > - return r; > + mutex_lock(&fpriv->bo_list_lock); > + r = idr_alloc(&fpriv->bo_list_handles, list, 1, 0, GFP_KERNEL); > + mutex_unlock(&fpriv->bo_list_lock); > + if (r < 0) { > + amdgpu_bo_list_free(list); > + return r; > + } > + *id = r; Can we move that into the calling or a separate other function? Would be cleaner to separate the functionality of creating a bo_list structure from inserting it into the IDR. > } > - *id = r; > + > + if (list_out) > + *list_out = list; > > return 0; > } > > -static void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int id) > +void amdgpu_bo_list_destroy(struct amdgpu_fpriv *fpriv, int id) > { > struct amdgpu_bo_list *list; > > @@ -263,53 +269,68 @@ void amdgpu_bo_list_free(struct amdgpu_bo_list *list) > kfree(list); > } > > -int amdgpu_bo_list_ioctl(struct drm_device *dev, void *data, > - struct drm_file *filp) > +int amdgpu_bo_create_list_entry_array(struct drm_amdgpu_bo_list_in *in, > + struct drm_amdgpu_bo_list_entry **info_param) > { > - const uint32_t info_size = sizeof(struct drm_amdgpu_bo_list_entry); > - > - struct amdgpu_device *adev = dev->dev_private; > - struct amdgpu_fpriv *fpriv = filp->driver_priv; > - union drm_amdgpu_bo_list *args = data; > - uint32_t handle = args->in.list_handle; > - const void __user *uptr = u64_to_user_ptr(args->in.bo_info_ptr); > - > struct drm_amdgpu_bo_list_entry *info; > - struct amdgpu_bo_list *list; > - > int r; > + const void __user *uptr = u64_to_user_ptr(in->bo_info_ptr); > + const uint32_t info_size = sizeof(struct drm_amdgpu_bo_list_entry); Reverse xmas tree order please, e.g. at least "int r;" should be last. I personally also put consts at the beginning of the declarations. Same problem at a few other occasions. > > - info = kvmalloc_array(args->in.bo_number, > + info = kvmalloc_array(in->bo_number, > sizeof(struct drm_amdgpu_bo_list_entry), GFP_KERNEL); We could use info_size here, would make the line a bit shorter. > if (!info) > return -ENOMEM; > > /* copy the handle array from userspace to a kernel buffer */ > r = -EFAULT; > - if (likely(info_size == args->in.bo_info_size)) { > - unsigned long bytes = args->in.bo_number * > - args->in.bo_info_size; > + if (likely(info_size == in->bo_info_size)) { > + unsigned long bytes = in->bo_number * > + in->bo_info_size; > > if (copy_from_user(info, uptr, bytes)) > goto error_free; > > } else { > - unsigned long bytes = min(args->in.bo_info_size, info_size); > + unsigned long bytes = min(in->bo_info_size, info_size); > unsigned i; > > - memset(info, 0, args->in.bo_number * info_size); > - for (i = 0; i < args->in.bo_number; ++i) { > + memset(info, 0, in->bo_number * info_size); > + for (i = 0; i < in->bo_number; ++i) { > if (copy_from_user(&info[i], uptr, bytes)) > goto error_free; > > - uptr += args->in.bo_info_size; > + uptr += in->bo_info_size; > } > } > > + *info_param = info; > + return 0; > + > +error_free: > + kvfree(info); > + return r; > +} > + > +int amdgpu_bo_list_ioctl(struct drm_device *dev, void *data, > + struct drm_file *filp) > +{ > + struct amdgpu_device *adev = dev->dev_private; > + struct amdgpu_fpriv *fpriv = filp->driver_priv; > + union drm_amdgpu_bo_list *args = data; > + uint32_t handle = args->in.list_handle; > + struct drm_amdgpu_bo_list_entry *info = NULL; > + struct amdgpu_bo_list *list; > + int r; > + > + r = amdgpu_bo_create_list_entry_array(&args->in, &info); > + if (r) > + goto error_free; > + > switch (args->in.operation) { > case AMDGPU_BO_LIST_OP_CREATE: > r = amdgpu_bo_list_create(adev, filp, info, args->in.bo_number, > - &handle); > + &handle, NULL); > if (r) > goto error_free; > break; > @@ -345,6 +366,7 @@ int amdgpu_bo_list_ioctl(struct drm_device *dev, void *data, > return 0; > > error_free: > - kvfree(info); > + if (info) > + kvfree(info); > return r; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index 9881a1e..30026b8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -66,11 +66,34 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, > return 0; > } > > -static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) > +static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p, > + struct drm_amdgpu_bo_list_in *data) > +{ > + int r; > + struct drm_amdgpu_bo_list_entry *info = NULL; > + > + r = amdgpu_bo_create_list_entry_array(data, &info); > + if (r) > + return r; > + > + r = amdgpu_bo_list_create(p->adev, p->filp, info, data->bo_number, NULL, &p->bo_list); > + if (r) > + goto error_free; > + > + kvfree(info); > + return 0; > + > +error_free: > + if (info) > + kvfree(info); > + > + return r; > +} > + > +static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs *cs) > { > struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > struct amdgpu_vm *vm = &fpriv->vm; > - union drm_amdgpu_cs *cs = data; > uint64_t *chunk_array_user; > uint64_t *chunk_array; > unsigned size, num_ibs = 0; > @@ -164,6 +187,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data) > > break; > > + case AMDGPU_CHUNK_ID_BO_HANDLES: > + size = sizeof(struct drm_amdgpu_bo_list_in); > + if (p->chunks[i].length_dw * sizeof(uint32_t) < size) { > + ret = -EINVAL; > + goto free_partial_kdata; > + } > + > + ret = amdgpu_cs_bo_handles_chunk(p, p->chunks[i].kdata); > + if (ret) > + goto free_partial_kdata; > + > + break; > + > case AMDGPU_CHUNK_ID_DEPENDENCIES: > case AMDGPU_CHUNK_ID_SYNCOBJ_IN: > case AMDGPU_CHUNK_ID_SYNCOBJ_OUT: > @@ -534,7 +570,12 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, > > INIT_LIST_HEAD(&p->validated); > > - p->bo_list = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle); > + /* p->bo_list could already be assigned if AMDGPU_CHUNK_ID_BO_HANDLES is present */ > + if (!p->bo_list) > + p->bo_list = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle); > + else > + mutex_lock(&p->bo_list->lock); > + > if (p->bo_list) { > amdgpu_bo_list_get_list(p->bo_list, &p->validated); > if (p->bo_list->first_userptr != p->bo_list->num_entries) > @@ -747,7 +788,7 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > * used by parsing context. > **/ > static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, > - bool backoff) > + bool backoff, int id) > { > unsigned i; > > @@ -1277,7 +1318,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > r = amdgpu_cs_submit(&parser, cs); > > out: > - amdgpu_cs_parser_fini(&parser, r, reserved_buffers); > + amdgpu_cs_parser_fini(&parser, r, reserved_buffers, cs->in.bo_list_handle); As David noted as well those changes are unnecessary now. Regards, Christian. > return r; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 06aede1..529500c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -69,9 +69,10 @@ > * - 3.24.0 - Add high priority compute support for gfx9 > * - 3.25.0 - Add support for sensor query info (stable pstate sclk/mclk). > * - 3.26.0 - GFX9: Process AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE. > + * - 3.27.0 - Add new chunk to to AMDGPU_CS to enable BO_LIST creation. > */ > #define KMS_DRIVER_MAJOR 3 > -#define KMS_DRIVER_MINOR 26 > +#define KMS_DRIVER_MINOR 27 > #define KMS_DRIVER_PATCHLEVEL 0 > > int amdgpu_vram_limit = 0; > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h > index 09741ba..94444ee 100644 > --- a/include/uapi/drm/amdgpu_drm.h > +++ b/include/uapi/drm/amdgpu_drm.h > @@ -519,6 +519,7 @@ struct drm_amdgpu_gem_va { > #define AMDGPU_CHUNK_ID_DEPENDENCIES 0x03 > #define AMDGPU_CHUNK_ID_SYNCOBJ_IN 0x04 > #define AMDGPU_CHUNK_ID_SYNCOBJ_OUT 0x05 > +#define AMDGPU_CHUNK_ID_BO_HANDLES 0x06 > > struct drm_amdgpu_cs_chunk { > __u32 chunk_id;