Re: [PATCH 1/8] drm/amdgpu: UAPI for user queue management

Alex Deucher <alexdeucher@xxxxxxxxx> · Tue, 7 Feb 2023 09:20:02 -0500

On Tue, Feb 7, 2023 at 9:19 AM Christian König <christian.koenig@xxxxxxx> wrote:
>
> Am 07.02.23 um 15:17 schrieb Alex Deucher:
> > On Tue, Feb 7, 2023 at 9:11 AM Christian König
> > <ckoenig.leichtzumerken@xxxxxxxxx> wrote:
> >> Am 07.02.23 um 15:07 schrieb Alex Deucher:
> >>> On Tue, Feb 7, 2023 at 2:38 AM Shashank Sharma <shashank.sharma@xxxxxxx> wrote:
> >>>> On 07/02/2023 08:03, Christian König wrote:
> >>>>> Am 06.02.23 um 22:03 schrieb Alex Deucher:
> >>>>>> On Mon, Feb 6, 2023 at 12:01 PM Christian König
> >>>>>> <christian.koenig@xxxxxxx> wrote:
> >>>>>>> Am 06.02.23 um 17:56 schrieb Alex Deucher:
> >>>>>>>> On Fri, Feb 3, 2023 at 5:26 PM Shashank Sharma
> >>>>>>>> <shashank.sharma@xxxxxxx> wrote:
> >>>>>>>>> Hey Alex,
> >>>>>>>>>
> >>>>>>>>> On 03/02/2023 23:07, Alex Deucher wrote:
> >>>>>>>>>> On Fri, Feb 3, 2023 at 4:54 PM Shashank Sharma
> >>>>>>>>>> <shashank.sharma@xxxxxxx> wrote:
> >>>>>>>>>>> From: Alex Deucher <alexander.deucher@xxxxxxx>
> >>>>>>>>>>>
> >>>>>>>>>>> This patch intorduces new UAPI/IOCTL for usermode graphics
> >>>>>>>>>>> queue. The userspace app will fill this structure and request
> >>>>>>>>>>> the graphics driver to add a graphics work queue for it. The
> >>>>>>>>>>> output of this UAPI is a queue id.
> >>>>>>>>>>>
> >>>>>>>>>>> This UAPI maps the queue into GPU, so the graphics app can start
> >>>>>>>>>>> submitting work to the queue as soon as the call returns.
> >>>>>>>>>>>
> >>>>>>>>>>> Cc: Alex Deucher <alexander.deucher@xxxxxxx>
> >>>>>>>>>>> Cc: Christian Koenig <christian.koenig@xxxxxxx>
> >>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
> >>>>>>>>>>> Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx>
> >>>>>>>>>>> ---
> >>>>>>>>>>>       include/uapi/drm/amdgpu_drm.h | 53
> >>>>>>>>>>> +++++++++++++++++++++++++++++++++++
> >>>>>>>>>>>       1 file changed, 53 insertions(+)
> >>>>>>>>>>>
> >>>>>>>>>>> diff --git a/include/uapi/drm/amdgpu_drm.h
> >>>>>>>>>>> b/include/uapi/drm/amdgpu_drm.h
> >>>>>>>>>>> index 4038abe8505a..6c5235d107b3 100644
> >>>>>>>>>>> --- a/include/uapi/drm/amdgpu_drm.h
> >>>>>>>>>>> +++ b/include/uapi/drm/amdgpu_drm.h
> >>>>>>>>>>> @@ -54,6 +54,7 @@ extern "C" {
> >>>>>>>>>>>       #define DRM_AMDGPU_VM                  0x13
> >>>>>>>>>>>       #define DRM_AMDGPU_FENCE_TO_HANDLE     0x14
> >>>>>>>>>>>       #define DRM_AMDGPU_SCHED               0x15
> >>>>>>>>>>> +#define DRM_AMDGPU_USERQ               0x16
> >>>>>>>>>>>
> >>>>>>>>>>>       #define DRM_IOCTL_AMDGPU_GEM_CREATE
> >>>>>>>>>>> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union
> >>>>>>>>>>> drm_amdgpu_gem_create)
> >>>>>>>>>>>       #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE
> >>>>>>>>>>> + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
> >>>>>>>>>>> @@ -71,6 +72,7 @@ extern "C" {
> >>>>>>>>>>>       #define DRM_IOCTL_AMDGPU_VM DRM_IOWR(DRM_COMMAND_BASE +
> >>>>>>>>>>> DRM_AMDGPU_VM, union drm_amdgpu_vm)
> >>>>>>>>>>>       #define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE
> >>>>>>>>>>> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union
> >>>>>>>>>>> drm_amdgpu_fence_to_handle)
> >>>>>>>>>>>       #define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE +
> >>>>>>>>>>> DRM_AMDGPU_SCHED, union drm_amdgpu_sched)
> >>>>>>>>>>> +#define DRM_IOCTL_AMDGPU_USERQ DRM_IOW(DRM_COMMAND_BASE +
> >>>>>>>>>>> DRM_AMDGPU_USERQ, union drm_amdgpu_userq)
> >>>>>>>>>>>
> >>>>>>>>>>>       /**
> >>>>>>>>>>>        * DOC: memory domains
> >>>>>>>>>>> @@ -302,6 +304,57 @@ union drm_amdgpu_ctx {
> >>>>>>>>>>>              union drm_amdgpu_ctx_out out;
> >>>>>>>>>>>       };
> >>>>>>>>>>>
> >>>>>>>>>>> +/* user queue IOCTL */
> >>>>>>>>>>> +#define AMDGPU_USERQ_OP_CREATE 1
> >>>>>>>>>>> +#define AMDGPU_USERQ_OP_FREE   2
> >>>>>>>>>>> +
> >>>>>>>>>>> +#define AMDGPU_USERQ_MQD_FLAGS_SECURE  (1 << 0)
> >>>>>>>>>>> +#define AMDGPU_USERQ_MQD_FLAGS_AQL     (1 << 1)
> >>>>>>>>>>> +
> >>>>>>>>>>> +struct drm_amdgpu_userq_mqd {
> >>>>>>>>>>> +       /** Flags: AMDGPU_USERQ_MQD_FLAGS_* */
> >>>>>>>>>>> +       __u32   flags;
> >>>>>>>>>>> +       /** IP type: AMDGPU_HW_IP_* */
> >>>>>>>>>>> +       __u32   ip_type;
> >>>>>>>>>>> +       /** GEM object handle */
> >>>>>>>>>>> +       __u32   doorbell_handle;
> >>>>>>>>>>> +       /** Doorbell offset in dwords */
> >>>>>>>>>>> +       __u32   doorbell_offset;
> >>>>>>>>>> Since doorbells are 64 bit, maybe this offset should be in qwords.
> >>>>>>>>> Can you please help to cross check this information ? All the
> >>>>>>>>> existing
> >>>>>>>>> kernel doorbell calculations are keeping doorbells size as
> >>>>>>>>> sizeof(u32)
> >>>>>>>> Doorbells on pre-vega hardware are 32 bits so that is where that comes
> >>>>>>>> from, but from vega onward most doorbells are 64 bit.  I think some
> >>>>>>>> versions of VCN may still use 32 bit doorbells.  Internally in the
> >>>>>>>> kernel driver we just use two slots for newer hardware, but for the
> >>>>>>>> UAPI, I think we can just stick with 64 bit slots to avoid confusion.
> >>>>>>>> Even if an engine only uses a 32 bit one, I don't know that there is
> >>>>>>>> much value to trying to support variable doorbell sizes.
> >>>>>>> I think we can stick with using __u32 because this is *not* the size of
> >>>>>>> the doorbell entries.
> >>>>>>>
> >>>>>>> Instead this is the offset into the BO where to find the doorbell for
> >>>>>>> this queue (which then in turn is 64bits wide).
> >>>>>>>
> >>>>>>> Since we will probably never have more than 4GiB doorbells we should be
> >>>>>>> pretty save to use 32bits here.
> >>>>>> Yes, the offset would still be 32 bits, but the units would be
> >>>>>> qwords.  E.g.,
> >>>>>>
> >>>>>> +       /** Doorbell offset in qwords */
> >>>>>> +       __u32   doorbell_offset;
> >>>>>>
> >>>>>> That way you couldn't accidently specify an overlapping doorbell.
> >>>>> Ah, so you only wanted to fix the comment. That was absolutely not
> >>>>> clear from the discussion.
> >>>> If I understand this correctly, the offset of the doorbell in the BO is
> >>>> still is 32-bit, but its width (size in bytes) is 64 bits. Am I getting
> >>>> that right ?
> >>> Right.  Each doorbell is 64 bits (8 bytes) so this value would
> >>> basically be an index into the doorbell bo.  Having it be a 64 bit
> >>> index rather than a 32 bit index would avoid the possibility of users
> >>> specifying overlapping doorbells.  E.g.,
> >>> offset in bytes
> >>> 0 - doorbell
> >>> 4 - doorbell
> >>> Would be incorrect, while
> >>> offset in bytes
> >>> 0 - doorbell
> >>> 8 - doorbell
> >>> Would be correct.
> >>>
> >>> I.e., u64 doorbell_page[512] vs u32 doorbell_page[1024]
> >> Well I usually prefer just straight byte offsets, but I think the main
> >> question is what does the underlying hw/fw use?
> >>
> >> If that's a dword index we should probably stick with that in the UAPI
> >> as well. If it's in qword then stick to that, if it's in bytes than use
> >> that.
> > The MQD takes a dword offset from the start of the BAR, but the
> > doorbell is 64 bits wide so we have to be careful that we check for
> > overlapping doorbells.
>
> Well then let's just add an "if (doorbell_idx & 0x1) return -EINVAL;" to
> the kernel instead.
>
> That's far less confusing that having dword in the MQD and qword in the
> UAPI.

Yes, agreed.

Alex

>
> Christian.
>
> >
> > Alex
> >
> >> Otherwise we will just confuse people when we convert between the
> >> different API levels.
> >>
> >> Christian.
> >>
> >>> Alex
> >>>
> >>>> - Shashank
> >>>>
> >>>>> Christian.
> >>>>>
> >>>>>> Alex
> >>>>>>
> >>>>>>> Christian.
> >>>>>>>
> >>>>>>>> Alex
> >>>>>>>>
> >>>>>>>>>>> +       /** GPU virtual address of the queue */
> >>>>>>>>>>> +       __u64   queue_va;
> >>>>>>>>>>> +       /** Size of the queue in bytes */
> >>>>>>>>>>> +       __u64   queue_size;
> >>>>>>>>>>> +       /** GPU virtual address of the rptr */
> >>>>>>>>>>> +       __u64   rptr_va;
> >>>>>>>>>>> +       /** GPU virtual address of the wptr */
> >>>>>>>>>>> +       __u64   wptr_va;
> >>>>>>>>>>> +};
> >>>>>>>>>>> +
> >>>>>>>>>>> +struct drm_amdgpu_userq_in {
> >>>>>>>>>>> +       /** AMDGPU_USERQ_OP_* */
> >>>>>>>>>>> +       __u32   op;
> >>>>>>>>>>> +       /** Flags */
> >>>>>>>>>>> +       __u32   flags;
> >>>>>>>>>>> +       /** Queue handle to associate the queue free call with,
> >>>>>>>>>>> +        * unused for queue create calls */
> >>>>>>>>>>> +       __u32   queue_id;
> >>>>>>>>>>> +       __u32   pad;
> >>>>>>>>>>> +       /** Queue descriptor */
> >>>>>>>>>>> +       struct drm_amdgpu_userq_mqd mqd;
> >>>>>>>>>>> +};
> >>>>>>>>>>> +
> >>>>>>>>>>> +struct drm_amdgpu_userq_out {
> >>>>>>>>>>> +       /** Queue handle */
> >>>>>>>>>>> +       __u32   q_id;
> >>>>>>>>>> Maybe this should be queue_id to match the input.
> >>>>>>>>> Agree.
> >>>>>>>>>
> >>>>>>>>> - Shashank
> >>>>>>>>>
> >>>>>>>>>> Alex
> >>>>>>>>>>
> >>>>>>>>>>> +       /** Flags */
> >>>>>>>>>>> +       __u32   flags;
> >>>>>>>>>>> +};
> >>>>>>>>>>> +
> >>>>>>>>>>> +union drm_amdgpu_userq {
> >>>>>>>>>>> +       struct drm_amdgpu_userq_in in;
> >>>>>>>>>>> +       struct drm_amdgpu_userq_out out;
> >>>>>>>>>>> +};
> >>>>>>>>>>> +
> >>>>>>>>>>>       /* vm ioctl */
> >>>>>>>>>>>       #define AMDGPU_VM_OP_RESERVE_VMID      1
> >>>>>>>>>>>       #define AMDGPU_VM_OP_UNRESERVE_VMID    2
> >>>>>>>>>>> --
> >>>>>>>>>>> 2.34.1
> >>>>>>>>>>>
>