On Tue, Feb 7, 2023 at 9:19 AM Christian König <christian.koenig@xxxxxxx> wrote: > > Am 07.02.23 um 15:17 schrieb Alex Deucher: > > On Tue, Feb 7, 2023 at 9:11 AM Christian König > > <ckoenig.leichtzumerken@xxxxxxxxx> wrote: > >> Am 07.02.23 um 15:07 schrieb Alex Deucher: > >>> On Tue, Feb 7, 2023 at 2:38 AM Shashank Sharma <shashank.sharma@xxxxxxx> wrote: > >>>> On 07/02/2023 08:03, Christian König wrote: > >>>>> Am 06.02.23 um 22:03 schrieb Alex Deucher: > >>>>>> On Mon, Feb 6, 2023 at 12:01 PM Christian König > >>>>>> <christian.koenig@xxxxxxx> wrote: > >>>>>>> Am 06.02.23 um 17:56 schrieb Alex Deucher: > >>>>>>>> On Fri, Feb 3, 2023 at 5:26 PM Shashank Sharma > >>>>>>>> <shashank.sharma@xxxxxxx> wrote: > >>>>>>>>> Hey Alex, > >>>>>>>>> > >>>>>>>>> On 03/02/2023 23:07, Alex Deucher wrote: > >>>>>>>>>> On Fri, Feb 3, 2023 at 4:54 PM Shashank Sharma > >>>>>>>>>> <shashank.sharma@xxxxxxx> wrote: > >>>>>>>>>>> From: Alex Deucher <alexander.deucher@xxxxxxx> > >>>>>>>>>>> > >>>>>>>>>>> This patch intorduces new UAPI/IOCTL for usermode graphics > >>>>>>>>>>> queue. The userspace app will fill this structure and request > >>>>>>>>>>> the graphics driver to add a graphics work queue for it. The > >>>>>>>>>>> output of this UAPI is a queue id. > >>>>>>>>>>> > >>>>>>>>>>> This UAPI maps the queue into GPU, so the graphics app can start > >>>>>>>>>>> submitting work to the queue as soon as the call returns. > >>>>>>>>>>> > >>>>>>>>>>> Cc: Alex Deucher <alexander.deucher@xxxxxxx> > >>>>>>>>>>> Cc: Christian Koenig <christian.koenig@xxxxxxx> > >>>>>>>>>>> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> > >>>>>>>>>>> Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx> > >>>>>>>>>>> --- > >>>>>>>>>>> include/uapi/drm/amdgpu_drm.h | 53 > >>>>>>>>>>> +++++++++++++++++++++++++++++++++++ > >>>>>>>>>>> 1 file changed, 53 insertions(+) > >>>>>>>>>>> > >>>>>>>>>>> diff --git a/include/uapi/drm/amdgpu_drm.h > >>>>>>>>>>> b/include/uapi/drm/amdgpu_drm.h > >>>>>>>>>>> index 4038abe8505a..6c5235d107b3 100644 > >>>>>>>>>>> --- a/include/uapi/drm/amdgpu_drm.h > >>>>>>>>>>> +++ b/include/uapi/drm/amdgpu_drm.h > >>>>>>>>>>> @@ -54,6 +54,7 @@ extern "C" { > >>>>>>>>>>> #define DRM_AMDGPU_VM 0x13 > >>>>>>>>>>> #define DRM_AMDGPU_FENCE_TO_HANDLE 0x14 > >>>>>>>>>>> #define DRM_AMDGPU_SCHED 0x15 > >>>>>>>>>>> +#define DRM_AMDGPU_USERQ 0x16 > >>>>>>>>>>> > >>>>>>>>>>> #define DRM_IOCTL_AMDGPU_GEM_CREATE > >>>>>>>>>>> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_GEM_CREATE, union > >>>>>>>>>>> drm_amdgpu_gem_create) > >>>>>>>>>>> #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE > >>>>>>>>>>> + DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) > >>>>>>>>>>> @@ -71,6 +72,7 @@ extern "C" { > >>>>>>>>>>> #define DRM_IOCTL_AMDGPU_VM DRM_IOWR(DRM_COMMAND_BASE + > >>>>>>>>>>> DRM_AMDGPU_VM, union drm_amdgpu_vm) > >>>>>>>>>>> #define DRM_IOCTL_AMDGPU_FENCE_TO_HANDLE > >>>>>>>>>>> DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDGPU_FENCE_TO_HANDLE, union > >>>>>>>>>>> drm_amdgpu_fence_to_handle) > >>>>>>>>>>> #define DRM_IOCTL_AMDGPU_SCHED DRM_IOW(DRM_COMMAND_BASE + > >>>>>>>>>>> DRM_AMDGPU_SCHED, union drm_amdgpu_sched) > >>>>>>>>>>> +#define DRM_IOCTL_AMDGPU_USERQ DRM_IOW(DRM_COMMAND_BASE + > >>>>>>>>>>> DRM_AMDGPU_USERQ, union drm_amdgpu_userq) > >>>>>>>>>>> > >>>>>>>>>>> /** > >>>>>>>>>>> * DOC: memory domains > >>>>>>>>>>> @@ -302,6 +304,57 @@ union drm_amdgpu_ctx { > >>>>>>>>>>> union drm_amdgpu_ctx_out out; > >>>>>>>>>>> }; > >>>>>>>>>>> > >>>>>>>>>>> +/* user queue IOCTL */ > >>>>>>>>>>> +#define AMDGPU_USERQ_OP_CREATE 1 > >>>>>>>>>>> +#define AMDGPU_USERQ_OP_FREE 2 > >>>>>>>>>>> + > >>>>>>>>>>> +#define AMDGPU_USERQ_MQD_FLAGS_SECURE (1 << 0) > >>>>>>>>>>> +#define AMDGPU_USERQ_MQD_FLAGS_AQL (1 << 1) > >>>>>>>>>>> + > >>>>>>>>>>> +struct drm_amdgpu_userq_mqd { > >>>>>>>>>>> + /** Flags: AMDGPU_USERQ_MQD_FLAGS_* */ > >>>>>>>>>>> + __u32 flags; > >>>>>>>>>>> + /** IP type: AMDGPU_HW_IP_* */ > >>>>>>>>>>> + __u32 ip_type; > >>>>>>>>>>> + /** GEM object handle */ > >>>>>>>>>>> + __u32 doorbell_handle; > >>>>>>>>>>> + /** Doorbell offset in dwords */ > >>>>>>>>>>> + __u32 doorbell_offset; > >>>>>>>>>> Since doorbells are 64 bit, maybe this offset should be in qwords. > >>>>>>>>> Can you please help to cross check this information ? All the > >>>>>>>>> existing > >>>>>>>>> kernel doorbell calculations are keeping doorbells size as > >>>>>>>>> sizeof(u32) > >>>>>>>> Doorbells on pre-vega hardware are 32 bits so that is where that comes > >>>>>>>> from, but from vega onward most doorbells are 64 bit. I think some > >>>>>>>> versions of VCN may still use 32 bit doorbells. Internally in the > >>>>>>>> kernel driver we just use two slots for newer hardware, but for the > >>>>>>>> UAPI, I think we can just stick with 64 bit slots to avoid confusion. > >>>>>>>> Even if an engine only uses a 32 bit one, I don't know that there is > >>>>>>>> much value to trying to support variable doorbell sizes. > >>>>>>> I think we can stick with using __u32 because this is *not* the size of > >>>>>>> the doorbell entries. > >>>>>>> > >>>>>>> Instead this is the offset into the BO where to find the doorbell for > >>>>>>> this queue (which then in turn is 64bits wide). > >>>>>>> > >>>>>>> Since we will probably never have more than 4GiB doorbells we should be > >>>>>>> pretty save to use 32bits here. > >>>>>> Yes, the offset would still be 32 bits, but the units would be > >>>>>> qwords. E.g., > >>>>>> > >>>>>> + /** Doorbell offset in qwords */ > >>>>>> + __u32 doorbell_offset; > >>>>>> > >>>>>> That way you couldn't accidently specify an overlapping doorbell. > >>>>> Ah, so you only wanted to fix the comment. That was absolutely not > >>>>> clear from the discussion. > >>>> If I understand this correctly, the offset of the doorbell in the BO is > >>>> still is 32-bit, but its width (size in bytes) is 64 bits. Am I getting > >>>> that right ? > >>> Right. Each doorbell is 64 bits (8 bytes) so this value would > >>> basically be an index into the doorbell bo. Having it be a 64 bit > >>> index rather than a 32 bit index would avoid the possibility of users > >>> specifying overlapping doorbells. E.g., > >>> offset in bytes > >>> 0 - doorbell > >>> 4 - doorbell > >>> Would be incorrect, while > >>> offset in bytes > >>> 0 - doorbell > >>> 8 - doorbell > >>> Would be correct. > >>> > >>> I.e., u64 doorbell_page[512] vs u32 doorbell_page[1024] > >> Well I usually prefer just straight byte offsets, but I think the main > >> question is what does the underlying hw/fw use? > >> > >> If that's a dword index we should probably stick with that in the UAPI > >> as well. If it's in qword then stick to that, if it's in bytes than use > >> that. > > The MQD takes a dword offset from the start of the BAR, but the > > doorbell is 64 bits wide so we have to be careful that we check for > > overlapping doorbells. > > Well then let's just add an "if (doorbell_idx & 0x1) return -EINVAL;" to > the kernel instead. > > That's far less confusing that having dword in the MQD and qword in the > UAPI. Yes, agreed. Alex > > Christian. > > > > > Alex > > > >> Otherwise we will just confuse people when we convert between the > >> different API levels. > >> > >> Christian. > >> > >>> Alex > >>> > >>>> - Shashank > >>>> > >>>>> Christian. > >>>>> > >>>>>> Alex > >>>>>> > >>>>>>> Christian. > >>>>>>> > >>>>>>>> Alex > >>>>>>>> > >>>>>>>>>>> + /** GPU virtual address of the queue */ > >>>>>>>>>>> + __u64 queue_va; > >>>>>>>>>>> + /** Size of the queue in bytes */ > >>>>>>>>>>> + __u64 queue_size; > >>>>>>>>>>> + /** GPU virtual address of the rptr */ > >>>>>>>>>>> + __u64 rptr_va; > >>>>>>>>>>> + /** GPU virtual address of the wptr */ > >>>>>>>>>>> + __u64 wptr_va; > >>>>>>>>>>> +}; > >>>>>>>>>>> + > >>>>>>>>>>> +struct drm_amdgpu_userq_in { > >>>>>>>>>>> + /** AMDGPU_USERQ_OP_* */ > >>>>>>>>>>> + __u32 op; > >>>>>>>>>>> + /** Flags */ > >>>>>>>>>>> + __u32 flags; > >>>>>>>>>>> + /** Queue handle to associate the queue free call with, > >>>>>>>>>>> + * unused for queue create calls */ > >>>>>>>>>>> + __u32 queue_id; > >>>>>>>>>>> + __u32 pad; > >>>>>>>>>>> + /** Queue descriptor */ > >>>>>>>>>>> + struct drm_amdgpu_userq_mqd mqd; > >>>>>>>>>>> +}; > >>>>>>>>>>> + > >>>>>>>>>>> +struct drm_amdgpu_userq_out { > >>>>>>>>>>> + /** Queue handle */ > >>>>>>>>>>> + __u32 q_id; > >>>>>>>>>> Maybe this should be queue_id to match the input. > >>>>>>>>> Agree. > >>>>>>>>> > >>>>>>>>> - Shashank > >>>>>>>>> > >>>>>>>>>> Alex > >>>>>>>>>> > >>>>>>>>>>> + /** Flags */ > >>>>>>>>>>> + __u32 flags; > >>>>>>>>>>> +}; > >>>>>>>>>>> + > >>>>>>>>>>> +union drm_amdgpu_userq { > >>>>>>>>>>> + struct drm_amdgpu_userq_in in; > >>>>>>>>>>> + struct drm_amdgpu_userq_out out; > >>>>>>>>>>> +}; > >>>>>>>>>>> + > >>>>>>>>>>> /* vm ioctl */ > >>>>>>>>>>> #define AMDGPU_VM_OP_RESERVE_VMID 1 > >>>>>>>>>>> #define AMDGPU_VM_OP_UNRESERVE_VMID 2 > >>>>>>>>>>> -- > >>>>>>>>>>> 2.34.1 > >>>>>>>>>>> >