Re: [RFC v4 02/11] drm/amdgpu: Move scheduler init to after XGMI is ready

"Deucher, Alexander" <Alexander.Deucher@xxxxxxx> · Thu, 24 Feb 2022 17:12:55 +0000

[Public]

If it applies cleanly, feel free to drop it in.  I'll drop those patches for drm-next since they are already in drm-misc.

Alex

From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> on behalf of Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>

Sent: Thursday, February 24, 2022 11:24 AM

To: Chen, JingWen <JingWen.Chen2@xxxxxxx>; Christian König <ckoenig.leichtzumerken@xxxxxxxxx>; dri-devel@xxxxxxxxxxxxxxxxxxxxx <dri-devel@xxxxxxxxxxxxxxxxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx <amd-gfx@xxxxxxxxxxxxxxxxxxxxx>

Cc: Liu, Monk <Monk.Liu@xxxxxxx>; Chen, Horace <Horace.Chen@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; daniel@xxxxxxxx <daniel@xxxxxxxx>

Subject: Re: [RFC v4 02/11] drm/amdgpu: Move scheduler init to after XGMI is ready

No because all the patch-set including this patch was landed into

drm-misc-next and will reach amd-staging-drm-next on the next upstream 

rebase i guess.

Andrey

On 2022-02-24 01:47, JingWen Chen wrote:

> Hi Andrey,

>

> Will you port this patch into amd-staging-drm-next?

>

> on 2/10/22 2:06 AM, Andrey Grodzovsky wrote:

>> All comments are fixed and code pushed. Thanks for everyone

>> who helped reviewing.

>>

>> Andrey

>>

>> On 2022-02-09 02:53, Christian König wrote:

>>> Am 09.02.22 um 01:23 schrieb Andrey Grodzovsky:

>>>> Before we initialize schedulers we must know which reset

>>>> domain are we in - for single device there iis a single

>>>> domain per device and so single wq per device. For XGMI

>>>> the reset domain spans the entire XGMI hive and so the

>>>> reset wq is per hive.

>>>>

>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>

>>> One more comment below, with that fixed Reviewed-by: Christian König <christian.koenig@xxxxxxx>.

>>>

>>>> ---

>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 45 ++++++++++++++++++++++

>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 34 ++--------------

>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h   |  2 +

>>>>    3 files changed, 51 insertions(+), 30 deletions(-)

>>>>

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> index 9704b0e1fd82..00123b0013d3 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

>>>> @@ -2287,6 +2287,47 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)

>>>>        return r;

>>>>    }

>>>>    +static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)

>>>> +{

>>>> +    long timeout;

>>>> +    int r, i;

>>>> +

>>>> +    for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

>>>> +        struct amdgpu_ring *ring = adev->rings[i];

>>>> +

>>>> +        /* No need to setup the GPU scheduler for rings that don't need it */

>>>> +        if (!ring || ring->no_scheduler)

>>>> +            continue;

>>>> +

>>>> +        switch (ring->funcs->type) {

>>>> +        case AMDGPU_RING_TYPE_GFX:

>>>> +            timeout = adev->gfx_timeout;

>>>> +            break;

>>>> +        case AMDGPU_RING_TYPE_COMPUTE:

>>>> +            timeout = adev->compute_timeout;

>>>> +            break;

>>>> +        case AMDGPU_RING_TYPE_SDMA:

>>>> +            timeout = adev->sdma_timeout;

>>>> +            break;

>>>> +        default:

>>>> +            timeout = adev->video_timeout;

>>>> +            break;

>>>> +        }

>>>> +

>>>> +        r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,

>>>> +                   ring->num_hw_submission, amdgpu_job_hang_limit,

>>>> +                   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);

>>>> +        if (r) {

>>>> +            DRM_ERROR("Failed to create scheduler on ring %s.\n",

>>>> +                  ring->name);

>>>> +            return r;

>>>> +        }

>>>> +    }

>>>> +

>>>> +    return 0;

>>>> +}

>>>> +

>>>> +

>>>>    /**

>>>>     * amdgpu_device_ip_init - run init for hardware IPs

>>>>     *

>>>> @@ -2419,6 +2460,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)

>>>>            }

>>>>        }

>>>>    +    r = amdgpu_device_init_schedulers(adev);

>>>> +    if (r)

>>>> +        goto init_failed;

>>>> +

>>>>        /* Don't init kfd if whole hive need to be reset during init */

>>>>        if (!adev->gmc.xgmi.pending_reset)

>>>>            amdgpu_amdkfd_device_init(adev);

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

>>>> index 45977a72b5dd..fa302540c69a 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

>>>> @@ -457,8 +457,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,

>>>>                      atomic_t *sched_score)

>>>>    {

>>>>        struct amdgpu_device *adev = ring->adev;

>>>> -    long timeout;

>>>> -    int r;

>>>>          if (!adev)

>>>>            return -EINVAL;

>>>> @@ -478,36 +476,12 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,

>>>>        spin_lock_init(&ring->fence_drv.lock);

>>>>        ring->fence_drv.fences = kcalloc(num_hw_submission * 2, sizeof(void *),

>>>>                         GFP_KERNEL);

>>>> -    if (!ring->fence_drv.fences)

>>>> -        return -ENOMEM;

>>>>    -    /* No need to setup the GPU scheduler for rings that don't need it */

>>>> -    if (ring->no_scheduler)

>>>> -        return 0;

>>>> +    ring->num_hw_submission = num_hw_submission;

>>>> +    ring->sched_score = sched_score;

>>> Let's move this into the caller and then use ring->num_hw_submission in the fence code as well.

>>>

>>> The maximum number of jobs on the ring is not really fence specific.

>>>

>>> Regards,

>>> Christian.

>>>

>>>>    -    switch (ring->funcs->type) {

>>>> -    case AMDGPU_RING_TYPE_GFX:

>>>> -        timeout = adev->gfx_timeout;

>>>> -        break;

>>>> -    case AMDGPU_RING_TYPE_COMPUTE:

>>>> -        timeout = adev->compute_timeout;

>>>> -        break;

>>>> -    case AMDGPU_RING_TYPE_SDMA:

>>>> -        timeout = adev->sdma_timeout;

>>>> -        break;

>>>> -    default:

>>>> -        timeout = adev->video_timeout;

>>>> -        break;

>>>> -    }

>>>> -

>>>> -    r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,

>>>> -               num_hw_submission, amdgpu_job_hang_limit,

>>>> -               timeout, NULL, sched_score, ring->name);

>>>> -    if (r) {

>>>> -        DRM_ERROR("Failed to create scheduler on ring %s.\n",

>>>> -              ring->name);

>>>> -        return r;

>>>> -    }

>>>> +    if (!ring->fence_drv.fences)

>>>> +        return -ENOMEM;

>>>>          return 0;

>>>>    }

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

>>>> index fae7d185ad0d..7f20ce73a243 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

>>>> @@ -251,6 +251,8 @@ struct amdgpu_ring {

>>>>        bool            has_compute_vm_bug;

>>>>        bool            no_scheduler;

>>>>        int            hw_prio;

>>>> +    unsigned         num_hw_submission;

>>>> +    atomic_t        *sched_score;

>>>>    };

>>>>      #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))