[PATCH 7/7] drm/amdgpu: retry init if it fails due to exclusive mode timeout (v2)

alexdeucher@xxxxxxxxx (Alex Deucher) · Wed, 25 Oct 2017 22:13:04 -0400

On Wed, Oct 25, 2017 at 9:05 PM, Ding, Pixel <Pixel.Ding at amd.com> wrote:
> Host rejects frequent exclusive mode requesting, otherwise an attacking VF may cause problems. Sleep for a while to let host accept next request here.
>

Please add a comment about that.  With that added:
Acked-by: Alex Deucher <alexander.deucher at amd.com>

> â??
> Sincerely Yours,
> Pixel
>
>
>
>
>
>
>
> On 26/10/2017, 6:44 AM, "Alex Deucher" <alexdeucher at gmail.com> wrote:
>
>>On Tue, Oct 24, 2017 at 10:19 PM, Pixel Ding <Pixel.Ding at amd.com> wrote:
>>> From: pding <Pixel.Ding at amd.com>
>>>
>>> The exclusive mode has real-time limitation in reality, such like being
>>> done in 300ms. It's easy observed if running many VF/VMs in single host
>>> with heavy CPU workload.
>>>
>>> If we find the init fails due to exclusive mode timeout, try it again.
>>>
>>> v2:
>>>  - rewrite the condition for readable value.
>>>
>>> Signed-off-by: pding <Pixel.Ding at amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++++++++++
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 14 ++++++++++++--
>>>  2 files changed, 22 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 2fdd73b..14fe2bc 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2301,6 +2301,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>
>>>         r = amdgpu_init(adev);
>>>         if (r) {
>>> +               /* failed in exclusive mode due to timeout */
>>> +               if (amdgpu_sriov_vf(adev) &&
>>> +                   !amdgpu_sriov_runtime(adev) &&
>>> +                   amdgpu_virt_mmio_blocked(adev) &&
>>> +                   !amdgpu_virt_wait_reset(adev)) {
>>> +                       dev_err(adev->dev, "VF exclusive mode timeout\n");
>>> +                       r = -EAGAIN;
>>> +                       goto failed;
>>> +               }
>>>                 dev_err(adev->dev, "amdgpu_init failed\n");
>>>                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
>>>                 amdgpu_fini(adev);
>>> @@ -2388,6 +2397,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>>>         amdgpu_vf_error_trans_all(adev);
>>>         if (runtime)
>>>                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
>>> +
>>>         return r;
>>>  }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> index 4a9f749..f4ee407 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> @@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
>>>  int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>>  {
>>>         struct amdgpu_device *adev;
>>> -       int r, acpi_status;
>>> +       int r, acpi_status, retry = 0;
>>>
>>>  #ifdef CONFIG_DRM_AMDGPU_SI
>>>         if (!amdgpu_si_support) {
>>> @@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>>                 }
>>>         }
>>>  #endif
>>> +retry_init:
>>>
>>>         adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
>>>         if (adev == NULL) {
>>> @@ -144,7 +145,16 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
>>>          * VRAM allocation
>>>          */
>>>         r = amdgpu_device_init(adev, dev, dev->pdev, flags);
>>> -       if (r) {
>>> +       if (++retry >= 3 && r == -EAGAIN) {
>>
>>Might be better to check for -EAGAIN first.
>>
>>> +               adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
>>> +               adev->virt.ops = NULL;
>>> +               amdgpu_device_fini(adev);
>>> +               kfree(adev);
>>> +               dev->dev_private = NULL;
>>> +               msleep(5000);
>>
>>
>>Why do we need the sleep here?
>>
>>> +               dev_err(&dev->pdev->dev, "retry init %d\n", retry);
>>> +               goto retry_init;
>>> +       } else if (r) {
>>>                 dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
>>>                 goto out;
>>>         }
>>> --
>>> 2.9.5
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx at lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx