Re: [PATCH 06/10] drm/amdgpu: Refactor XGMI reset on init handling

Alex Deucher <alexdeucher@xxxxxxxxx> · Mon, 9 Sep 2024 17:35:54 -0400

On Mon, Sep 2, 2024 at 3:34 AM Lijo Lazar <lijo.lazar@xxxxxxx> wrote:
>
> Use XGMI hive information to rely on resetting XGMI devices on
> initialization rather than using mgpu structure. mgpu structure may have
> other devices as well.
>
> Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    |  6 --
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 72 ++++++++++++++++++++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  2 +
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 14 +++--
>  drivers/gpu/drm/amd/amdgpu/soc15.c         |  5 ++
>  6 files changed, 90 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 468c4f590183..9f33de7ab656 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -155,7 +155,8 @@ struct amdgpu_init_level amdgpu_init_minimal = {
>         .level = AMDGPU_INIT_LEVEL_MINIMAL,
>         .hwini_ip_block_mask =
>                 BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
> -               BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH)
> +               BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
> +               BIT(AMD_IP_BLOCK_TYPE_PSP)
>  };
>
>  static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
> @@ -2832,6 +2833,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
>   */
>  static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>  {
> +       bool init_badpage;
>         int i, r;
>
>         r = amdgpu_ras_init(adev);
> @@ -2945,7 +2947,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
>          * Note: theoretically, this should be called before all vram allocations
>          * to protect retired page from abusing
>          */
> -       r = amdgpu_ras_recovery_init(adev, true);
> +       init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL);
> +       r = amdgpu_ras_recovery_init(adev, init_badpage);
>         if (r)
>                 goto init_failed;
>
> @@ -4501,8 +4504,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
>
>         if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL)
> -               queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
> -                                  msecs_to_jiffies(AMDGPU_RESUME_MS));
> +               amdgpu_xgmi_reset_on_init(adev);
>
>         amdgpu_device_check_iommu_direct_map(adev);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 65c891b6b999..2c29f4c34e64 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3216,12 +3216,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
>         max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control);
>         amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
>
> -       /* Todo: During test the SMU might fail to read the eeprom through I2C
> -        * when the GPU is pending on XGMI reset during probe time
> -        * (Mostly after second bus reset), skip it now
> -        */
> -       if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL)
> -               return 0;
>         if (init_bp_info) {
>                 ret = amdgpu_ras_init_badpage_info(adev);
>                 if (ret)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index a7a892512cb9..6a473a4262f5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -860,8 +860,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>         if (!adev->gmc.xgmi.supported)
>                 return 0;
>
> -       if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) &&
> -           amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +       if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>                 ret = psp_xgmi_initialize(&adev->psp, false, true);
>                 if (ret) {
>                         dev_err(adev->dev,
> @@ -907,8 +906,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>
>         task_barrier_add_task(&hive->tb);
>
> -       if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) &&
> -           amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
> +       if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
>                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
>                         /* update node list for other device in the hive */
>                         if (tmp_adev != adev) {
> @@ -985,7 +983,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>                 }
>         }
>
> -       if (!ret && (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL))
> +       if (!ret)
>                 ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
>
>  exit_unlock:
> @@ -1500,3 +1498,67 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev)
>
>         return 0;
>  }
> +
> +static void amdgpu_xgmi_roi_handler(struct work_struct *work)

Use consistent naming for roi, depending on what you decide in the
earlier patches.  Other than that:
Acked-by: Alex Deucher <alexander.deucher@xxxxxxx>

> +{
> +       struct amdgpu_hive_info *hive =
> +               container_of(work, struct amdgpu_hive_info, roi_work);
> +       struct amdgpu_reset_context reset_context;
> +       struct amdgpu_device *tmp_adev;
> +       struct list_head device_list;
> +       int r;
> +
> +       mutex_lock(&hive->hive_lock);
> +
> +       INIT_LIST_HEAD(&device_list);
> +       list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
> +               list_add_tail(&tmp_adev->reset_list, &device_list);
> +
> +       tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
> +                                   reset_list);
> +       amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
> +
> +       reset_context.method = AMD_RESET_METHOD_ON_INIT;
> +       reset_context.reset_req_dev = tmp_adev;
> +       reset_context.hive = hive;
> +       reset_context.reset_device_list = &device_list;
> +       set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
> +       set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
> +
> +       amdgpu_reset_xgmi_rst_on_init(&reset_context);
> +       mutex_unlock(&hive->hive_lock);
> +       amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
> +
> +       list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
> +               r = amdgpu_ras_init_badpage_info(tmp_adev);
> +               if (r && r != -EHWPOISON)
> +                       dev_err(tmp_adev->dev,
> +                               "error during bad page data initializtion");
> +       }
> +}
> +
> +static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive)
> +{
> +       INIT_WORK(&hive->roi_work, amdgpu_xgmi_roi_handler);
> +       amdgpu_reset_domain_schedule(hive->reset_domain, &hive->roi_work);
> +}
> +
> +int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev)
> +{
> +       struct amdgpu_hive_info *hive;
> +       int r, num_devs;
> +
> +       hive = amdgpu_get_xgmi_hive(adev);
> +       if (!hive)
> +               return -EINVAL;
> +
> +       mutex_lock(&hive->hive_lock);
> +       num_devs = atomic_read(&hive->number_devices);
> +       if (num_devs == adev->gmc.xgmi.num_physical_nodes)
> +               amdgpu_xgmi_schedule_reset_on_init(hive);
> +
> +       mutex_unlock(&hive->hive_lock);
> +       amdgpu_put_xgmi_hive(hive);
> +
> +       return r;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index a3bfc16de6d4..902c2f928653 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -45,6 +45,7 @@ struct amdgpu_hive_info {
>         struct amdgpu_reset_domain *reset_domain;
>         atomic_t ras_recovery;
>         struct ras_event_manager event_mgr;
> +       struct work_struct roi_work;
>  };
>
>  struct amdgpu_pcs_ras_field {
> @@ -75,5 +76,6 @@ static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
>                 adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
>  }
>  int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev);
> +int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev);
>
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index c76ac0dfe572..bc30bc3b7851 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -2413,11 +2413,17 @@ static int gmc_v9_0_hw_fini(void *handle)
>         if (adev->mmhub.funcs->update_power_gating)
>                 adev->mmhub.funcs->update_power_gating(adev, false);
>
> -       amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
> +       /*
> +        * For minimal init, late_init is not called, hence VM fault/RAS irqs
> +        * are not enabled.
> +        */
> +       if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) {
> +               amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
>
> -       if (adev->gmc.ecc_irq.funcs &&
> -               amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> -               amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +               if (adev->gmc.ecc_irq.funcs &&
> +                   amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
> +                       amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
> +       }
>
>         return 0;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index 8d16dacdc172..7901b3fbc127 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -1295,7 +1295,12 @@ static int soc15_common_hw_fini(void *handle)
>         if (amdgpu_sriov_vf(adev))
>                 xgpu_ai_mailbox_put_irq(adev);
>
> +       /*
> +        * For minimal init, late_init is not called, hence RAS irqs are not
> +        * enabled.
> +        */
>         if ((!amdgpu_sriov_vf(adev)) &&
> +           (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) &&
>             adev->nbio.ras_if &&
>             amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
>                 if (adev->nbio.ras &&
> --
> 2.25.1
>