On Mon, Sep 2, 2024 at 3:34 AM Lijo Lazar <lijo.lazar@xxxxxxx> wrote: > > Use XGMI hive information to rely on resetting XGMI devices on > initialization rather than using mgpu structure. mgpu structure may have > other devices as well. > > Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 -- > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 72 ++++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 2 + > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 14 +++-- > drivers/gpu/drm/amd/amdgpu/soc15.c | 5 ++ > 6 files changed, 90 insertions(+), 19 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 468c4f590183..9f33de7ab656 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -155,7 +155,8 @@ struct amdgpu_init_level amdgpu_init_minimal = { > .level = AMDGPU_INIT_LEVEL_MINIMAL, > .hwini_ip_block_mask = > BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) | > - BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) > + BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) | > + BIT(AMD_IP_BLOCK_TYPE_PSP) > }; > > static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev, > @@ -2832,6 +2833,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) > */ > static int amdgpu_device_ip_init(struct amdgpu_device *adev) > { > + bool init_badpage; > int i, r; > > r = amdgpu_ras_init(adev); > @@ -2945,7 +2947,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) > * Note: theoretically, this should be called before all vram allocations > * to protect retired page from abusing > */ > - r = amdgpu_ras_recovery_init(adev, true); > + init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL); > + r = amdgpu_ras_recovery_init(adev, init_badpage); > if (r) > goto init_failed; > > @@ -4501,8 +4504,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, > vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); > > if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL) > - queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, > - msecs_to_jiffies(AMDGPU_RESUME_MS)); > + amdgpu_xgmi_reset_on_init(adev); > > amdgpu_device_check_iommu_direct_map(adev); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 65c891b6b999..2c29f4c34e64 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3216,12 +3216,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) > max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); > amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); > > - /* Todo: During test the SMU might fail to read the eeprom through I2C > - * when the GPU is pending on XGMI reset during probe time > - * (Mostly after second bus reset), skip it now > - */ > - if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL) > - return 0; > if (init_bp_info) { > ret = amdgpu_ras_init_badpage_info(adev); > if (ret) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > index a7a892512cb9..6a473a4262f5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c > @@ -860,8 +860,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) > if (!adev->gmc.xgmi.supported) > return 0; > > - if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) && > - amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { > + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { > ret = psp_xgmi_initialize(&adev->psp, false, true); > if (ret) { > dev_err(adev->dev, > @@ -907,8 +906,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) > > task_barrier_add_task(&hive->tb); > > - if ((adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) && > - amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { > + if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { > list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { > /* update node list for other device in the hive */ > if (tmp_adev != adev) { > @@ -985,7 +983,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) > } > } > > - if (!ret && (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL)) > + if (!ret) > ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive); > > exit_unlock: > @@ -1500,3 +1498,67 @@ int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev) > > return 0; > } > + > +static void amdgpu_xgmi_roi_handler(struct work_struct *work) Use consistent naming for roi, depending on what you decide in the earlier patches. Other than that: Acked-by: Alex Deucher <alexander.deucher@xxxxxxx> > +{ > + struct amdgpu_hive_info *hive = > + container_of(work, struct amdgpu_hive_info, roi_work); > + struct amdgpu_reset_context reset_context; > + struct amdgpu_device *tmp_adev; > + struct list_head device_list; > + int r; > + > + mutex_lock(&hive->hive_lock); > + > + INIT_LIST_HEAD(&device_list); > + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) > + list_add_tail(&tmp_adev->reset_list, &device_list); > + > + tmp_adev = list_first_entry(&device_list, struct amdgpu_device, > + reset_list); > + amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); > + > + reset_context.method = AMD_RESET_METHOD_ON_INIT; > + reset_context.reset_req_dev = tmp_adev; > + reset_context.hive = hive; > + reset_context.reset_device_list = &device_list; > + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); > + set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); > + > + amdgpu_reset_xgmi_rst_on_init(&reset_context); > + mutex_unlock(&hive->hive_lock); > + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); > + > + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { > + r = amdgpu_ras_init_badpage_info(tmp_adev); > + if (r && r != -EHWPOISON) > + dev_err(tmp_adev->dev, > + "error during bad page data initializtion"); > + } > +} > + > +static void amdgpu_xgmi_schedule_reset_on_init(struct amdgpu_hive_info *hive) > +{ > + INIT_WORK(&hive->roi_work, amdgpu_xgmi_roi_handler); > + amdgpu_reset_domain_schedule(hive->reset_domain, &hive->roi_work); > +} > + > +int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev) > +{ > + struct amdgpu_hive_info *hive; > + int r, num_devs; > + > + hive = amdgpu_get_xgmi_hive(adev); > + if (!hive) > + return -EINVAL; > + > + mutex_lock(&hive->hive_lock); > + num_devs = atomic_read(&hive->number_devices); > + if (num_devs == adev->gmc.xgmi.num_physical_nodes) > + amdgpu_xgmi_schedule_reset_on_init(hive); > + > + mutex_unlock(&hive->hive_lock); > + amdgpu_put_xgmi_hive(hive); > + > + return r; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > index a3bfc16de6d4..902c2f928653 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h > @@ -45,6 +45,7 @@ struct amdgpu_hive_info { > struct amdgpu_reset_domain *reset_domain; > atomic_t ras_recovery; > struct ras_event_manager event_mgr; > + struct work_struct roi_work; > }; > > struct amdgpu_pcs_ras_field { > @@ -75,5 +76,6 @@ static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev, > adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id); > } > int amdgpu_xgmi_ras_sw_init(struct amdgpu_device *adev); > +int amdgpu_xgmi_reset_on_init(struct amdgpu_device *adev); > > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index c76ac0dfe572..bc30bc3b7851 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -2413,11 +2413,17 @@ static int gmc_v9_0_hw_fini(void *handle) > if (adev->mmhub.funcs->update_power_gating) > adev->mmhub.funcs->update_power_gating(adev, false); > > - amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); > + /* > + * For minimal init, late_init is not called, hence VM fault/RAS irqs > + * are not enabled. > + */ > + if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) { > + amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0); > > - if (adev->gmc.ecc_irq.funcs && > - amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) > - amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); > + if (adev->gmc.ecc_irq.funcs && > + amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) > + amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0); > + } > > return 0; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c > index 8d16dacdc172..7901b3fbc127 100644 > --- a/drivers/gpu/drm/amd/amdgpu/soc15.c > +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c > @@ -1295,7 +1295,12 @@ static int soc15_common_hw_fini(void *handle) > if (amdgpu_sriov_vf(adev)) > xgpu_ai_mailbox_put_irq(adev); > > + /* > + * For minimal init, late_init is not called, hence RAS irqs are not > + * enabled. > + */ > if ((!amdgpu_sriov_vf(adev)) && > + (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL) && > adev->nbio.ras_if && > amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) { > if (adev->nbio.ras && > -- > 2.25.1 >