Vega20 arbitrates pstate at hive level and not device level. Last peer to remote buffer unmap could drop P-State while another process is still remote buffer mapped. With this fix, P-States still needs to be disabled for now as SMU bug was discovered on synchronous P2P transfers. This should be fixed in the next FW update. Signed-off-by: Jonathan Kim <Jonathan.Kim@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 - drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 16 ++---- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 -- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 66 ++++++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h | 6 +++ 5 files changed, 43 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 4e1d4cfe7a9f..94dff899248d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -982,8 +982,6 @@ struct amdgpu_device { uint64_t unique_id; uint64_t df_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS]; - /* device pstate */ - int pstate; /* enable runtime pm on the device */ bool runpm; bool in_runpm; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index accbb34ea670..95560eea61c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2135,11 +2135,8 @@ struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev, if (bo && amdgpu_xgmi_same_hive(adev, amdgpu_ttm_adev(bo->tbo.bdev)) && (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM)) { bo_va->is_xgmi = true; - mutex_lock(&adev->vm_manager.lock_pstate); /* Power up XGMI if it can be potentially used */ - if (++adev->vm_manager.xgmi_map_counter == 1) - amdgpu_xgmi_set_pstate(adev, 1); - mutex_unlock(&adev->vm_manager.lock_pstate); + amdgpu_xgmi_set_pstate(adev, AMDGPU_XGMI_PSTATE_MAX_VEGA20); } return bo_va; @@ -2562,12 +2559,8 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev, dma_fence_put(bo_va->last_pt_update); - if (bo && bo_va->is_xgmi) { - mutex_lock(&adev->vm_manager.lock_pstate); - if (--adev->vm_manager.xgmi_map_counter == 0) - amdgpu_xgmi_set_pstate(adev, 0); - mutex_unlock(&adev->vm_manager.lock_pstate); - } + if (bo && bo_va->is_xgmi) + amdgpu_xgmi_set_pstate(adev, AMDGPU_XGMI_PSTATE_MIN); kfree(bo_va); } @@ -3177,9 +3170,6 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev) idr_init(&adev->vm_manager.pasid_idr); spin_lock_init(&adev->vm_manager.pasid_lock); - - adev->vm_manager.xgmi_map_counter = 0; - mutex_init(&adev->vm_manager.lock_pstate); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index ea771d84bf2b..c8e68d7890bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -357,10 +357,6 @@ struct amdgpu_vm_manager { */ struct idr pasid_idr; spinlock_t pasid_lock; - - /* counter of mapped memory through xgmi */ - uint32_t xgmi_map_counter; - struct mutex lock_pstate; }; #define amdgpu_vm_copy_pte(adev, ib, pe, src, count) ((adev)->vm_manager.vm_pte_funcs->copy_pte((ib), (pe), (src), (count))) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 8c3215505e78..52f45b9fe271 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -373,7 +373,13 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo if (lock) mutex_lock(&tmp->hive_lock); - tmp->pstate = -1; + tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; + tmp->high_gpu = NULL; + /* + * hive pstate on boot is high in vega20 so we have to go to low + * pstate on after boot. + */ + tmp->map_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE; mutex_unlock(&xgmi_mutex); return tmp; @@ -383,50 +389,49 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) { int ret = 0; struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); - struct amdgpu_device *tmp_adev; - bool update_hive_pstate = true; - bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20; + struct amdgpu_device *tar_adev = hive->high_gpu ? + hive->high_gpu : adev; + bool map = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; + bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; - if (!hive) + if (!hive || adev->asic_type == CHIP_VEGA20) return 0; mutex_lock(&hive->hive_lock); - if (hive->pstate == pstate) { - adev->pstate = is_high_pstate ? pstate : adev->pstate; + if (map) + hive->map_count++; + else + hive->map_count--; + + /* + * Vega20 only needs single peer to request pstate high for the hive to + * go high but all peers must request pstate low for the hive to go low + */ + if (hive->pstate == pstate || (!map && hive->map_count && !init_low)) goto out; - } - dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate); + dev_dbg(tar_adev->dev, "Set xgmi pstate %d.\n", pstate); - ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate); + ret = amdgpu_dpm_set_xgmi_pstate(tar_adev, pstate); if (ret) { - dev_err(adev->dev, + dev_err(tar_adev->dev, "XGMI: Set pstate failure on device %llx, hive %llx, ret %d", - adev->gmc.xgmi.node_id, - adev->gmc.xgmi.hive_id, ret); + tar_adev->gmc.xgmi.node_id, + tar_adev->gmc.xgmi.hive_id, ret); goto out; } - /* Update device pstate */ - adev->pstate = pstate; - - /* - * Update the hive pstate only all devices of the hive - * are in the same pstate - */ - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { - if (tmp_adev->pstate != adev->pstate) { - update_hive_pstate = false; - break; - } - } - if (update_hive_pstate || is_high_pstate) + if (init_low) + hive->pstate = hive->map_count ? + hive->pstate : AMDGPU_XGMI_PSTATE_MIN; + else { hive->pstate = pstate; - + hive->high_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? + adev : NULL; + } out: mutex_unlock(&hive->hive_lock); - return ret; } @@ -507,9 +512,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) goto exit; } - /* Set default device pstate */ - adev->pstate = -1; - top_info = &adev->psp.xgmi_context.top_info; list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index d5a63904ec33..b5c4acf2316d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -25,6 +25,10 @@ #include <drm/task_barrier.h> #include "amdgpu_psp.h" +#define AMDGPU_XGMI_PSTATE_UNKNOWN -1 +#define AMDGPU_XGMI_PSTATE_MIN 0 +#define AMDGPU_XGMI_PSTATE_MAX_VEGA20 1 + struct amdgpu_hive_info { uint64_t hive_id; struct list_head device_list; @@ -34,6 +38,8 @@ struct amdgpu_hive_info { struct device_attribute dev_attr; struct amdgpu_device *adev; int pstate; /*0 -- low , 1 -- high , -1 unknown*/ + int map_count; + struct amdgpu_device *high_gpu; struct task_barrier tb; }; -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx