Waiting for system ready to fix the discovery initialization failure issue. This failure usually occurs when dGPU is removed and then rescanned via command line. It's caused by following two errors: [1] vram size is 0 [2] wrong binary signature Signed-off-by: Ma Jun <Jun.Ma2@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index 07c5fca06178..ac6b2ae6414c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -276,7 +276,12 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev, msleep(1); } } - vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20; + for (i = 0; i < 100; i++) { + vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20; + if (vram_size) + break; + usleep_range(1000, 1100); + } if (vram_size) { uint64_t pos = vram_size - DISCOVERY_TMR_OFFSET; @@ -371,6 +376,7 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev) { struct table_info *info; struct binary_header *bhdr; + int error_count = 0; uint16_t offset; uint16_t size; uint16_t checksum; @@ -380,7 +386,7 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev) adev->mman.discovery_bin = kzalloc(adev->mman.discovery_tmr_size, GFP_KERNEL); if (!adev->mman.discovery_bin) return -ENOMEM; - +retry: /* Read from file if it is the preferred option */ if (amdgpu_discovery == 2) { dev_info(adev->dev, "use ip discovery information from file"); @@ -401,6 +407,10 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev) /* check the ip discovery binary signature */ if (!amdgpu_discovery_verify_binary_signature(adev->mman.discovery_bin)) { + if (error_count++ < 1) { + msleep(100); + goto retry; + } dev_err(adev->dev, "get invalid ip discovery binary signature\n"); r = -EINVAL; @@ -515,7 +525,6 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev) if (0 && offset) { struct mall_info_header *mhdr = (struct mall_info_header *)(adev->mman.discovery_bin + offset); - if (le32_to_cpu(mhdr->table_id) != MALL_INFO_TABLE_ID) { dev_err(adev->dev, "invalid ip discovery mall table id\n"); r = -EINVAL; -- 2.34.1