Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 1/20/2022 12:57 PM, Chai, Thomas wrote:

-----Original Message-----
From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
Sent: Thursday, January 20, 2022 1:49 PM
To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Clements, John <John.Clements@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>
Subject: Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init



On 1/20/2022 8:48 AM, yipechai wrote:
Move xgmi ras initialization from .late_init to .early_init, which let
xgmi ras can be initialized only once.

Signed-off-by: yipechai <YiPeng.Chai@xxxxxxx>
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++++++++++-----
   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  5 +++++
   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +++++
   4 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 3483a82f5734..788c0257832d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -436,6 +436,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
   	} while (fault->timestamp < tmp);
   }
+int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev) {
+	if (!adev->gmc.xgmi.connected_to_cpu) {
+		adev->gmc.xgmi.ras = &xgmi_ras;
+		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
+	}
+
+	return 0;
+}
+
   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
   {
   	int r;
@@ -452,11 +462,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
   			return r;
   	}
- if (!adev->gmc.xgmi.connected_to_cpu) {
-		adev->gmc.xgmi.ras = &xgmi_ras;
-		amdgpu_ras_register_ras_block(adev, &adev->gmc.xgmi.ras->ras_block);
-	}
-
   	if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) {
   		r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
   		if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 0001631cfedb..ac4c0e50b45c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -318,6 +318,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
   			      uint16_t pasid, uint64_t timestamp);
   void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
   				     uint16_t pasid);
+int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev); diff
--git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 4f8d356f8432..7a6ad5d467b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -719,6 +719,7 @@ static void gmc_v10_0_set_gfxhub_funcs(struct
amdgpu_device *adev)
static int gmc_v10_0_early_init(void *handle)
   {
+	int r;
   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
gmc_v10_0_set_mmhub_funcs(adev);
@@ -734,6 +735,10 @@ static int gmc_v10_0_early_init(void *handle)
   	adev->gmc.private_aperture_end =
   		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
+ r = amdgpu_gmc_ras_early_init(adev);
+	if (r)
+		return r;
+

At this point it's unknown if RAS is applicable for the SKU. I think this failure check shouldn't be there (here and below one).

amdgpu_gmc_ras_early_init is return 0 always, that way also this check is not needed.

[Thomas]  Just like calling amdgpu_gmc_ras_late_init,  checking the return status may make the code extensible.
	   In amdgpu_gmc_ras_early_init,  the xgmi ras initialization may always return 0, but it may add functions that need to check the return status in future.


At this point, it's unknown

1) If the device is part of XGMI hive or not.
2) If the device supports RAS.

For such devices, it doesn't make any sense to fail here based on this function.

Thanks,
Lijo

   	return 0;
   }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index c76ffd1a70cd..3cdd3d459d51 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1318,6 +1318,7 @@ static void gmc_v9_0_set_mca_funcs(struct
amdgpu_device *adev)
static int gmc_v9_0_early_init(void *handle)
   {
+	int r;
   	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
/* ARCT and VEGA20 don't have XGMI defined in their IP discovery
tables */ @@ -1347,6 +1348,10 @@ static int gmc_v9_0_early_init(void *handle)
   	adev->gmc.private_aperture_end =
   		adev->gmc.private_aperture_start + (4ULL << 30) - 1;
+ r = amdgpu_gmc_ras_early_init(adev);
+	if (r)
+		return r;
+
   	return 0;
   }



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux