> -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Sent: Monday, March 6, 2023 10:32 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; > Yang, Stanley <Stanley.Yang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; > Chai, Thomas <YiPeng.Chai@xxxxxxx> > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: [PATCH 08/11] drm/amdgpu: Rework mca ras sw_init > > To align with other IP blocks > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 21 ++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 72 > +++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h | 9 ++-- > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 15 +++--- > drivers/gpu/drm/amd/amdgpu/mca_v3_0.c | 44 ++------------- > drivers/gpu/drm/amd/amdgpu/mca_v3_0.h | 4 +- > 6 files changed, 111 insertions(+), 54 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > index 087a75374610..524e2c9b3012 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > @@ -477,6 +477,27 @@ int amdgpu_gmc_ras_sw_init(struct amdgpu_device > *adev) > } > } > > + /* mca.x ras block */ > + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MCA)) > { > + r = amdgpu_mca_mp0_ras_sw_init(adev); > + if (r) { > + dev_err(adev->dev, "Failed to initialize mca.mp0 ras > block!\n"); > + return r; > + } > + > + r = amdgpu_mca_mp1_ras_sw_init(adev); > + if (r) { > + dev_err(adev->dev, "Failed to initialize mca.mp1 ras > block!\n"); > + return r; > + } > + > + r = amdgpu_mca_mpio_ras_sw_init(adev); > + if (r) { > + dev_err(adev->dev, "Failed to initialize mca.mpio ras > block!\n"); > + return r; > + } > + } > + > /* xgmi ras block */ > if (amdgpu_ras_is_supported(adev, > AMDGPU_RAS_BLOCK__XGMI_WAFL)) { > adev->gmc.xgmi.ras = &xgmi_ras; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c > index 51c2a82e2fa4..0b545bdcd636 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c > @@ -70,3 +70,75 @@ void amdgpu_mca_query_ras_error_count(struct > amdgpu_device *adev, > > amdgpu_mca_reset_error_count(adev, mc_status_addr); } > + > +int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev) { > + int err; > + struct amdgpu_mca_ras_block *ras; > + > + if (!adev->mca.mp0.ras) > + return 0; > + > + ras = adev->mca.mp0.ras; > + > + err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); > + if (err) { > + dev_err(adev->dev, "Failed to register mca.mp0 ras > block!\n"); > + return err; > + } > + > + strcpy(ras->ras_block.ras_comm.name, "mca.mp0"); > + ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA; > + ras->ras_block.ras_comm.type = > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; > + adev->mca.mp0.ras_if = &ras->ras_block.ras_comm; > + > + return 0; > +} > + > +int amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev) { > + int err; > + struct amdgpu_mca_ras_block *ras; > + > + if (!adev->mca.mp1.ras) > + return 0; > + > + ras = adev->mca.mp1.ras; > + > + err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); > + if (err) { > + dev_err(adev->dev, "Failed to register mca.mp1 ras block!\n"); > + return err; > + } > + > + strcpy(ras->ras_block.ras_comm.name, "mca.mp1"); > + ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA; > + ras->ras_block.ras_comm.type = > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; > + adev->mca.mp1.ras_if = &ras->ras_block.ras_comm; > + > + return 0; > +} > + > +int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev) { > + int err; > + struct amdgpu_mca_ras_block *ras; > + > + if (!adev->mca.mpio.ras) > + return 0; > + > + ras = adev->mca.mpio.ras; > + > + err = amdgpu_ras_register_ras_block(adev, &ras->ras_block); > + if (err) { > + dev_err(adev->dev, "Failed to register mca.mpio ras block!\n"); > + return err; > + } > + > + strcpy(ras->ras_block.ras_comm.name, "mca.mpio"); > + ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__MCA; > + ras->ras_block.ras_comm.type = > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE; > + adev->mca.mpio.ras_if = &ras->ras_block.ras_comm; > + > + return 0; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h > index 7ce16d16e34b..997a073e2409 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h > @@ -30,12 +30,7 @@ struct amdgpu_mca_ras { > struct amdgpu_mca_ras_block *ras; > }; > > -struct amdgpu_mca_funcs { > - void (*init)(struct amdgpu_device *adev); > -}; > - > struct amdgpu_mca { > - const struct amdgpu_mca_funcs *funcs; > struct amdgpu_mca_ras mp0; > struct amdgpu_mca_ras mp1; > struct amdgpu_mca_ras mpio; > @@ -55,5 +50,7 @@ void amdgpu_mca_reset_error_count(struct > amdgpu_device *adev, void amdgpu_mca_query_ras_error_count(struct > amdgpu_device *adev, > uint64_t mc_status_addr, > void *ras_error_status); > - > +int amdgpu_mca_mp0_ras_sw_init(struct amdgpu_device *adev); int > +amdgpu_mca_mp1_ras_sw_init(struct amdgpu_device *adev); int > +amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev); > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index 9a333f9744bf..67c2a5186b8a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -1363,13 +1363,18 @@ static void gmc_v9_0_set_hdp_ras_funcs(struct > amdgpu_device *adev) > adev->hdp.ras = &hdp_v4_0_ras; > } > > -static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev) > +static void gmc_v9_0_set_mca_ras_funcs(struct amdgpu_device *adev) > { > + struct amdgpu_mca *mca = &adev->mca; > + > /* is UMC the right IP to check for MCA? Maybe DF? */ > switch (adev->ip_versions[UMC_HWIP][0]) { > case IP_VERSION(6, 7, 0): > - if (!adev->gmc.xgmi.connected_to_cpu) > - adev->mca.funcs = &mca_v3_0_funcs; > + if (!adev->gmc.xgmi.connected_to_cpu) { [Stanley]: Can we use if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MCA)) instead of if (!adev->gmc.xgmi.connected_to_cpu) to keep the ip ras judgment uniform. Regards, Stanley > + mca->mp0.ras = &mca_v3_0_mp0_ras; > + mca->mp1.ras = &mca_v3_0_mp1_ras; > + mca->mpio.ras = &mca_v3_0_mpio_ras; > + } > break; > default: > break; > @@ -1398,7 +1403,7 @@ static int gmc_v9_0_early_init(void *handle) > gmc_v9_0_set_mmhub_ras_funcs(adev); > gmc_v9_0_set_gfxhub_funcs(adev); > gmc_v9_0_set_hdp_ras_funcs(adev); > - gmc_v9_0_set_mca_funcs(adev); > + gmc_v9_0_set_mca_ras_funcs(adev); > > adev->gmc.shared_aperture_start = 0x2000000000000000ULL; > adev->gmc.shared_aperture_end = > @@ -1611,8 +1616,6 @@ static int gmc_v9_0_sw_init(void *handle) > adev->gfxhub.funcs->init(adev); > > adev->mmhub.funcs->init(adev); > - if (adev->mca.funcs) > - adev->mca.funcs->init(adev); > > spin_lock_init(&adev->gmc.invalidate_lock); > > diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c > b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c > index d4bd7d1d2649..6dae4a2e2767 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c > @@ -51,19 +51,13 @@ static int mca_v3_0_ras_block_match(struct > amdgpu_ras_block_object *block_obj, > return -EINVAL; > } > > -const struct amdgpu_ras_block_hw_ops mca_v3_0_mp0_hw_ops = { > +static const struct amdgpu_ras_block_hw_ops mca_v3_0_mp0_hw_ops = { > .query_ras_error_count = mca_v3_0_mp0_query_ras_error_count, > .query_ras_error_address = NULL, > }; > > struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = { > .ras_block = { > - .ras_comm = { > - .block = AMDGPU_RAS_BLOCK__MCA, > - .sub_block_index = > AMDGPU_RAS_MCA_BLOCK__MP0, > - .type = > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, > - .name = "mp0", > - }, > .hw_ops = &mca_v3_0_mp0_hw_ops, > .ras_block_match = mca_v3_0_ras_block_match, > }, > @@ -77,19 +71,13 @@ static void > mca_v3_0_mp1_query_ras_error_count(struct amdgpu_device *adev, > ras_error_status); > } > > -const struct amdgpu_ras_block_hw_ops mca_v3_0_mp1_hw_ops = { > +static const struct amdgpu_ras_block_hw_ops mca_v3_0_mp1_hw_ops = { > .query_ras_error_count = mca_v3_0_mp1_query_ras_error_count, > .query_ras_error_address = NULL, > }; > > struct amdgpu_mca_ras_block mca_v3_0_mp1_ras = { > .ras_block = { > - .ras_comm = { > - .block = AMDGPU_RAS_BLOCK__MCA, > - .sub_block_index = > AMDGPU_RAS_MCA_BLOCK__MP1, > - .type = > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, > - .name = "mp1", > - }, > .hw_ops = &mca_v3_0_mp1_hw_ops, > .ras_block_match = mca_v3_0_ras_block_match, > }, > @@ -103,40 +91,14 @@ static void > mca_v3_0_mpio_query_ras_error_count(struct amdgpu_device *adev, > ras_error_status); > } > > -const struct amdgpu_ras_block_hw_ops mca_v3_0_mpio_hw_ops = { > +static const struct amdgpu_ras_block_hw_ops mca_v3_0_mpio_hw_ops = { > .query_ras_error_count = mca_v3_0_mpio_query_ras_error_count, > .query_ras_error_address = NULL, > }; > > struct amdgpu_mca_ras_block mca_v3_0_mpio_ras = { > .ras_block = { > - .ras_comm = { > - .block = AMDGPU_RAS_BLOCK__MCA, > - .sub_block_index = > AMDGPU_RAS_MCA_BLOCK__MPIO, > - .type = > AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE, > - .name = "mpio", > - }, > .hw_ops = &mca_v3_0_mpio_hw_ops, > .ras_block_match = mca_v3_0_ras_block_match, > }, > }; > - > - > -static void mca_v3_0_init(struct amdgpu_device *adev) -{ > - struct amdgpu_mca *mca = &adev->mca; > - > - mca->mp0.ras = &mca_v3_0_mp0_ras; > - mca->mp1.ras = &mca_v3_0_mp1_ras; > - mca->mpio.ras = &mca_v3_0_mpio_ras; > - amdgpu_ras_register_ras_block(adev, &mca->mp0.ras->ras_block); > - amdgpu_ras_register_ras_block(adev, &mca->mp1.ras->ras_block); > - amdgpu_ras_register_ras_block(adev, &mca->mpio.ras->ras_block); > - mca->mp0.ras_if = &mca->mp0.ras->ras_block.ras_comm; > - mca->mp1.ras_if = &mca->mp1.ras->ras_block.ras_comm; > - mca->mpio.ras_if = &mca->mpio.ras->ras_block.ras_comm; > -} > - > -const struct amdgpu_mca_funcs mca_v3_0_funcs = { > - .init = mca_v3_0_init, > -}; > \ No newline at end of file > diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h > b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h > index b899b86194c2..d3eaef0d7f2d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h > +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.h > @@ -21,6 +21,8 @@ > #ifndef __MCA_V3_0_H__ > #define __MCA_V3_0_H__ > > -extern const struct amdgpu_mca_funcs mca_v3_0_funcs; > +extern struct amdgpu_mca_ras_block mca_v3_0_mp0_ras; extern struct > +amdgpu_mca_ras_block mca_v3_0_mp1_ras; extern struct > +amdgpu_mca_ras_block mca_v3_0_mpio_ras; > > #endif > -- > 2.17.1