[Public] Thanks for the review, Tao. Updated the position for unlocking. Fix race condition failure during UMC UE injection. Signed-off-by: Candice Li <candice.li@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 08133de21fdd63..53b957a5b9a65c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1935,9 +1935,11 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) if (!con || !con->eh_data) return 0; + mutex_lock(&con->recovery_lock); control = &con->eeprom_control; data = con->eh_data; save_count = data->count - control->ras_num_recs; + mutex_unlock(&con->recovery_lock); /* only new entries are saved */ if (save_count > 0) { if (amdgpu_ras_eeprom_append(control, -- 2.17.1 Thanks, Candice -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Tuesday, November 16, 2021 4:27 PM To: Li, Candice <Candice.Li@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Clements, John <John.Clements@xxxxxxx> Subject: RE: [PATCH] drm/amdgpu: Add recovery_lock to save bad pages function [AMD Official Use Only] > -----Original Message----- > From: Li, Candice <Candice.Li@xxxxxxx> > Sent: Tuesday, November 16, 2021 4:02 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Clements, John <John.Clements@xxxxxxx>; Zhou1, Tao > <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx> > Subject: [PATCH] drm/amdgpu: Add recovery_lock to save bad pages function > > Fix race condition failure during UMC UE injection. > > Signed-off-by: Candice Li <candice.li@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++++++-- > 1 file changed, 7 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 08133de21fdd63..711b5fb26d47d4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1931,10 +1931,12 @@ int amdgpu_ras_save_bad_pages(struct > amdgpu_device *adev) > struct ras_err_handler_data *data; > struct amdgpu_ras_eeprom_control *control; > int save_count; > + int ret = 0; > > if (!con || !con->eh_data) > return 0; > > + mutex_lock(&con->recovery_lock); > control = &con->eeprom_control; > data = con->eh_data; > save_count = data->count - control->ras_num_recs; @@ -1944,13 [Tao] Since recovery_lock is dedicated to protecting eh_data, can we unlock it here? > +1946,16 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) > &data->bps[control->ras_num_recs], > save_count)) { > dev_err(adev->dev, "Failed to save EEPROM table > data!"); > - return -EIO; > + ret = -EIO; > + goto out; > } > > dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", > save_count); > } > > - return 0; > +out: > + mutex_unlock(&con->recovery_lock); > + return ret; > } > > /* > -- > 2.17.1