> -----Original Message----- > From: Chen, Guchun <Guchun.Chen@xxxxxxx> > Sent: 2019年9月2日 10:13 > To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; > Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>; Li, Dennis > <Dennis.Li@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table > record structure > > > > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao > Zhou > Sent: Friday, August 30, 2019 8:25 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Grodzovsky, Andrey > <Andrey.Grodzovsky@xxxxxxx>; Chen, Guchun <Guchun.Chen@xxxxxxx>; > Li, Dennis <Dennis.Li@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx> > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table > record structure > > change bps type from retired page to eeprom table record, prepare for > saving error records to eeprom > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++------- > -- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++-- > 2 files changed, 43 insertions(+), 27 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 2ca3997d4b3a..24663ec41248 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1187,14 +1187,14 @@ static int amdgpu_ras_badpages_read(struct > amdgpu_device *adev, > > for (; i < data->count; i++) { > (*bps)[i] = (struct ras_badpage){ > - .bp = data->bps[i].bp, > + .bp = data->bps[i].retired_page, > .size = AMDGPU_GPU_PAGE_SIZE, > .flags = 0, > }; > > if (data->last_reserved <= i) > (*bps)[i].flags = 1; > - else if (data->bps[i].bo == NULL) > + else if (data->bps_bo[i] == NULL) > (*bps)[i].flags = 2; > } > > @@ -1288,30 +1288,40 @@ static int > amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, { > unsigned int old_space = data->count + data->space_left; > unsigned int new_space = old_space + pages; > - unsigned int align_space = ALIGN(new_space, 1024); > - void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); > - > - if (!tmp) > + unsigned int align_space = ALIGN(new_space, 512); > [Guchun]Any special reason to change alignment from 512 to 1024? [Tao] The old "data->bps" is 16 byte and new " struct eeprom_table_record bps" is 31 bytes on 64bit system, I'd like to lower the pressure on memory system. The value can be adjusted according to feedback in the future. > > + void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); > + struct amdgpu_bo **bps_bo = > + kmalloc(align_space * sizeof(*data->bps_bo), > GFP_KERNEL); > + > + if (!bps || !bps_bo) { > + kfree(bps); > + kfree(bps_bo); > return -ENOMEM; > + } > > if (data->bps) { > - memcpy(tmp, data->bps, > + memcpy(bps, data->bps, > data->count * sizeof(*data->bps)); > kfree(data->bps); > } > + if (data->bps_bo) { > + memcpy(bps_bo, data->bps_bo, > + data->count * sizeof(*data->bps_bo)); > + kfree(data->bps_bo); > + } > > - data->bps = tmp; > + data->bps = bps; > + data->bps_bo = bps_bo; > data->space_left += align_space - old_space; > return 0; > } > > /* it deal with vram only. */ > int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, > - unsigned long *bps, int pages) > + struct eeprom_table_record *bps, int pages) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_err_handler_data *data; > - int i = pages; > int ret = 0; > > if (!con || !con->eh_data || !bps || pages <= 0) @@ -1328,10 > +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, > goto out; > } > > - while (i--) > - data->bps[data->count++].bp = bps[i]; > - > + memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); > + data->count += pages; > data->space_left -= pages; > + > out: > mutex_unlock(&con->recovery_lock); > > @@ -1356,13 +1366,13 @@ int amdgpu_ras_reserve_bad_pages(struct > amdgpu_device *adev) > goto out; > /* reserve vram at driver post stage. */ > for (i = data->last_reserved; i < data->count; i++) { > - bp = data->bps[i].bp; > + bp = data->bps[i].retired_page; > > if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, > PAGE_SIZE, &bo)) > DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", > bp); > > - data->bps[i].bo = bo; > + data->bps_bo[i] = bo; > data->last_reserved = i + 1; > } > out: > @@ -1387,11 +1397,11 @@ static int amdgpu_ras_release_bad_pages(struct > amdgpu_device *adev) > goto out; > > for (i = data->last_reserved - 1; i >= 0; i--) { > - bo = data->bps[i].bo; > + bo = data->bps_bo[i]; > > amdgpu_ras_release_vram(adev, &bo); > > - data->bps[i].bo = bo; > + data->bps_bo[i] = bo; > data->last_reserved = i; > } > out: > @@ -1407,12 +1417,19 @@ static int amdgpu_ras_save_bad_pages(struct > amdgpu_device *adev) > return 0; > } > > +/* > + * read error record array in eeprom and reserve enough space for > + * storing new bad pages > + */ > static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) { > - /* TODO > - * read the array to eeprom when SMU disabled. > - */ > - return 0; > + struct eeprom_table_record *bps = NULL; > + int ret; > + > + ret = amdgpu_ras_add_bad_pages(adev, bps, > + adev->umc.max_ras_err_cnt_per_query); > + > + return ret; > } > > static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 66b71525446e..b6bac873c588 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -351,11 +351,10 @@ struct ras_err_data { }; > > struct ras_err_handler_data { > - /* point to bad pages array */ > - struct { > - unsigned long bp; > - struct amdgpu_bo *bo; > - } *bps; > + /* point to bad page records array */ > + struct eeprom_table_record *bps; > + /* point to reserved bo array */ > + struct amdgpu_bo **bps_bo; > /* the count of entries */ > int count; > /* the space can place new entries */ > @@ -492,7 +491,7 @@ unsigned long > amdgpu_ras_query_error_count(struct amdgpu_device *adev, > > /* error handling functions */ > int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, > - unsigned long *bps, int pages); > + struct eeprom_table_record *bps, int pages); > > int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev); > > -- > 2.17.1 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx