Am 16.02.22 um 11:43 schrieb Lazar, Lijo:
On 2/16/2022 4:04 PM, Somalapuram, Amaranath wrote:On 2/16/2022 3:45 PM, Lazar, Lijo wrote:On 2/16/2022 3:19 PM, Somalapuram Amaranath wrote:List of register populated for dump collection during the GPU reset. Signed-off-by: Somalapuram Amaranath <Amaranath.Somalapuram@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 5 ++drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 95 +++++++++++++++++++++2 files changed, 100 insertions(+)diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.hindex b85b67a88a3d..57965316873b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1097,6 +1097,11 @@ struct amdgpu_device { struct amdgpu_reset_control *reset_cntl; uint32_t ip_versions[HW_ID_MAX][HWIP_MAX_INSTANCE]; + + /* reset dump register */ + uint32_t *reset_dump_reg_list; + int n_regs; + struct mutex reset_dump_mutex; };static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.cindex 164d6a9e9fbb..faf985c7cb93 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1609,6 +1609,98 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL, DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL, amdgpu_debugfs_sclk_set, "%llu\n"); +static ssize_t amdgpu_reset_dump_register_list_read(struct file *f, + char __user *buf, size_t size, loff_t *pos) +{+ struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;+ char reg_offset[11]; + int i, r, len = 0; + + if (*pos) + return 0; + + if (adev->n_regs == 0) + return 0; + + for (i = 0; i < adev->n_regs; i++) { + sprintf(reg_offset, "0x%x ", adev->reset_dump_reg_list[i]); + r = copy_to_user(buf + len, reg_offset, strlen(reg_offset)); + + if (r) + return -EFAULT; + + len += strlen(reg_offset); + } + + r = copy_to_user(buf + len, "\n", 1); + + if (r) + return -EFAULT; + + len++; + *pos += len; + + return len; +} + +static ssize_t amdgpu_reset_dump_register_list_write(struct file *f, + const char __user *buf, size_t size, loff_t *pos) +{+ struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;+ char *reg_offset, *reg, reg_temp[11]; + static int alloc_count;This being static what happens when it is called on a second device?
Good point. I've totally missed that.
Thanks, LijoI tried to avoid adding to adev. It wont work for multiple devices.Hmm.This is not friendly for single device also. Some one could just parse a text file of reg offsets and dosudo echo offset > fileThis will overwrite whatever is there. Instead you may define a syntax likesudo echo 0x000 > file => Clears all sudo echo offset > file => Append to the existing set. Taking all offsets in one go may not be needed.
Yes, completely agree. Now that you mention it the code here is severely broken in quite a number of ways, but see below.
Thanks, Lijo+ int ret, i = 0, len = 0; + + do { + reg_offset = reg_temp; + memset(reg_offset, 0, 11);+ ret = copy_from_user(reg_offset, buf + len, min(11, ((int)size-len)));+ + if (ret) + goto failed; + + reg = strsep(®_offset, " "); + + if (alloc_count <= i) {
The alloc_count variable and this check is completely unnecessary.
+ adev->reset_dump_reg_list = krealloc_array( + adev->reset_dump_reg_list, 1,
The memory management knows internally how much memory is allocated and when the buffer needs to be re-allocated for grow.
So all you need to do is is "array = krealloc_array(array, i, sizeof(*array), GFP_KERNEL);"
Regards, Christian.
+ sizeof(uint32_t), GFP_KERNEL); + alloc_count++; + } + + ret = kstrtouint(reg, 16, &adev->reset_dump_reg_list[i]); + + if (ret) + goto failed; + + len += strlen(reg) + 1; + i++; + + } while (len < size); + + adev->n_regs = i; + + return size; + +failed: + mutex_lock(&adev->reset_dump_mutex); + kfree(adev->reset_dump_reg_list); + adev->reset_dump_reg_list = NULL; + alloc_count = 0; + adev->n_regs = 0; + mutex_unlock(&adev->reset_dump_mutex); + return -EFAULT; +} + + ++static const struct file_operations amdgpu_reset_dump_register_list = {+ .owner = THIS_MODULE, + .read = amdgpu_reset_dump_register_list_read, + .write = amdgpu_reset_dump_register_list_write, + .llseek = default_llseek +}; + int amdgpu_debugfs_init(struct amdgpu_device *adev) { struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;@@ -1618,6 +1710,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)if (!debugfs_initialized()) return 0; + mutex_init(&adev->reset_dump_mutex); ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev, &fops_ib_preempt); if (IS_ERR(ent)) {@@ -1672,6 +1765,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)&amdgpu_debugfs_test_ib_fops); debugfs_create_file("amdgpu_vm_info", 0444, root, adev, &amdgpu_debugfs_vm_info_fops);+ debugfs_create_file("amdgpu_reset_dump_register_list", 0644, root, adev,+ &amdgpu_reset_dump_register_list); adev->debugfs_vbios_blob.data = adev->bios; adev->debugfs_vbios_blob.size = adev->bios_size;