On 6/22/20 7:21 AM, Greg KH wrote:
On Mon, Jun 22, 2020 at 11:51:24AM +0200, Daniel Vetter wrote:On Sun, Jun 21, 2020 at 02:03:05AM -0400, Andrey Grodzovsky wrote:Track sysfs files in a list so they all can be removed during pci remove since otherwise their removal after that causes crash because parent folder was already removed during pci remove.Huh? That should not happen, do you have a backtrace of that crash?
2 examples in the attached trace. Andrey
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>Uh I thought sysfs just gets yanked completely. Please check with Greg KH whether hand-rolling all this really is the right solution here ... Feels very wrong. I thought this was all supposed to work by adding attributes before publishing the sysfs node, and then letting sysfs clean up everything. Not by cleaning up manually yourself.Yes, that is supposed to be the correct thing to do.Adding Greg for an authoritative answer. -Daniel--- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 13 +++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c | 7 +++++- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 35 ++++++++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 12 ++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 8 ++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 17 ++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 13 ++++++++++- drivers/gpu/drm/amd/amdgpu/df_v3_6.c | 10 +++++--- 8 files changed, 99 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 604a681..ba3775f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -726,6 +726,15 @@ struct amd_powerplay {#define AMDGPU_RESET_MAGIC_NUM 64#define AMDGPU_MAX_DF_PERFMONS 4 + +struct amdgpu_sysfs_list_node { + struct list_head head; + struct device_attribute *attr; +};You know we have lists of attributes already, called attribute groups, if you really wanted to do something like this. But, I don't think so. Either way, don't hand-roll your own stuff that the driver core has provided for you for a decade or more, that's just foolish :)+ +#define AMDGPU_DEVICE_ATTR_LIST_NODE(_attr) \ + struct amdgpu_sysfs_list_node dev_attr_handle_##_attr = {.attr = &dev_attr_##_attr} + struct amdgpu_device { struct device *dev; struct drm_device *ddev; @@ -992,6 +1001,10 @@ struct amdgpu_device { char product_number[16]; char product_name[32]; char serial[16]; + + struct list_head sysfs_files_list; + struct mutex sysfs_files_list_lock; + };static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c index fdd52d8..c1549ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c @@ -1950,8 +1950,10 @@ static ssize_t amdgpu_atombios_get_vbios_version(struct device *dev, return snprintf(buf, PAGE_SIZE, "%s\n", ctx->vbios_version); }+static DEVICE_ATTR(vbios_version, 0444, amdgpu_atombios_get_vbios_version, NULL); +static AMDGPU_DEVICE_ATTR_LIST_NODE(vbios_version);/*** amdgpu_atombios_fini - free the driver info and callbacks for atombios @@ -1972,7 +1974,6 @@ void amdgpu_atombios_fini(struct amdgpu_device *adev) adev->mode_info.atom_context = NULL; kfree(adev->mode_info.atom_card_info); adev->mode_info.atom_card_info = NULL; - device_remove_file(adev->dev, &dev_attr_vbios_version); }/**@@ -2038,6 +2039,10 @@ int amdgpu_atombios_init(struct amdgpu_device *adev) return ret; }+ mutex_lock(&adev->sysfs_files_list_lock);+ list_add_tail(&dev_attr_handle_vbios_version.head, &adev->sysfs_files_list); + mutex_unlock(&adev->sysfs_files_list_lock); + return 0; }diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.cindex e7b9065..3173046 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2928,6 +2928,12 @@ static const struct attribute *amdgpu_dev_attributes[] = { NULL };+static AMDGPU_DEVICE_ATTR_LIST_NODE(product_name);+static AMDGPU_DEVICE_ATTR_LIST_NODE(product_number); +static AMDGPU_DEVICE_ATTR_LIST_NODE(serial_number); +static AMDGPU_DEVICE_ATTR_LIST_NODE(pcie_replay_count); + + /** * amdgpu_device_init - initialize the driver * @@ -3029,6 +3035,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->shadow_list); mutex_init(&adev->shadow_list_lock);+ INIT_LIST_HEAD(&adev->sysfs_files_list);+ mutex_init(&adev->sysfs_files_list_lock); + INIT_DELAYED_WORK(&adev->delayed_init_work, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, @@ -3281,6 +3290,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) { dev_err(adev->dev, "Could not create amdgpu device attr\n"); return r; + } else { + mutex_lock(&adev->sysfs_files_list_lock); + list_add_tail(&dev_attr_handle_product_name.head, &adev->sysfs_files_list); + list_add_tail(&dev_attr_handle_product_number.head, &adev->sysfs_files_list); + list_add_tail(&dev_attr_handle_serial_number.head, &adev->sysfs_files_list); + list_add_tail(&dev_attr_handle_pcie_replay_count.head, &adev->sysfs_files_list); + mutex_unlock(&adev->sysfs_files_list_lock); }if (IS_ENABLED(CONFIG_PERF_EVENTS))@@ -3298,6 +3314,16 @@ int amdgpu_device_init(struct amdgpu_device *adev, return r; }+static void amdgpu_sysfs_remove_files(struct amdgpu_device *adev)+{ + struct amdgpu_sysfs_list_node *node; + + mutex_lock(&adev->sysfs_files_list_lock); + list_for_each_entry(node, &adev->sysfs_files_list, head) + device_remove_file(adev->dev, node->attr); + mutex_unlock(&adev->sysfs_files_list_lock); +} + /** * amdgpu_device_fini - tear down the driver * @@ -3332,6 +3358,11 @@ void amdgpu_device_fini_early(struct amdgpu_device *adev) amdgpu_fbdev_fini(adev);amdgpu_irq_fini_early(adev);+ + amdgpu_sysfs_remove_files(adev); + + if (adev->ucode_sysfs_en) + amdgpu_ucode_sysfs_fini(adev); }void amdgpu_device_fini_late(struct amdgpu_device *adev)@@ -3366,10 +3397,6 @@ void amdgpu_device_fini_late(struct amdgpu_device *adev) adev->rmmio = NULL; amdgpu_device_doorbell_fini(adev);- if (adev->ucode_sysfs_en)- amdgpu_ucode_sysfs_fini(adev); - - sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); if (IS_ENABLED(CONFIG_PERF_EVENTS)) amdgpu_pmu_fini(adev); if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c index 6271044..e7b6c4a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c @@ -76,6 +76,9 @@ static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO, static DEVICE_ATTR(mem_info_gtt_used, S_IRUGO, amdgpu_mem_info_gtt_used_show, NULL);+static AMDGPU_DEVICE_ATTR_LIST_NODE(mem_info_gtt_total);+static AMDGPU_DEVICE_ATTR_LIST_NODE(mem_info_gtt_used); + /** * amdgpu_gtt_mgr_init - init GTT manager and DRM MM * @@ -114,6 +117,11 @@ static int amdgpu_gtt_mgr_init(struct ttm_mem_type_manager *man, return ret; }+ mutex_lock(&adev->sysfs_files_list_lock);+ list_add_tail(&dev_attr_handle_mem_info_gtt_total.head, &adev->sysfs_files_list); + list_add_tail(&dev_attr_handle_mem_info_gtt_used.head, &adev->sysfs_files_list); + mutex_unlock(&adev->sysfs_files_list_lock); + return 0; }@@ -127,7 +135,6 @@ static int amdgpu_gtt_mgr_init(struct ttm_mem_type_manager *man,*/ static int amdgpu_gtt_mgr_fini(struct ttm_mem_type_manager *man) { - struct amdgpu_device *adev = amdgpu_ttm_adev(man->bdev); struct amdgpu_gtt_mgr *mgr = man->priv; spin_lock(&mgr->lock); drm_mm_takedown(&mgr->mm); @@ -135,9 +142,6 @@ static int amdgpu_gtt_mgr_fini(struct ttm_mem_type_manager *man) kfree(mgr); man->priv = NULL;- device_remove_file(adev->dev, &dev_attr_mem_info_gtt_total);- device_remove_file(adev->dev, &dev_attr_mem_info_gtt_used); - return 0; }diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.cindex ddb4af0c..554fec0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2216,6 +2216,8 @@ static DEVICE_ATTR(usbc_pd_fw, S_IRUGO | S_IWUSR, psp_usbc_pd_fw_sysfs_read, psp_usbc_pd_fw_sysfs_write);+static AMDGPU_DEVICE_ATTR_LIST_NODE(usbc_pd_fw);+const struct amd_ip_funcs psp_ip_funcs = {@@ -2242,13 +2244,17 @@ static int psp_sysfs_init(struct amdgpu_device *adev)if (ret)DRM_ERROR("Failed to create USBC PD FW control file!"); + else { + mutex_lock(&adev->sysfs_files_list_lock); + list_add_tail(&dev_attr_handle_usbc_pd_fw.head, &adev->sysfs_files_list); + mutex_unlock(&adev->sysfs_files_list_lock); + }return ret;}static void psp_sysfs_fini(struct amdgpu_device *adev){ - device_remove_file(adev->dev, &dev_attr_usbc_pd_fw); }const struct amdgpu_ip_block_version psp_v3_1_ip_block =diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 7723937..39c400c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -148,6 +148,12 @@ static DEVICE_ATTR(mem_info_vis_vram_used, S_IRUGO, static DEVICE_ATTR(mem_info_vram_vendor, S_IRUGO, amdgpu_mem_info_vram_vendor, NULL);+static AMDGPU_DEVICE_ATTR_LIST_NODE(mem_info_vram_total);+static AMDGPU_DEVICE_ATTR_LIST_NODE(mem_info_vis_vram_total); +static AMDGPU_DEVICE_ATTR_LIST_NODE(mem_info_vram_used); +static AMDGPU_DEVICE_ATTR_LIST_NODE(mem_info_vis_vram_used); +static AMDGPU_DEVICE_ATTR_LIST_NODE(mem_info_vram_vendor);Converting all of these individual attributes to an attribute group would be a nice thing to do anyway. Makes your logic much simpler and less error-prone. But again, the driver core should do all of the device file removal stuff automatically for you when your PCI device is removed from the system _UNLESS_ you are doing crazy things like creating child devices or messing with raw kobjects or other horrible things that I haven't read the code to see if you are, but hopefully not :) thanks, greg k-h
[ 925.738225 < 0.188086>] BUG: kernel NULL pointer dereference, address: 0000000000000090 [ 925.738232 < 0.000007>] #PF: supervisor read access in kernel mode [ 925.738236 < 0.000004>] #PF: error_code(0x0000) - not-present page [ 925.738240 < 0.000004>] PGD 0 P4D 0 [ 925.738245 < 0.000005>] Oops: 0000 [#1] SMP PTI [ 925.738249 < 0.000004>] CPU: 7 PID: 2547 Comm: amdgpu_test Tainted: G W OE 5.5.0-rc7-dev-kfd+ #50 [ 925.738256 < 0.000007>] Hardware name: System manufacturer System Product Name/RAMPAGE IV FORMULA, BIOS 4804 12/30/2013 [ 925.738266 < 0.000010>] RIP: 0010:kernfs_find_ns+0x18/0x110 [ 925.738270 < 0.000004>] Code: a6 cf ff 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 41 57 41 56 49 89 f6 41 55 41 54 49 89 fd 55 53 49 89 d4 <0f> b7 af 90 00 00 00 8b 05 8f ee 6b 01 48 8b 5f 68 66 83 e5 20 41 [ 925.738282 < 0.000012>] RSP: 0018:ffffad6d0118fb00 EFLAGS: 00010246 [ 925.738287 < 0.000005>] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 2098a12076864b7e [ 925.738292 < 0.000005>] RDX: 0000000000000000 RSI: ffffffffb6606b31 RDI: 0000000000000000 [ 925.738297 < 0.000005>] RBP: ffffffffb6606b31 R08: ffffffffb5379d10 R09: 0000000000000000 [ 925.738302 < 0.000005>] R10: ffffad6d0118fb38 R11: ffff9a75f64820a8 R12: 0000000000000000 [ 925.738307 < 0.000005>] R13: 0000000000000000 R14: ffffffffb6606b31 R15: ffff9a7612b06130 [ 925.738313 < 0.000006>] FS: 00007f3eca4e8700(0000) GS:ffff9a763dbc0000(0000) knlGS:0000000000000000 [ 925.738319 < 0.000006>] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 925.738323 < 0.000004>] CR2: 0000000000000090 CR3: 0000000035e5a005 CR4: 00000000000606e0 [ 925.738329 < 0.000006>] Call Trace: [ 925.738334 < 0.000005>] kernfs_find_and_get_ns+0x2e/0x50 [ 925.738339 < 0.000005>] sysfs_remove_group+0x25/0x80 [ 925.738344 < 0.000005>] sysfs_remove_groups+0x29/0x40 [ 925.738350 < 0.000006>] free_msi_irqs+0xf5/0x190 [ 925.738354 < 0.000004>] pci_disable_msi+0xe9/0x120 [ 925.738406 < 0.000052>] amdgpu_irq_fini+0xe3/0xf0 [amdgpu] [ 925.738453 < 0.000047>] tonga_ih_sw_fini+0xe/0x30 [amdgpu] [ 925.738490 < 0.000037>] amdgpu_device_fini_late+0x14b/0x440 [amdgpu] [ 925.738529 < 0.000039>] amdgpu_driver_release_kms+0x16/0x40 [amdgpu] [ 925.738548 < 0.000019>] drm_dev_put+0x5b/0x80 [drm] [ 925.738558 < 0.000010>] drm_release+0xc6/0xd0 [drm] [ 925.738563 < 0.000005>] __fput+0xc6/0x260 [ 925.738568 < 0.000005>] task_work_run+0x79/0xb0 [ 925.738573 < 0.000005>] do_exit+0x3d0/0xc60 [ 925.738578 < 0.000005>] do_group_exit+0x47/0xb0 [ 925.738583 < 0.000005>] get_signal+0x18b/0xc30 [ 925.738589 < 0.000006>] do_signal+0x36/0x6a0 [ 925.738593 < 0.000004>] ? force_sig_info_to_task+0xbc/0xd0 [ 925.738597 < 0.000004>] ? signal_wake_up_state+0x15/0x30 [ 925.738603 < 0.000006>] exit_to_usermode_loop+0x6f/0xc0 [ 925.738608 < 0.000005>] prepare_exit_to_usermode+0xc7/0x110 [ 925.738613 < 0.000005>] ret_from_intr+0x25/0x35 [ 925.738617 < 0.000004>] RIP: 0033:0x417369 [ 925.738621 < 0.000004>] Code: Bad RIP value. [ 925.738625 < 0.000004>] RSP: 002b:00007ffdd6bf0900 EFLAGS: 00010246 [ 925.738629 < 0.000004>] RAX: 00007f3eca509000 RBX: 000000000000001e RCX: 00007f3ec95ba260 [ 925.738634 < 0.000005>] RDX: 00007f3ec9889790 RSI: 000000000000000a RDI: 0000000000000000 [ 925.738639 < 0.000005>] RBP: 00007ffdd6bf0990 R08: 00007f3ec9889780 R09: 00007f3eca4e8700 [ 925.738645 < 0.000006>] R10: 000000000000035c R11: 0000000000000246 R12: 00000000021c6170 [ 925.738650 < 0.000005>] R13: 00007ffdd6bf0c00 R14: 0000000000000000 R15: 0000000000000000 [ 40.880899 < 0.000004>] BUG: kernel NULL pointer dereference, address: 0000000000000090 [ 40.880906 < 0.000007>] #PF: supervisor read access in kernel mode [ 40.880910 < 0.000004>] #PF: error_code(0x0000) - not-present page [ 40.880915 < 0.000005>] PGD 0 P4D 0 [ 40.880920 < 0.000005>] Oops: 0000 [#1] SMP PTI [ 40.880924 < 0.000004>] CPU: 1 PID: 2526 Comm: amdgpu_test Tainted: G W OE 5.5.0-rc7-dev-kfd+ #50 [ 40.880932 < 0.000008>] Hardware name: System manufacturer System Product Name/RAMPAGE IV FORMULA, BIOS 4804 12/30/2013 [ 40.880941 < 0.000009>] RIP: 0010:kernfs_find_ns+0x18/0x110 [ 40.880945 < 0.000004>] Code: a6 cf ff 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 41 57 41 56 49 89 f6 41 55 41 54 49 89 fd 55 53 49 89 d4 <0f> b7 af 90 00 00 00 8b 05 8f ee 6b 01 48 8b 5f 68 66 83 e5 20 41 [ 40.880957 < 0.000012>] RSP: 0018:ffffaf3380467ba8 EFLAGS: 00010246 [ 40.880963 < 0.000006>] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 2098a12076864b7e [ 40.880968 < 0.000005>] RDX: 0000000000000000 RSI: ffffffffc0678cfc RDI: 0000000000000000 [ 40.880973 < 0.000005>] RBP: ffffffffc0678cfc R08: ffffffffaa379d10 R09: 0000000000000000 [ 40.880979 < 0.000006>] R10: ffffaf3380467be0 R11: ffff93547615d128 R12: 0000000000000000 [ 40.880984 < 0.000005>] R13: 0000000000000000 R14: ffffffffc0678cfc R15: ffff93549be86130 [ 40.880990 < 0.000006>] FS: 00007fd9ecb10700(0000) GS:ffff9354bd840000(0000) knlGS:0000000000000000 [ 40.880996 < 0.000006>] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 40.881001 < 0.000005>] CR2: 0000000000000090 CR3: 0000000072866001 CR4: 00000000000606e0 [ 40.881006 < 0.000005>] Call Trace: [ 40.881011 < 0.000005>] kernfs_find_and_get_ns+0x2e/0x50 [ 40.881016 < 0.000005>] sysfs_remove_group+0x25/0x80 [ 40.881055 < 0.000039>] amdgpu_device_fini_late+0x3eb/0x440 [amdgpu] [ 40.881095 < 0.000040>] amdgpu_driver_release_kms+0x16/0x40 [amdgpu] [ 40.881109 < 0.000014>] drm_dev_put+0x5b/0x80 [drm] [ 40.881119 < 0.000010>] drm_release+0xc6/0xd0 [drm] [ 40.881124 < 0.000005>] __fput+0xc6/0x260 [ 40.881129 < 0.000005>] task_work_run+0x79/0xb0 [ 40.881134 < 0.000005>] do_exit+0x3d0/0xc60 [ 40.881138 < 0.000004>] do_group_exit+0x47/0xb0 [ 40.881143 < 0.000005>] get_signal+0x18b/0xc30 [ 40.881149 < 0.000006>] do_signal+0x36/0x6a0 [ 40.881153 < 0.000004>] ? force_sig_info_to_task+0xbc/0xd0 [ 40.881158 < 0.000005>] ? signal_wake_up_state+0x15/0x30 [ 40.881164 < 0.000006>] exit_to_usermode_loop+0x6f/0xc0 [ 40.881170 < 0.000006>] prepare_exit_to_usermode+0xc7/0x110 [ 40.881176 < 0.000006>] ret_from_intr+0x25/0x35 [ 40.881181 < 0.000005>] RIP: 0033:0x417369 [ 40.881185 < 0.000004>] Code: Bad RIP value. [ 40.881188 < 0.000003>] RSP: 002b:00007ffd6a742f90 EFLAGS: 00010246 [ 40.881193 < 0.000005>] RAX: 00007fd9ecb31000 RBX: 000000000000001e RCX: 00007fd9ebbe2260 [ 40.881199 < 0.000006>] RDX: 00007fd9ebeb1790 RSI: 000000000000000a RDI: 0000000000000000 [ 40.881204 < 0.000005>] RBP: 00007ffd6a743020 R08: 00007fd9ebeb1780 R09: 00007fd9ecb10700 [ 40.881210 < 0.000006>] R10: 000000000000035c R11: 0000000000000246 R12: 00000000023e0170 [ 40.881215 < 0.000005>] R13: 00007ffd6a743290 R14: 0000000000000000 R15: 0000000000000000
_______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx