Hi Thomas, > Can you please apply the patch below on top of Linus tree and retest? > > Please send me the outputs I asked you to provide last time in any case > (success or fail). The issue still occurs even if I applied your patch to linux 4.14.0-rc4. --- [ ...] INFO: task setroubleshootd:4972 blocked for more than 120 seconds. [ ...] Not tainted 4.14.0-rc4.thomas.with.irqdebug+ #6 [ ...] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ ...] setroubleshootd D 0 4972 1 0x00000080 [ ...] Call Trace: [ ...] __schedule+0x28d/0x890 [ ...] ? release_pages+0x16f/0x3f0 [ ...] schedule+0x36/0x80 [ ...] io_schedule+0x16/0x40 [ ...] wait_on_page_bit+0x107/0x150 [ ...] ? page_cache_tree_insert+0xb0/0xb0 [ ...] truncate_inode_pages_range+0x3dd/0x7d0 [ ...] ? schedule_hrtimeout_range_clock+0xad/0x140 [ ...] ? remove_wait_queue+0x59/0x60 [ ...] ? down_write+0x12/0x40 [ ...] ? unmap_mapping_range+0x75/0x130 [ ...] truncate_pagecache+0x47/0x60 [ ...] truncate_setsize+0x32/0x40 [ ...] xfs_setattr_size+0x100/0x300 [xfs] [ ...] xfs_vn_setattr_size+0x40/0x90 [xfs] [ ...] xfs_vn_setattr+0x87/0xa0 [xfs] [ ...] notify_change+0x266/0x440 [ ...] do_truncate+0x75/0xc0 [ ...] path_openat+0xaba/0x13b0 [ ...] ? mem_cgroup_commit_charge+0x31/0x130 [ ...] do_filp_open+0x91/0x100 [ ...] ? __alloc_fd+0x46/0x170 [ ...] do_sys_open+0x124/0x210 [ ...] SyS_open+0x1e/0x20 [ ...] do_syscall_64+0x67/0x1b0 [ ...] entry_SYSCALL64_slow_path+0x25/0x25 [ ...] RIP: 0033:0x7f275e2365bd [ ...] RSP: 002b:00007ffe29337da0 EFLAGS: 00000293 ORIG_RAX: 0000000000000002 [ ...] RAX: ffffffffffffffda RBX: 00000000040aea00 RCX: 00007f275e2365bd [ ...] RDX: 00000000000001b6 RSI: 0000000000000241 RDI: 00000000040ae840 [ ...] RBP: 00007ffe29337e00 R08: 00000000040aea06 R09: 0000000000000240 [ ...] R10: 0000000000000024 R11: 0000000000000293 R12: 00000000040eb660 [ ...] R13: 0000000000000004 R14: 00000000040ae840 R15: 000000000186a0a0 [ ...] sd 0:2:0:0: [sda] tag#0 task abort called for scmd(ffff9b4bf2306160) [ ...] sd 0:2:0:0: [sda] tag#0 CDB: Write(10) 2a 00 0b 3a 82 a0 00 00 20 00 [ ...] sd 0:2:0:0: task abort: FAILED scmd(ffff9b4bf2306160) [ ...] sd 0:2:0:0: target reset called for scmd(ffff9b4bf2306160) [ ...] sd 0:2:0:0: [sda] tag#0 megasas: target reset FAILED!! [ ...] sd 0:2:0:0: [sda] tag#0 Controller reset is requested due to IO timeout [ ...] SCSI command pointer: (ffff9b4bf2306160) SCSI host state: 5 SCSI --- I could not prepare the same environment I reported. So I reproduced the issue on the following megasas environment. --- IRQ affinity_list IRQ_TYPE 34 0-1 IR-PCI-MSI 1048576-edge megasas 35 2-3 IR-PCI-MSI 1048577-edge megasas 36 4-5 IR-PCI-MSI 1048578-edge megasas 37 6-7 IR-PCI-MSI 1048579-edge megasas 38 8-9 IR-PCI-MSI 1048580-edge megasas 39 10-11 IR-PCI-MSI 1048581-edge megasas 40 12-13 IR-PCI-MSI 1048582-edge megasas 41 14-15 IR-PCI-MSI 1048583-edge megasas 42 16-17 IR-PCI-MSI 1048584-edge megasas 43 18-19 IR-PCI-MSI 1048585-edge megasas 44 20-21 IR-PCI-MSI 1048586-edge megasas 45 22-23 IR-PCI-MSI 1048587-edge megasas 46 24-25 IR-PCI-MSI 1048588-edge megasas 47 26-27 IR-PCI-MSI 1048589-edge megasas 48 28-29 IR-PCI-MSI 1048590-edge megasas 49 30-31 IR-PCI-MSI 1048591-edge megasas 50 32-33 IR-PCI-MSI 1048592-edge megasas 51 34-35 IR-PCI-MSI 1048593-edge megasas 52 36-37 IR-PCI-MSI 1048594-edge megasas 53 38-39 IR-PCI-MSI 1048595-edge megasas 54 40-41 IR-PCI-MSI 1048596-edge megasas 55 42-43 IR-PCI-MSI 1048597-edge megasas 56 44-45 IR-PCI-MSI 1048598-edge megasas 57 46-47 IR-PCI-MSI 1048599-edge megasas 58 48-49 IR-PCI-MSI 1048600-edge megasas 59 50-51 IR-PCI-MSI 1048601-edge megasas 60 52-53 IR-PCI-MSI 1048602-edge megasas 61 54-55 IR-PCI-MSI 1048603-edge megasas 62 56-57 IR-PCI-MSI 1048604-edge megasas 63 58-59 IR-PCI-MSI 1048605-edge megasas 64 60-61 IR-PCI-MSI 1048606-edge megasas 65 62-63 IR-PCI-MSI 1048607-edge megasas 66 64-65 IR-PCI-MSI 1048608-edge megasas 67 66-67 IR-PCI-MSI 1048609-edge megasas 68 68-69 IR-PCI-MSI 1048610-edge megasas 69 70-71 IR-PCI-MSI 1048611-edge megasas 70 72-73 IR-PCI-MSI 1048612-edge megasas 71 74-75 IR-PCI-MSI 1048613-edge megasas 72 76-77 IR-PCI-MSI 1048614-edge megasas 73 78-79 IR-PCI-MSI 1048615-edge megasas 74 80-81 IR-PCI-MSI 1048616-edge megasas 75 82-83 IR-PCI-MSI 1048617-edge megasas 76 84-85 IR-PCI-MSI 1048618-edge megasas 77 86-87 IR-PCI-MSI 1048619-edge megasas 78 88-89 IR-PCI-MSI 1048620-edge megasas 79 90-91 IR-PCI-MSI 1048621-edge megasas 80 92-93 IR-PCI-MSI 1048622-edge megasas 81 94-95 IR-PCI-MSI 1048623-edge megasas 82 96-97 IR-PCI-MSI 1048624-edge megasas 83 98-99 IR-PCI-MSI 1048625-edge megasas 84 100-101 IR-PCI-MSI 1048626-edge megasas 85 102-103 IR-PCI-MSI 1048627-edge megasas 86 104-105 IR-PCI-MSI 1048628-edge megasas 87 106-107 IR-PCI-MSI 1048629-edge megasas 88 108-109 IR-PCI-MSI 1048630-edge megasas 89 110-111 IR-PCI-MSI 1048631-edge megasas 90 112-113 IR-PCI-MSI 1048632-edge megasas 91 114-115 IR-PCI-MSI 1048633-edge megasas 92 116-117 IR-PCI-MSI 1048634-edge megasas 93 118-119 IR-PCI-MSI 1048635-edge megasas 94 120-121 IR-PCI-MSI 1048636-edge megasas 95 122-123 IR-PCI-MSI 1048637-edge megasas 96 124-125 IR-PCI-MSI 1048638-edge megasas 97 126-127 IR-PCI-MSI 1048639-edge megasas 98 128-129 IR-PCI-MSI 1048640-edge megasas 99 130-131 IR-PCI-MSI 1048641-edge megasas 100 132-133 IR-PCI-MSI 1048642-edge megasas 101 134-135 IR-PCI-MSI 1048643-edge megasas 102 136-137 IR-PCI-MSI 1048644-edge megasas 103 138-139 IR-PCI-MSI 1048645-edge megasas 104 140-141 IR-PCI-MSI 1048646-edge megasas 105 142-143 IR-PCI-MSI 1048647-edge megasas 106 144-145 IR-PCI-MSI 1048648-edge megasas 107 146-147 IR-PCI-MSI 1048649-edge megasas 108 148-149 IR-PCI-MSI 1048650-edge megasas 109 150-151 IR-PCI-MSI 1048651-edge megasas 110 152-153 IR-PCI-MSI 1048652-edge megasas 111 154-155 IR-PCI-MSI 1048653-edge megasas 112 156-157 IR-PCI-MSI 1048654-edge megasas 113 158-159 IR-PCI-MSI 1048655-edge megasas 114 160-161 IR-PCI-MSI 1048656-edge megasas 115 162-163 IR-PCI-MSI 1048657-edge megasas 116 164-165 IR-PCI-MSI 1048658-edge megasas 117 166-167 IR-PCI-MSI 1048659-edge megasas 118 168-169 IR-PCI-MSI 1048660-edge megasas 119 170-171 IR-PCI-MSI 1048661-edge megasas 120 172-173 IR-PCI-MSI 1048662-edge megasas 121 174-175 IR-PCI-MSI 1048663-edge megasas 122 176-177 IR-PCI-MSI 1048664-edge megasas 123 178-179 IR-PCI-MSI 1048665-edge megasas 124 180-181 IR-PCI-MSI 1048666-edge megasas 125 182-183 IR-PCI-MSI 1048667-edge megasas 126 184-185 IR-PCI-MSI 1048668-edge megasas 127 186-187 IR-PCI-MSI 1048669-edge megasas 128 188-189 IR-PCI-MSI 1048670-edge megasas 129 190-191 IR-PCI-MSI 1048671-edge megasas --- Here are trace log that I offlined CPU 186-191 in descending order. When I offlined CPU 186, the issue occurred. --- # tracer: nop # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | systemd-1 [000] d... 0.427765: irq_do_set_affinity: irq: 24 ret 0 mask: 0-23 eff: 0 systemd-1 [029] d... 16.745803: irq_do_set_affinity: irq: 9 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 21.850146: irq_do_set_affinity: irq: 25 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 21.856549: irq_do_set_affinity: irq: 26 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 21.862920: irq_do_set_affinity: irq: 27 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 21.869300: irq_do_set_affinity: irq: 28 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 21.875685: irq_do_set_affinity: irq: 29 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 21.897267: irq_do_set_affinity: irq: 30 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 23.983226: irq_do_set_affinity: irq: 31 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 23.998459: irq_do_set_affinity: irq: 32 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 26.095152: irq_do_set_affinity: irq: 33 ret 2 mask: 0-23 eff: 0-5 kworker/0:3-1458 [000] d... 28.497033: irq_do_set_affinity: irq: 16 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 28.715688: irq_do_set_affinity: irq: 8 ret 2 mask: 0-23 eff: 0-5 systemd-1 [120] d... 29.163740: irq_do_set_affinity: irq: 4 ret 2 mask: 0-23 eff: 0-5 kworker/0:1-134 [000] d... 30.625367: irq_do_set_affinity: irq: 34 ret 2 mask: 0-1 eff: 0-1 kworker/0:1-134 [000] d... 30.625400: irq_do_set_affinity: irq: 35 ret 2 mask: 2-3 eff: 2-3 kworker/0:1-134 [000] d... 30.625442: irq_do_set_affinity: irq: 36 ret 2 mask: 4-5 eff: 4-5 kworker/0:1-134 [000] d... 30.625474: irq_do_set_affinity: irq: 37 ret 2 mask: 6-7 eff: 6-7 kworker/0:1-134 [000] d... 30.625513: irq_do_set_affinity: irq: 38 ret 2 mask: 8-9 eff: 8-9 kworker/0:1-134 [000] d... 30.625549: irq_do_set_affinity: irq: 39 ret 2 mask: 10-11 eff: 10-11 kworker/0:1-134 [000] d... 30.625585: irq_do_set_affinity: irq: 40 ret 2 mask: 12-13 eff: 12-13 kworker/0:1-134 [000] d... 30.625621: irq_do_set_affinity: irq: 41 ret 2 mask: 14-15 eff: 14-15 kworker/0:1-134 [000] d... 30.625656: irq_do_set_affinity: irq: 42 ret 2 mask: 16-17 eff: 16-17 kworker/0:1-134 [000] d... 30.625692: irq_do_set_affinity: irq: 43 ret 2 mask: 18-19 eff: 18-19 kworker/0:1-134 [000] d... 30.625732: irq_do_set_affinity: irq: 44 ret 2 mask: 20-21 eff: 20-21 kworker/0:1-134 [000] d... 30.625768: irq_do_set_affinity: irq: 45 ret 2 mask: 22-23 eff: 22-23 kworker/0:1-134 [000] d... 30.625801: irq_do_set_affinity: irq: 46 ret 2 mask: 24-25 eff: 24-25 kworker/0:1-134 [000] d... 30.625818: irq_do_set_affinity: irq: 47 ret 2 mask: 26-27 eff: 26-27 kworker/0:1-134 [000] d... 30.625843: irq_do_set_affinity: irq: 48 ret 2 mask: 28-29 eff: 28-29 kworker/0:1-134 [000] d... 30.625869: irq_do_set_affinity: irq: 49 ret 2 mask: 30-31 eff: 30-31 kworker/0:1-134 [000] d... 30.625897: irq_do_set_affinity: irq: 50 ret 2 mask: 32-33 eff: 32-33 kworker/0:1-134 [000] d... 30.625922: irq_do_set_affinity: irq: 51 ret 2 mask: 34-35 eff: 34-35 kworker/0:1-134 [000] d... 30.625947: irq_do_set_affinity: irq: 52 ret 2 mask: 36-37 eff: 36-37 kworker/0:1-134 [000] d... 30.625969: irq_do_set_affinity: irq: 53 ret 2 mask: 38-39 eff: 38-39 kworker/0:1-134 [000] d... 30.625992: irq_do_set_affinity: irq: 54 ret 2 mask: 40-41 eff: 40-41 kworker/0:1-134 [000] d... 30.626012: irq_do_set_affinity: irq: 55 ret 2 mask: 42-43 eff: 42-43 kworker/0:1-134 [000] d... 30.626032: irq_do_set_affinity: irq: 56 ret 2 mask: 44-45 eff: 44-45 kworker/0:1-134 [000] d... 30.626052: irq_do_set_affinity: irq: 57 ret 2 mask: 46-47 eff: 46-47 kworker/0:1-134 [000] d... 30.626088: irq_do_set_affinity: irq: 58 ret 2 mask: 48-49 eff: 48-49 kworker/0:1-134 [000] d... 30.626105: irq_do_set_affinity: irq: 59 ret 2 mask: 50-51 eff: 50-51 kworker/0:1-134 [000] d... 30.626118: irq_do_set_affinity: irq: 60 ret 2 mask: 52-53 eff: 52-53 kworker/0:1-134 [000] d... 30.626157: irq_do_set_affinity: irq: 61 ret 2 mask: 54-55 eff: 54-55 kworker/0:1-134 [000] d... 30.626185: irq_do_set_affinity: irq: 62 ret 2 mask: 56-57 eff: 56-57 kworker/0:1-134 [000] d... 30.626217: irq_do_set_affinity: irq: 63 ret 2 mask: 58-59 eff: 58-59 kworker/0:1-134 [000] d... 30.626243: irq_do_set_affinity: irq: 64 ret 2 mask: 60-61 eff: 60-61 kworker/0:1-134 [000] d... 30.626269: irq_do_set_affinity: irq: 65 ret 2 mask: 62-63 eff: 62-63 kworker/0:1-134 [000] d... 30.626299: irq_do_set_affinity: irq: 66 ret 2 mask: 64-65 eff: 64-65 kworker/0:1-134 [000] d... 30.626322: irq_do_set_affinity: irq: 67 ret 2 mask: 66-67 eff: 66-67 kworker/0:1-134 [000] d... 30.626346: irq_do_set_affinity: irq: 68 ret 2 mask: 68-69 eff: 68-69 kworker/0:1-134 [000] d... 30.626368: irq_do_set_affinity: irq: 69 ret 2 mask: 70-71 eff: 70-71 kworker/0:1-134 [000] d... 30.626390: irq_do_set_affinity: irq: 70 ret 2 mask: 72-73 eff: 72-73 kworker/0:1-134 [000] d... 30.626405: irq_do_set_affinity: irq: 71 ret 2 mask: 74-75 eff: 74-75 kworker/0:1-134 [000] d... 30.626417: irq_do_set_affinity: irq: 72 ret 2 mask: 76-77 eff: 76-77 kworker/0:1-134 [000] d... 30.626455: irq_do_set_affinity: irq: 73 ret 2 mask: 78-79 eff: 78-79 kworker/0:1-134 [000] d... 30.626483: irq_do_set_affinity: irq: 74 ret 2 mask: 80-81 eff: 80-81 kworker/0:1-134 [000] d... 30.626510: irq_do_set_affinity: irq: 75 ret 2 mask: 82-83 eff: 82-83 kworker/0:1-134 [000] d... 30.626535: irq_do_set_affinity: irq: 76 ret 2 mask: 84-85 eff: 84-85 kworker/0:1-134 [000] d... 30.626563: irq_do_set_affinity: irq: 77 ret 2 mask: 86-87 eff: 86-87 kworker/0:1-134 [000] d... 30.626585: irq_do_set_affinity: irq: 78 ret 2 mask: 88-89 eff: 88-89 kworker/0:1-134 [000] d... 30.626604: irq_do_set_affinity: irq: 79 ret 2 mask: 90-91 eff: 90-91 kworker/0:1-134 [000] d... 30.626624: irq_do_set_affinity: irq: 80 ret 2 mask: 92-93 eff: 92-93 kworker/0:1-134 [000] d... 30.626644: irq_do_set_affinity: irq: 81 ret 2 mask: 94-95 eff: 94-95 kworker/0:1-134 [000] d... 30.626665: irq_do_set_affinity: irq: 82 ret 2 mask: 96-97 eff: 96-97 kworker/0:1-134 [000] d... 30.626679: irq_do_set_affinity: irq: 83 ret 2 mask: 98-99 eff: 98-99 kworker/0:1-134 [000] d... 30.626693: irq_do_set_affinity: irq: 84 ret 2 mask: 100-101 eff: 100-101 kworker/0:1-134 [000] d... 30.626708: irq_do_set_affinity: irq: 85 ret 2 mask: 102-103 eff: 102-103 kworker/0:1-134 [000] d... 30.626750: irq_do_set_affinity: irq: 86 ret 2 mask: 104-105 eff: 104-105 kworker/0:1-134 [000] d... 30.626784: irq_do_set_affinity: irq: 87 ret 2 mask: 106-107 eff: 106-107 kworker/0:1-134 [000] d... 30.626814: irq_do_set_affinity: irq: 88 ret 2 mask: 108-109 eff: 108-109 kworker/0:1-134 [000] d... 30.626844: irq_do_set_affinity: irq: 89 ret 2 mask: 110-111 eff: 110-111 kworker/0:1-134 [000] d... 30.626872: irq_do_set_affinity: irq: 90 ret 2 mask: 112-113 eff: 112-113 kworker/0:1-134 [000] d... 30.626896: irq_do_set_affinity: irq: 91 ret 2 mask: 114-115 eff: 114-115 kworker/0:1-134 [000] d... 30.626928: irq_do_set_affinity: irq: 92 ret 2 mask: 116-117 eff: 116-117 kworker/0:1-134 [000] d... 30.626954: irq_do_set_affinity: irq: 93 ret 2 mask: 118-119 eff: 118-119 kworker/0:1-134 [000] d... 30.626975: irq_do_set_affinity: irq: 94 ret 2 mask: 120-121 eff: 120-121 kworker/0:1-134 [000] d... 30.626996: irq_do_set_affinity: irq: 95 ret 2 mask: 122-123 eff: 122-123 kworker/0:1-134 [000] d... 30.627022: irq_do_set_affinity: irq: 96 ret 2 mask: 124-125 eff: 124-125 kworker/0:1-134 [000] d... 30.627050: irq_do_set_affinity: irq: 97 ret 2 mask: 126-127 eff: 126-127 kworker/0:1-134 [000] d... 30.627081: irq_do_set_affinity: irq: 98 ret 2 mask: 128-129 eff: 128-129 kworker/0:1-134 [000] d... 30.627110: irq_do_set_affinity: irq: 99 ret 2 mask: 130-131 eff: 130-131 kworker/0:1-134 [000] d... 30.627137: irq_do_set_affinity: irq: 100 ret 2 mask: 132-133 eff: 132-133 kworker/0:1-134 [000] d... 30.627164: irq_do_set_affinity: irq: 101 ret 2 mask: 134-135 eff: 134-135 kworker/0:1-134 [000] d... 30.627191: irq_do_set_affinity: irq: 102 ret 2 mask: 136-137 eff: 136-137 kworker/0:1-134 [000] d... 30.627214: irq_do_set_affinity: irq: 103 ret 2 mask: 138-139 eff: 138-139 kworker/0:1-134 [000] d... 30.627238: irq_do_set_affinity: irq: 104 ret 2 mask: 140-141 eff: 140-141 kworker/0:1-134 [000] d... 30.627263: irq_do_set_affinity: irq: 105 ret 2 mask: 142-143 eff: 142-143 kworker/0:1-134 [000] d... 30.627283: irq_do_set_affinity: irq: 106 ret 2 mask: 144-145 eff: 144-145 kworker/0:1-134 [000] d... 30.627296: irq_do_set_affinity: irq: 107 ret 2 mask: 146-147 eff: 146-147 kworker/0:1-134 [000] d... 30.627311: irq_do_set_affinity: irq: 108 ret 2 mask: 148-149 eff: 148-149 kworker/0:1-134 [000] d... 30.627344: irq_do_set_affinity: irq: 109 ret 2 mask: 150-151 eff: 150-151 kworker/0:1-134 [000] d... 30.627377: irq_do_set_affinity: irq: 110 ret 2 mask: 152-153 eff: 152-153 kworker/0:1-134 [000] d... 30.627410: irq_do_set_affinity: irq: 111 ret 2 mask: 154-155 eff: 154-155 kworker/0:1-134 [000] d... 30.627437: irq_do_set_affinity: irq: 112 ret 2 mask: 156-157 eff: 156-157 kworker/0:1-134 [000] d... 30.627467: irq_do_set_affinity: irq: 113 ret 2 mask: 158-159 eff: 158-159 kworker/0:1-134 [000] d... 30.627494: irq_do_set_affinity: irq: 114 ret 2 mask: 160-161 eff: 160-161 kworker/0:1-134 [000] d... 30.627519: irq_do_set_affinity: irq: 115 ret 2 mask: 162-163 eff: 162-163 kworker/0:1-134 [000] d... 30.627545: irq_do_set_affinity: irq: 116 ret 2 mask: 164-165 eff: 164-165 kworker/0:1-134 [000] d... 30.627569: irq_do_set_affinity: irq: 117 ret 2 mask: 166-167 eff: 166-167 kworker/0:1-134 [000] d... 30.627589: irq_do_set_affinity: irq: 118 ret 2 mask: 168-169 eff: 168-169 kworker/0:1-134 [000] d... 30.627607: irq_do_set_affinity: irq: 119 ret 2 mask: 170-171 eff: 170-171 kworker/0:1-134 [000] d... 30.627639: irq_do_set_affinity: irq: 120 ret 2 mask: 172-173 eff: 172-173 kworker/0:1-134 [000] d... 30.627666: irq_do_set_affinity: irq: 121 ret 2 mask: 174-175 eff: 174-175 kworker/0:1-134 [000] d... 30.627691: irq_do_set_affinity: irq: 122 ret 2 mask: 176-177 eff: 176-177 kworker/0:1-134 [000] d... 30.627721: irq_do_set_affinity: irq: 123 ret 2 mask: 178-179 eff: 178-179 kworker/0:1-134 [000] d... 30.627748: irq_do_set_affinity: irq: 124 ret 2 mask: 180-181 eff: 180-181 kworker/0:1-134 [000] d... 30.627774: irq_do_set_affinity: irq: 125 ret 2 mask: 182-183 eff: 182-183 kworker/0:1-134 [000] d... 30.627799: irq_do_set_affinity: irq: 126 ret 2 mask: 184-185 eff: 184-185 kworker/0:1-134 [000] d... 30.627828: irq_do_set_affinity: irq: 127 ret 2 mask: 186-187 eff: 186 kworker/0:1-134 [000] d... 30.627850: irq_do_set_affinity: irq: 128 ret 2 mask: 188-189 eff: 188 kworker/0:1-134 [000] d... 30.627875: irq_do_set_affinity: irq: 129 ret 2 mask: 190-191 eff: 190 kworker/0:0-3 [000] d... 38.217213: irq_do_set_affinity: irq: 18 ret 2 mask: 0-23 eff: 0-5 systemd-udevd-2007 [129] d... 38.510108: irq_do_set_affinity: irq: 3 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.732162: irq_do_set_affinity: irq: 131 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.732195: irq_do_set_affinity: irq: 132 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.732214: irq_do_set_affinity: irq: 133 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.732229: irq_do_set_affinity: irq: 134 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.732246: irq_do_set_affinity: irq: 135 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.732261: irq_do_set_affinity: irq: 136 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.732276: irq_do_set_affinity: irq: 137 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.732292: irq_do_set_affinity: irq: 138 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.732308: irq_do_set_affinity: irq: 139 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.865529: irq_do_set_affinity: irq: 140 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.865557: irq_do_set_affinity: irq: 141 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.865575: irq_do_set_affinity: irq: 142 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.865591: irq_do_set_affinity: irq: 143 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.865607: irq_do_set_affinity: irq: 144 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.865621: irq_do_set_affinity: irq: 145 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.865635: irq_do_set_affinity: irq: 146 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.865650: irq_do_set_affinity: irq: 147 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 44.865664: irq_do_set_affinity: irq: 148 ret 2 mask: 0-23 eff: 0-5 NetworkManager-2628 [135] d... 45.041598: irq_do_set_affinity: irq: 130 ret 2 mask: 0-23 eff: 6-9,11,16,18-19,21-23,26-28,31,34-35,38,40-43,47-63 NetworkManager-2628 [135] d... 45.042054: irq_do_set_affinity: irq: 130 ret 2 mask: 0-23 eff: 6-8,10-11,16,18-19,21-23,26-28,31,34-35,38,40-43,47-63 NetworkManager-2628 [135] d... 45.150285: irq_do_set_affinity: irq: 130 ret 2 mask: 0-23 eff: 0-5 (agetty)-3134 [049] d... 55.930794: irq_do_set_affinity: irq: 4 ret 2 mask: 0-23 eff: 0-5 <...>-1346 [191] d..1 100.473714: irq_do_set_affinity: irq: 129 ret 2 mask: 190-191 eff: 190 <...>-1346 [191] d..1 100.473722: <stack trace> => native_cpu_disable => take_cpu_down => multi_cpu_stop => cpu_stopper_thread => smpboot_thread_fn => kthread => ret_from_fork <...>-1334 [189] d..1 700.567235: irq_do_set_affinity: irq: 128 ret 2 mask: 188-189 eff: 188 <...>-1334 [189] d..1 700.567243: <stack trace> => native_cpu_disable => take_cpu_down => multi_cpu_stop => cpu_stopper_thread => smpboot_thread_fn => kthread => ret_from_fork <...>-1322 [187] d..1 1300.660985: irq_do_set_affinity: irq: 127 ret 2 mask: 186-187 eff: 186 <...>-1322 [187] d..1 1300.660993: <stack trace> => native_cpu_disable => take_cpu_down => multi_cpu_stop => cpu_stopper_thread => smpboot_thread_fn => kthread => ret_from_fork --- Thanks, Yasuaki Ishimatsu On 10/10/2017 12:30 PM, YASUAKI ISHIMATSU wrote: > Hi Thomas, > > Sorry for the late reply. > > I'll apply the patches and retest in this week. > Please wait a while. > > Thanks, > Yasuaki Ishimatsu > > On 10/04/2017 05:04 PM, Thomas Gleixner wrote: >> On Tue, 3 Oct 2017, Thomas Gleixner wrote: >>> Can you please apply the debug patch below. >> >> I found an issue with managed interrupts when the affinity mask of an >> managed interrupt spawns multiple CPUs. Explanation in the changelog >> below. I'm not sure that this cures the problems you have, but at least I >> could prove that it's not doing what it should do. The failure I'm seing is >> fixed, but I can't test that megasas driver due to -ENOHARDWARE. >> >> Can you please apply the patch below on top of Linus tree and retest? >> >> Please send me the outputs I asked you to provide last time in any case >> (success or fail). >> >> @block/scsi folks: Can you please run that through your tests as well? >> >> Thanks, >> >> tglx >> >> 8<----------------------- >> Subject: genirq/cpuhotplug: Enforce affinity setting on startup of managed irqs >> From: Thomas Gleixner <tglx@xxxxxxxxxxxxx> >> Date: Wed, 04 Oct 2017 21:07:38 +0200 >> >> Managed interrupts can end up in a stale state on CPU hotplug. If the >> interrupt is not targeting a single CPU, i.e. the affinity mask spawns >> multiple CPUs then the following can happen: >> >> After boot: >> >> dstate: 0x01601200 >> IRQD_ACTIVATED >> IRQD_IRQ_STARTED >> IRQD_SINGLE_TARGET >> IRQD_AFFINITY_SET >> IRQD_AFFINITY_MANAGED >> node: 0 >> affinity: 24-31 >> effectiv: 24 >> pending: 0 >> >> After offlining CPU 31 - 24 >> >> dstate: 0x01a31000 >> IRQD_IRQ_DISABLED >> IRQD_IRQ_MASKED >> IRQD_SINGLE_TARGET >> IRQD_AFFINITY_SET >> IRQD_AFFINITY_MANAGED >> IRQD_MANAGED_SHUTDOWN >> node: 0 >> affinity: 24-31 >> effectiv: 24 >> pending: 0 >> >> Now CPU 25 gets onlined again, so it should get the effective interrupt >> affinity for this interruopt, but due to the x86 interrupt affinity setter >> restrictions this ends up after restarting the interrupt with: >> >> dstate: 0x01601300 >> IRQD_ACTIVATED >> IRQD_IRQ_STARTED >> IRQD_SINGLE_TARGET >> IRQD_AFFINITY_SET >> IRQD_SETAFFINITY_PENDING >> IRQD_AFFINITY_MANAGED >> node: 0 >> affinity: 24-31 >> effectiv: 24 >> pending: 24-31 >> >> So the interrupt is still affine to CPU 24, which was the last CPU to go >> offline of that affinity set and the move to an online CPU within 24-31, >> in this case 25, is pending. This mechanism is x86/ia64 specific as those >> architectures cannot move interrupts from thread context and do this when >> an interrupt is actually handled. So the move is set to pending. >> >> Whats worse is that offlining CPU 25 again results in: >> >> dstate: 0x01601300 >> IRQD_ACTIVATED >> IRQD_IRQ_STARTED >> IRQD_SINGLE_TARGET >> IRQD_AFFINITY_SET >> IRQD_SETAFFINITY_PENDING >> IRQD_AFFINITY_MANAGED >> node: 0 >> affinity: 24-31 >> effectiv: 24 >> pending: 24-31 >> >> This means the interrupt has not been shut down, because the outgoing CPU >> is not in the effective affinity mask, but of course nothing notices that >> the effective affinity mask is pointing at an offline CPU. >> >> In the case of restarting a managed interrupt the move restriction does not >> apply, so the affinity setting can be made unconditional. This needs to be >> done _before_ the interrupt is started up as otherwise the condition for >> moving it from thread context would not longer be fulfilled. >> >> With that change applied onlining CPU 25 after offlining 31-24 results in: >> >> dstate: 0x01600200 >> IRQD_ACTIVATED >> IRQD_IRQ_STARTED >> IRQD_SINGLE_TARGET >> IRQD_AFFINITY_MANAGED >> node: 0 >> affinity: 24-31 >> effectiv: 25 >> pending: >> >> And after offlining CPU 25: >> >> dstate: 0x01a30000 >> IRQD_IRQ_DISABLED >> IRQD_IRQ_MASKED >> IRQD_SINGLE_TARGET >> IRQD_AFFINITY_MANAGED >> IRQD_MANAGED_SHUTDOWN >> node: 0 >> affinity: 24-31 >> effectiv: 25 >> pending: >> >> which is the correct and expected result. >> >> To complete that, add some debug code to catch this kind of situation in >> the cpu offline code and warn about interrupt chips which allow affinity >> setting and do not update the effective affinity mask if that feature is >> enabled. >> >> Reported-by: YASUAKI ISHIMATSU <yasu.isimatu@xxxxxxxxx> >> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> >> >> --- >> kernel/irq/chip.c | 2 +- >> kernel/irq/cpuhotplug.c | 28 +++++++++++++++++++++++++++- >> kernel/irq/manage.c | 17 +++++++++++++++++ >> 3 files changed, 45 insertions(+), 2 deletions(-) >> >> --- a/kernel/irq/chip.c >> +++ b/kernel/irq/chip.c >> @@ -265,8 +265,8 @@ int irq_startup(struct irq_desc *desc, b >> irq_setup_affinity(desc); >> break; >> case IRQ_STARTUP_MANAGED: >> + irq_do_set_affinity(d, aff, false); >> ret = __irq_startup(desc); >> - irq_set_affinity_locked(d, aff, false); >> break; >> case IRQ_STARTUP_ABORT: >> return 0; >> --- a/kernel/irq/cpuhotplug.c >> +++ b/kernel/irq/cpuhotplug.c >> @@ -18,8 +18,34 @@ >> static inline bool irq_needs_fixup(struct irq_data *d) >> { >> const struct cpumask *m = irq_data_get_effective_affinity_mask(d); >> + unsigned int cpu = smp_processor_id(); >> >> - return cpumask_test_cpu(smp_processor_id(), m); >> +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK >> + /* >> + * The cpumask_empty() check is a workaround for interrupt chips, >> + * which do not implement effective affinity, but the architecture has >> + * enabled the config switch. Use the general affinity mask instead. >> + */ >> + if (cpumask_empty(m)) >> + m = irq_data_get_affinity_mask(d); >> + >> + /* >> + * Sanity check. If the mask is not empty when excluding the outgoing >> + * CPU then it must contain at least one online CPU. The outgoing CPU >> + * has been removed from the online mask already. >> + */ >> + if (cpumask_any_but(m, cpu) < nr_cpu_ids && >> + cpumask_any_and(m, cpu_online_mask) >= nr_cpu_ids) { >> + /* >> + * If this happens then there was a missed IRQ fixup at some >> + * point. Warn about it and enforce fixup. >> + */ >> + pr_warn("Eff. affinity %*pbl of IRQ %u contains only offline CPUs after offlining CPU %u\n", >> + cpumask_pr_args(m), d->irq, cpu); >> + return true; >> + } >> +#endif >> + return cpumask_test_cpu(cpu, m); >> } >> >> static bool migrate_one_irq(struct irq_desc *desc) >> --- a/kernel/irq/manage.c >> +++ b/kernel/irq/manage.c >> @@ -168,6 +168,19 @@ void irq_set_thread_affinity(struct irq_ >> set_bit(IRQTF_AFFINITY, &action->thread_flags); >> } >> >> +static void irq_validate_effective_affinity(struct irq_data *data) >> +{ >> +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK >> + const struct cpumask *m = irq_data_get_effective_affinity_mask(data); >> + struct irq_chip *chip = irq_data_get_irq_chip(data); >> + >> + if (!cpumask_empty(m)) >> + return; >> + pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n", >> + chip->name, data->irq); >> +#endif >> +} >> + >> int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, >> bool force) >> { >> @@ -175,12 +188,16 @@ int irq_do_set_affinity(struct irq_data >> struct irq_chip *chip = irq_data_get_irq_chip(data); >> int ret; >> >> + if (!chip || !chip->irq_set_affinity) >> + return -EINVAL; >> + >> ret = chip->irq_set_affinity(data, mask, force); >> switch (ret) { >> case IRQ_SET_MASK_OK: >> case IRQ_SET_MASK_OK_DONE: >> cpumask_copy(desc->irq_common_data.affinity, mask); >> case IRQ_SET_MASK_OK_NOCOPY: >> + irq_validate_effective_affinity(data); >> irq_set_thread_affinity(desc); >> ret = 0; >> } >>