Hello When using the nvme-tcp driver in a storage cluster, the driver may trigger a null pointer causing the host to crash several times. By analyzing the vmcore, we know the direct cause is that the request->mq_hctx was used after free. CPU1 CPU2 nvme_tcp_poll nvme_tcp_try_send --failed to send reqrest 13 nvme_tcp_try_recv nvme_tcp_fail_request nvme_tcp_recv_skb nvme_tcp_end_request nvme_tcp_recv_pdu nvme_complete_rq nvme_tcp_handle_comp nvme_retry_req -- request->mq_hctx have been freed, is NULL. nvme_tcp_process_nvme_cqe nvme_complete_rq nvme_end_req blk_mq_end_request when nvme_tcp_try_send failed to send reqrest 13, it maybe be resulted by selinux or other reasons, this is a problem. then the nvme_tcp_fail_request would execute。 but the nvme_tcp_recv_pdu may have received the responding pdu and the nvme_tcp_process_nvme_cqe would have completed the request. request->mq_hctx was used after free. the follow patch is to solve it. can you give some suggestions? thanks! diff --git a/linux/drivers/nvme/host/core.c b/linux/drivers/nvme/host/core.c index a65b1dce8..417466674 100644 --- a/linux/drivers/nvme/host/core.c +++ b/linux/drivers/nvme/host/core.c @@ -288,6 +288,9 @@ static void nvme_retry_req(struct request *req) unsigned long delay = 0; u16 crd; + if(!req->mq_hctx && req->state == MQ_RQ_IDLE) + return; + /* The mask and shift result must be <= 3 */ crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11; The details are as follows [35665.692310] nvme nvme2: failed to send request -13 [35665.692683] nvme nvme2: queue 1 failed to send request 00000000b42f4e2b state 2 pdu 00000000d7fb8da3 type 4 rq_state 1 nrq_status 0 [35665.693323] nvme nvme2: failed to send rq 00000000f86a68c3 state 2 nrq_status 370 [35665.702265] nvme nvme2: unsupported pdu type (3) [35665.702272] BUG: kernel NULL pointer dereference, address: 0000000000000000 [35665.702542] nvme nvme2: queue 1 receive failed: -22 [35665.703209] #PF: supervisor write access in kernel mode [35665.703213] #PF: error_code(0x0002) - not-present page [35665.703214] PGD 8000003801cce067 P4D 8000003801cce067 PUD 37e6f79067 PMD 0 [35665.703220] Oops: 0002 [#1] SMP PTI [35665.703658] nvme nvme2: starting error recovery [35665.704442] CPU: 20 PID: 815 Comm: kworker/20:1H Kdump: loaded Not tainted 5.15.131-17.cl9.x86_64 #1 [35665.705168] nvme nvme2: queue 1 receive again after receive failed [35665.705809] Hardware name: Inspur aaabbb/YZMB-00882-104, BIOS 4.1.26 09/22/2022 [35665.705812] Workqueue: kblockd blk_mq_requeue_work [35665.709172] RIP: 0010:_raw_spin_lock+0xc/0x30 [35665.709606] Code: 05 c3 cc cc cc cc 89 c6 e8 31 05 68 ff 66 90 c3 cc cc cc cc 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 c0 ba 01 00 00 00 <f0> 0f b1 17 75 05 c3 cc cc cc cc 89 c6 e8 02 05 68 ff 66 90 c3 cc [35665.710470] RSP: 0000:ffffa67bcd797e08 EFLAGS: 00010246 [35665.710925] RAX: 0000000000000000 RBX: ffff92f6bbcc9840 RCX: ffff92f6bbcc9888 [35665.711393] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000000 [35665.711849] RBP: 0000000000000000 R08: ffffa67bcd797e48 R09: ffff932346d576f4 [35665.712275] R10: 0000000000000008 R11: 0000000000000008 R12: 0000000000000000 [35665.712725] R13: ffff92f6bbcc9888 R14: 0000000000000008 R15: 0000000000000000 [35665.713158] FS: 0000000000000000(0000) GS:ffff93527d400000(0000) knlGS:0000000000000000 [35665.713603] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [35665.714063] CR2: 0000000000000000 CR3: 000000371aa02006 CR4: 00000000007706e0 [35665.714534] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [35665.714961] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [35665.715359] PKRU: 55555554 [35665.715788] Call Trace: [35665.716201] <TASK> [35665.716613] ? show_trace_log_lvl+0x1c1/0x2d9 [35665.717049] ? show_trace_log_lvl+0x1c1/0x2d9 [35665.717457] ? blk_mq_request_bypass_insert+0x2c/0xb0 [35665.717950] ? __die_body.cold+0x8/0xd [35665.718361] ? page_fault_oops+0xac/0x140 [35665.718749] ? blk_mq_start_request+0x30/0xf0 [35665.719144] ? nvme_tcp_queue_rq+0xc7/0x170 [nvme_tcp] [35665.719547] ? exc_page_fault+0x62/0x130 [35665.719938] ? asm_exc_page_fault+0x22/0x30 [35665.720333] ? _raw_spin_lock+0xc/0x30 [35665.720723] blk_mq_request_bypass_insert+0x2c/0xb0 [35665.721101] blk_mq_requeue_work+0xa5/0x180 [35665.721451] process_one_work+0x1e8/0x390 [35665.721809] worker_thread+0x53/0x3d0 [35665.722159] ? process_one_work+0x390/0x390 [35665.722501] kthread+0x124/0x150 [35665.722849] ? set_kthread_struct+0x50/0x50 [35665.723182] ret_from_fork+0x1f/0x30 [35665.723508] </TASK> crash> struct nvme_tcp_request ffff92f6bbcc9950 struct nvme_tcp_request { req = { cmd = 0xffff92f5b83f6748, result = { u16 = 0, u32 = 0, u64 = 0 }, genctr = 169 '\251', retries = 1 '\001', flags = 0 '\000', status = 6, ctrl = 0xffff92f5e5df7348 }, pdu = 0xffff92f5b83f6740, queue = 0xffff92f407cc9128, data_len = 4096, pdu_len = 4096, pdu_sent = 0, h2cdata_left = 0, h2cdata_offset = 0, ttag = 62, status = 12, entry = { next = 0xdead000000000100, prev = 0xdead000000000122 }, lentry = { next = 0x0 }, ddgst = 0, curr_bio = 0xffff9324d639e240, iter = { iter_type = 2 '\002', nofault = false, data_source = true, iov_offset = 0, count = 4096, { iov = 0xffff92f6bbcc98a8, kvec = 0xffff92f6bbcc98a8, bvec = 0xffff92f6bbcc98a8, xarray = 0xffff92f6bbcc98a8, pipe = 0xffff92f6bbcc98a8 }, { nr_segs = 1, { head = 1, start_head = 0 }, xarray_start = 1 } }, offset = 0, data_sent = 0, state = NVME_TCP_SEND_DATA } crash> nvme_tcp_hdr.type 0xffff92f5b83f6740 type = 4 '\004', crash> crash> struct request ffff92f6bbcc9840 struct request { q = 0xffff92f59d55c240, mq_ctx = 0xffffc67bb9a1f040, mq_hctx = 0x0, cmd_flags = 33556483, rq_flags = 139456, tag = 87, internal_tag = -1, __data_len = 0, __sector = 66846720, bio = 0x0, biotail = 0x0, queuelist = { next = 0xffff92f6bbcc9888, prev = 0xffff92f6bbcc9888 }, { hash = { next = 0x0, pprev = 0x0 }, ipi_list = { next = 0x0 } }, { rb_node = { __rb_parent_color = 18446685131795018112, rb_right = 0x1000, rb_left = 0x0 }, special_vec = { bv_page = 0xffffca64841f3180, bv_len = 4096, bv_offset = 0 }, completion_data = 0xffffca64841f3180, error_count = -2078330496 }, { elv = { icq = 0x0, priv = {0xffff92f6bbcc98c8, 0xffff92f6bbcc98c8} }, flush = { seq = 0, list = { next = 0xffff92f6bbcc98c8, prev = 0xffff92f6bbcc98c8 }, saved_end_io = 0x0 } }, rq_disk = 0xffff92f6bdbff600, part = 0xffff92f59f557800, start_time_ns = 35665692557229, io_start_time_ns = 35665692566268, wbt_flags = 0, stats_sectors = 0, nr_phys_segments = 1, nr_integrity_segments = 0, write_hint = 0, ioprio = 0, state = MQ_RQ_IDLE, ref = { refs = { counter = 0 } }, timeout = 180000, deadline = 4330512774, { csd = { node = { llist = { next = 0x0 }, { u_flags = 0, a_flags = { counter = 0 } }, src = 0, dst = 0 }, func = 0xffffffff8eed62d0 <__blk_mq_complete_request_remote>, info = 0xffff92f6bbcc9840 }, fifo_time = 0 }, end_io = 0x0, end_io_data = 0x0 } Best regards zhang.guanghui@xxxxxxxx