Hi Nic, On 2/11/16, 3:47 PM, "Nicholas A. Bellinger" <nab@xxxxxxxxxxxxxxx> wrote: >On Wed, 2016-02-10 at 22:53 -0800, Nicholas A. Bellinger wrote: >> On Tue, 2016-02-09 at 18:03 +0000, Himanshu Madhani wrote: >> > On 2/8/16, 9:25 PM, "Nicholas A. Bellinger" <nab@xxxxxxxxxxxxxxx> >>wrote: >> > >On Mon, 2016-02-08 at 23:27 +0000, Himanshu Madhani wrote: >> > >> >> > >> I am testing this series with with 4.5.0-rc2+ kernel and I am >>seeing >> > >>issue >> > >> where trying to trigger >> > >> sg_reset with option of host/device/bus in loop at 120second >>interval >> > >> causes call stack. At this point >> > >> removing configuration hangs indefinitely. See attached dmesg >>output >> > >>from >> > >> my setup. >> > >> >> > > >> > >Thanks alot for testing this. >> > > >> > >So It looks like we're still hitting a indefinite schedule() on >> > >se_cmd->cmd_wait_comp once tcm_qla2xxx session disconnect/reconnect >> > >occurs, after repeated explicit active I/O remote-port sg_resets. >> > > >> > >Does this trigger on the first tcm_qla2xxx session reconnect after >> > >explicit remote-port sg_reset..? Are session reconnects actively >>being >> > >triggered during the test..? >> > > >> > >To verify the latter for iscsi-target, I've been using a small patch >>to >> > >trigger session reset from TMR kthread context in order to simulate >>the >> > >I_T disconnects. Something like that would be useful for verifying >>with >> > >tcm_qla2xxx too. >> > > >> > >That said, I'll be reproducing with tcm_qla2xxx ports this week, and >> > >will enable various debug in a WIP branch for testing. >> >> Following up here.. >> >> So far using my test setup with ISP2532 ports in P2P + RAMDISK_MCP and >> v4.5-rc1, repeated remote-port active I/O LUN_RESET (sg_reset -d) has >> been functioning as expected with a blocksize_range=4k-256k + iodepth=32 >> fio write-verify style workload. >> >> No ->cmd_kref -1 OOPsen or qla2xxx initiator generated ABORT_TASKs from >> outstanding target TAS responses, nor fio write-verify failures to >> report after 800x remote-port active I/O LUN_RESETS. >> >> Next step will be to verify explicit tcm_qla2xxx port + module shutdown >> after 1K test iterations, and then IBLOCK async completions <-> NVMe >> backends with the same case. >> > >After letting this test run over-night up to 7k active I/O remote-port >LUN_RESETs, things are still functioning as expected. > >Also, /etc/init.d/target stop was able to successfully shutdown all >active sessions and unload tcm_qla2xxx after the test run. > >So AFAICT, the active I/O remote-port LUN_RESET changes are stable with >tcm_qla2xxx ports, separate from concurrent session disconnect hung task >you reported earlier. > >That said, I'll likely push this series as-is for -rc4, given that Dan >has also been able to verify the non conncurrent session disconnect case >on his setup generating constant ABORT_TASKs, and it's still surviving >both cases for iscsi-target ports. > >Please give the debug patch from last night a shot, and see if we can >determine the se_cmd states when you hit the hung task. I¹ll give your latest debug patch try in a little while >From the testing that I have done, what is seen is that active IO has already been completed and qla2xxx driver is waiting for commands to be Completed and it¹s waiting indefinitely for cmd_wait_comp. So it looks like there is a missing complete call from target_core. I¹ve attached our analysis from crash debug on a live system after the issues happens. I can recreate this issue at will within 5 minute of triggering sg_reset with following steps 1. Export 4 RAM disk LUNs on each of 2 port adapter. Initiator will see 8 RAM disk targets 2. Start IO with 4K block size and 8 threads with 80% write 20% read and 100% dandom. (I am using vdbench for generating IO. I can provide setup/config script if needed) 3. Start sg_reset for each LUNs with first device, bus and host with 120s delay. (I¹ve attached My script that I am using for triggering sg_reset) > >Thank you, > >-nab >
crash> bt PID: 305 TASK: ffff880829d00680 CPU: 11 COMMAND: "kworker/11:1" #0 [ffff880829d076d0] __schedule at ffffffff81628c3f #1 [ffff880829d077f0] schedule at ffffffff816292f0 #2 [ffff880829d07840] schedule_timeout at ffffffff8162d2b8 #3 [ffff880829d07940] wait_for_completion at ffffffff8162a2dc #4 [ffff880829d079b0] target_wait_for_sess_cmds at ffffffffa05b4e4d [target_core_mod] #5 [ffff880829d07a20] tcm_qla2xxx_free_session at ffffffffa06e8129 [tcm_qla2xxx] #6 [ffff880829d07a40] qlt_free_session_done at ffffffffa0693429 [qla2xxx] #7 [ffff880829d07ba0] process_one_work at ffffffff8107d797 #8 [ffff880829d07d10] worker_thread at ffffffff8107e5dd #9 [ffff880829d07e40] kthread at ffffffff81083a44 #10 [ffff880829d07f50] ret_from_fork at ffffffff8162ec9f #2 [ffff880829d07840] schedule_timeout at ffffffff8162d2b8 ffff880829d07848: ffff880829d012d8 0000000000000002 ffff880829d07858: ffff880829d012d8 00000002810ae34a ffff880829d07868: 0000000000000003 ffffffff8162a2d4 ffff880829d07878: 0000000b00000000 0000000000000046 ffff880829d07888: ffff88071266cfc0 ffff88071266cfe0 ffff880829d07898: ffffffff8162a2d4 0000000000000001 ffff880829d078a8: ffff880829d07918 0000000000000046 ffff880829d078b8: 0000000000000000 ffff880829d00680 ffff880829d078c8: 0000000000000046 ffff880712c052a0 ffff880829d078d8: ffff880700000000 ffff88071266d078 ffff880829d078e8: ffff880829d07908 ffff88071266cfc8 ffff880829d078f8: ffff880700000001 ffff880829d07948 ffff880829d07908: ffff880829d00680 ffff88071266cfc8 ffff880829d07918: ffff88071266cfc0<< ffff880829d07948 se_cmd.cmd_wait_comp = ffff88071266cfc0 ffff880829d07928: ffff880829d00680 7fffffffffffffff ffff880829d07938: ffff880829d079a8 ffffffff8162a2dc #3 [ffff880829d07940] wait_for_completion at ffffffff8162a2dc crash> se_cmd.cmd_wait_comp -ox struct se_cmd { [0x98] struct completion cmd_wait_comp; } crash> p/x 0xffff88071266cfc0-0x98 $7 = 0xffff88071266cf28 qla_tgt_cmd -x 0xffff88071266cf28 struct qla_tgt_cmd { se_cmd = { scsi_status = 0x2, scsi_asc = 0x0, scsi_ascq = 0x0, scsi_sense_length = 0x60, cmd_wait_set = 0x1, unknown_data_length = 0x0, state_active = 0x0, tag = 0x1314d0, alua_nonop_delay = 0x0, data_direction = DMA_TO_DEVICE, sam_task_attr = 0x20, map_tag = 0x7e9, t_state = TRANSPORT_WRITE_PENDING, se_cmd_flags = 0x80090d, SCF_SUPPORTED_SAM_OPCODE SCF_USE_CPUID/¬SCF_SUPPORTED_SAM_OPCODE data_length = 0x1000, residual_count = 0x0, orig_fe_lun = 0x3, pr_res_key = 0x0, sense_buffer = 0xffff88071266d2a8, se_delayed_node = { next = 0xffff88071266cf70, prev = 0xffff88071266cf70 }, se_qf_node = { next = 0xffff88071266cf80, prev = 0xffff88071266cf80 }, se_dev = 0xffff880753c12000, se_lun = 0x0, se_sess = 0xffff880712c05240, se_tmr_req = 0x0, se_cmd_list = { next = 0xffff88071266cfb0, prev = 0xffff88071266cfb0 }, cmd_wait_comp = { done = 0x0, wait = { lock = { { rlock = { raw_lock = { val = { counter = 0x0 } }, magic = 0xdead4ead, owner_cpu = 0xffffffff, owner = 0xffffffffffffffff, dep_map = { key = 0xffffffffa05d9c20, class_cache = {0xffffffff8246c460 <lock_classes+761856>, 0x0}, name = 0xffffffffa05c65ce "&x->wait", cpu = 0xb, ip = 0xffffffff8162a246 } }, { __padding = "\000\000\000\000\255N\255\336\377\377\377\377\000\000\000\000\377\377\377\377\377\377\377\377", dep_map = { key = 0xffffffffa05d9c20, class_cache = {0xffffffff8246c460 <lock_classes+761856>, 0x0}, name = 0xffffffffa05c65ce "&x->wait", cpu = 0xb, ip = 0xffffffff8162a246 } } } }, task_list = { next = 0xffff880829d07960, prev = 0xffff880829d07960 } } }, se_tfo = 0xffffffffa06ea000 <tcm_qla2xxx_ops>, execute_cmd = 0xffffffffa05b69f0, transport_complete_callback = 0x0, protocol_data = 0xffffffffa05d59a0, t_task_cdb = 0xffff88071266d048 "*", __t_task_cdb = "*\000\000\n\372(\000\000\b\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000", t_task_lba = 0xafa28, t_task_nolb = 0x0, transport_state = 0x880, t_state_lock = { { rlock = { raw_lock = { val = { counter = 0x0 } }, magic = 0xdead4ead, owner_cpu = 0xffffffff, owner = 0xffffffffffffffff, dep_map = { key = 0xffffffffa05d9c18, class_cache = {0xffffffff82468470 <lock_classes+745488>, 0x0}, name = 0xffffffffa05c65d7 "&(&cmd->t_state_lock)->rlock", cpu = 0x9, ip = 0xffffffffa05b1a72 } }, { __padding = "\000\000\000\000\255N\255\336\377\377\377\377\000\000\000\000\377\377\377\377\377\377\377\377", dep_map = { key = 0xffffffffa05d9c18, class_cache = {0xffffffff82468470 <lock_classes+745488>, 0x0}, name = 0xffffffffa05c65d7 "&(&cmd->t_state_lock)->rlock", cpu = 0x9, ip = 0xffffffffa05b1a72 } } } }, cmd_kref = { refcount = { counter = 0x0 } }, t_transport_stop_comp = { done = 0x0, wait = { lock = { { rlock = { raw_lock = { val = { counter = 0x0 } }, magic = 0xdead4ead, owner_cpu = 0xffffffff, owner = 0xffffffffffffffff, dep_map = { key = 0xffffffffa05d9c20, class_cache = {0x0, 0x0}, name = 0xffffffffa05c65ce "&x->wait", cpu = 0x9, ip = 0x0 } }, { __padding = "\000\000\000\000\255N\255\336\377\377\377\377\000\000\000\000\377\377\377\377\377\377\377\377", dep_map = { key = 0xffffffffa05d9c20, class_cache = {0x0, 0x0}, name = 0xffffffffa05c65ce "&x->wait", cpu = 0x9, ip = 0x0 } } } }, task_list = { next = 0xffff88071266d118, prev = 0xffff88071266d118 } } }, work = { data = { counter = 0x0 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0, lockdep_map = { key = 0x0, class_cache = {0x0, 0x0}, name = 0x0, cpu = 0x0, ip = 0x0 } }, t_data_sg = 0x0, t_data_sg_orig = 0x0, t_data_nents = 0x0, t_data_nents_orig = 0x0, t_data_vmap = 0x0, t_bidi_data_sg = 0x0, t_bidi_data_nents = 0x0, lun_ref_active = 0x0, state_list = { next = 0xdead000000000100, prev = 0xdead000000000200 }, priv = 0x0, prot_op = TARGET_PROT_NORMAL, prot_type = TARGET_DIF_TYPE0_PROT, prot_checks = 0x0, prot_pto = 0x0, prot_length = 0x0, reftag_seed = 0x0, t_prot_sg = 0x0, t_prot_nents = 0x0, pi_err = 0x0, bad_sector = 0x0, cpuid = 0x9 }, sess = 0xffff880829271100, state = 0x3, QLA_TGT_STATE_PROCESSED/queue_status(scsi_status 2) free_work = { data = { counter = 0x0 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0, lockdep_map = { key = 0x0, class_cache = {0x0, 0x0}, name = 0x0, cpu = 0x0, ip = 0x0 } }, work = { data = { counter = 0x240 }, entry = { next = 0xffff88071266d260, prev = 0xffff88071266d260 }, func = 0xffffffffa06e8190 <tcm_qla2xxx_complete_free>, lockdep_map = { key = 0xffffffffa06eb2e0 <__key.90144>, class_cache = {0x0, 0x0}, name = 0xffffffffa06e9287 "(&cmd->work)", cpu = 0x9, ip = 0x0 } }, sense_buffer = "p\000\v\000\000\000\000\n\000\000\000\000)\003\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000", cmd_lock = { { rlock = { raw_lock = { val = { counter = 0x0 } }, magic = 0xdead4ead, owner_cpu = 0xffffffff, owner = 0xffffffffffffffff, dep_map = { key = 0xffffffffa06be0ac, class_cache = {0xffffffff824699c0 <lock_classes+750944>, 0x0}, name = 0xffffffffa06b82d8 "&(&cmd->cmd_lock)->rlock", cpu = 0x9, ip = 0xffffffffa06e82f9 } }, { __padding = "\000\000\000\000\255N\255\336\377\377\377\377\000\000\000\000\377\377\377\377\377\377\377\377", dep_map = { key = 0xffffffffa06be0ac, class_cache = {0xffffffff824699c0 <lock_classes+750944>, 0x0}, name = 0xffffffffa06b82d8 "&(&cmd->cmd_lock)->rlock", cpu = 0x9, ip = 0xffffffffa06e82f9 } } } }, conf_compl_supported = 0x0, sg_mapped = 0x0, free_sg = 0x0, write_data_transferred = 0x0, ctx_dsd_alloced = 0x0, q_full = 0x0, term_exchg = 0x0, cmd_sent_to_fw = 0x0, cmd_in_wq = 0x0, aborted = 0x0, sg = 0x0, sg_cnt = 0x0, bufflen = 0x1000, offset = 0x0, unpacked_lun = 0x3, dma_data_direction = DMA_FROM_DEVICE, reset_count = 0x1, loop_id = 0x4, tgt = 0xffff88083f845400, vha = 0xffff88081fff8d28, cmd_list = { next = 0xdead000000000100, prev = 0xdead000000000200 }, atio = { u = { isp2x = { entry_hdr = 0x106, sys_define = 0x38, entry_status = 0x0, sys_define_2 = 0x1314d0, target = { extended = 0x106, id = { reserved = 0x6, standard = 0x1 } }, rx_id = 0x8, flags = 0x100, status = 0x17, command_ref = 0x8, task_codes = 0x29, task_flags = 0x0, execution_codes = 0x0, cdb = "\001\000\000\000\006\307\377\377\000\000\000\000\000\003\000", data_length = 0x0, lun = 0x0, initiator_port_name = "\000\001*\000\000\n\372(", reserved_32 = {0x0, 0x8, 0x0, 0x0, 0x0, 0x0}, ox_id = 0x10 }, isp24 = { entry_hdr = 0x106, fcp_cmnd_len_low = 0x38, fcp_cmnd_len_high = 0x0, attr = 0x0, exchange_addr = 0x1314d0, fcp_hdr = { r_ctl = 0x6, d_id = "\001\b", cs_ctl = 0x0, s_id = "\001\027", type = 0x8, f_ctl = ")\000", seq_id = 0x1, df_ctl = 0x0, seq_cnt = 0x0, ox_id = 0xc706, rx_id = 0xffff, parameter = 0x0 }, fcp_cmnd = { lun = 0x300, cmnd_ref = 0x0, task_attr = 0x0, reserved = 0x0, task_mgmt_flags = 0x0, wrdata = 0x1, rddata = 0x0, add_cdb_len = 0x0, cdb = "*\000\000\n\372(\000\000\b\000\000\000\000\000\000", add_cdb = "\000\000\020" } }, raw = { entry_type = 0x6, entry_count = 0x1, data = "8\000\320\024\023\000\006\001\b\000\000\001\027\000\b)\000\000\001\000\000\000\006\307\377\377\000\000\000\000\000\003\000\000\000\000\000\000\000\000\000\001*\000\000\n\372(\000\000\b\000\000\000\000\000\000", signature = 0x100000 } } }, prot_sg = 0x0, prot_sg_cnt = 0x0, blk_sz = 0x200, ctx = 0x0, jiffies_at_alloc = 0x1053c6a2c, jiffies_at_free = 0x1053c729e, cmd_flags = (CMD_FLAG_DATA_WORK | unknown: 1131563) }
Attachment:
sg_reset.sh
Description: sg_reset.sh