Hi Alan, I've got another USB bug to report that manifests during automated device removal testing on RHEL7. This one hits the BUG() inside qh_destroy: PID: 139 TASK: ffff881054101960 CPU: 22 COMMAND: "kworker/u66:0" #0 [ffff881054113540] machine_kexec at ffffffff810411b1 #1 [ffff881054113598] crash_kexec at ffffffff810cf272 #2 [ffff881054113668] oops_end at ffffffff815eb548 #3 [ffff881054113690] die at ffffffff8101632b #4 [ffff8810541136c0] do_trap at ffffffff815eac20 #5 [ffff881054113710] do_invalid_op at ffffffff81013134 #6 [ffff8810541137c0] invalid_op at ffffffff815f485e [exception RIP: qh_destroy+0x66] RIP: ffffffff81412066 RSP: ffff881054113870 RFLAGS: 00010246 RAX: ffff88084b84dc98 RBX: ffff88084b84dc80 RCX: ffff88107fdf0874 RDX: 0000000000000056 RSI: ffff88084b84dc80 RDI: ffff880078f12740 RBP: ffff881054113880 R8: ffffffff819e4ba0 R9: 084f8036100c0000 R10: f79281ddd0018403 R11: 0000000000000400 R12: ffff880078f12740 R13: ffff880078f1282c R14: ffff8810405745b8 R15: 0000000000000048 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffff881054113888] ehci_mem_cleanup at ffffffff814132b8 #8 [ffff8810541138c8] ehci_stop at ffffffff81417a92 #9 [ffff881054113950] usb_remove_hcd at ffffffff813f8d38 #10 [ffff881054113980] usb_hcd_pci_remove at ffffffff8140a15f #11 [ffff8810541139a8] pci_device_remove at ffffffff812ec5bb #12 [ffff8810541139d0] __device_release_driver at ffffffff813b6b9f #13 [ffff8810541139f0] device_release_driver at ffffffff813b6c33 #14 [ffff881054113a10] bus_remove_device at ffffffff813b63a8 #15 [ffff881054113a38] device_del at ffffffff813b2955 #16 [ffff881054113a68] pci_stop_bus_device at ffffffff812e54b4 #17 [ffff881054113a90] pci_stop_bus_device at ffffffff812e545b #18 [ffff881054113ab8] pci_stop_bus_device at ffffffff812e545b #19 [ffff881054113ae0] pci_stop_and_remove_bus_device at ffffffff812e55a2 #20 [ffff881054113af8] ioboard_bringdown at ffffffffa0400917 [ftmod] #21 [ffff881054113b20] ftmod_ioboard_event at ffffffffa0400c07 [ftmod] #22 [ffff881054113ba8] io_state_change at ffffffffa03fe7b2 [ftmod] #23 [ffff881054113bf8] OsIoStateChange at ffffffffa06d25d4 [ccmod] #24 [ffff881054113c10] IoStateChange at ffffffffa06d3008 [ccmod] #25 [ffff881054113d40] CcIoPoll at ffffffffa06d6cb4 [ccmod] #26 [ffff881054113de0] CcIoPoll at ffffffffa03f9b76 [ftmod] #27 [ffff881054113df8] io_poll at ffffffffa03fe6d5 [ftmod] #28 [ffff881054113e20] process_one_work at ffffffff8107e03b #29 [ffff881054113e68] worker_thread at ffffffff8107ee0b #30 [ffff881054113ec8] kthread at ffffffff81085aff #31 [ffff881054113f50] ret_from_fork at ffffffff815f306c drivers/usb/host/ehci-mem.c: 68 <qh_destroy+0xa>: mov %rsp,%rbp <qh_destroy+0xd>: push %r12 <qh_destroy+0xf>: mov %rdi,%r12 <qh_destroy+0x12>: push %rbx ... drivers/usb/host/ehci-mem.c: 68 <qh_destroy+0x17>: mov %rsi,%rbx ... drivers/usb/host/ehci-mem.c: 72 <qh_destroy+0x66>: ud2 67 static void qh_destroy(struct ehci_hcd *ehci, struct ehci_qh *qh) 68 { 69 /* clean qtds first, and know this is not linked */ 70 if (!list_empty (&qh->qtd_list) || qh->qh_next.ptr) { 71 ehci_dbg (ehci, "unused qh not empty!\n"); 72 BUG (); 73 } 74 if (qh->dummy) 75 ehci_qtd_free (ehci, qh->dummy); 76 dma_pool_free(ehci->qh_pool, qh->hw, qh->qh_dma); 77 kfree(qh); 78 } According to disassembly, R12 has a copy of the initial RDI value and and RBX has a copy of the initial RSI, so: R12: struct ehci_hcd ffff880078f12740 RBX: struct ehci_qh ffff88084b84dc80 crash> struct ehci_hcd ffff880078f12740 struct ehci_hcd { next_hrtimer_event = 99, enabled_hrtimer_events = 0x0, hr_timeouts = {{ tv64 = 0x0 }, { tv64 = 0x0 }, { tv64 = 0x0 }, { tv64 = 0x0 }, { tv64 = 0x0 }, { tv64 = 0x0 }, { tv64 = 0x0 }, { tv64 = 0x0 }, { tv64 = 0x0 }, { tv64 = 0x0 }, { tv64 = 0x0 }}, hrtimer = { node = { node = { __rb_parent_color = 0xffff880078f127a0, rb_right = 0x0, rb_left = 0x0 }, expires = { tv64 = 0x0 } }, _softexpires = { tv64 = 0x0 }, function = 0xffffffff814103c0 <ehci_hrtimer_func>, base = 0xffff88085fc0e780, state = 0x0, start_pid = 0xffffffff, start_site = 0x0, start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" }, PSS_poll_count = 0x0, ASS_poll_count = 0x0, died_poll_count = 0x0, caps = 0xffffc9001680e000, regs = 0xffffc9001680e020, debug = 0x0, hcs_params = 0x204008, lock = { { rlock = { raw_lock = { { head_tail = 0x580058, tickets = { head = 0x58, tail = 0x58 } } } } } }, rh_state = EHCI_RH_HALTED, scanning = 0x0, need_rescan = 0x0, intr_unlinking = 0x0, iaa_in_progress = 0x0, async_unlinking = 0x0, shutdown = 0x1, qh_scan_next = 0x0, async = 0xffff88084b84dc80, dummy = 0x0, async_unlink = { next = 0xffff880078f12850, prev = 0xffff880078f12850 }, async_idle = { next = 0xffff880078f12860, prev = 0xffff880078f12860 }, async_unlink_cycle = 0x0, async_count = 0x1, periodic_size = 0x400, periodic = 0xffff880078d58000, periodic_dma = 0x78d58000, intr_qh_list = { next = 0xffff880078f12890, prev = 0xffff880078f12890 }, i_thresh = 0x0, pshadow = 0xffff88084b7d4290, intr_unlink_wait = { next = 0xffff880078f128b0, prev = 0xffff880078f128b0 }, intr_unlink = { next = 0xffff880078f128c0, prev = 0xffff880078f128c0 }, intr_unlink_wait_cycle = 0x0, intr_unlink_cycle = 0x0, now_frame = 0x0, last_iso_frame = 0x0, intr_count = 0x0, isoc_count = 0x0, periodic_count = 0x0, uframe_periodic_max = 0x64, cached_itd_list = { next = 0xffff880078f128f0, prev = 0xffff880078f128f0 }, last_itd_to_free = 0x0, cached_sitd_list = { next = 0xffff880078f12908, prev = 0xffff880078f12908 }, last_sitd_to_free = 0x0, reset_done = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, bus_suspended = 0x0, companion_ports = 0x0, owned_ports = 0x0, port_c_suspend = 0x0, suspended_ports = 0x0, resuming_ports = 0x0, qh_pool = 0xffff88084b84c1c8, qtd_pool = 0xffff88084b84f900, itd_pool = 0xffff88084b84fac8, sitd_pool = 0xffff88084b84c000, random_frame = 0x0, next_statechange = 0x10023df2f, last_periodic_enable = { tv64 = 0x1392c8d7a295b667 }, command = 0x10000, no_selective_suspend = 0x0, has_fsl_port_bug = 0x0, big_endian_mmio = 0x0, big_endian_desc = 0x0, big_endian_capbase = 0x0, has_amcc_usb23 = 0x0, need_io_watchdog = 0x0, amd_pll_fix = 0x0, use_dummy_qh = 0x0, has_synopsys_hc_bug = 0x0, frame_index_bug = 0x0, need_oc_pp_cycle = 0x0, ohci_hcctrl_reg = 0x0, has_hostpc = 0x0, has_ppcd = 0x0, sbrn = 0x20, stats = { normal = 0x0, error = 0x0, iaa = 0x0, lost_iaa = 0x0, complete = 0x1, unlink = 0x0 }, debug_dir = 0xffff88084f900208, priv = 0xffff880078f12a50 } from the ehci_hcd, we can also figure out the containing usb_hcd: crash> struct -o usb_hcd | grep hcd_priv [0x220] unsigned long hcd_priv[]; crash> p 0xffff880078f12740-0x220 $2 struct = 0xffff880078f12520 struct usb_hcd 0xffff880078f12520 crash> struct usb_hcd 0xffff880078f12520 struct usb_hcd { self = { controller = 0xffff8810405745b8, busnum = 0x3, bus_name = 0xffff88103fc11428 "0000:66:00.0", uses_dma = 0x1, uses_pio_for_control = 0x0, otg_port = 0x0, is_b_host = 0x0, b_hnp_enable = 0x0, no_stop_on_short = 0x0, no_sg_constraint = 0x0, sg_tablesize = 0xffffffff, devnum_next = 0x6, devmap = { devicemap = {0x0, 0x0} }, root_hub = 0xffff880078f16f60, hs_companion = 0x0, bus_list = { next = 0xffff88084b7b6fa8, prev = 0xffffffff81994030 <usb_bus_list> }, bandwidth_allocated = 0x0, bandwidth_int_reqs = 0x0, bandwidth_isoc_reqs = 0x0, resuming_ports = 0x0, mon_bus = 0xffff88084b84f1e0, monitored = 0x0 }, kref = { refcount = { counter = 0x2 } }, product_desc = 0xffffffff8183b3d8 "EHCI Host Controller", speed = 0x20, irq_descr = "ehci_hcd:usb3\000\000\000\000\000\000\000\000\000\000", rh_timer = { entry = { next = 0x0, prev = 0x0 }, expires = 0x0, base = 0xffffffff81c9b4c0 <boot_tvec_bases>, function = 0xffffffff813f8600 <rh_timer_func>, data = 0xffff880078f12520, slack = 0xffffffff, start_pid = 0xffffffff, start_site = 0x0, start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" }, status_urb = 0x0, wakeup_work = { data = { counter = 0xfffffffe0 }, entry = { next = 0xffff880078f12628, prev = 0xffff880078f12628 }, func = 0xffffffff813f7cf0 <hcd_resume_work> }, driver = 0xffffffff819ec2a0 <ehci_pci_hc_driver>, phy = 0x0, flags = 0x41, rh_registered = 0x0, rh_pollable = 0x0, msix_enabled = 0x0, uses_new_polling = 0x1, wireless = 0x0, authorized_default = 0x1, has_tt = 0x0, irq = 0xc, regs = 0xffffc9001680e000, rsrc_start = 0xa0000000, rsrc_len = 0x400, power_budget = 0x0, high_prio_bh = { running = 0x0, lock = { { rlock = { raw_lock = { { head_tail = 0x60006, tickets = { head = 0x6, tail = 0x6 } } } } } }, head = { next = 0xffff880078f12688, prev = 0xffff880078f12688 }, bh = { next = 0x0, state = 0x0, count = { counter = 0x0 }, func = 0xffffffff813f9050 <usb_giveback_urb_bh>, data = 0xffff880078f12680 } }, low_prio_bh = { running = 0x0, lock = { { rlock = { raw_lock = { { head_tail = 0xc000c0, tickets = { head = 0xc0, tail = 0xc0 } } } } } }, head = { next = 0xffff880078f126c8, prev = 0xffff880078f126c8 }, bh = { next = 0x0, state = 0x0, count = { counter = 0x0 }, func = 0xffffffff813f9050 <usb_giveback_urb_bh>, data = 0xffff880078f126c0 } }, bandwidth_mutex = 0xffff88084c231a08, shared_hcd = 0x0, primary_hcd = 0x0, pool = {0xffff88084b84ec88, 0xffff88084b84f738, 0xffff88084b84cc78, 0xffff88084b84c390}, state = 0x85, hcd_priv = 0xffff880078f12740 } and finally a dump of the ehci_qh in question: crash> struct ehci_qh ffff88084b84dc80 struct ehci_qh { hw = 0xffff880078d1a000, qh_dma = 0x78d1a000, qh_next = { qh = 0xffff88084efe6730, itd = 0xffff88084efe6730, sitd = 0xffff88084efe6730, fstn = 0xffff88084efe6730, hw_next = 0xffff88084efe6730, ptr = 0xffff88084efe6730 << !NULL }, qtd_list = { << list_empty next = 0xffff88084b84dc98, prev = 0xffff88084b84dc98 }, intr_node = { next = 0x0, prev = 0x0 }, dummy = 0xffff880078d22000, unlink_node = { next = 0xffff88084b84dcc0, prev = 0xffff88084b84dcc0 }, unlink_cycle = 0x0, qh_state = 0x1, << QH_STATE_LINKED xacterrs = 0x0, usecs = 0x0, gap_uf = 0x0, c_usecs = 0x0, tt_usecs = 0x0, period = 0x0, start = 0x0, dev = 0x0, is_out = 0x0, clearing_tt = 0x0, dequeue_during_giveback = 0x0, exception = 0x0 } The qtd_list is empty, contains only one entry, itself. crash> struct -o ehci_qh | grep td_list [0x18] struct list_head qtd_list; crash> p/x 0xffff88084b84dc80 + 0x18 $1 = 0xffff88084b84dc98 but qh->qh_next.ptr is !NULL, so we hit the BUG. However, it seems that the memory at qh->qh_next.ptr has been freed: crash> struct ehci_qh 0xffff88084efe6730 struct ehci_qh { hw = 0x6b6b6b6b6b6b6b6b, qh_dma = 0x6b6b6b6b6b6b6b6b, qh_next = { qh = 0x6b6b6b6b6b6b6b6b, itd = 0x6b6b6b6b6b6b6b6b, sitd = 0x6b6b6b6b6b6b6b6b, fstn = 0x6b6b6b6b6b6b6b6b, hw_next = 0x6b6b6b6b6b6b6b6b, ptr = 0x6b6b6b6b6b6b6b6b }, qtd_list = { next = 0x6b6b6b6b6b6b6b6b, prev = 0x6b6b6b6b6b6b6b6b }, intr_node = { next = 0x6b6b6b6b6b6b6b6b, prev = 0x6b6b6b6b6b6b6b6b }, dummy = 0x6b6b6b6b6b6b6b6b, unlink_node = { next = 0x6b6b6b6b6b6b6b6b, prev = 0x6b6b6b6b6b6b6b6b }, unlink_cycle = 0x6b6b6b6b, qh_state = 0x6b, xacterrs = 0x6b, usecs = 0x6b, gap_uf = 0x6b, c_usecs = 0x6b, tt_usecs = 0x6b6b, period = 0x6b6b, start = 0x6b6b, dev = 0x6b6b6b6b6b6b6b6b, is_out = 0x1, clearing_tt = 0x1, dequeue_during_giveback = 0x0, exception = 0x1 } crash> kmem 0xffff88084efe6730 CACHE NAME OBJSIZE ALLOCATED TOTAL SLABS SSIZE ffff88085f807600 kmalloc-128 128 2761 4305 123 16k SLAB MEMORY NODE TOTAL ALLOCATED FREE ffffea00213bf900 ffff88084efe4000 0 35 29 6 FREE / [ALLOCATED] ffff88084efe6730 PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffea00213bf980 84efe6000 0 0 0 2fffff00008000 tail I'm not too familiar with the USB code stack, so any suggestions on instrumentation that I can add to aid in debugging would be helpful. Maybe some tracing in qh_link_async / single_unlink_async / end_unlink_async /qh_link_periodic can reveal the sequence that is leaving this dangling qh_next.ptr? Note: This does bear some resemblance to a bug that Stratus hit a few years ago [1] [2], however enough of the code has changed that I'm not sure the fix for that one would apply to a modern kernel. Regards, -- Joe [1] https://bugzilla.redhat.com/show_bug.cgi?id=680987 [2] http://thread.gmane.org/gmane.linux.usb.general/44788 -- To unsubscribe from this list: send the line "unsubscribe linux-usb" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html