Hitting "unused qh not empty" BUG in qh_destroy

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Alan,

I've got another USB bug to report that manifests during automated
device removal testing on RHEL7.  This one hits the BUG() inside
qh_destroy:

PID: 139    TASK: ffff881054101960  CPU: 22  COMMAND: "kworker/u66:0"
 #0 [ffff881054113540] machine_kexec at ffffffff810411b1
 #1 [ffff881054113598] crash_kexec at ffffffff810cf272
 #2 [ffff881054113668] oops_end at ffffffff815eb548
 #3 [ffff881054113690] die at ffffffff8101632b
 #4 [ffff8810541136c0] do_trap at ffffffff815eac20
 #5 [ffff881054113710] do_invalid_op at ffffffff81013134
 #6 [ffff8810541137c0] invalid_op at ffffffff815f485e
    [exception RIP: qh_destroy+0x66]
    RIP: ffffffff81412066  RSP: ffff881054113870  RFLAGS: 00010246
    RAX: ffff88084b84dc98  RBX: ffff88084b84dc80  RCX: ffff88107fdf0874
    RDX: 0000000000000056  RSI: ffff88084b84dc80  RDI: ffff880078f12740
    RBP: ffff881054113880   R8: ffffffff819e4ba0   R9: 084f8036100c0000
    R10: f79281ddd0018403  R11: 0000000000000400  R12: ffff880078f12740
    R13: ffff880078f1282c  R14: ffff8810405745b8  R15: 0000000000000048
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #7 [ffff881054113888] ehci_mem_cleanup at ffffffff814132b8
 #8 [ffff8810541138c8] ehci_stop at ffffffff81417a92
 #9 [ffff881054113950] usb_remove_hcd at ffffffff813f8d38
#10 [ffff881054113980] usb_hcd_pci_remove at ffffffff8140a15f
#11 [ffff8810541139a8] pci_device_remove at ffffffff812ec5bb
#12 [ffff8810541139d0] __device_release_driver at ffffffff813b6b9f
#13 [ffff8810541139f0] device_release_driver at ffffffff813b6c33
#14 [ffff881054113a10] bus_remove_device at ffffffff813b63a8
#15 [ffff881054113a38] device_del at ffffffff813b2955
#16 [ffff881054113a68] pci_stop_bus_device at ffffffff812e54b4
#17 [ffff881054113a90] pci_stop_bus_device at ffffffff812e545b
#18 [ffff881054113ab8] pci_stop_bus_device at ffffffff812e545b
#19 [ffff881054113ae0] pci_stop_and_remove_bus_device at ffffffff812e55a2
#20 [ffff881054113af8] ioboard_bringdown at ffffffffa0400917 [ftmod]
#21 [ffff881054113b20] ftmod_ioboard_event at ffffffffa0400c07 [ftmod]
#22 [ffff881054113ba8] io_state_change at ffffffffa03fe7b2 [ftmod]
#23 [ffff881054113bf8] OsIoStateChange at ffffffffa06d25d4 [ccmod]
#24 [ffff881054113c10] IoStateChange at ffffffffa06d3008 [ccmod]
#25 [ffff881054113d40] CcIoPoll at ffffffffa06d6cb4 [ccmod]
#26 [ffff881054113de0] CcIoPoll at ffffffffa03f9b76 [ftmod]
#27 [ffff881054113df8] io_poll at ffffffffa03fe6d5 [ftmod]
#28 [ffff881054113e20] process_one_work at ffffffff8107e03b
#29 [ffff881054113e68] worker_thread at ffffffff8107ee0b
#30 [ffff881054113ec8] kthread at ffffffff81085aff
#31 [ffff881054113f50] ret_from_fork at ffffffff815f306c

drivers/usb/host/ehci-mem.c: 68
  <qh_destroy+0xa>:    mov    %rsp,%rbp
  <qh_destroy+0xd>:    push   %r12
  <qh_destroy+0xf>:    mov    %rdi,%r12
  <qh_destroy+0x12>:   push   %rbx
...
drivers/usb/host/ehci-mem.c: 68
  <qh_destroy+0x17>:   mov    %rsi,%rbx
...
drivers/usb/host/ehci-mem.c: 72
  <qh_destroy+0x66>:   ud2

 67 static void qh_destroy(struct ehci_hcd *ehci, struct ehci_qh *qh)
 68 {
 69         /* clean qtds first, and know this is not linked */
 70         if (!list_empty (&qh->qtd_list) || qh->qh_next.ptr) {
 71                 ehci_dbg (ehci, "unused qh not empty!\n");
 72                 BUG ();
 73         }
 74         if (qh->dummy)
 75                 ehci_qtd_free (ehci, qh->dummy);
 76         dma_pool_free(ehci->qh_pool, qh->hw, qh->qh_dma);
 77         kfree(qh);
 78 }

According to disassembly, R12 has a copy of the initial RDI value and
and RBX has a copy of the initial RSI, so:

  R12: struct ehci_hcd ffff880078f12740
  RBX: struct ehci_qh ffff88084b84dc80

crash> struct ehci_hcd ffff880078f12740
struct ehci_hcd {
  next_hrtimer_event = 99, 
  enabled_hrtimer_events = 0x0, 
  hr_timeouts = {{
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }, {
      tv64 = 0x0
    }}, 
  hrtimer = {
    node = {
      node = {
        __rb_parent_color = 0xffff880078f127a0, 
        rb_right = 0x0, 
        rb_left = 0x0
      }, 
      expires = {
        tv64 = 0x0
      }
    }, 
    _softexpires = {
      tv64 = 0x0
    }, 
    function = 0xffffffff814103c0 <ehci_hrtimer_func>, 
    base = 0xffff88085fc0e780, 
    state = 0x0, 
    start_pid = 0xffffffff, 
    start_site = 0x0, 
    start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
  }, 
  PSS_poll_count = 0x0, 
  ASS_poll_count = 0x0, 
  died_poll_count = 0x0, 
  caps = 0xffffc9001680e000, 
  regs = 0xffffc9001680e020, 
  debug = 0x0, 
  hcs_params = 0x204008, 
  lock = {
    {
      rlock = {
        raw_lock = {
          {
            head_tail = 0x580058, 
            tickets = {
              head = 0x58, 
              tail = 0x58
            }
          }
        }
      }
    }
  }, 
  rh_state = EHCI_RH_HALTED, 
  scanning = 0x0, 
  need_rescan = 0x0, 
  intr_unlinking = 0x0, 
  iaa_in_progress = 0x0, 
  async_unlinking = 0x0, 
  shutdown = 0x1, 
  qh_scan_next = 0x0, 
  async = 0xffff88084b84dc80, 
  dummy = 0x0, 
  async_unlink = {
    next = 0xffff880078f12850, 
    prev = 0xffff880078f12850
  }, 
  async_idle = {
    next = 0xffff880078f12860, 
    prev = 0xffff880078f12860
  }, 
  async_unlink_cycle = 0x0, 
  async_count = 0x1, 
  periodic_size = 0x400, 
  periodic = 0xffff880078d58000, 
  periodic_dma = 0x78d58000, 
  intr_qh_list = {
    next = 0xffff880078f12890, 
    prev = 0xffff880078f12890
  }, 
  i_thresh = 0x0, 
  pshadow = 0xffff88084b7d4290, 
  intr_unlink_wait = {
    next = 0xffff880078f128b0, 
    prev = 0xffff880078f128b0
  }, 
  intr_unlink = {
    next = 0xffff880078f128c0, 
    prev = 0xffff880078f128c0
  }, 
  intr_unlink_wait_cycle = 0x0, 
  intr_unlink_cycle = 0x0, 
  now_frame = 0x0, 
  last_iso_frame = 0x0, 
  intr_count = 0x0, 
  isoc_count = 0x0, 
  periodic_count = 0x0, 
  uframe_periodic_max = 0x64, 
  cached_itd_list = {
    next = 0xffff880078f128f0, 
    prev = 0xffff880078f128f0
  }, 
  last_itd_to_free = 0x0, 
  cached_sitd_list = {
    next = 0xffff880078f12908, 
    prev = 0xffff880078f12908
  }, 
  last_sitd_to_free = 0x0, 
  reset_done = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, 
  bus_suspended = 0x0, 
  companion_ports = 0x0, 
  owned_ports = 0x0, 
  port_c_suspend = 0x0, 
  suspended_ports = 0x0, 
  resuming_ports = 0x0, 
  qh_pool = 0xffff88084b84c1c8, 
  qtd_pool = 0xffff88084b84f900, 
  itd_pool = 0xffff88084b84fac8, 
  sitd_pool = 0xffff88084b84c000, 
  random_frame = 0x0, 
  next_statechange = 0x10023df2f, 
  last_periodic_enable = {
    tv64 = 0x1392c8d7a295b667
  }, 
  command = 0x10000, 
  no_selective_suspend = 0x0, 
  has_fsl_port_bug = 0x0, 
  big_endian_mmio = 0x0, 
  big_endian_desc = 0x0, 
  big_endian_capbase = 0x0, 
  has_amcc_usb23 = 0x0, 
  need_io_watchdog = 0x0, 
  amd_pll_fix = 0x0, 
  use_dummy_qh = 0x0, 
  has_synopsys_hc_bug = 0x0, 
  frame_index_bug = 0x0, 
  need_oc_pp_cycle = 0x0, 
  ohci_hcctrl_reg = 0x0, 
  has_hostpc = 0x0, 
  has_ppcd = 0x0, 
  sbrn = 0x20, 
  stats = {
    normal = 0x0, 
    error = 0x0, 
    iaa = 0x0, 
    lost_iaa = 0x0, 
    complete = 0x1, 
    unlink = 0x0
  }, 
  debug_dir = 0xffff88084f900208, 
  priv = 0xffff880078f12a50
}

from the ehci_hcd, we can also figure out the containing usb_hcd:

crash> struct -o usb_hcd | grep hcd_priv
  [0x220] unsigned long hcd_priv[];
crash> p 0xffff880078f12740-0x220
  $2 struct = 0xffff880078f12520

  struct usb_hcd 0xffff880078f12520

crash> struct usb_hcd 0xffff880078f12520

struct usb_hcd {
  self = {
    controller = 0xffff8810405745b8, 
    busnum = 0x3, 
    bus_name = 0xffff88103fc11428 "0000:66:00.0", 
    uses_dma = 0x1, 
    uses_pio_for_control = 0x0, 
    otg_port = 0x0, 
    is_b_host = 0x0, 
    b_hnp_enable = 0x0, 
    no_stop_on_short = 0x0, 
    no_sg_constraint = 0x0, 
    sg_tablesize = 0xffffffff, 
    devnum_next = 0x6, 
    devmap = {
      devicemap = {0x0, 0x0}
    }, 
    root_hub = 0xffff880078f16f60, 
    hs_companion = 0x0, 
    bus_list = {
      next = 0xffff88084b7b6fa8, 
      prev = 0xffffffff81994030 <usb_bus_list>
    }, 
    bandwidth_allocated = 0x0, 
    bandwidth_int_reqs = 0x0, 
    bandwidth_isoc_reqs = 0x0, 
    resuming_ports = 0x0, 
    mon_bus = 0xffff88084b84f1e0, 
    monitored = 0x0
  }, 
  kref = {
    refcount = {
      counter = 0x2
    }
  }, 
  product_desc = 0xffffffff8183b3d8 "EHCI Host Controller", 
  speed = 0x20, 
  irq_descr = "ehci_hcd:usb3\000\000\000\000\000\000\000\000\000\000", 
  rh_timer = {
    entry = {
      next = 0x0, 
      prev = 0x0
    }, 
    expires = 0x0, 
    base = 0xffffffff81c9b4c0 <boot_tvec_bases>, 
    function = 0xffffffff813f8600 <rh_timer_func>, 
    data = 0xffff880078f12520, 
    slack = 0xffffffff, 
    start_pid = 0xffffffff, 
    start_site = 0x0, 
    start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"
  }, 
  status_urb = 0x0, 
  wakeup_work = {
    data = {
      counter = 0xfffffffe0
    }, 
    entry = {
      next = 0xffff880078f12628, 
      prev = 0xffff880078f12628
    }, 
    func = 0xffffffff813f7cf0 <hcd_resume_work>
  }, 
  driver = 0xffffffff819ec2a0 <ehci_pci_hc_driver>, 
  phy = 0x0, 
  flags = 0x41, 
  rh_registered = 0x0, 
  rh_pollable = 0x0, 
  msix_enabled = 0x0, 
  uses_new_polling = 0x1, 
  wireless = 0x0, 
  authorized_default = 0x1, 
  has_tt = 0x0, 
  irq = 0xc, 
  regs = 0xffffc9001680e000, 
  rsrc_start = 0xa0000000, 
  rsrc_len = 0x400, 
  power_budget = 0x0, 
  high_prio_bh = {
    running = 0x0, 
    lock = {
      {
        rlock = {
          raw_lock = {
            {
              head_tail = 0x60006, 
              tickets = {
                head = 0x6, 
                tail = 0x6
              }
            }
          }
        }
      }
    }, 
    head = {
      next = 0xffff880078f12688, 
      prev = 0xffff880078f12688
    }, 
    bh = {
      next = 0x0, 
      state = 0x0, 
      count = {
        counter = 0x0
      }, 
      func = 0xffffffff813f9050 <usb_giveback_urb_bh>, 
      data = 0xffff880078f12680
    }
  }, 
  low_prio_bh = {
    running = 0x0, 
    lock = {
      {
        rlock = {
          raw_lock = {
            {
              head_tail = 0xc000c0, 
              tickets = {
                head = 0xc0, 
                tail = 0xc0
              }
            }
          }
        }
      }
    }, 
    head = {
      next = 0xffff880078f126c8, 
      prev = 0xffff880078f126c8
    }, 
    bh = {
      next = 0x0, 
      state = 0x0, 
      count = {
        counter = 0x0
      }, 
      func = 0xffffffff813f9050 <usb_giveback_urb_bh>, 
      data = 0xffff880078f126c0
    }
  }, 
  bandwidth_mutex = 0xffff88084c231a08, 
  shared_hcd = 0x0, 
  primary_hcd = 0x0, 
  pool = {0xffff88084b84ec88, 0xffff88084b84f738, 0xffff88084b84cc78, 0xffff88084b84c390}, 
  state = 0x85, 
  hcd_priv = 0xffff880078f12740
}

and finally a dump of the ehci_qh in question:

crash> struct ehci_qh ffff88084b84dc80
struct ehci_qh {
  hw = 0xffff880078d1a000, 
  qh_dma = 0x78d1a000, 
  qh_next = {
    qh = 0xffff88084efe6730, 
    itd = 0xffff88084efe6730, 
    sitd = 0xffff88084efe6730, 
    fstn = 0xffff88084efe6730, 
    hw_next = 0xffff88084efe6730, 
    ptr = 0xffff88084efe6730                     << !NULL
  }, 
  qtd_list = {                                   << list_empty
    next = 0xffff88084b84dc98, 
    prev = 0xffff88084b84dc98
  }, 
  intr_node = {
    next = 0x0, 
    prev = 0x0
  }, 
  dummy = 0xffff880078d22000, 
  unlink_node = {
    next = 0xffff88084b84dcc0, 
    prev = 0xffff88084b84dcc0
  }, 
  unlink_cycle = 0x0, 
  qh_state = 0x1,                                << QH_STATE_LINKED
  xacterrs = 0x0, 
  usecs = 0x0, 
  gap_uf = 0x0, 
  c_usecs = 0x0, 
  tt_usecs = 0x0, 
  period = 0x0, 
  start = 0x0, 
  dev = 0x0, 
  is_out = 0x0, 
  clearing_tt = 0x0, 
  dequeue_during_giveback = 0x0, 
  exception = 0x0
}

The qtd_list is empty, contains only one entry, itself.

crash> struct -o ehci_qh | grep td_list
  [0x18] struct list_head qtd_list;
crash> p/x 0xffff88084b84dc80 + 0x18
$1 = 0xffff88084b84dc98

but qh->qh_next.ptr is !NULL, so we hit the BUG.  However, it seems that
the memory at qh->qh_next.ptr has been freed:

crash> struct ehci_qh 0xffff88084efe6730
struct ehci_qh {
  hw = 0x6b6b6b6b6b6b6b6b, 
  qh_dma = 0x6b6b6b6b6b6b6b6b, 
  qh_next = {
    qh = 0x6b6b6b6b6b6b6b6b, 
    itd = 0x6b6b6b6b6b6b6b6b, 
    sitd = 0x6b6b6b6b6b6b6b6b, 
    fstn = 0x6b6b6b6b6b6b6b6b, 
    hw_next = 0x6b6b6b6b6b6b6b6b, 
    ptr = 0x6b6b6b6b6b6b6b6b
  }, 
  qtd_list = {
    next = 0x6b6b6b6b6b6b6b6b, 
    prev = 0x6b6b6b6b6b6b6b6b
  }, 
  intr_node = {
    next = 0x6b6b6b6b6b6b6b6b, 
    prev = 0x6b6b6b6b6b6b6b6b
  }, 
  dummy = 0x6b6b6b6b6b6b6b6b, 
  unlink_node = {
    next = 0x6b6b6b6b6b6b6b6b, 
    prev = 0x6b6b6b6b6b6b6b6b
  }, 
  unlink_cycle = 0x6b6b6b6b, 
  qh_state = 0x6b, 
  xacterrs = 0x6b, 
  usecs = 0x6b, 
  gap_uf = 0x6b, 
  c_usecs = 0x6b, 
  tt_usecs = 0x6b6b, 
  period = 0x6b6b, 
  start = 0x6b6b, 
  dev = 0x6b6b6b6b6b6b6b6b, 
  is_out = 0x1, 
  clearing_tt = 0x1, 
  dequeue_during_giveback = 0x0, 
  exception = 0x1
}

crash> kmem 0xffff88084efe6730
CACHE            NAME                 OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE
ffff88085f807600 kmalloc-128              128       2761      4305    123    16k
  SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
  ffffea00213bf900  ffff88084efe4000     0     35         29     6
  FREE / [ALLOCATED]
   ffff88084efe6730  

      PAGE        PHYSICAL      MAPPING       INDEX CNT FLAGS
ffffea00213bf980 84efe6000                0        0  0 2fffff00008000 tail

I'm not too familiar with the USB code stack, so any suggestions on
instrumentation that I can add to aid in debugging would be helpful.
Maybe some tracing in qh_link_async / single_unlink_async /
end_unlink_async /qh_link_periodic can reveal the sequence that is
leaving this dangling qh_next.ptr?

Note: This does bear some resemblance to a bug that Stratus hit a few
years ago [1] [2], however enough of the code has changed that I'm not
sure the fix for that one would apply to a modern kernel.

Regards,

-- Joe

[1] https://bugzilla.redhat.com/show_bug.cgi?id=680987
[2] http://thread.gmane.org/gmane.linux.usb.general/44788
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Media]     [Linux Input]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Old Linux USB Devel Archive]

  Powered by Linux