[PATCH] usb/core: Fix race condition when removing EHCI PCI devices

Don Zickus <dzickus@xxxxxxxxxx> · Mon, 10 Sep 2012 11:29:37 -0400

A customer of ours noticed that after a bunch of hot removals of the EHCI
PCI device, a panic occurs. This happened on a 2.6.32 RHEL-6 kernel, but
I believe is still applicable upstream.

The panic was further simplified to enabling SLUB debug poisoning and running
the following command:

while true; do find /proc/bus/usb -type f -exec cat {} >/dev/null \; ; done &

This gets the machine to panic after 1 or 2 removals of the device.

The backtrace looks like:

  crash> bt
  PID: 23674  TASK: ffff8801ab411540  CPU: 2   COMMAND: "sosreport"
   #0 [ffff8801b9073980] machine_kexec at ffffffff8103281b
   #1 [ffff8801b90739e0] crash_kexec at ffffffff810ba662
   #2 [ffff8801b9073ab0] oops_end at ffffffff81501290
   #3 [ffff8801b9073ae0] no_context at ffffffff81043bab
   #4 [ffff8801b9073b30] __bad_area_nosemaphore at ffffffff81043e35
   #5 [ffff8801b9073b80] bad_area at ffffffff81043f5e
   #6 [ffff8801b9073bb0] __do_page_fault at ffffffff81044710
   #7 [ffff8801b9073cd0] do_page_fault at ffffffff8150326e
   #8 [ffff8801b9073d00] page_fault at ffffffff81500625
      [exception RIP: __list_add+0x26]
      RIP: ffffffff81282fb6  RSP: ffff8801b9073db8  RFLAGS: 00010046
      RAX: 0000000000000292  RBX: ffff8801b9073de8  RCX: 0000000000000000
      RDX: ffff8802782788f8  RSI: 0000000000000000  RDI: ffff8801b9073de8
      RBP: ffff8801b9073dd8   R8: 0000000000000000   R9: 0000000000000001
      R10: ffff880028401fc0  R11: 0000000000000000  R12: ffff8802782788f8
      R13: 0000000000000000  R14: 7fffffffffffffff  R15: ffff8801b9073e98
      ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
   #9 [ffff8801b9073de0] __down at ffffffff814ff58e
  #10 [ffff8801b9073e30] down at ffffffff81097ef1
  #11 [ffff8801b9073e60] usb_device_read at ffffffff813b78ce
  #12 [ffff8801b9073ef0] vfs_read at ffffffff8117b8b5
  #13 [ffff8801b9073f30] sys_read at ffffffff8117b9f1
  #14 [ffff8801b9073f80] tracesys at ffffffff8100b308 (via system_call)
      RIP: 00000030f90da360  RSP: 00007fffef71dda8  RFLAGS: 00000246
      RAX: ffffffffffffffda  RBX: ffffffff8100b308  RCX: ffffffffffffffff
      RDX: 0000000000001000  RSI: 00007f1926568000  RDI: 0000000000000003
      RBP: 0000000000000000   R8: 00000000084023c4   R9: 00007f1926530700
      R10: 0000000000002028  R11: 0000000000000246  R12: 0000000008403b7e
      R13: 0000000000000846  R14: 00000000083159c0  R15: 00007fffef71de40
      ORIG_RAX: 0000000000000000  CS: 0033  SS: 002b

This is likely because the following thread was in the process of removing the
device:

  PID: 3070   TASK: ffff8802735fd540  CPU: 1   COMMAND: "io_poll"
   #0 [ffff8802788a97b0] schedule at ffffffff814fd7e2
   #1 [ffff8802788a9878] __mutex_lock_slowpath at ffffffff814fee8e
   #2 [ffff8802788a98e8] mutex_lock at ffffffff814fed2b
   #3 [ffff8802788a9908] usb_deregister_bus at ffffffff813a59ff
   #4 [ffff8802788a9938] usb_remove_hcd at ffffffff813a5b9d
   #5 [ffff8802788a9978] usb_hcd_pci_remove at ffffffff813b80b3
   #6 [ffff8802788a9998] pci_device_remove at ffffffff812930b7
   #7 [ffff8802788a99b8] __device_release_driver at ffffffff8135006f
   #8 [ffff8802788a99d8] device_release_driver at ffffffff813501dd
   #9 [ffff8802788a99f8] bus_remove_device at ffffffff8134f073
  #10 [ffff8802788a9a28] device_del at ffffffff8134ccbd
  #11 [ffff8802788a9a58] device_unregister at ffffffff8134cd92
  #12 [ffff8802788a9a78] pci_stop_bus_device at ffffffff8128c4cc
  #13 [ffff8802788a9aa8] pci_stop_bus_device at ffffffff8128c47b
  #14 [ffff8802788a9ad8] pci_stop_bus_device at ffffffff8128c47b
  #15 [ffff8802788a9b08] pci_remove_bus_device at ffffffff8128c57a
  #16 [ffff8802788a9b38] remove_io at ffffffffa0361db8 [ftmod]
  #17 [ffff8802788a9b98] ioboard_event at ffffffffa0362130 [ftmod]
  #18 [ffff8802788a9c18] io_state_change at ffffffffa0365c9e [ftmod]
  #19 [ffff8802788a9c38] OsIoStateChange at ffffffffa0406638 [ftmod]
  #20 [ffff8802788a9c58] IoStateChange at ffffffffa0407a4b [ftmod]
  #21 [ffff8802788a9db8] CcIoPoll at ffffffffa040af64 [ftmod]
  #22 [ffff8802788a9e98] io_poll at ffffffffa0365b00 [ftmod]
  #23 [ffff8802788a9f48] kernel_thread at ffffffff8100c14a

The fix that solved their problem was to deregister the usb bus first before
running usb_put_dev.  After running multiple tests the panic disappeared.

Deregistering the bus first to prevent future readers from accessing the device
before cleaning up the hcd, seemed like an appropriate way to go.  I'll leave
it to the experts to validate that or provide a better solution. :-)

I adapted their usb_remove_hcd fix for usb_add_hcd error path too to mimic
the same behaviour (a little code shuffling to handle the gotos cleanly).

Signed-off-by: Don Zickus <dzickus@xxxxxxxxxx>
---
 drivers/usb/core/hcd.c |   20 ++++++++++----------
 1 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index bc84106..d155bbf 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -2428,13 +2428,6 @@ int usb_add_hcd(struct usb_hcd *hcd,
 	if ((retval = usb_register_bus(&hcd->self)) < 0)
 		goto err_register_bus;
 
-	if ((rhdev = usb_alloc_dev(NULL, &hcd->self, 0)) == NULL) {
-		dev_err(hcd->self.controller, "unable to allocate root hub\n");
-		retval = -ENOMEM;
-		goto err_allocate_root_hub;
-	}
-	hcd->self.root_hub = rhdev;
-
 	switch (hcd->speed) {
 	case HCD_USB11:
 		rhdev->speed = USB_SPEED_FULL;
@@ -2450,6 +2443,13 @@ int usb_add_hcd(struct usb_hcd *hcd,
 		goto err_set_rh_speed;
 	}
 
+	if ((rhdev = usb_alloc_dev(NULL, &hcd->self, 0)) == NULL) {
+		dev_err(hcd->self.controller, "unable to allocate root hub\n");
+		retval = -ENOMEM;
+		goto err_allocate_root_hub;
+	}
+	hcd->self.root_hub = rhdev;
+
 	/* wakeup flag init defaults to "everything works" for root hubs,
 	 * but drivers can override it in reset() if needed, along with
 	 * recording the overall controller's system wakeup capability.
@@ -2541,10 +2541,10 @@ err_hcd_driver_start:
 		free_irq(irqnum, hcd);
 err_request_irq:
 err_hcd_driver_setup:
-err_set_rh_speed:
-	usb_put_dev(hcd->self.root_hub);
 err_allocate_root_hub:
 	usb_deregister_bus(&hcd->self);
+err_set_rh_speed:
+	usb_put_dev(hcd->self.root_hub);
 err_register_bus:
 	hcd_buffer_destroy(hcd);
 	return retval;
@@ -2606,8 +2606,8 @@ void usb_remove_hcd(struct usb_hcd *hcd)
 			free_irq(hcd->irq, hcd);
 	}
 
-	usb_put_dev(hcd->self.root_hub);
 	usb_deregister_bus(&hcd->self);
+	usb_put_dev(hcd->self.root_hub);
 	hcd_buffer_destroy(hcd);
 }
 EXPORT_SYMBOL_GPL(usb_remove_hcd);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html