Testing for hardware bug in EHCI controllers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Sarah (and anyone else who's interested):

A while ago I wrote about a hardware bug in my Intel ICH5 and ICH8 EHCI
controllers.  You pointed out that these are rather old components, not 
being used in current systems, which is quite true.

Now I have figured out a simple way for anyone to test for this bug in
any EHCI controller, without the need for a g-zero gadget.  It's a
two-part procedure:

	Apply the patch below (which is written for vanilla 3.8) and
	load the resulting driver.  The patch adds an explicit test
	to ehci-hcd for detecting the bug.

	Then plug in an ordinary USB flash drive and run the attached
	program (as root), giving it the device path for the flash
	drive as the single command-line argument.  For example:

		sudo ./ehci-test /dev/bus/usb/002/003

The program won't do anything bad to the flash drive; it just reads the
first 256 KB of data over and over again, now and then unlinking an URB
to try and trigger the bug.  If the program works right, it will print
out a loop counter every hundred iterations.  If it runs for 1000
iterations with no error messages in the kernel log, you may consider
that the controller has passed the test.  This should take under a
minute, depending on the hardware speed.

The program won't stop by itself unless something goes wrong.  You can
kill it with ^C or more simply by unplugging the flash drive.  (If you
want to be safe, make sure there are no mounted filesystems on the
drive before running the test program.)

If the hardware bug is detected, the kernel patch will print error
messages to the system log.  For example, when I run the test on the
Intel controller in this computer, I get:

[  150.019441] usb-storage 3-8:1.0: disconnect by usbfs
[  150.271190] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  150.591089] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  151.538560] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  151.857569] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  152.018886] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  152.179810] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 80008d00 00008d00
[  153.211804] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  153.374497] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  153.770443] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 80008d00 00008d00
[  154.247861] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 82008d80 00008d00
[  154.566912] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 82008d80 00008d00
[  155.359101] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  155.838132] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  156.791107] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 80008d00 00008d00
[  157.267620] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 00008d00 80008d00
[  159.252057] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 80008d00 00008d00
[  159.886048] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 80008d00 00008d00
[  160.206625] ehci-pci 0000:00:1d.7: EHCI hardware bug detected: 02008d80 80008d00
...

You get the idea.  The values in the two columns on the right are 
always supposed to be equal; when they aren't it indicates that the 
controller has done a DMA write at a time when ehci-hcd isn't expecting 
one to happen.

I'd be interested to hear the results of testing on a variety of 
controllers.  (This computer also has an NEC EHCI controller, and that 
one does not have the bug.)  Do the EHCI controllers on current Intel 
chipsets pass the test?  What about other vendors?

Thanks to all who try it out and report their results.

Alan Stern




Index: usb-3.8/drivers/usb/host/ehci-q.c
===================================================================
--- usb-3.8.orig/drivers/usb/host/ehci-q.c
+++ usb-3.8/drivers/usb/host/ehci-q.c
@@ -547,7 +547,7 @@ qh_completions (struct ehci_hcd *ehci, s
 	if (stopped != 0 || hw->hw_qtd_next == EHCI_LIST_END(ehci)) {
 		switch (state) {
 		case QH_STATE_IDLE:
-			qh_refresh(ehci, qh);
+//			qh_refresh(ehci, qh);
 			break;
 		case QH_STATE_LINKED:
 			/* We won't refresh a QH that's linked (after the HC
@@ -1232,6 +1232,7 @@ static void start_iaa_cycle(struct ehci_
 static void end_unlink_async(struct ehci_hcd *ehci)
 {
 	struct ehci_qh		*qh;
+	__hc32			tok1, tok2;
 
 	if (ehci->has_synopsys_hc_bug)
 		ehci_writel(ehci, (u32) ehci->async->qh_dma,
@@ -1242,6 +1243,7 @@ static void end_unlink_async(struct ehci
 	ehci->async_unlinking = true;
 	while (ehci->async_iaa) {
 		qh = ehci->async_iaa;
+		tok1 = ACCESS_ONCE(qh->hw->hw_token);
 		ehci->async_iaa = qh->unlink_next;
 		qh->unlink_next = NULL;
 
@@ -1250,8 +1252,14 @@ static void end_unlink_async(struct ehci
 
 		qh_completions(ehci, qh);
 		if (!list_empty(&qh->qtd_list) &&
-				ehci->rh_state == EHCI_RH_RUNNING)
+				ehci->rh_state == EHCI_RH_RUNNING) {
+			udelay(10);
+			tok2 = ACCESS_ONCE(qh->hw->hw_token);
+			if (tok1 != tok2)
+				ehci_err(ehci, "EHCI hardware bug detected: %08x %08x\n",
+						tok1, tok2);
 			qh_link_async(ehci, qh);
+		}
 		disable_async(ehci);
 	}
 	ehci->async_unlinking = false;
/*
 * ehci-test.c -- Test EHCI hardware using a flash drive test device
 *
 * To build:  gcc -O2 -o ehci-test ehci-test.c
 *
 * To run: Plug in a USB flash drive and note the bus and device numbers
 * it gets assigned.  Then do:
 *
 *	sudo ./ehci-test /dev/bus/usb/BBB/DDD
 *
 * where BBB and DDD are the bus and device numbers zero-filled to three
 * digits each.  If all goes well the test will not terminate; kill it
 * after a minute or so with ^C or by unplugging the flash drive.
 */

#include <stdio.h>
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <time.h>
#include <sys/ioctl.h>
#include <linux/usbdevice_fs.h>
#include <linux/usb/ch9.h>

int			fd;
unsigned char		buf[512];

int			ifnum;
int			ep_in = -1;
int			ep_out = -1;

int			start_read = 1;
int			block_count;
int			loop_count;

#define	NUM_URBS	600
struct usbdevfs_urb	urbs[NUM_URBS];
struct usbdevfs_urb	* const urbs_end = urbs + NUM_URBS;
struct usbdevfs_urb	*next_urb, *unlinked_urb;

#define DEFAULT_TIMEOUT	2	/* seconds */
#define NUM_BLOCKS	512


void init_urb(struct usbdevfs_urb *u)
{
	u->type = USBDEVFS_URB_TYPE_BULK;
	u->endpoint = ep_in;
	u->buffer = buf;
	u->buffer_length = sizeof(buf);
}

int get_interface(void)
{
	int				i, rc;
	struct usb_interface_descriptor	*pi;
	struct usb_endpoint_descriptor	*pe;
	struct usbdevfs_ioctl		ctl;

	/* Assume we will use the first interface in the first configuration */
	i = USB_DT_DEVICE_SIZE + USB_DT_CONFIG_SIZE + USB_DT_INTERFACE_SIZE;
	rc = read(fd, buf, i);
	if (rc < i) {
		perror("Unable to read device file");
		return 1;
	}

	pi = (struct usb_interface_descriptor *)
			&buf[USB_DT_DEVICE_SIZE + USB_DT_CONFIG_SIZE];
	if (pi->bDescriptorType != USB_DT_INTERFACE ||
			pi->bLength != USB_DT_INTERFACE_SIZE) {
		fprintf(stderr, "Interface descriptor not found\n");
		return 1;
	}

	if (pi->bInterfaceClass != USB_CLASS_MASS_STORAGE) {
		fprintf(stderr, "First interface is is not mass storage\n");
		return 1;
	}
	if (pi->bInterfaceSubClass != 0x06 ||		/* Transparent SCSI */
			pi->bInterfaceProtocol != 0x50) {	/* Bulk Only */
		fprintf(stderr, "Interface subclass/protocol is wrong\n");
		return 1;
	}

	ifnum = pi->bInterfaceNumber;
	i = pi->bNumEndpoints * USB_DT_ENDPOINT_SIZE;
	rc = read(fd, buf, i);
	if (rc != i) {
		perror("Unable to read endpoint descriptors");
		return 1;
	}

	for (i = 0; i < pi->bNumEndpoints; ++i) {
		pe = (struct usb_endpoint_descriptor *)
				&buf[i * USB_DT_ENDPOINT_SIZE];
		if (pe->bDescriptorType != USB_DT_ENDPOINT ||
				pe->bLength != USB_DT_ENDPOINT_SIZE) {
			fprintf(stderr, "Endpoint descriptor not found\n");
			return 1;
		}

		if (usb_endpoint_is_bulk_in(pe) && ep_in < 0)
			ep_in = pe->bEndpointAddress;
		if (usb_endpoint_is_bulk_out(pe) && ep_out < 0)
			ep_out = pe->bEndpointAddress;
	}

	if (ep_in < 0 || ep_out < 0) {
		fprintf(stderr, "Didn't find both bulk endpoints\n");
		return 1;
	}

	/* Unbind usb-storage from the interface */
	ctl.ifno = ifnum;
	ctl.ioctl_code = USBDEVFS_DISCONNECT;
	rc = ioctl(fd, USBDEVFS_IOCTL, &ctl);
	if (rc == -1 && errno != ENODATA) {
		perror("Unable to unbind the kernel driver");
		return 1;
	}

	/* Claim the interface */
	rc = ioctl(fd, USBDEVFS_CLAIMINTERFACE, &ifnum);
	if (rc == -1) {
		perror("Unable to claim interface\n");
		return 1;
	}

	for (i = 0; i < NUM_URBS; ++i)
		init_urb(&urbs[i]);

	return 0;
}

/* Send TEST UNIT READY */
int check_device(void)
{
	int				rc;
	struct usbdevfs_bulktransfer	bulk;
	static unsigned char		cbw[31] = {
		'U', 'S', 'B', 'C',		/* Signature */
		100, 0, 0, 0,			/* Tag */
		0, 0, 0, 0,			/* DataTransferLength */
		0, 0, 6,			/* Flags, LUN, Length of CDB */
		0, 0, 0, 0, 0, 0,		/* CDB: TEST UNIT READY */
	};
	unsigned char			csw[13];

	bulk.ep = ep_out;
	bulk.len = sizeof(cbw);
	bulk.timeout = 1000;
	bulk.data = cbw;

	rc = ioctl(fd, USBDEVFS_BULK, &bulk);
	if (rc < 0) {
		perror("Unable to send TEST UNIT READY");
		return 1;
	}

	bulk.ep = ep_in;
	bulk.len = sizeof(csw);
	bulk.data = csw;

	rc = ioctl(fd, USBDEVFS_BULK, &bulk);
	if (rc < 0) {
		perror("Unable to get TEST UNIT READY status");
		return 1;
	}
	if (rc != bulk.len || csw[3] != 'S') {
		fprintf(stderr, "Invalid CSW data\n");
		return 1;
	}

	rc = csw[12];
	if (rc != 0) {
		fprintf(stderr, "TEST UNIT READY status %d\n", rc);
		return 1;
	}

	return 0;
}

int send_READ10(void)
{
	int				rc;
	struct usbdevfs_bulktransfer	bulk;
	static unsigned char		cbw[31] = {
		'U', 'S', 'B', 'C',		/* Signature */
		101, 0, 0, 0,			/* Tag */
		0,				/* DataTransferLength */
		(NUM_BLOCKS << 1) & 0xff,
		(NUM_BLOCKS >> 7) & 0xff,
		(NUM_BLOCKS >> 15) & 0xff,
		0, 0, 10,			/* Flags, LUN, Length of CDB */
		0x28, 0,			/* CDB: READ(10), LUN 0 */
		0, 0, 0, 0,			/* LBA = 0  */
		0,				/* Reserved */
		NUM_BLOCKS >> 8,		/* Block count (big-endian) */
		NUM_BLOCKS & 0xff,
		0,				/* Control */
	};

	bulk.ep = ep_out;
	bulk.len = sizeof(cbw);
	bulk.timeout = 1000;
	bulk.data = cbw;

	rc = ioctl(fd, USBDEVFS_BULK, &bulk);
	if (rc < 0) {
		perror("Unable to send TEST UNIT READY");
		return 1;
	}

	++cbw[4];		/* Increment the tag */
	return 0;
}

int wait_for_one_urb(int use_timeout)
{
	int			rc;
	time_t			tend;
	struct usbdevfs_urb	*u;

	/* If a READ(10) command is needed, send it */
	if (start_read) {
		if (send_READ10() != 0)
			return 1;
		start_read = 0;
	}

	tend = time(NULL);
	if (use_timeout)
		tend += DEFAULT_TIMEOUT;

	for (;;) {
		rc = ioctl(fd, USBDEVFS_REAPURBNDELAY, &u);
		if (rc == 0)
			break;
		if (rc == -1) {
			if (errno != EAGAIN) {
				perror("Error in REAPURBNDELAY");
				return 1;
			}
		}
		if (time(NULL) >= tend)
			return -1;		/* Timed out */
	}

	// printf("Reaped URB %d status %d actlen %d\n",
	//		u - urbs, u->status, u->actual_length);

	/* Make sure we are in sync */
	if (u == unlinked_urb) {
		unlinked_urb = NULL;
	} else if (u == next_urb) {
		if (++next_urb == urbs_end)
			next_urb = urbs;
	} else {
		fprintf(stderr, "Wrong URB completed\n");
		return 1;
	}

	if (u->status == -ENOENT && u->usercontext)
		;	/* Okay, URB was unlinked */
	else if (u->status == 0)
		;	/* Okay, URB completed normally */
	else {
		fprintf(stderr, "Invalid URB status %d, act len %d\n",
				u->status, u->actual_length);
		return 1;
	}

	if (u->actual_length == 512) {		/* Data block */
		++block_count;
		if (block_count > NUM_BLOCKS)
			fprintf(stderr, "Block count is too large\n");
	} else if (u->actual_length == 13) {	/* CSW */
		if (buf[3] != 'S' || buf[12] != 0) {
			fprintf(stderr, "Invalid CSW packet\n");
			return 1;
		}
		if (block_count != NUM_BLOCKS) {
			fprintf(stderr, "Block count is too small: %d\n",
					block_count);
			return 1;
		}

		++loop_count;
		if (loop_count % 100 == 0)
			printf("%d\n", loop_count);

		start_read = 1;
		block_count = 0;
	}
	else if (u->actual_length == 0) {	/* Must have been unlinked */
		if (!u->usercontext) {
			fprintf(stderr, "Got zero-length packet\n");
			return 1;
		}
	} else {
		fprintf(stderr, "Got invalid packet length: %d\n",
				u->actual_length);
	}

	/* Resubmit if we're not waiting for an unlinked URB */
	if (!unlinked_urb) {
		do {
			u->usercontext = NULL;
			rc = ioctl(fd, USBDEVFS_SUBMITURB, u);
			if (rc < 0) {
				perror("Error resubmitting bulk-in urb");
				return 1;
			}
			if (++u == urbs_end)
				u = urbs;
		} while (u != next_urb);
	}

	return 0;
}

int wait_for_urb_with_timeout(void)
{
	int	rc;

 retry:
	rc = wait_for_one_urb(1);
	if (rc >= 0)
		return rc;

	/* Try to unlink the next URB */
	if (!next_urb->usercontext) {
		printf("URB timed out; bug may be present\n");
		next_urb->usercontext = next_urb;
		ioctl(fd, USBDEVFS_DISCARDURB, next_urb);
		goto retry;
	}
	return 1;
}

void run_test(void)
{
	int	rc, i;
	struct usbdevfs_urb	*u;

	for (i = 0; i < NUM_URBS; ++i) {
		u = &urbs[i];
		rc = ioctl(fd, USBDEVFS_SUBMITURB, u);
		if (rc < 0) {
			perror("Error submitting bulk-in urb");
			return;
		}
	}
	next_urb = &urbs[0];

	/* If everything works right, the test never stops */
	for (;;) {

		/* Wait for at least two URBs to complete */
		for (i = 0; i < 2; ++i) {
			rc = wait_for_urb_with_timeout();
			if (rc != 0)
				return;
		}

		/* Wait until no URBs are ready */
		do {
			rc = wait_for_one_urb(0);
			if (rc > 0)
				return;
		} while (rc == 0);

		/* Unlink the URB which was just resubmitted */
		u = next_urb;
		if (u == urbs)
			u = urbs_end;
		unlinked_urb = --u;

		u->usercontext = u;
		rc = ioctl(fd, USBDEVFS_DISCARDURB, u);
		if (rc == -1) {
			perror("Error in DISCARDURB");
			return;
		}

		/* Wait until the unlinked URB completes */
		while (unlinked_urb) {
			rc = wait_for_one_urb(0);
			if (rc > 0)
				return;
		}

		/* Wait until no URBs are ready */
		do {
			rc = wait_for_one_urb(0);
			if (rc > 0)
				return;
		} while (rc == 0);
	}
}

int main(int argc, char **argv)
{
	char *filename;

	if (argc != 2) {
		printf("Usage:  ehci-test device-filename\n");
		return 1;
	}
	filename = argv[1];

	fd = open(filename, O_RDWR);
	if (fd < 0) {
		perror("Error in open");
		return 1;
	}

	if (get_interface() != 0)
		return 1;

	if (check_device() != 0)
		return 1;

	run_test();

	close(fd);
	return 0;
}

[Index of Archives]     [Linux Media]     [Linux Input]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Old Linux USB Devel Archive]

  Powered by Linux