how to explain this?

Ming Zhang <blackmagic02881@xxxxxxxxx> · Sun, 10 Dec 2006 20:29:12 -0500

Today I use blktrace observe a strange (at least to me) behavior at
block layer. Wonder if anybody can shed some lights? Thanks.

Here is the detail.

... previous requests are ok.

  8,16   0      782     7.025277381  4915  Q   W 6768 + 32 [istiod1]
  8,16   0      783     7.025283850  4915  G   W 6768 + 32 [istiod1]
  8,16   0      784     7.025286799  4915  P   R [istiod1]
  8,16   0      785     7.025287794  4915  I   W 6768 + 32 [istiod1]

Write request to lba 6768 was inserted to the queue.

  8,16   0      786     7.026059876  4915  Q   R 6768 + 32 [istiod1]
  8,16   0      787     7.026064451  4915  G   R 6768 + 32 [istiod1]
  8,16   0      788     7.026066369  4915  I   R 6768 + 32 [istiod1]

Read request to same lba was inserted to the queue as well. though it
can not be merged, i thought it can be satisfied by previous write
request directly. seems merge function does not consider this.

  8,16   0      789     7.034883766     0 UT   R [swapper] 2
  8,16   0      790     7.034904284     9  U   R [kblockd/0] 2

Unplug because of a read.

  8,16   0      791     7.045272094     9  D   R 6768 + 32 [kblockd/0]
  8,16   0      792     7.045654039     9  C   R 6768 + 32 [0]

Strangely, read request was sent to device before write request and thus
return a wrong data.

  8,16   0      793     7.045669809     9  D   W 6768 + 32 [kblockd/0]
  8,16   0      794     7.049840970     0  C   W 6768 + 32 [0]

Write finished.

So read get a wrong data back to application. one thing not sure is
where (front/back) the request are insert into queue and who mess up the
order here.

Is it possible for I event, we can know the extra flag, so we know where
it is inserted.

---- is the code to generate this io -----. disk is a regular disk and
current scheduler is CFQ.

/**
 * blockio_make_request(): The function translates an iscsi-request into 
 * a number of requests to the corresponding block device. 
 **/
int blockio_make_request(struct iet_volume *lu, struct tio *tio, int rw)
{
	struct blockio_data *p;
	struct block_device *target_device;
	struct request_queue *target_queue;
	struct bio *target_bio;
	int max_sectors;
	int pg_number;
	int page_count;
	int counter;
	struct page *page;
	mm_segment_t oldfs;

	u32 offset, size;
	u32 len;

	loff_t ppos;
	int i;
	ssize_t ret;
	DECLARE_COMPLETION(work);

	p = (struct blockio_data *)lu->private;
	assert(p);

	target_device = p->device;
	assert(target_device);

	size = tio->size;
	offset = tio->offset;

	ppos = (loff_t) tio->idx << PAGE_SHIFT;
	ppos += offset;

	/* Get maximum number of sectors / pages that could be sent to target 
	 * block device within a single bio-structure */

	target_queue = target_device->bd_disk->queue;
	if (target_queue) {
		max_sectors = target_queue->max_sectors;
		if (max_sectors > 0) {
			pg_number =
			    (max_sectors << SECTOR_SIZE_BITS) >> PAGE_SHIFT;
			if (pg_number > tio->pg_cnt)
				pg_number = tio->pg_cnt;
		} else
			pg_number = tio->pg_cnt;
	} else {
		max_sectors = 0;
		pg_number = tio->pg_cnt;
	}

	page_count = 0;
	counter = tio->pg_cnt;

	while (counter > 0) {
		/* get new bio-structure */
		target_bio = bio_alloc(GFP_NOIO, pg_number);
		if (!target_bio) {
			eprintk("I/O error:  %d\n", page_count);
			return -ENOMEM;
		}

		/* Initialize bio */
		target_bio->bi_sector = ppos >> SECTOR_SIZE_BITS;
		target_bio->bi_bdev = target_device;
		target_bio->bi_rw = rw;
		target_bio->bi_end_io = (bio_end_io_t *) blockio_bio_endio;

		if (rw == READ)
			target_bio->bi_private = &work;
		else {
			target_bio->bi_private = tio;
			tio_get(tio);
		}
		for (i = 0; i < pg_number; i++) {
			page = tio->pvec[page_count];
			assert(page);

			/* calc access length for this page */
			len = PAGE_SIZE;
			if (offset)
				len -= offset;
			if (size < len)
				len = size;

			/* bio_add_page returns len if successful */
			ret = bio_add_page(target_bio, page, len, offset);
			if (!ret) {
				eprintk("I/O error:  %ld\n", (long)ret);
				return -EIO;
			}
			/* offset valid only once */
			offset = 0;
			size -= len;
			page_count++;
		}

		counter -= pg_number;
		ppos += (pg_number << PAGE_SHIFT);

		if (pg_number > counter)
			pg_number = counter;

		oldfs = get_fs();
		set_fs(get_ds());

		/* send bio to generic_make_request */
		submit_bio(rw, target_bio);

		if (rw == READ)
			wait_for_completion(&work);

		set_fs(oldfs);

	}
	assert(!size);

	return 0;
}

--
Kernelnewbies: Help each other learn about the Linux kernel.
Archive:       http://mail.nl.linux.org/kernelnewbies/
FAQ:           http://kernelnewbies.org/faq/