Today I use blktrace observe a strange (at least to me) behavior at block layer. Wonder if anybody can shed some lights? Thanks. Here is the detail. ... previous requests are ok. 8,16 0 782 7.025277381 4915 Q W 6768 + 32 [istiod1] 8,16 0 783 7.025283850 4915 G W 6768 + 32 [istiod1] 8,16 0 784 7.025286799 4915 P R [istiod1] 8,16 0 785 7.025287794 4915 I W 6768 + 32 [istiod1] Write request to lba 6768 was inserted to the queue. 8,16 0 786 7.026059876 4915 Q R 6768 + 32 [istiod1] 8,16 0 787 7.026064451 4915 G R 6768 + 32 [istiod1] 8,16 0 788 7.026066369 4915 I R 6768 + 32 [istiod1] Read request to same lba was inserted to the queue as well. though it can not be merged, i thought it can be satisfied by previous write request directly. seems merge function does not consider this. 8,16 0 789 7.034883766 0 UT R [swapper] 2 8,16 0 790 7.034904284 9 U R [kblockd/0] 2 Unplug because of a read. 8,16 0 791 7.045272094 9 D R 6768 + 32 [kblockd/0] 8,16 0 792 7.045654039 9 C R 6768 + 32 [0] Strangely, read request was sent to device before write request and thus return a wrong data. 8,16 0 793 7.045669809 9 D W 6768 + 32 [kblockd/0] 8,16 0 794 7.049840970 0 C W 6768 + 32 [0] Write finished. So read get a wrong data back to application. one thing not sure is where (front/back) the request are insert into queue and who mess up the order here. Is it possible for I event, we can know the extra flag, so we know where it is inserted. ---- is the code to generate this io -----. disk is a regular disk and current scheduler is CFQ. /** * blockio_make_request(): The function translates an iscsi-request into * a number of requests to the corresponding block device. **/ int blockio_make_request(struct iet_volume *lu, struct tio *tio, int rw) { struct blockio_data *p; struct block_device *target_device; struct request_queue *target_queue; struct bio *target_bio; int max_sectors; int pg_number; int page_count; int counter; struct page *page; mm_segment_t oldfs; u32 offset, size; u32 len; loff_t ppos; int i; ssize_t ret; DECLARE_COMPLETION(work); p = (struct blockio_data *)lu->private; assert(p); target_device = p->device; assert(target_device); size = tio->size; offset = tio->offset; ppos = (loff_t) tio->idx << PAGE_SHIFT; ppos += offset; /* Get maximum number of sectors / pages that could be sent to target * block device within a single bio-structure */ target_queue = target_device->bd_disk->queue; if (target_queue) { max_sectors = target_queue->max_sectors; if (max_sectors > 0) { pg_number = (max_sectors << SECTOR_SIZE_BITS) >> PAGE_SHIFT; if (pg_number > tio->pg_cnt) pg_number = tio->pg_cnt; } else pg_number = tio->pg_cnt; } else { max_sectors = 0; pg_number = tio->pg_cnt; } page_count = 0; counter = tio->pg_cnt; while (counter > 0) { /* get new bio-structure */ target_bio = bio_alloc(GFP_NOIO, pg_number); if (!target_bio) { eprintk("I/O error: %d\n", page_count); return -ENOMEM; } /* Initialize bio */ target_bio->bi_sector = ppos >> SECTOR_SIZE_BITS; target_bio->bi_bdev = target_device; target_bio->bi_rw = rw; target_bio->bi_end_io = (bio_end_io_t *) blockio_bio_endio; if (rw == READ) target_bio->bi_private = &work; else { target_bio->bi_private = tio; tio_get(tio); } for (i = 0; i < pg_number; i++) { page = tio->pvec[page_count]; assert(page); /* calc access length for this page */ len = PAGE_SIZE; if (offset) len -= offset; if (size < len) len = size; /* bio_add_page returns len if successful */ ret = bio_add_page(target_bio, page, len, offset); if (!ret) { eprintk("I/O error: %ld\n", (long)ret); return -EIO; } /* offset valid only once */ offset = 0; size -= len; page_count++; } counter -= pg_number; ppos += (pg_number << PAGE_SHIFT); if (pg_number > counter) pg_number = counter; oldfs = get_fs(); set_fs(get_ds()); /* send bio to generic_make_request */ submit_bio(rw, target_bio); if (rw == READ) wait_for_completion(&work); set_fs(oldfs); } assert(!size); return 0; } -- Kernelnewbies: Help each other learn about the Linux kernel. Archive: http://mail.nl.linux.org/kernelnewbies/ FAQ: http://kernelnewbies.org/faq/