Dave Chinner found a 10% performance regression with ext3 when using DIO to fill holes instead of buffered IO. On large IOs, the ext3 get_block routine will send more than a page worth of blocks back to DIO via a single buffer_head with a large b_size value. The DIO code iterates through this massive block and tests for a boundary buffer over and over again. For every block size unit spanned by the big map_bh, the boundary bit is tested and a bio may be forced down to the block layer. There are two potential fixes, one is to ignore the boundary bit on large regions returned by the FS. DIO can't tell which part of the big region was a boundary, and so it may not be a good idea to trust the hint. This patch just clears the boundary bit after using it once. It is 10% faster for a streaming DIO write w/blocksize of 512k on my sata drive. Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx> diff -r 3bd838f3dc06 -r 9d3d4e0f01fe fs/direct-io.c --- a/fs/direct-io.c Thu Dec 21 15:31:31 2006 -0500 +++ b/fs/direct-io.c Thu Dec 21 15:31:31 2006 -0500 @@ -610,7 +610,6 @@ static int dio_new_bio(struct dio *dio, nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); BUG_ON(nr_pages <= 0); ret = dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); - dio->boundary = 0; out: return ret; } @@ -664,12 +663,6 @@ static int dio_send_cur_page(struct dio */ if (dio->final_block_in_bio != dio->cur_page_block) dio_bio_submit(dio); - /* - * Submit now if the underlying fs is about to perform a - * metadata read - */ - if (dio->boundary) - dio_bio_submit(dio); } if (dio->bio == NULL) { @@ -686,6 +679,12 @@ static int dio_send_cur_page(struct dio BUG_ON(ret != 0); } } + /* + * Submit now if the underlying fs is about to perform a + * metadata read + */ + if (dio->boundary) + dio_bio_submit(dio); out: return ret; } @@ -712,6 +711,10 @@ submit_page_section(struct dio *dio, str unsigned offset, unsigned len, sector_t blocknr) { int ret = 0; + int boundary = dio->boundary; + + /* don't let dio_send_cur_page do the boundary too soon */ + dio->boundary = 0; if (dio->rw & WRITE) { /* @@ -728,17 +731,7 @@ submit_page_section(struct dio *dio, str (dio->cur_page_block + (dio->cur_page_len >> dio->blkbits) == blocknr)) { dio->cur_page_len += len; - - /* - * If dio->boundary then we want to schedule the IO now to - * avoid metadata seeks. - */ - if (dio->boundary) { - ret = dio_send_cur_page(dio); - page_cache_release(dio->cur_page); - dio->cur_page = NULL; - } - goto out; + goto out_send; } /* @@ -757,6 +750,18 @@ submit_page_section(struct dio *dio, str dio->cur_page_offset = offset; dio->cur_page_len = len; dio->cur_page_block = blocknr; + +out_send: + /* + * If dio->boundary then we want to schedule the IO now to + * avoid metadata seeks. + */ + if (boundary) { + dio->boundary = 1; + ret = dio_send_cur_page(dio); + page_cache_release(dio->cur_page); + dio->cur_page = NULL; + } out: return ret; } @@ -962,7 +967,16 @@ do_holes: this_chunk_bytes = this_chunk_blocks << blkbits; BUG_ON(this_chunk_bytes == 0); - dio->boundary = buffer_boundary(map_bh); + /* + * get_block may return more than one page worth + * of blocks. Make sure only the last io we + * send down for this region is a boundary + */ + if (dio->blocks_available == this_chunk_blocks) + dio->boundary = buffer_boundary(map_bh); + else + dio->boundary = 0; + ret = submit_page_section(dio, page, offset_in_page, this_chunk_bytes, dio->next_block_for_io); if (ret) { - To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html