--/2994txjAzEdQwm5 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Hi, On Thu, May 02, 2002 at 01:29:49PM -0400, David Mansfield wrote: > The last thing I tried is running the program on a server here running the > beta of 'Red Hat Advanced Server' which is running their kernel > '2.4.9-26beta.31' and I got: > > 1399 blks/sec ext3-ordered !!! > > It turns out that this kernel ignores the O_SYNC on open... ha. It looks like a generic 2.4.9 bug. What's happening is that ext3 has its own extra logic for tracking buffers when it comes to writing pages from the page cache, and part of what the jbd layer does marks the pages as dirty. Then, the core VFS __block_prepare_write comes along, and it does if (!atomic_set_buffer_dirty(bh)) { __mark_dirty(bh); buffer_insert_inode_queue(bh, inode); need_balance_dirty = 1; } so if the buffer becomes dirty, it gets hooked onto the appropriate inode's queue where O_SYNC can find it. Unfortunately, ext3 has already marked it dirty, so this code doesn't run. It looks as if the same problem is still present in 2.4.18, except ameliorated somewhat by new code in ext3_file_write: /* * Nasty: if the file is subject to synchronous writes then we need * to force generic_osync_inode() to call ext3_write_inode(). * We do that by marking the inode dirty. This adds much more * computational expense than we need, but we're going to sync * anyway. */ if (IS_SYNC(inode) || (file->f_flags & O_SYNC)) mark_inode_dirty(inode); That's actually the wrong fix --- for a multi-page write, it only guarantees that the first page gets flushed (and not even that if another task happens to flush the inode out before the page write completes); and it forces an unnecessary inode sync even when we haven't modified i_size and don't need to sync metadata. So 2.4.18 is broken too, but so much less so that it doesn't show up in benchmarks. The 2.4.9 fix should fix 2.4.18 too, and will let us remove the above performance degradation. The patch I've been testing for 2.4.9 is attached. Cheers, Stephen --/2994txjAzEdQwm5 Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="sct-osync.patch" --- linux/fs/ext3/inode.c.~1~ Thu May 2 19:23:39 2002 +++ linux/fs/ext3/inode.c Fri May 3 15:54:09 2002 @@ -949,11 +949,13 @@ } static int walk_page_buffers( handle_t *handle, + struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)( handle_t *handle, + struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; @@ -971,7 +973,7 @@ *partial = 1; continue; } - err = (*fn)(handle, bh); + err = (*fn)(handle, inode, bh); if (!ret) ret = err; } @@ -1004,7 +1006,7 @@ * write. */ -static int do_journal_get_write_access(handle_t *handle, +static int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { return ext3_journal_get_write_access(handle, bh); @@ -1028,7 +1030,7 @@ goto prepare_write_failed; if (ext3_should_journal_data(inode)) { - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, NULL, do_journal_get_write_access); if (ret) { /* @@ -1049,24 +1051,32 @@ return ret; } -static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) +static int journal_dirty_sync_data(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { - return ext3_journal_dirty_data(handle, bh, 0); + int ret = ext3_journal_dirty_data(handle, bh, 0); + if (bh->b_inode != inode) + buffer_insert_inode_queue(bh, inode); + return ret; } /* * For ext3_writepage(). We also brelse() the buffer to account for * the bget() which ext3_writepage() performs. */ -static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) +static int journal_dirty_async_data(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int ret = ext3_journal_dirty_data(handle, bh, 1); + if (bh->b_inode != inode) + buffer_insert_inode_queue(bh, inode); __brelse(bh); return ret; } /* For commit_write() in data=journal mode */ -static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +static int commit_write_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { set_bit(BH_Uptodate, &bh->b_state); return ext3_journal_dirty_metadata(handle, bh); @@ -1101,7 +1111,7 @@ int partial = 0; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, &partial, commit_write_fn); if (!partial) SetPageUptodate(page); @@ -1111,7 +1121,7 @@ EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; } else { if (ext3_should_order_data(inode)) { - ret = walk_page_buffers(handle, page->buffers, + ret = walk_page_buffers(handle, inode, page->buffers, from, to, NULL, journal_dirty_sync_data); } /* Be careful here if generic_commit_write becomes a @@ -1193,7 +1203,8 @@ return generic_block_bmap(mapping,block,ext3_get_block); } -static int bget_one(handle_t *handle, struct buffer_head *bh) +static int bget_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { atomic_inc(&bh->b_count); return 0; @@ -1292,7 +1303,7 @@ create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize); page_buffers = page->buffers; - walk_page_buffers(handle, page_buffers, 0, + walk_page_buffers(handle, inode, page_buffers, 0, PAGE_CACHE_SIZE, NULL, bget_one); } @@ -1310,7 +1321,7 @@ /* And attach them to the current transaction */ if (order_data) { - err = walk_page_buffers(handle, page_buffers, + err = walk_page_buffers(handle, inode, page_buffers, 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); if (!ret) ret = err; --/2994txjAzEdQwm5--