Although replying to self is somewhat bad etiquette... I've found at least one issue with this patch: Although the semantics seem correct, since the late-converted-to-init extents are not merged with neighbors, you can easily end up with thousands of extents :-( . Each write to fallocate'd space results in its own initialized extent. I'm not sure how expensive it would be to merge the extents when they are converted to initialized after the DIO write goes through. Curt On Tue, Jul 28, 2009 at 5:28 PM, Curt Wohlgemuth<curtw@xxxxxxxxxx> wrote: > This insures that direct IO writes to fallocate'd file space do not use the > page cache. > > > Signed-off-by: Curt Wohlgemuth <curtw@xxxxxxxxxx> > > --- > > I implemented Aneesh's ideas for this solution, but have some questions. > > I've verified that the page cache isn't used, regardless of whether > FALLOC_FL_KEEP_SIZE is used in the fallocate() call or not. > > The changes: > - New inode state flag to indicate that a DIO write is ongoing > > - ext4_ext_convert_to_initialized() will mark any new extent it creates > as uninitialized, if this new EXT4_STATE_DIO_WRITE flag is set. > > - ext4_direct_IO() will set this flag for any write. > > It now calls blockdev_direct_IO_own_locking() to do the I/O. > > After return from blockdev_direct_IO_own_locking() it clears the flag > and calls a new routine to mark all extents containing the returned > blocks as initialized. > > I'm a bit uncertain about the use of DIO_OWN_LOCKING; I looked at the XFS > code to see if it acquired any other locks while using this flag, but didn't > see any. Suggestions/corrections welcome. > > > diff -Naur orig/fs/ext4/ext4.h new/fs/ext4/ext4.h > --- orig/fs/ext4/ext4.h 2009-07-28 17:02:25.000000000 -0700 > +++ new/fs/ext4/ext4.h 2009-07-28 17:02:19.000000000 -0700 > @@ -272,6 +272,7 @@ > #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ > #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ > #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ > +#define EXT4_STATE_DIO_WRITE 0x00000020 /* This is a DIO write */ > > /* Used to pass group descriptor data when online resize is done */ > struct ext4_new_group_input { > diff -Naur orig/fs/ext4/ext4_extents.h new/fs/ext4/ext4_extents.h > --- orig/fs/ext4/ext4_extents.h 2009-07-28 17:02:40.000000000 -0700 > +++ new/fs/ext4/ext4_extents.h 2009-07-28 17:02:34.000000000 -0700 > @@ -242,5 +242,7 @@ > ext4_lblk_t *, ext4_fsblk_t *); > extern void ext4_ext_drop_refs(struct ext4_ext_path *); > extern int ext4_ext_check_inode(struct inode *inode); > +extern int ext4_ext_mark_extents_init(struct inode *inode, loff_t offset, > + int nr_bytes); > #endif /* _EXT4_EXTENTS */ > > diff -Naur orig/fs/ext4/extents.c new/fs/ext4/extents.c > --- orig/fs/ext4/extents.c 2009-07-28 17:02:54.000000000 -0700 > +++ new/fs/ext4/extents.c 2009-07-28 17:02:49.000000000 -0700 > @@ -2563,6 +2563,13 @@ > ex3->ee_block = cpu_to_le32(iblock); > ext4_ext_store_pblock(ex3, newblock); > ex3->ee_len = cpu_to_le16(allocated); > + > + /* For direct I/O, we mark this as uninitialized, and > + * set it as initialized when we finish the direct > + * IO. */ > + if (EXT4_I(inode)->i_state & EXT4_STATE_DIO_WRITE) > + ext4_ext_mark_uninitialized(ex3); > + > err = ext4_ext_insert_extent(handle, inode, path, ex3); > if (err == -ENOSPC) { > err = ext4_ext_zeroout(inode, &orig_ex); > @@ -2698,6 +2705,13 @@ > ex2->ee_block = cpu_to_le32(iblock); > ext4_ext_store_pblock(ex2, newblock); > ex2->ee_len = cpu_to_le16(allocated); > + > + /* For direct I/O, we mark this as uninitialized, and > + * set it as initialized when we finish the direct > + * IO. */ > + if (EXT4_I(inode)->i_state & EXT4_STATE_DIO_WRITE) > + ext4_ext_mark_uninitialized(ex2); > + > if (ex2 != ex) > goto insert; > /* > @@ -2763,6 +2777,109 @@ > return err; > } > > + > +static int handle_uninit_extent(struct ext4_ext_path *path, > + struct inode *inode, > + struct ext4_extent *ex) > +{ > + handle_t *handle; > + int credits; > + int started; > + int depth = ext_depth(inode); > + int ret = 0; > + > + if (ext4_ext_is_uninitialized(ex)) { > + handle = ext4_journal_current_handle(); > + started = 0; > + > + if (!handle) { > + credits = > + ext4_chunk_trans_blocks(inode, 1); > + handle = ext4_journal_start(inode, > + credits); > + if (IS_ERR(handle)) { > + ret = PTR_ERR(handle); > + goto out; > + } > + started = 1; > + } > + /* Mark as initialized */ > + ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)); > + > + ret = ext4_ext_dirty(handle, inode, path + depth); > + > + if (started) > + ext4_journal_stop(handle); > + } > +out: > + return ret; > +} > + > +/* Called after direct I/O writes, to make sure that all extents are marked > + * as initialized. */ > +int ext4_ext_mark_extents_init(struct inode *inode, > + loff_t offset, > + int nr_bytes) > +{ > + struct ext4_ext_path *path = NULL; > + struct ext4_extent *ex; > + int depth; > + int nr_blocks = nr_bytes >> inode->i_blkbits; > + ext4_lblk_t first_block = offset >> inode->i_blkbits; > + ext4_lblk_t block; > + int ret; > + > + /* Handle the extent for the first block */ > + path = ext4_ext_find_extent(inode, first_block, NULL); > + if (IS_ERR(path)) { > + ret = PTR_ERR(path); > + path = NULL; > + goto out; > + } > + depth = ext_depth(inode); > + ex = path[depth].p_ext; > + > + ret = handle_uninit_extent(path, inode, ex); > + if (ret != 0) { > + goto out; > + } > + > + /* Check the rest of the blocks; if they're in different > + * extents, handle those as well. */ > + for (block = first_block + 1; > + block < first_block + nr_blocks; > + block++) { > + > + /* Only look for new extents now */ > + if (block >= ex->ee_block && > + block < (ex->ee_block + ex->ee_len)) > + continue; > + > + ext4_ext_drop_refs(path); > + path = ext4_ext_find_extent(inode, first_block, path); > + if (IS_ERR(path)) { > + ret = PTR_ERR(path); > + path = NULL; > + goto out; > + } > + depth = ext_depth(inode); > + ex = path[depth].p_ext; > + > + ret = handle_uninit_extent(path, inode, ex); > + if (ret != 0) { > + goto out; > + } > + } > +out: > + if (path) { > + ext4_ext_drop_refs(path); > + kfree(path); > + } > + > + return ret; > +} > + > + > /* > * Block allocation/map/preallocation routine for extents based files > * > diff -Naur orig/fs/ext4/inode.c new/fs/ext4/inode.c > --- orig/fs/ext4/inode.c 2009-07-28 17:02:04.000000000 -0700 > +++ new/fs/ext4/inode.c 2009-07-28 17:23:59.000000000 -0700 > @@ -3318,11 +3318,33 @@ > ei->i_disksize = inode->i_size; > ext4_journal_stop(handle); > } > - } > + /* This prevents initializing extents too early */ > + if (ei->i_flags & EXT4_EXTENTS_FL) > + ei->i_state |= EXT4_STATE_DIO_WRITE; > + } > + > + ret = blockdev_direct_IO_own_locking(rw, iocb, inode, > + inode->i_sb->s_bdev, iov, > + offset, nr_segs, > + ext4_get_block, NULL); > + > + /* We now need to look at all extents for the blocks that were > + * written out, and mark them as initialized. Note that they've > + * already been converted, simply not yet marked as init. */ > + if (rw == WRITE && ei->i_flags & EXT4_EXTENTS_FL) { > + BUG_ON((ei->i_state & EXT4_STATE_DIO_WRITE) == 0); > + ei->i_state |= ~EXT4_STATE_DIO_WRITE; > + > + if (ret > 0) { > + int ret2; > > - ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, > - offset, nr_segs, > - ext4_get_block, NULL); > + ret2 = ext4_ext_mark_extents_init(inode, offset, ret); > + if (ret2 != 0) { > + ret = ret2; > + goto out; > + } > + } > + } > > if (orphan) { > int err; > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html