Sorry for taking so long to finish this. Here is the new patch based on Andreas's suggestions. Now the patch clears the EXT4_EOFBLOCKS_FL flag when we allocate beyond the maximum allocated block. I also made the EOFBLOCKS flag user visible and added the handling in ext4_ioctl as Andrea suggested. Index: linux-2.6.30.5/fs/ext4/inode.c =================================================================== --- linux-2.6.30.5.orig/fs/ext4/inode.c 2009-08-31 12:08:10.000000000 -0700 +++ linux-2.6.30.5/fs/ext4/inode.c 2009-09-23 21:42:33.000000000 -0700 @@ -3973,6 +3973,8 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; + inode->i_flags &= ~EXT4_EOFBLOCKS_FL; + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; @@ -4285,8 +4287,8 @@ void ext4_get_inode_flags(struct ext4_in { unsigned int flags = ei->vfs_inode.i_flags; - ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| - EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); + ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|EXT4_IMMUTABLE_FL| + EXT4_NOATIME_FL|EXT4_DIRSYNC_FL|EXT4_EOFBLOCKS_FL); if (flags & S_SYNC) ei->i_flags |= EXT4_SYNC_FL; if (flags & S_APPEND) @@ -4297,6 +4299,8 @@ void ext4_get_inode_flags(struct ext4_in ei->i_flags |= EXT4_NOATIME_FL; if (flags & S_DIRSYNC) ei->i_flags |= EXT4_DIRSYNC_FL; + if (flags & FS_EOFBLOCKS_FL) + ei->i_flags |= EXT4_EOFBLOCKS_FL; } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) @@ -4807,7 +4811,9 @@ int ext4_setattr(struct dentry *dentry, } if (S_ISREG(inode->i_mode) && - attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + attr->ia_valid & ATTR_SIZE && + (attr->ia_size < inode->i_size || + (inode->i_flags & EXT4_EOFBLOCKS_FL))) { handle_t *handle; handle = ext4_journal_start(inode, 3); @@ -4838,6 +4844,11 @@ int ext4_setattr(struct dentry *dentry, goto err_out; } } + if ((inode->i_flags & EXT4_EOFBLOCKS_FL)) { + rc = vmtruncate(inode, attr->ia_size); + if (rc) + goto err_out; + } } rc = inode_setattr(inode, attr); Index: linux-2.6.30.5/include/linux/fs.h =================================================================== --- linux-2.6.30.5.orig/include/linux/fs.h 2009-08-31 12:08:10.000000000 -0700 +++ linux-2.6.30.5/include/linux/fs.h 2009-09-10 21:27:30.000000000 -0700 @@ -343,9 +343,10 @@ struct inodes_stat_t { #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define FS_EXTENT_FL 0x00080000 /* Extents */ #define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ +#define FS_EOFBLOCKS_FL 0x00200000 /* Blocks allocated beyond EOF */ #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ -#define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ +#define FS_FL_USER_VISIBLE 0x0023DFFF /* User visible flags */ #define FS_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ Index: linux-2.6.30.5/fs/ext4/ext4.h =================================================================== --- linux-2.6.30.5.orig/fs/ext4/ext4.h 2009-08-31 12:08:10.000000000 -0700 +++ linux-2.6.30.5/fs/ext4/ext4.h 2009-09-10 21:28:14.000000000 -0700 @@ -235,9 +235,10 @@ struct flex_groups { #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ +#define EXT4_EOFBLOCKS_FL 0x00200000 /* Blocks allocated beyond EOF (bit reserved in fs.h) */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ +#define EXT4_FL_USER_VISIBLE 0x002BDFFF /* User visible flags */ #define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ /* Flags that should be inherited by new inodes from their parent. */ Index: linux-2.6.30.5/fs/ext4/extents.c =================================================================== --- linux-2.6.30.5.orig/fs/ext4/extents.c 2009-09-01 18:14:58.000000000 -0700 +++ linux-2.6.30.5/fs/ext4/extents.c 2009-09-23 22:12:22.000000000 -0700 @@ -2788,7 +2788,7 @@ int ext4_ext_get_blocks(handle_t *handle { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; - struct ext4_extent newex, *ex; + struct ext4_extent newex, *ex, *last_ex; ext4_fsblk_t newblock; int err = 0, depth, ret, cache_type; unsigned int allocated = 0; @@ -2968,6 +2968,14 @@ int ext4_ext_get_blocks(handle_t *handle newex.ee_len = cpu_to_le16(ar.len); if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ ext4_ext_mark_uninitialized(&newex); + + if (unlikely(inode->i_flags & EXT4_EOFBLOCKS_FL)) { + BUG_ON(!eh->eh_entries); + last_ex = EXT_LAST_EXTENT(eh); + if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) + inode->i_flags &= ~EXT4_EOFBLOCKS_FL; + } err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) { /* free data blocks we just allocated */ @@ -3095,6 +3103,13 @@ static void ext4_falloc_update_inode(str i_size_write(inode, new_size); if (new_size > EXT4_I(inode)->i_disksize) ext4_update_i_disksize(inode, new_size); + } else { + /* + * Mark that we allocate beyond EOF so the subsequent truncate + * can proceed even if the new size is the same as i_size. + */ + if (new_size > i_size_read(inode)) + inode->i_flags |= EXT4_EOFBLOCKS_FL; } } Index: linux-2.6.30.5/fs/ext4/ioctl.c =================================================================== --- linux-2.6.30.5.orig/fs/ext4/ioctl.c 2009-08-16 14:19:38.000000000 -0700 +++ linux-2.6.30.5/fs/ext4/ioctl.c 2009-09-23 22:04:47.000000000 -0700 @@ -92,6 +92,16 @@ long ext4_ioctl(struct file *filp, unsig flags &= ~EXT4_EXTENTS_FL; } + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + /* free the space reserved with fallocate KEEPSIZE */ + vmtruncate(inode, inode->i_size); + handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); Jiaying On Wed, Sep 2, 2009 at 10:20 PM, Jiaying Zhang <jiayingz@xxxxxxxxxx> wrote: > > On Wed, Sep 2, 2009 at 1:41 AM, Andreas Dilger<adilger@xxxxxxx> wrote: > > On Aug 31, 2009 16:33 -0700, Jiaying Zhang wrote: > >> > EXT4_KEEPSIZE_FL should only be cleared if there were writes to > >> > the end of the fallocated space. In that regard, I think the name > >> > of this flag should be changed to something like "EXT4_EOFBLOCKS_FL" > >> > to indicate that blocks are allocated beyond the end of file (i_size). > >> > >> Thanks for catching this! I changed the patch to only clear the flag > >> when the new_size is larger than i_size and changed the flag name > >> as you suggested. It would be nice if we only clear the flag when we > >> write beyond the fallocated space, but this seems hard to detect > >> because we no longer have the allocated size once that keepsize > >> fallocate call returns. > > > > The problem is that if e2fsck depends on the EXT4_EOFBLOCKS_FL set > > for fallocate-beyond-EOF then it is worse to clear it than to leave > > it set. At worst, leaving the flag set results in too many truncates > > on the file. Clearing the flag when not correct may result in user > > visible data corruption if the file size is extended... > > > >> Here is the new patch: > >> > >> --- .pc/fallocate_keepsizse.patch/fs/ext4/extents.c 2009-08-31 > >> 12:08:10.000000000 -0700 > >> +++ fs/ext4/extents.c 2009-08-31 15:51:13.000000000 -0700 > >> @@ -3091,11 +3091,19 @@ static void ext4_falloc_update_inode(str > >> * the file size. > >> */ > >> if (!(mode & FALLOC_FL_KEEP_SIZE)) { > >> + if (new_size > i_size_read(inode)) { > >> i_size_write(inode, new_size); > >> + inode->i_flags &= ~EXT4_EOFBLOCKS_FL; > > > > This again isn't quite correct, since the EOFBLOCKS_FL shouldn't > > be cleared unless new_size is beyond the allocated size. The > > allocation code itself might be a better place to clear this, > > since it knows whether there were new blocks being added beyond > > the current max allocated block. > > We were thinking to clear this flag when we need to allocate new > blocks, but I was not sure how to get the current max allocated > block -- that is mostly because I just started working on the ext4 > code. After digging into the ext4 allocation code today, I think we > can put the check&clear in ext4_ext_get_blocks: > > @@ -2968,6 +2968,14 @@ int ext4_ext_get_blocks(handle_t *handle > newex.ee_len = cpu_to_le16(ar.len); > if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ > ext4_ext_mark_uninitialized(&newex); > + > + if (unlikely(inode->i_flags & EXT4_EOFBLOCKS_FL)) { > + BUG_ON(!eh->eh_entries); > + last_ex = EXT_LAST_EXTENT(eh); > + if (iblock + max_blocks > le32_to_cpu(last_ex->ee_block) > + + ext4_ext_get_actual_len(last_ex)) > + inode->i_flags &= ~EXT4_EOFBLOCKS_FL; > + } > err = ext4_ext_insert_extent(handle, inode, path, &newex); > if (err) { > /* free data blocks we just allocated */ > > Again, I just started looking at this part of code, so please let me know > if I am in the right direction. > > Another thing I am not sure is whether we can allocate a non-data block, > like extended attributes, beyond the current max block without changing > the i_size. In that case, clearing the EOFBLOCKS flag will be wrong. > > >> #define FS_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */ > > > > It probably isn't a bad idea to make this flag user-visible, since it > > would allow scanning for files that have excess space reserved (e.g. > > if the filesystem is getting full). Making it user-settable (i.e. > > clearable) would essentially mean truncating the file to i_size without > > updating the timestamps so that the reserved space is discarded. I > > don't think there is any value in allowing a user to turn this flag on > > for a file. > > So to make it user-settable, we need to add the handling in ext4_ioctl > that calls vmtruncate when the flag to be cleared. But how can we get > the right size to truncate in that case? Can we just set that to the > max initialized block shift with block size? But that may also truncate > the blocks reserved without the KEEP_SIZE flag. > > Jiaying > > > > > Cheers, Andreas > > -- > > Andreas Dilger > > Sr. Staff Engineer, Lustre Group > > Sun Microsystems of Canada, Inc. > > > > -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html