On 25-Sep-2014 04:33:22 PM, Matthew Wilcox wrote: > Use an inode flag to tag inodes which should avoid using the page cache. > Convert ext2 to use it instead of mapping_is_xip(). Prevent I/Os to > files tagged with the DAX flag from falling back to buffered I/O. I agree that DAX enabled FS should not silently fallback to buffered I/O, since it would void some guarantees about persistency of data that has been written to a DAX mmap()'d region. > > Signed-off-by: Matthew Wilcox <matthew.r.wilcox@xxxxxxxxx> > Reviewed-by: Jan Kara <jack@xxxxxxx> Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> > --- > fs/ext2/inode.c | 9 ++++++--- > fs/ext2/xip.h | 2 -- > include/linux/fs.h | 6 ++++++ > mm/filemap.c | 19 ++++++++++++------- > 4 files changed, 24 insertions(+), 12 deletions(-) > > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c > index 36d35c3..0cb0448 100644 > --- a/fs/ext2/inode.c > +++ b/fs/ext2/inode.c > @@ -731,7 +731,7 @@ static int ext2_get_blocks(struct inode *inode, > goto cleanup; > } > > - if (ext2_use_xip(inode->i_sb)) { > + if (IS_DAX(inode)) { > /* > * we need to clear the block > */ > @@ -1201,7 +1201,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) > > inode_dio_wait(inode); > > - if (mapping_is_xip(inode->i_mapping)) > + if (IS_DAX(inode)) > error = xip_truncate_page(inode->i_mapping, newsize); > else if (test_opt(inode->i_sb, NOBH)) > error = nobh_truncate_page(inode->i_mapping, > @@ -1273,7 +1273,8 @@ void ext2_set_inode_flags(struct inode *inode) > { > unsigned int flags = EXT2_I(inode)->i_flags; > > - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); > + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | > + S_DIRSYNC | S_DAX); > if (flags & EXT2_SYNC_FL) > inode->i_flags |= S_SYNC; > if (flags & EXT2_APPEND_FL) > @@ -1284,6 +1285,8 @@ void ext2_set_inode_flags(struct inode *inode) > inode->i_flags |= S_NOATIME; > if (flags & EXT2_DIRSYNC_FL) > inode->i_flags |= S_DIRSYNC; > + if (test_opt(inode->i_sb, XIP)) > + inode->i_flags |= S_DAX; > } > > /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ > diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h > index 18b34d2..29be737 100644 > --- a/fs/ext2/xip.h > +++ b/fs/ext2/xip.h > @@ -16,9 +16,7 @@ static inline int ext2_use_xip (struct super_block *sb) > } > int ext2_get_xip_mem(struct address_space *, pgoff_t, int, > void **, unsigned long *); > -#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem) > #else > -#define mapping_is_xip(map) 0 > #define ext2_xip_verify_sb(sb) do { } while (0) > #define ext2_use_xip(sb) 0 > #define ext2_clear_xip_target(inode, chain) 0 > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 9418772..e99e5c4 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1605,6 +1605,7 @@ struct super_operations { > #define S_IMA 1024 /* Inode has an associated IMA struct */ > #define S_AUTOMOUNT 2048 /* Automount/referral quasi-directory */ > #define S_NOSEC 4096 /* no suid or xattr security attributes */ > +#define S_DAX 8192 /* Direct Access, avoiding the page cache */ > > /* > * Note that nosuid etc flags are inode-specific: setting some file-system > @@ -1642,6 +1643,11 @@ struct super_operations { > #define IS_IMA(inode) ((inode)->i_flags & S_IMA) > #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) > #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) > +#ifdef CONFIG_FS_XIP > +#define IS_DAX(inode) ((inode)->i_flags & S_DAX) > +#else > +#define IS_DAX(inode) 0 > +#endif > > /* > * Inode state bits. Protected by inode->i_lock > diff --git a/mm/filemap.c b/mm/filemap.c > index 90effcd..fec4db9 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -1718,9 +1718,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) > * we've already read everything we wanted to, or if > * there was a short read because we hit EOF, go ahead > * and return. Otherwise fallthrough to buffered io for > - * the rest of the read. > + * the rest of the read. Buffered reads will not work for > + * DAX files, so don't bother trying. > */ > - if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { > + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || > + IS_DAX(inode)) { > file_accessed(file); > goto out; > } > @@ -2584,13 +2586,16 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) > loff_t endbyte; > > written = generic_file_direct_write(iocb, from, pos); > - if (written < 0 || written == count) > - goto out; > - > /* > - * direct-io write to a hole: fall through to buffered I/O > - * for completing the rest of the request. > + * If the write stopped short of completing, fall back to > + * buffered writes. Some filesystems do this for writes to > + * holes, for example. For DAX files, a buffered write will > + * not succeed (even if it did, DAX does not handle dirty > + * page-cache pages correctly). > */ > + if (written < 0 || written == count || IS_DAX(inode)) > + goto out; > + > pos += written; > count -= written; > > -- > 2.1.0 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> > > -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com Key fingerprint: 2A0B 4ED9 15F2 D3FA 45F5 B162 1728 0A97 8118 6ACF -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html