From: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> This is a port of the XIP functionality found in the current version of ext2. Signed-off-by: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> Reviewed-by: Andreas Dilger <andreas.dilger@xxxxxxxxx> [heavily tweaked] Signed-off-by: Matthew Wilcox <matthew.r.wilcox@xxxxxxxxx> --- Documentation/filesystems/ext4.txt | 2 ++ fs/ext4/ext4.h | 2 ++ fs/ext4/file.c | 47 ++++++++++++++++++++++++---- fs/ext4/indirect.c | 19 +++++++---- fs/ext4/inode.c | 64 +++++++++++++++++++++++++++----------- fs/ext4/namei.c | 10 ++++-- fs/ext4/super.c | 39 ++++++++++++++++++++++- 7 files changed, 150 insertions(+), 33 deletions(-) diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 919a329..06dab5e 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -386,6 +386,8 @@ max_dir_size_kb=n This limits the size of directories so that any i_version Enable 64-bit inode version support. This option is off by default. +xip Use execute in place (no caching) if possible + Data Mode ========= There are 3 different data modes: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a48d367..5b160da 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -954,6 +954,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT4_MOUNT_XIP 0x00200 /* Execute in place */ #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ @@ -2569,6 +2570,7 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; +extern const struct file_operations ext4_xip_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); extern void ext4_unwritten_wait(struct inode *inode); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3da2194..d6ae6be 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -190,7 +190,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, } } - if (unlikely(iocb->ki_filp->f_flags & O_DIRECT)) + if (io_is_direct(iocb->ki_filp)) ret = ext4_file_dio_write(iocb, iov, nr_segs, pos); else ret = generic_file_aio_write(iocb, iov, nr_segs, pos); @@ -198,6 +198,21 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, return ret; } +#ifdef CONFIG_FS_XIP +static int ext4_xip_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return xip_fault(vma, vmf, ext4_get_block); + /* Is this the right get_block? */ +} + +static const struct vm_operations_struct ext4_xip_vm_ops = { + .fault = ext4_xip_fault, + .remap_pages = generic_file_remap_pages, +}; +#else +#define ext4_xip_vm_ops ext4_file_vm_ops +#endif + static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = ext4_page_mkwrite, @@ -206,12 +221,13 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct address_space *mapping = file->f_mapping; - - if (!mapping->a_ops->readpage) - return -ENOEXEC; file_accessed(file); - vma->vm_ops = &ext4_file_vm_ops; + if (IS_XIP(file_inode(file))) { + vma->vm_ops = &ext4_xip_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + } else { + vma->vm_ops = &ext4_file_vm_ops; + } return 0; } @@ -609,6 +625,25 @@ const struct file_operations ext4_file_operations = { .fallocate = ext4_fallocate, }; +#ifdef CONFIG_FS_XIP +const struct file_operations ext4_xip_file_operations = { + .llseek = ext4_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = ext4_file_write, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + .fallocate = ext4_fallocate, +}; +#endif + const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 594009f..ef2bdef 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -686,15 +686,22 @@ retry: inode_dio_done(inode); goto locked; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - ext4_get_block, NULL, NULL, 0); + if (IS_XIP(inode)) + ret = xip_do_io(rw, iocb, inode, iov, offset, nr_segs, + ext4_get_block, NULL, 0); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, offset, + nr_segs, ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: - ret = blockdev_direct_IO(rw, iocb, inode, iov, - offset, nr_segs, ext4_get_block); + if (IS_XIP(inode)) + ret = xip_do_io(rw, iocb, inode, iov, offset, nr_segs, + ext4_get_block, NULL, 0); + else + ret = blockdev_direct_IO(rw, iocb, inode, iov, + offset, nr_segs, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c767666..8b73d77 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -663,6 +663,18 @@ found: WARN_ON(1); } + /* this is probably wrong for ext4. unlike ext2, ext4 supports + * uninitialised extents, so we should probably be hooking + * into the "make it initialised" code instead. */ + if (IS_XIP(inode)) { + ret = xip_clear_blocks(inode, map->m_pblk, + map->m_len << inode->i_blkbits); + if (ret) { + retval = ret; + goto has_zeroout; + } + } + /* * If the extent has been zeroed out, we don't need to update * extent status tree. @@ -3152,13 +3164,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iov, - offset, nr_segs, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); + if (IS_XIP(inode)) + ret = xip_do_io(rw, iocb, inode, iov, offset, nr_segs, + get_block_func, ext4_end_io_dio, dio_flags); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, offset, + nr_segs, get_block_func, + ext4_end_io_dio, NULL, dio_flags); /* * Put our reference to io_end. This can free the io_end structure e.g. @@ -3323,14 +3336,7 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -static int ext4_block_zero_page_range(handle_t *handle, +static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; @@ -3421,6 +3427,22 @@ unlock: } /* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + struct inode *inode = mapping->host; + if (IS_XIP(inode)) + return xip_zero_page_range(inode, from, length, ext4_get_block); + return __ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end @@ -3939,7 +3961,8 @@ void ext4_set_inode_flags(struct inode *inode) { unsigned int flags = EXT4_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC | S_XIP); if (flags & EXT4_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & EXT4_APPEND_FL) @@ -3950,6 +3973,8 @@ void ext4_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; + if (test_opt(inode->i_sb, XIP)) + inode->i_flags |= S_XIP; } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4201,7 +4226,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, XIP)) + inode->i_fop = &ext4_xip_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; @@ -4653,7 +4681,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ - truncate_pagecache(inode, inode->i_size); + truncate_pagecache(inode, inode->i_size); } /* * We want to call ext4_truncate() even if attr->ia_size == diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5a0408d..ac68129 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2250,7 +2250,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, XIP)) + inode->i_fop = &ext4_xip_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) @@ -2314,7 +2317,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, XIP)) + inode->i_fop = &ext4_xip_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c977f4e..309a1a3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1156,7 +1156,7 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_xip, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, @@ -1218,6 +1218,7 @@ static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, + {Opt_xip, "xip"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, @@ -1400,6 +1401,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_xip, EXT4_MOUNT_XIP, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -1638,6 +1640,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif +#ifndef CONFIG_FS_XIP + } else if (token == Opt_xip) { + ext4_msg(sb, KERN_INFO, "xip option not supported"); + return -1; +#endif } else { if (!args->from) arg = 1; @@ -3551,6 +3558,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } + if (test_opt(sb, XIP)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and xip"); + goto failed_mount; + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } @@ -3604,6 +3616,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (sbi->s_mount_opt & EXT4_MOUNT_XIP) { + if (blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, + "error: unsupported blocksize for xip"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext4_msg(sb, KERN_ERR, + "error: device does not support xip"); + goto failed_mount; + } + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -4798,6 +4823,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + if (test_opt(sb, XIP)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and xip"); + err = -EINVAL; + goto restore_opts; + } + } + + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_XIP) { + ext4_msg(sb, KERN_WARNING, "warning: refusing change of " + "xip flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT4_MOUNT_XIP; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) -- 1.8.4.rc3 -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html