[no subject]

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I just quickly hacked fcntl to allow setting no. of ra_pages in
inode->i_ra_pages. Readahead algorithm then takes this setting whenever
it initializes the readahead control in "file_ra_state_init()"
So after one opens the file, we can set the fcntl F_SET_FILE_READAHEAD
to the preferred value on the open fd. 


Note: I am not saying the implementation could be 100% correct. But it's
just a quick working PoC to discuss whether this is the right approach
to the given problem.

-ritesh


<quick patch>
===========
fcntl: Add control to set per inode readahead pages

As of now readahead setting is done in units of pages at the bdi level.
(default 128K).
But sometimes the user wants to have more granular control over this
knob on a per file basis. This adds support to control readahead pages
on an open fd.

Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@xxxxxxxxx>
---
 fs/btrfs/defrag.c           |  2 +-
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/relocation.c       |  2 +-
 fs/btrfs/send.c             |  2 +-
 fs/cramfs/inode.c           |  2 +-
 fs/fcntl.c                  | 44 +++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4file.c           |  2 +-
 fs/open.c                   |  2 +-
 include/linux/fs.h          |  4 +++-
 include/uapi/linux/fcntl.h  |  2 ++
 mm/readahead.c              |  7 ++++--
 11 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 968dae953948..c6616d69a9af 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -261,7 +261,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
 	range.len = (u64)-1;
 	range.start = cur;
 	range.extent_thresh = defrag->extent_thresh;
-	file_ra_state_init(ra, inode->i_mapping);
+	file_ra_state_init(ra, inode);
 
 	sb_start_write(fs_info->sb);
 	ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cfa52ef40b06..ac240b148747 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -373,7 +373,7 @@ static void readahead_cache(struct inode *inode)
 	struct file_ra_state ra;
 	unsigned long last_index;
 
-	file_ra_state_init(&ra, inode->i_mapping);
+	file_ra_state_init(&ra, inode);
 	last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
 	page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index bf267bdfa8f8..7688b79ae7e7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3057,7 +3057,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
 	if (ret)
 		goto out;
 
-	file_ra_state_init(ra, inode->i_mapping);
+	file_ra_state_init(ra, inode);
 
 	ret = setup_relocation_extent_mapping(rc);
 	if (ret)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7254279c3cc9..b22fc2a426e4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5745,7 +5745,7 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
 			return err;
 		}
 		memset(&sctx->ra, 0, sizeof(struct file_ra_state));
-		file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
+		file_ra_state_init(&sctx->ra, sctx->cur_inode);
 
 		/*
 		 * It's very likely there are no pages from this inode in the page
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b84d1747a020..917f09040f6e 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -214,7 +214,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
 	devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT;
 
 	/* Ok, read in BLKS_PER_BUF pages completely first. */
-	file_ra_state_init(&ra, mapping);
+	file_ra_state_init(&ra, mapping->host);
 	page_cache_sync_readahead(mapping, &ra, NULL, blocknr, BLKS_PER_BUF);
 
 	for (i = 0; i < BLKS_PER_BUF; i++) {
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 49884fa3c81d..277afe78536f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -394,6 +394,44 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
 	return 0;
 }
 
+static long fcntl_get_file_readahead(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	u64 __user *argp = (u64 __user *)arg;
+	u64 ra_pages = READ_ONCE(inode->i_ra_pages);
+
+	if (copy_to_user(argp, &ra_pages, sizeof(*argp)))
+		return -EFAULT;
+	return 0;
+}
+
+
+static long fcntl_set_file_readahead(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	u64 __user *argp = (u64 __user *)arg;
+	u64 ra_pages;
+
+	if (!inode_owner_or_capable(file_mnt_idmap(file), inode))
+		return -EPERM;
+
+	if (copy_from_user(&ra_pages, argp, sizeof(ra_pages)))
+		return -EFAULT;
+
+	WRITE_ONCE(inode->i_ra_pages, ra_pages);
+
+	/*
+	 * file->f_mapping->host may differ from inode. As an example,
+	 * blkdev_open() modifies file->f_mapping.
+	 */
+	if (file->f_mapping->host != inode)
+		WRITE_ONCE(file->f_mapping->host->i_ra_pages, ra_pages);
+
+	return 0;
+}
+
 /* Is the file descriptor a dup of the file? */
 static long f_dupfd_query(int fd, struct file *filp)
 {
@@ -552,6 +590,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_SET_RW_HINT:
 		err = fcntl_set_rw_hint(filp, cmd, arg);
 		break;
+	case F_GET_FILE_READAHEAD:
+		err = fcntl_get_file_readahead(filp, cmd, arg);
+		break;
+	case F_SET_FILE_READAHEAD:
+		err = fcntl_set_file_readahead(filp, cmd, arg);
+		break;
 	default:
 		break;
 	}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1cd9652f3c28..cee84aa8aa0f 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -388,7 +388,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
 	nfs_file_set_open_context(filep, ctx);
 	put_nfs_open_context(ctx);
 
-	file_ra_state_init(&filep->f_ra, filep->f_mapping->host->i_mapping);
+	file_ra_state_init(&filep->f_ra, filep->f_mapping->host);
 	res = filep;
 out_free_name:
 	kfree(read_name);
diff --git a/fs/open.c b/fs/open.c
index 0f75e220b700..466c3affe161 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -961,7 +961,7 @@ static int do_dentry_open(struct file *f,
 	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
 	f->f_iocb_flags = iocb_flags(f);
 
-	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
+	file_ra_state_init(&f->f_ra, f->f_mapping->host);
 
 	if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
 		return -EINVAL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 12fe11b6e3dd..77ee23e30245 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -678,6 +678,8 @@ struct inode {
 	unsigned short          i_bytes;
 	u8			i_blkbits;
 	enum rw_hint		i_write_hint;
+	/* Per inode setting for max readahead in page_size units */
+	unsigned long		i_ra_pages;
 	blkcnt_t		i_blocks;
 
 #ifdef __NEED_I_SIZE_ORDERED
@@ -3271,7 +3273,7 @@ extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
 
 
 extern void
-file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
+file_ra_state_init(struct file_ra_state *ra, struct inode *inode);
 extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6e6907e63bfc..b6e5413ca660 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -60,6 +60,8 @@
 #define F_SET_RW_HINT		(F_LINUX_SPECIFIC_BASE + 12)
 #define F_GET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 13)
 #define F_SET_FILE_RW_HINT	(F_LINUX_SPECIFIC_BASE + 14)
+#define F_GET_FILE_READAHEAD	(F_LINUX_SPECIFIC_BASE + 15)
+#define F_SET_FILE_READAHEAD	(F_LINUX_SPECIFIC_BASE + 16)
 
 /*
  * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be
diff --git a/mm/readahead.c b/mm/readahead.c
index 2bc3abf07828..71079ae1753d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -136,9 +136,12 @@
  * memset *ra to zero.
  */
 void
-file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
+file_ra_state_init(struct file_ra_state *ra, struct inode *inode)
 {
-	ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
+	unsigned int ra_pages = inode->i_ra_pages ? inode->i_ra_pages :
+				inode_to_bdi(inode)->ra_pages;
+
+	ra->ra_pages = ra_pages;
 	ra->prev_pos = -1;
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);

2.39.5




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux