[PATCH 5/5] vfs: Add better VFS support for page_mkwrite when blocksize < pagesize

Jan Kara <jack@xxxxxxx> · Tue, 11 Aug 2009 00:20:47 +0200

page_mkwrite() is meant to be used by filesystems to allocate blocks under a
page which is becoming writeably mmapped in some process address space. This
allows a filesystem to return a page fault if there is not enough space
available, user exceeds quota or similar problem happens, rather than silently
discarding data later when writepage is called.

On filesystems where blocksize < pagesize the situation is more complicated.
Think for example that blocksize = 1024, pagesize = 4096 and a process does:
  ftruncate(fd, 0);
  pwrite(fd, buf, 1024, 0);
  map = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED, fd, 0);
  map[0] = 'a';  ----> page_mkwrite() for index 0 is called
  ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */
  fsync(fd); ----> writepage() for index 0 is called

At the moment page_mkwrite() is called, filesystem can allocate only one block
for the page because i_size == 1024. Otherwise it would create blocks beyond
i_size which is generally undesirable. But later at writepage() time, we would
like to have blocks allocated for the whole page (and in principle we have to
allocate them because user could have filled the page with data after the
second ftruncate()). This patch introduces a framework which allows filesystems
to handle this with a reasonable effort.

Signed-off-by: Jan Kara <jack@xxxxxxx>
---
 fs/buffer.c                 |   68 +++++++++++++++++++++++++++++++++++++++++-
 include/linux/buffer_head.h |    4 ++
 include/linux/fs.h          |    4 ++-
 mm/filemap.c                |   10 ++++++-
 mm/filemap_xip.c            |    3 +-
 mm/memory.c                 |    2 +-
 mm/nommu.c                  |    2 +-
 mm/shmem.c                  |    2 +-
 8 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 15e7f40..00f8bdd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -40,6 +40,7 @@
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 #include <linux/mpage.h>
+#include <linux/rmap.h>
 #include <linux/bit_spinlock.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -2060,7 +2061,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 	 * cannot change under us because we hold i_mutex.
 	 */
 	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
+		extend_i_size(inode, pos, copied);
 		mark_inode_dirty(inode);
 	}
 
@@ -2360,6 +2361,69 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
 }
 
 /*
+ * Lock inode with I_HOLE_EXTEND if the write is going to create a hole
+ * under a mmapped page. Also mark the page RO so that page_mkwrite()
+ * is called on the nearest write access to the page and clear dirty bits
+ * beyond i_size.
+ *
+ * @pos is offset to which write/truncate is happenning.
+ */
+void block_extend_i_size(struct inode *inode, loff_t pos, loff_t len)
+{
+	int bsize = 1 << inode->i_blkbits;
+	loff_t rounded_i_size;
+	struct page *page;
+	pgoff_t index;
+	struct buffer_head *head, *bh;
+	sector_t block, last_block;
+	int start. end;
+
+	WARN_ON(!mutex_is_locked(&inode->i_mutex));
+
+	/*
+	 * Make sure page_mkwrite() is called on this page before
+	 * user is able to write any data beyond current i_size via
+	 * mmap.
+	 *
+	 * See clear_page_dirty_for_io() for details why set_page_dirty()
+	 * is needed.
+	 */
+	index = inode->i_size >> PAGE_CACHE_SHIFT;
+	page = find_lock_page(inode->i_mapping, index);
+	/* Page not cached? Nothing to do */
+	if (!page)
+		goto write_size;
+	/* Optimize for common case */
+	if (PAGE_CACHE_SIZE == bsize)
+		goto zero_and_write;
+	/* Currently last page will not have any hole block created? */
+	rounded_i_size = ALIGN(inode->i_size, bsize);
+	if (pos <= rounded_i_size || !(rounded_i_size & (PAGE_CACHE_SIZE - 1)))
+		goto zero_and_write;
+#ifdef CONFIG_MMU
+	if (page_mkclean(page))
+		set_page_dirty(page);
+#endif
+zero_and_write:
+	/*
+	 * Zero out end of the page as it could have been modified via mmap
+	 * and it now becomes to be inside i_size
+	 */
+	if (pos > inode->i_size) {
+		start = inode->i_size & (PAGE_CACHE_SIZE - 1);
+		end = min_t(int, PAGE_CACHE_SIZE, pos - inode->i_size + start);
+		zero_user_segment(page, offset, end);
+	}
+	i_size_write(inode, pos + len);
+	unlock_page(page);
+	page_cache_release(page);
+	return;
+write_size:
+	i_size_write(inode, pos + len);
+}
+EXPORT_SYMBOL(block_extend_i_size);
+
+/*
  * block_page_mkwrite() is not allowed to change the file size as it gets
  * called from a page fault handler when a page is first dirtied. Hence we must
  * be careful to check for EOF conditions here. We set the page up correctly
@@ -2621,7 +2685,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
 	page_cache_release(page);
 
 	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
+		extend_i_size(inode, pos, copied);
 		mark_inode_dirty(inode);
 	}
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 16ed028..56a0162 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -219,6 +219,10 @@ int cont_write_begin(struct file *, struct address_space *, loff_t,
 			get_block_t *, loff_t *);
 int generic_cont_expand_simple(struct inode *inode, loff_t size);
 int block_commit_write(struct page *page, unsigned from, unsigned to);
+int block_lock_hole_extend(struct inode *inode, loff_t pos);
+void block_unlock_hole_extend(struct inode *inode);
+int block_wait_on_hole_extend(struct inode *inode, loff_t pos);
+void block_extend_i_size(struct inode *inode, loff_t pos, loff_t len);
 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 				get_block_t get_block);
 void block_sync_page(struct page *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a36ffa5..9440666 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -580,7 +580,7 @@ struct address_space_operations {
 	int (*write_end)(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
-
+	void (*extend_i_size)(struct inode *, loff_t pos, loff_t len);
 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
 	sector_t (*bmap)(struct address_space *, sector_t);
 	void (*invalidatepage) (struct page *, unsigned long);
@@ -597,6 +597,8 @@ struct address_space_operations {
 					unsigned long);
 };
 
+void extend_i_size(struct inode *inode, loff_t pos, loff_t len);
+
 /*
  * pagecache_write_begin/pagecache_write_end must be used by general code
  * to write into the pagecache.
diff --git a/mm/filemap.c b/mm/filemap.c
index ccea3b6..bf5e527 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2102,6 +2102,14 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 }
 EXPORT_SYMBOL(pagecache_write_end);
 
+void extend_i_size(struct inode *inode, loff_t pos, loff_t len)
+{
+	if (inode->i_mapping->a_ops->extend_i_size)
+		inode->i_mapping->a_ops->extend_i_size(inode, pos, len);
+	else
+		block_extend_i_size(inode, pos, len);
+}
+
 ssize_t
 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -2162,7 +2170,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (written > 0) {
 		loff_t end = pos + written;
 		if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
-			i_size_write(inode,  end);
+			extend_i_size(inode, pos, written);
 			mark_inode_dirty(inode);
 		}
 		*ppos = end;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 427dfe3..3f7e15d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -321,6 +321,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 	long		status = 0;
 	size_t		bytes;
 	ssize_t		written = 0;
+	loff_t		orig_pos = pos;
 
 	BUG_ON(!mapping->a_ops->get_xip_mem);
 
@@ -378,7 +379,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
 	 * cannot change under us because we hold i_mutex.
 	 */
 	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
+		extend_i_size(inode, orig_pos, written);
 		mark_inode_dirty(inode);
 	}
 
diff --git a/mm/memory.c b/mm/memory.c
index aede2ce..e034616 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2429,7 +2429,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
 			goto out_sig;
 		if (offset > inode->i_sb->s_maxbytes)
 			goto out_big;
-		i_size_write(inode, offset);
+		extend_i_size(inode, offset, 0);
 	} else {
 		struct address_space *mapping = inode->i_mapping;
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 53cab10..93c1c03 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -111,7 +111,7 @@ do_expand:
 		goto out_sig;
 	if (offset > inode->i_sb->s_maxbytes)
 		goto out;
-	i_size_write(inode, offset);
+	extend_i_size(inode, offset, 0);
 
 out_truncate:
 	if (inode->i_op->truncate)
diff --git a/mm/shmem.c b/mm/shmem.c
index 52ac65c..01df19b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1632,7 +1632,7 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 	page_cache_release(page);
 
 	if (pos + copied > inode->i_size)
-		i_size_write(inode, pos + copied);
+		extend_i_size(inode, pos, copied);
 
 	return copied;
 }
-- 
1.6.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html