[PATCH 3 of 7] DIO: don't fall back to buffered writes

Chris Mason <chris.mason@xxxxxxxxxx> · Wed, 01 Nov 2006 11:08:05 -0400

Placeholder pages allow DIO to use locking rules similar to that of
writepage.  DIO can now fill holes, and it can extend the file via
expanding truncates.  This creates holes that are filled during the DIO
write.

i_mutex can be dropped during writes once DIO decides if the file needs to be
extended.

The expanding truncate may cause some pagecache pages to be dirtied.
The call to find_or_insert_placeholders is changed to wait on dirty
or writeback pages in the case of writes as well as reads.

diff -r 4486b1f7011a -r 3fa8c25ec60f fs/direct-io.c

--- a/fs/direct-io.c	Wed Nov 01 10:18:44 2006 -0500
+++ b/fs/direct-io.c	Wed Nov 01 10:22:34 2006 -0500
@@ -69,6 +69,7 @@ struct dio {
 	int rw;
 	loff_t i_size;			/* i_size when submitted */
 	int lock_type;			/* doesn't change */
+	int reacquire_i_mutex;		/* should we get i_mutex when done? */
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
@@ -221,8 +222,7 @@ static int lock_page_range(struct dio *d
 		return 0;
 	return find_or_insert_placeholders(mapping, dio->tmppages, start, end,
 	                                  ARRAY_SIZE(dio->tmppages),
-					  GFP_KERNEL, fake,
-					  dio->rw == READ);
+					  GFP_KERNEL, fake, 1);
 }
 
 
@@ -258,6 +258,8 @@ static void dio_complete(struct dio *dio
 	unlock_page_range(dio, dio->fspages_start_off,
 			  dio->fspages_end_off - dio->fspages_start_off);
 	dio->fspages_end_off = dio->fspages_start_off;
+	if (dio->reacquire_i_mutex)
+		mutex_lock(&dio->inode->i_mutex);
 }
 
 /*
@@ -574,13 +576,8 @@ static int get_more_blocks(struct dio *d
 		map_bh->b_size = fs_count << dio->inode->i_blkbits;
 
 		create = dio->rw & WRITE;
-		if (dio->lock_type == DIO_LOCKING) {
-			if (dio->block_in_file < (i_size_read(dio->inode) >>
-							dio->blkbits))
-				create = 0;
-		} else if (dio->lock_type == DIO_NO_LOCKING) {
+		if (dio->lock_type == DIO_NO_LOCKING)
 			create = 0;
-		}
 	        index = fs_startblk >> (PAGE_CACHE_SHIFT -
 		                        dio->inode->i_blkbits);
 		end = (dio->final_block_in_request >> dio->blkfactor) >>
@@ -1291,6 +1288,33 @@ __blockdev_direct_IO(int rw, struct kioc
 	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
 		(end > i_size_read(inode)));
 
+	/*
+	 * extend the file if needed, and drop i_mutex for non-aio writes.
+	 * We extend the file by creating a hole that is later filled in
+	 * during the O_DIRECT.  Because pages are locked or placeholders
+	 * are inserted, the locking rules end up being the same as
+	 * mmap'd writes using writepage to fill holes
+	 */
+	dio->reacquire_i_mutex = 0;
+	if ((rw & WRITE) && dio_lock_type == DIO_LOCKING) {
+		/* if our write goes past i_size, do an expanding
+		 * truncate to fill it before dropping i_mutex
+		 */
+		if (end > i_size_read(inode) && iocb->ki_filp) {
+			struct iattr newattrs;
+			newattrs.ia_size = end;
+			newattrs.ia_file = iocb->ki_filp;
+			newattrs.ia_valid = ATTR_SIZE | ATTR_FILE;
+			retval = notify_change(iocb->ki_filp->f_dentry,
+					       &newattrs);
+			if (retval)
+				goto out;
+		}
+		if (is_sync_kiocb(iocb)) {
+			dio->reacquire_i_mutex = 1;
+			mutex_unlock(&inode->i_mutex);
+		}
+	}
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
 				nr_segs, blkbits, get_block, end_io, dio);
 out:
diff -r 4486b1f7011a -r 3fa8c25ec60f mm/filemap.c
--- a/mm/filemap.c	Wed Nov 01 10:18:44 2006 -0500
+++ b/mm/filemap.c	Wed Nov 01 10:22:34 2006 -0500
@@ -2758,10 +2758,19 @@ generic_file_direct_IO(int rw, struct ki
 	retval = mapping->a_ops->direct_IO(rw, iocb, iov,
 					offset, nr_segs);
 	if (rw == WRITE && mapping->nrpages) {
+		int err;
 		pgoff_t end = (offset + write_len - 1)
 					>> PAGE_CACHE_SHIFT;
-		int err = invalidate_inode_pages2_range(mapping,
-				offset >> PAGE_CACHE_SHIFT, end);
+
+		/* O_DIRECT is allowed to drop i_mutex, so more data
+		 * could have been dirtied by others.  Start io one more
+		 * time
+		 */
+		err = filemap_fdatawrite_range(mapping, offset,
+		                               offset + write_len - 1);
+		if (!err)
+			err = invalidate_inode_pages2_range(mapping,
+					offset >> PAGE_CACHE_SHIFT, end);
 		if (err)
 			retval = err;
 	}


-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html