[PATCH 2 of 8] Change O_DIRECT to use placeholders instead of i_mutex/i_alloc_sem locking

Chris Mason <chris.mason@xxxxxxxxxx> · Tue, 06 Feb 2007 20:32:47 -0400

All mutex and semaphore usage is removed from the blockdev_direct_IO paths.
Filesystems can either do this locking on their own, or ask for placeholder
pages.

Signed-off-by: Chris Mason <chris.mason@xxxxxxxxxx>

diff -r 7819e6e3f674 -r 5cd028318654 fs/direct-io.c

--- a/fs/direct-io.c	Tue Feb 06 19:45:28 2007 -0500
+++ b/fs/direct-io.c	Tue Feb 06 20:02:49 2007 -0500
@@ -36,6 +36,7 @@
 #include <linux/rwsem.h>
 #include <linux/uio.h>
 #include <asm/atomic.h>
+#include <linux/writeback.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -95,6 +96,22 @@ struct dio {
 	struct buffer_head map_bh;	/* last get_block() result */
 
 	/*
+	 * kernel page pinning (placeholders);
+	 */
+	unsigned long fspages_start_off; /* page index where pinning starts */
+
+	/*
+	 * end off is the first page past the of the pinned range.  If
+	 * no pages or placeholders are pinned down, start_off == end_off
+	 */
+	unsigned long fspages_end_off;
+
+	/*
+	 * how big of a radix extent are we allowed to insert
+	 */
+	unsigned long fspages_span;
+
+	/*
 	 * Deferred addition of a page to the dio.  These variables are
 	 * private to dio_send_cur_page(), submit_page_section() and
 	 * dio_bio_add_page().
@@ -187,7 +204,50 @@ static int dio_refill_pages(struct dio *
 		ret = 0;
 	}
 out:
-	return ret;	
+	return ret;
+}
+
+static void dio_unlock_page_range(struct dio *dio)
+{
+	if (dio->lock_type != DIO_NO_LOCKING) {
+		remove_placeholder_pages(dio->inode->i_mapping,
+					 dio->fspages_start_off,
+					 dio->fspages_end_off);
+		dio->fspages_end_off = dio->fspages_start_off;
+	}
+}
+
+static int dio_lock_page_range(struct dio *dio, struct buffer_head *map_bh,
+				unsigned long index, unsigned long end)
+{
+	struct address_space *mapping = dio->inode->i_mapping;
+	unsigned long max_size;
+	int ret = 0;
+
+	if (dio->lock_type == DIO_NO_LOCKING)
+		return 0;
+
+	while (index >= dio->fspages_end_off) {
+		unsigned long nr = end - dio->fspages_end_off + 1;
+		nr = min(nr, dio->fspages_span);
+		ret = find_or_insert_placeholders(mapping,
+						  dio->fspages_end_off,
+						  dio->fspages_end_off + nr,
+						  GFP_KERNEL, 1);
+		if (ret)
+			break;
+		dio->fspages_end_off += nr;
+	}
+	/*
+	 * if we allow the FS to allocate more than we've placeholdered,
+	 * a concurrent readahead operation will find metadata where the
+	 * corresponding data has never been written.  This will trim
+	 * down amount of data we ask the FS to return.
+	 */
+	max_size = (dio->fspages_end_off - index) << PAGE_CACHE_SHIFT;
+	if (map_bh->b_size > max_size)
+		map_bh->b_size = max_size;
+	return ret;
 }
 
 /*
@@ -246,9 +306,7 @@ static int dio_complete(struct dio *dio,
 	if (dio->end_io && dio->result)
 		dio->end_io(dio->iocb, offset, transferred,
 			    dio->map_bh.b_private);
-	if (dio->lock_type == DIO_LOCKING)
-		/* lockdep: non-owner release */
-		up_read_non_owner(&dio->inode->i_alloc_sem);
+	dio_unlock_page_range(dio);
 
 	if (ret == 0)
 		ret = dio->page_errors;
@@ -513,6 +571,8 @@ static int get_more_blocks(struct dio *d
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	unsigned long dio_count;/* Number of dio_block-sized blocks */
 	unsigned long blkmask;
+	unsigned long index;
+	unsigned long end;
 	int create;
 
 	/*
@@ -540,7 +600,14 @@ static int get_more_blocks(struct dio *d
 		} else if (dio->lock_type == DIO_NO_LOCKING) {
 			create = 0;
 		}
-
+	        index = fs_startblk >> (PAGE_CACHE_SHIFT -
+		                        dio->inode->i_blkbits);
+		end = (dio->final_block_in_request >> dio->blkfactor) >>
+		      (PAGE_CACHE_SHIFT - dio->inode->i_blkbits);
+		BUG_ON(index > end);
+		ret = dio_lock_page_range(dio, map_bh, index, end);
+		if (ret)
+			goto error;
 		/*
 		 * For writes inside i_size we forbid block creations: only
 		 * overwrites are permitted.  We fall back to buffered writes
@@ -550,6 +617,7 @@ static int get_more_blocks(struct dio *d
 		ret = (*dio->get_block)(dio->inode, fs_startblk,
 						map_bh, create);
 	}
+error:
 	return ret;
 }
 
@@ -946,9 +1014,6 @@ out:
 	return ret;
 }
 
-/*
- * Releases both i_mutex and i_alloc_sem
- */
 static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
@@ -992,6 +1057,24 @@ direct_io_worker(int rw, struct kiocb *i
 	dio->bio_list = NULL;
 	dio->waiter = NULL;
 
+	if (dio->lock_type != DIO_NO_LOCKING) {
+		dio->fspages_start_off = offset >> PAGE_CACHE_SHIFT;
+		dio->fspages_end_off = dio->fspages_start_off;
+
+		/* if the mapping is mapped, they may be using a mmap'd portion
+		 * of the file as the buffer for this io.  That will deadlock
+		 * with placeholders because the placeholder code forces the
+		 * page fault handler to block.  The (ugly) solution is to
+		 * limit the span of inserted placeholders to the same
+		 * increment we use for get_user_pages.
+		 */
+		if (inode->i_mapping->nrpages ||
+		    mapping_mapped(inode->i_mapping))
+			dio->fspages_span = DIO_PAGES;
+		else
+			dio->fspages_span = ULONG_MAX;
+	}
+
 	/*
 	 * In case of non-aligned buffers, we may need 2 more
 	 * pages since we need to zero out first and last block.
@@ -1074,14 +1157,6 @@ direct_io_worker(int rw, struct kiocb *i
 	dio_cleanup(dio);
 
 	/*
-	 * All block lookups have been performed. For READ requests
-	 * we can let i_mutex go now that its achieved its purpose
-	 * of protecting us from looking up uninitialized blocks.
-	 */
-	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
-		mutex_unlock(&dio->inode->i_mutex);
-
-	/*
 	 * The only time we want to leave bios in flight is when a successful
 	 * partial aio read or full aio write have been setup.  In that case
 	 * bio completion will call aio_complete.  The only time it's safe to
@@ -1130,8 +1205,6 @@ direct_io_worker(int rw, struct kiocb *i
  * DIO_LOCKING (simple locking for regular files)
  * For writes we are called under i_mutex and return with i_mutex held, even
  * though it is internally dropped.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
- * returning.
  *
  * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
  *	uninitialised data, allowing parallel direct readers and writers)
@@ -1156,8 +1229,7 @@ __blockdev_direct_IO(int rw, struct kioc
 	ssize_t retval = -EINVAL;
 	loff_t end = offset;
 	struct dio *dio;
-	int release_i_mutex = 0;
-	int acquire_i_mutex = 0;
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
 
 	if (rw & WRITE)
 		rw = WRITE_SYNC;
@@ -1186,49 +1258,26 @@ __blockdev_direct_IO(int rw, struct kioc
 				goto out;
 		}
 	}
-
 	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
 	retval = -ENOMEM;
 	if (!dio)
 		goto out;
 
+
 	/*
 	 * For block device access DIO_NO_LOCKING is used,
 	 *	neither readers nor writers do any locking at all
 	 * For regular files using DIO_LOCKING,
-	 *	readers need to grab i_mutex and i_alloc_sem
-	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
+	 *	No locks are taken
 	 * For regular files using DIO_OWN_LOCKING,
 	 *	neither readers nor writers take any locks here
 	 */
 	dio->lock_type = dio_lock_type;
-	if (dio_lock_type != DIO_NO_LOCKING) {
-		/* watch out for a 0 len io from a tricksy fs */
-		if (rw == READ && end > offset) {
-			struct address_space *mapping;
-
-			mapping = iocb->ki_filp->f_mapping;
-			if (dio_lock_type != DIO_OWN_LOCKING) {
-				mutex_lock(&inode->i_mutex);
-				release_i_mutex = 1;
-			}
-
-			retval = filemap_write_and_wait_range(mapping, offset,
-							      end - 1);
-			if (retval) {
-				kfree(dio);
-				goto out;
-			}
-
-			if (dio_lock_type == DIO_OWN_LOCKING) {
-				mutex_unlock(&inode->i_mutex);
-				acquire_i_mutex = 1;
-			}
-		}
-
-		if (dio_lock_type == DIO_LOCKING)
-			/* lockdep: not the owner will release it */
-			down_read_non_owner(&inode->i_alloc_sem);
+
+	if (dio->lock_type == DIO_NO_LOCKING && end > offset) {
+		retval = filemap_write_and_wait_range(mapping, offset, end - 1);
+		if (retval)
+			goto out;
 	}
 
 	/*
@@ -1242,15 +1291,7 @@ __blockdev_direct_IO(int rw, struct kioc
 
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
 				nr_segs, blkbits, get_block, end_io, dio);
-
-	if (rw == READ && dio_lock_type == DIO_LOCKING)
-		release_i_mutex = 0;
-
 out:
-	if (release_i_mutex)
-		mutex_unlock(&inode->i_mutex);
-	else if (acquire_i_mutex)
-		mutex_lock(&inode->i_mutex);
 	return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);


-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html