[PATCH 5/8] dio: refactor __blockdev_direct_IO()

Zach Brown <zach.brown@xxxxxxxxxx> · Thu, 22 Oct 2009 13:25:54 -0700

This patch moves code around so that __blockdev_direct_IO() becomes a function
which only iterates over its iovec argument while calling helper functions.
This wil let us add functions in the future which call the same helper
functions while iterating over other ways of specifying memory for the IO.

direct_io_worker() was only called once from __blockdev_direct_IO() but took
the iovec argument.  Instead of trying to pass down other ways of specifying
memory, we move its body up into __blockdev_direct_IO() before pulling the
initilization and teardown steps of this resulting huge function off into
helpers.  What's left is a small function that mostly concerns itself with
calling helpers on the iovec.

The dio_unaligned() helper is a little strange, but it reflects the flow of the
original code.  If we were to clean it up we'd run the risk of introducing
bugs.

Comments have not been updated to reflect the code change yet.

Signed-off-by: Zach Brown <zach.brown@xxxxxxxxxx>
---
 fs/direct-io.c |  313 +++++++++++++++++++++++++++++--------------------------
 1 files changed, 165 insertions(+), 148 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 8b10b87..0a2ba8e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -925,21 +925,19 @@ out:
 	return ret;
 }
 
-/*
- * Releases both i_mutex and i_alloc_sem
- */
-static ssize_t
-direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
-	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-	unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-	struct dio *dio)
+static struct dio *dio_prepare(int rw, struct kiocb *iocb, struct inode *inode,
+			       unsigned blkbits, loff_t offset, loff_t end,
+			       get_block_t get_block, dio_iodone_t end_io,
+			       int dio_lock_type)
 {
-	unsigned long user_addr; 
-	unsigned long flags;
-	int seg;
-	ssize_t ret = 0;
-	ssize_t ret2;
-	size_t bytes;
+	struct dio *dio;
+
+	dio = kzalloc(sizeof(*dio), GFP_KERNEL);
+	if (!dio)
+		return NULL;
+
+	if (rw & WRITE)
+		rw = WRITE_ODIRECT;
 
 	dio->inode = inode;
 	dio->rw = rw;
@@ -947,6 +945,13 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	dio->blkfactor = inode->i_blkbits - blkbits;
 	dio->block_in_file = offset >> blkbits;
 
+	/*
+	 * In case of non-aligned buffers, we may need 2 more
+	 * pages since we need to zero out first and last block.
+	 */
+	if (unlikely(dio->blkfactor))
+		dio->pages_in_io = 2;
+
 	dio->get_block = get_block;
 	dio->end_io = end_io;
 	dio->final_block_in_bio = -1;
@@ -958,54 +963,88 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	spin_lock_init(&dio->bio_lock);
 	dio->refcount = 1;
 
+	dio->lock_type = dio_lock_type;
+
 	/*
-	 * In case of non-aligned buffers, we may need 2 more
-	 * pages since we need to zero out first and last block.
+	 * For file extending writes updating i_size before data
+	 * writeouts complete can expose uninitialized blocks. So
+	 * even for AIO, we need to wait for i/o to complete before
+	 * returning in this case.
 	 */
-	if (unlikely(dio->blkfactor))
-		dio->pages_in_io = 2;
+	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
+		(end > i_size_read(inode)));
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		dio->pages_in_io +=
-			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
-				- user_addr/PAGE_SIZE);
+	return dio;
+}
+
+/*
+ * dio will work in inode blocksize until the time that it sees IO that isn't
+ * aligned to the inode blocksize.  From that point on all IO works in
+ * the block device's block size.
+ */
+int dio_unaligned(unsigned long val, unsigned *blkbits, 
+		  struct block_device *bdev)
+{
+	unsigned mask = (1 << *blkbits) - 1;
+
+	if (val & mask) {
+		if (bdev)
+			*blkbits = blksize_bits(bdev_logical_block_size(bdev));
+		mask = (1 << *blkbits) - 1;
+		return !!(val & mask);
 	}
 
-	for (seg = 0; seg < nr_segs; seg++) {
-		user_addr = (unsigned long)iov[seg].iov_base;
-		dio->size += bytes = iov[seg].iov_len;
+	return 0;
+}
 
-		/* Index into the first page of the first block */
-		dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
-		dio->final_block_in_request = dio->block_in_file +
-						(bytes >> blkbits);
-		/* Page fetching state */
-		dio->head = 0;
-		dio->tail = 0;
-		dio->curr_page = 0;
+/*
+ * For block device access DIO_NO_LOCKING is used,
+ *	neither readers nor writers do any locking at all
+ * For regular files using DIO_LOCKING,
+ *	readers need to grab i_mutex and i_alloc_sem
+ *	writers need to grab i_alloc_sem only (i_mutex is already held)
+ * For regular files using DIO_OWN_LOCKING,
+ *	neither readers nor writers take any locks here
+ */
+static int dio_lock_and_flush(int rw, struct inode *inode, int dio_lock_type,
+		              loff_t offset, loff_t end)
+{
+	int ret;
 
-		dio->total_pages = 0;
-		if (user_addr & (PAGE_SIZE-1)) {
-			dio->total_pages++;
-			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
-		}
-		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
-		dio->curr_user_address = user_addr;
-	
-		ret = do_direct_IO(dio);
+	if (dio_lock_type == DIO_NO_LOCKING)
+		return 0;
 
-		dio->result += iov[seg].iov_len -
-			((dio->final_block_in_request - dio->block_in_file) <<
-					blkbits);
+	/* watch out for a 0 len io from a tricksy fs */
+	if (rw == READ && end > offset) {
+		if (dio_lock_type != DIO_OWN_LOCKING)
+			mutex_lock(&inode->i_mutex);
 
+		ret = filemap_write_and_wait_range(inode->i_mapping, offset,
+						   end - 1);
 		if (ret) {
-			dio_cleanup(dio);
-			break;
+			if (dio_lock_type != DIO_OWN_LOCKING)
+				mutex_unlock(&inode->i_mutex);
+			return ret;
 		}
-	} /* end iovec loop */
 
-	if (ret == -ENOTBLK && (rw & WRITE)) {
+		if (dio_lock_type == DIO_OWN_LOCKING)
+			mutex_unlock(&inode->i_mutex);
+	}
+
+	if (dio_lock_type == DIO_LOCKING)
+		/* lockdep: not the owner will release it */
+		down_read_non_owner(&inode->i_alloc_sem);
+
+	return 0;
+}
+
+static ssize_t dio_post_submission(struct dio *dio, loff_t offset, loff_t end,
+				   int ret)
+{
+	unsigned long flags;
+	int ret2;
+
+	if (ret == -ENOTBLK && (dio->rw & WRITE)) {
 		/*
 		 * The remaining part of the request will be
 		 * be handled by buffered I/O when we return
@@ -1029,7 +1068,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		dio_bio_submit(dio);
 
 	/* All IO is now issued, send it on its way */
-	blk_run_address_space(inode->i_mapping);
+	blk_run_address_space(dio->inode->i_mapping);
 
 	/*
 	 * It is possible that, we return short IO due to end of file.
@@ -1042,7 +1081,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 * we can let i_mutex go now that its achieved its purpose
 	 * of protecting us from looking up uninitialized blocks.
 	 */
-	if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+	if ((dio->rw == READ) && (dio->lock_type == DIO_LOCKING))
 		mutex_unlock(&dio->inode->i_mutex);
 
 	/*
@@ -1054,7 +1093,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (dio->is_async && ret == 0 && dio->result &&
-	    ((rw & READ) || (dio->result == dio->size)))
+	    ((dio->rw & READ) || (dio->result == dio->size)))
 		ret = -EIOCBQUEUED;
 
 	if (ret != -EIOCBQUEUED)
@@ -1081,6 +1120,22 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	} else
 		BUG_ON(ret != -EIOCBQUEUED);
 
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
+	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+	 * it's own meaner.
+	 */
+	if (unlikely(ret < 0 && (dio->rw & WRITE))) {
+		loff_t isize = i_size_read(dio->inode);
+
+		if (end > isize && dio->lock_type == DIO_LOCKING)
+			vmtruncate(dio->inode, isize);
+	}
+
+	if (dio->lock_type == DIO_OWN_LOCKING && dio->rw == READ)
+		mutex_lock(&dio->inode->i_mutex);
+
 	return ret;
 }
 
@@ -1112,122 +1167,84 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int dio_lock_type)
 {
 	int seg;
-	size_t size;
-	unsigned long addr;
+	size_t bytes;
+	unsigned long user_addr;
 	unsigned blkbits = inode->i_blkbits;
-	unsigned bdev_blkbits = 0;
-	unsigned blocksize_mask = (1 << blkbits) - 1;
-	ssize_t retval = -EINVAL;
+	ssize_t ret;
 	loff_t end = offset;
 	struct dio *dio;
-	int release_i_mutex = 0;
-	int acquire_i_mutex = 0;
-
-	if (rw & WRITE)
-		rw = WRITE_ODIRECT;
-
-	if (bdev)
-		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
 
-	if (offset & blocksize_mask) {
-		if (bdev)
-			 blkbits = bdev_blkbits;
-		blocksize_mask = (1 << blkbits) - 1;
-		if (offset & blocksize_mask)
-			goto out;
+	if (dio_unaligned(offset, &blkbits, bdev)) {
+		ret = -EINVAL;
+		goto out;
 	}
 
 	/* Check the memory alignment.  Blocks cannot straddle pages */
 	for (seg = 0; seg < nr_segs; seg++) {
-		addr = (unsigned long)iov[seg].iov_base;
-		size = iov[seg].iov_len;
-		end += size;
-		if ((addr & blocksize_mask) || (size & blocksize_mask))  {
-			if (bdev)
-				 blkbits = bdev_blkbits;
-			blocksize_mask = (1 << blkbits) - 1;
-			if ((addr & blocksize_mask) || (size & blocksize_mask))  
-				goto out;
+		user_addr = (unsigned long)iov[seg].iov_base;
+		bytes = iov[seg].iov_len;
+		end += bytes;
+		if (dio_unaligned(user_addr|bytes, &blkbits, bdev)) {
+			ret = -EINVAL;
+			goto out;
 		}
 	}
 
-	dio = kzalloc(sizeof(*dio), GFP_KERNEL);
-	retval = -ENOMEM;
-	if (!dio)
+	dio = dio_prepare(rw, iocb, inode, blkbits, offset, end, get_block,
+			  end_io, dio_lock_type);
+	if (!dio) {
+		ret = -ENOMEM;
 		goto out;
+	}
 
-	/*
-	 * For block device access DIO_NO_LOCKING is used,
-	 *	neither readers nor writers do any locking at all
-	 * For regular files using DIO_LOCKING,
-	 *	readers need to grab i_mutex and i_alloc_sem
-	 *	writers need to grab i_alloc_sem only (i_mutex is already held)
-	 * For regular files using DIO_OWN_LOCKING,
-	 *	neither readers nor writers take any locks here
-	 */
-	dio->lock_type = dio_lock_type;
-	if (dio_lock_type != DIO_NO_LOCKING) {
-		/* watch out for a 0 len io from a tricksy fs */
-		if (rw == READ && end > offset) {
-			struct address_space *mapping;
-
-			mapping = iocb->ki_filp->f_mapping;
-			if (dio_lock_type != DIO_OWN_LOCKING) {
-				mutex_lock(&inode->i_mutex);
-				release_i_mutex = 1;
-			}
-
-			retval = filemap_write_and_wait_range(mapping, offset,
-							      end - 1);
-			if (retval) {
-				kfree(dio);
-				goto out;
-			}
-
-			if (dio_lock_type == DIO_OWN_LOCKING) {
-				mutex_unlock(&inode->i_mutex);
-				acquire_i_mutex = 1;
-			}
-		}
+	ret = dio_lock_and_flush(rw, inode, dio_lock_type, offset, end);
+	if (ret) {
+		kfree(dio);
+		goto out;
+	}
 
-		if (dio_lock_type == DIO_LOCKING)
-			/* lockdep: not the owner will release it */
-			down_read_non_owner(&inode->i_alloc_sem);
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		dio->pages_in_io +=
+			((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE
+				- user_addr/PAGE_SIZE);
 	}
 
-	/*
-	 * For file extending writes updating i_size before data
-	 * writeouts complete can expose uninitialized blocks. So
-	 * even for AIO, we need to wait for i/o to complete before
-	 * returning in this case.
-	 */
-	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
-		(end > i_size_read(inode)));
+	for (seg = 0; seg < nr_segs; seg++) {
+		user_addr = (unsigned long)iov[seg].iov_base;
+		dio->size += bytes = iov[seg].iov_len;
 
-	retval = direct_io_worker(rw, iocb, inode, iov, offset,
-				nr_segs, blkbits, get_block, end_io, dio);
+		/* Index into the first page of the first block */
+		dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
+		dio->final_block_in_request = dio->block_in_file +
+						(bytes >> blkbits);
+		/* Page fetching state */
+		dio->head = 0;
+		dio->tail = 0;
+		dio->curr_page = 0;
 
-	/*
-	 * In case of error extending write may have instantiated a few
-	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
-	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
-	 * it's own meaner.
-	 */
-	if (unlikely(retval < 0 && (rw & WRITE))) {
-		loff_t isize = i_size_read(inode);
+		dio->total_pages = 0;
+		if (user_addr & (PAGE_SIZE-1)) {
+			dio->total_pages++;
+			bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
+		}
+		dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+		dio->curr_user_address = user_addr;
+	
+		ret = do_direct_IO(dio);
 
-		if (end > isize && dio_lock_type == DIO_LOCKING)
-			vmtruncate(inode, isize);
-	}
+		dio->result += iov[seg].iov_len -
+			((dio->final_block_in_request - dio->block_in_file) <<
+					blkbits);
 
-	if (rw == READ && dio_lock_type == DIO_LOCKING)
-		release_i_mutex = 0;
+		if (ret) {
+			dio_cleanup(dio);
+			break;
+		}
+	}
 
+	ret = dio_post_submission(dio, offset, end, ret);
 out:
-	if (release_i_mutex)
-		mutex_unlock(&inode->i_mutex);
-	else if (acquire_i_mutex)
-		mutex_lock(&inode->i_mutex);
-	return retval;
+	return ret;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
-- 
1.6.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html