Re: [PATCH 4/5] ext4: introduce direct IO write code path using iomap infrastructure

Christoph Hellwig <hch@xxxxxxxxxxxxx> · Mon, 12 Aug 2019 10:34:03 -0700

> +	if (error) {
> +		if (offset + size > i_size_read(inode))
> +			ext4_truncate_failed_write(inode);
> +
> +		/*
> +		 * The inode may have been placed onto the orphan list
> +		 * as a result of an extension. However, an error may
> +		 * have been encountered prior to being able to
> +		 * complete the write operation. Perform any necessary
> +		 * clean up in this case.
> +		 */
> +		if (!list_empty(&EXT4_I(inode)->i_orphan)) {
> +			handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
> +			if (IS_ERR(handle)) {
> +				if (inode->i_nlink)
> +					ext4_orphan_del(NULL, inode);
> +				return PTR_ERR(handle);
> +			}
> +
> +			if (inode->i_nlink)
> +				ext4_orphan_del(handle, inode);
> +			ext4_journal_stop(handle);
> +		}
> +		return error;

I'd split this branch into a separate function just to keep the
end_io handler tidy.

> +	if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
> +		inode_dio_wait(inode);
> +
> +	if (ret >= 0 && iov_iter_count(from)) {
> +		overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> +		return ext4_buffered_write_iter(iocb, from);
> +	}
> +out:
> +	overwrite ? inode_unlock_shared(inode) : inode_unlock(inode);
> +	return ret;

the ? : expression here is weird.

I'd write this as:

	if (overwrite)
		inode_unlock_shared(inode);
	else
		inode_unlock(inode);

	if (ret >= 0 && iov_iter_count(from))
		return ext4_buffered_write_iter(iocb, from);
	return ret;

and handle the only place we jump to the current out label manually,
as that always does an exclusive unlock anyway.

> +		if (IS_DAX(inode)) {
> +			ret = ext4_map_blocks(handle, inode, &map,
> +					      EXT4_GET_BLOCKS_CREATE_ZERO);
> +		} else {
> +			/*
> +			 * DAX and direct IO are the only two
> +			 * operations currently supported with
> +			 * IOMAP_WRITE.
> +			 */
> +			WARN_ON(!(flags & IOMAP_DIRECT));
> +			if (round_down(offset, i_blocksize(inode)) >=
> +			    i_size_read(inode)) {
> +				ret = ext4_map_blocks(handle, inode, &map,
> +						      EXT4_GET_BLOCKS_CREATE);
> +			} else if (!ext4_test_inode_flag(inode,
> +							 EXT4_INODE_EXTENTS)) {
> +				/*
> +				 * We cannot fill holes in indirect
> +				 * tree based inodes as that could
> +				 * expose stale data in the case of a
> +				 * crash. Use magic error code to
> +				 * fallback to buffered IO.
> +				 */
> +				ret = ext4_map_blocks(handle, inode, &map, 0);
> +				if (ret == 0)
> +					ret = -ENOTBLK;
> +			} else {
> +				ret = ext4_map_blocks(handle, inode, &map,
> +						      EXT4_GET_BLOCKS_IO_CREATE_EXT);
> +			}
> +		}

I think this could be simplified down to something like:

		int flags = 0;

		...

		/*
		 * DAX and direct IO are the only two operations currently
		 * supported with IOMAP_WRITE.
		 */
		WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));

		if (IS_DAX(inode))
			flags = EXT4_GET_BLOCKS_CREATE_ZERO;
		else if (round_down(offset, i_blocksize(inode)) >=
				i_size_read(inode)) {
			flags = EXT4_GET_BLOCKS_CREATE;
		else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
			flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;

		/*
		 * We cannot fill holes in indirect tree based inodes as that
		 * could expose stale data in the case of a crash.  Use the
		 * magic error code to fallback to buffered IO.
		 */
		if (!flags && !ret)
			ret = -ENOTBLK;

> @@ -3601,6 +3631,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>  static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
>  			  ssize_t written, unsigned flags, struct iomap *iomap)
>  {
> +	if (flags & IOMAP_DIRECT && written == 0)
> +		return -ENOTBLK;

This probably wants a comment, too.  But do we actually ever end up
here?