Re: [PATCH] ceph: re-org copy_file_range and fix error handling paths

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 2020-02-17 at 12:36 +0000, Luis Henriques wrote:
> This patch re-organizes copy_file_range, trying to fix a few issues in
> error handling.  Here's the summary:
> 
> - Abort copy if initial do_splice_direct() returns fewer bytes than
>   requested.
> 
> - Move the 'size' initialization (with i_size_read()) further down in the
>   code, after the initial call to do_splice_direct().  This avoids issues
>   with a possibly stale value if a manual copy is done.
> 
> - Move the object copy loop into a separate function.  This makes it
>   easier to handle errors (e.g, dirtying caps and updating the MDS
>   metadata if only some objects have been copied before an error has
>   occurred).
> 
> - Added calls to ceph_oloc_destroy() to avoid leaking memory with src_oloc
>   and dst_oloc
> 
> - After the object copy loop, the new file size to be reported to the MDS
>   (if there's file size change) is now the actual file size, and not the
>   size after an eventual extra manual copy.
> 
> - Added a few dout() to show the number of bytes copied in the two manual
>   copies and in the object copy loop.
> 
> Signed-off-by: Luis Henriques <lhenriques@xxxxxxxx>
> ---
> Hi!
> 
> Initially I was going to have this patch split in a series, but then I
> decided not to do that as this big patch allows (IMO) to better see the
> whole picture.  But please let me know if you think otherwise and I can
> split it in a few smaller patches.
> 
> I tried to cover all the issues that have been pointed out by Ilya, but I
> may have missed something or, more likely, introduced new bugs ;-)
> 
> Cheers,
> --
> Luis
> 

Sorry for the delay in review!

>  fs/ceph/file.c | 169 ++++++++++++++++++++++++++++---------------------
>  1 file changed, 96 insertions(+), 73 deletions(-)
> 
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index c3b8e8e0bf17..4d90a275f9a5 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1931,6 +1931,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
>  	return 0;
>  }
>  
> +static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
> +				    struct ceph_inode_info *dst_ci, u64 *dst_off,
> +				    struct ceph_fs_client *fsc,
> +				    size_t len, unsigned int flags)
> +{
> +	struct ceph_object_locator src_oloc, dst_oloc;
> +	struct ceph_object_id src_oid, dst_oid;
> +	size_t bytes = 0;
> +	u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
> +	u32 src_objlen, dst_objlen;
> +	u32 object_size = src_ci->i_layout.object_size;
> +	int ret;
> +
> +	src_oloc.pool = src_ci->i_layout.pool_id;
> +	src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
> +	dst_oloc.pool = dst_ci->i_layout.pool_id;
> +	dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
> +
> +	while (len >= object_size) {
> +		ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
> +					      object_size, &src_objnum,
> +					      &src_objoff, &src_objlen);
> +		ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
> +					      object_size, &dst_objnum,
> +					      &dst_objoff, &dst_objlen);
> +		ceph_oid_init(&src_oid);
> +		ceph_oid_printf(&src_oid, "%llx.%08llx",
> +				src_ci->i_vino.ino, src_objnum);
> +		ceph_oid_init(&dst_oid);
> +		ceph_oid_printf(&dst_oid, "%llx.%08llx",
> +				dst_ci->i_vino.ino, dst_objnum);
> +		/* Do an object remote copy */
> +		ret = ceph_osdc_copy_from(&fsc->client->osdc,
> +					  src_ci->i_vino.snap, 0,
> +					  &src_oid, &src_oloc,
> +					  CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
> +					  CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
> +					  &dst_oid, &dst_oloc,
> +					  CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
> +					  CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
> +					  dst_ci->i_truncate_seq,
> +					  dst_ci->i_truncate_size,
> +					  CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
> +		if (ret) {
> +			if (ret == -EOPNOTSUPP) {
> +				fsc->have_copy_from2 = false;
> +				pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
> +			}
> +			dout("ceph_osdc_copy_from returned %d\n", ret);
> +			if (!bytes)
> +				bytes = ret;
> +			goto out;
> +		}
> +		len -= object_size;
> +		bytes += object_size;
> +		*src_off += object_size;
> +		*dst_off += object_size;
> +	}
> +
> +out:
> +	ceph_oloc_destroy(&src_oloc);
> +	ceph_oloc_destroy(&dst_oloc);
> +	return bytes;
> +}
> +
>  static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  				      struct file *dst_file, loff_t dst_off,
>  				      size_t len, unsigned int flags)
> @@ -1941,14 +2006,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  	struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
>  	struct ceph_cap_flush *prealloc_cf;
>  	struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
> -	struct ceph_object_locator src_oloc, dst_oloc;
> -	struct ceph_object_id src_oid, dst_oid;
> -	loff_t endoff = 0, size;
> -	ssize_t ret = -EIO;
> +	loff_t size;
> +	ssize_t ret = -EIO, bytes;
>  	u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
> -	u32 src_objlen, dst_objlen, object_size;
> +	u32 src_objlen, dst_objlen;
>  	int src_got = 0, dst_got = 0, err, dirty;
> -	bool do_final_copy = false;
>  
>  	if (src_inode->i_sb != dst_inode->i_sb) {
>  		struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
> @@ -2026,22 +2088,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  	if (ret < 0)
>  		goto out_caps;
>  
> -	size = i_size_read(dst_inode);
> -	endoff = dst_off + len;
> -
>  	/* Drop dst file cached pages */
>  	ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
>  					    dst_off >> PAGE_SHIFT,
> -					    endoff >> PAGE_SHIFT);
> +					    (dst_off + len) >> PAGE_SHIFT);
>  	if (ret < 0) {
>  		dout("Failed to invalidate inode pages (%zd)\n", ret);
>  		ret = 0; /* XXX */
>  	}
> -	src_oloc.pool = src_ci->i_layout.pool_id;
> -	src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
> -	dst_oloc.pool = dst_ci->i_layout.pool_id;
> -	dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
> -
>  	ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
>  				      src_ci->i_layout.object_size,
>  				      &src_objnum, &src_objoff, &src_objlen);
> @@ -2060,6 +2114,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  	 * starting at the src_off
>  	 */
>  	if (src_objoff) {
> +		dout("Initial partial copy of %u bytes\n", src_objlen);
> +
>  		/*
>  		 * we need to temporarily drop all caps as we'll be calling
>  		 * {read,write}_iter, which will get caps again.
> @@ -2067,8 +2123,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  		put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
>  		ret = do_splice_direct(src_file, &src_off, dst_file,
>  				       &dst_off, src_objlen, flags);
> -		if (ret < 0) {
> -			dout("do_splice_direct returned %d\n", err);
> +		/* Abort on short copies or on error */
> +		if (ret < src_objlen) {
> +			dout("Failed partial copy (%zd)\n", ret);
>  			goto out;
>  		}
>  		len -= ret;
> @@ -2081,62 +2138,29 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  		if (err < 0)
>  			goto out_caps;
>  	}
> -	object_size = src_ci->i_layout.object_size;
> -	while (len >= object_size) {
> -		ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
> -					      object_size, &src_objnum,
> -					      &src_objoff, &src_objlen);
> -		ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
> -					      object_size, &dst_objnum,
> -					      &dst_objoff, &dst_objlen);
> -		ceph_oid_init(&src_oid);
> -		ceph_oid_printf(&src_oid, "%llx.%08llx",
> -				src_ci->i_vino.ino, src_objnum);
> -		ceph_oid_init(&dst_oid);
> -		ceph_oid_printf(&dst_oid, "%llx.%08llx",
> -				dst_ci->i_vino.ino, dst_objnum);
> -		/* Do an object remote copy */
> -		err = ceph_osdc_copy_from(
> -			&src_fsc->client->osdc,
> -			src_ci->i_vino.snap, 0,
> -			&src_oid, &src_oloc,
> -			CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
> -			CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
> -			&dst_oid, &dst_oloc,
> -			CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
> -			CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
> -			dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
> -			CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
> -		if (err) {
> -			if (err == -EOPNOTSUPP) {
> -				src_fsc->have_copy_from2 = false;
> -				pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
> -			}
> -			dout("ceph_osdc_copy_from returned %d\n", err);
> -			if (!ret)
> -				ret = err;
> -			goto out_caps;
> -		}
> -		len -= object_size;
> -		src_off += object_size;
> -		dst_off += object_size;
> -		ret += object_size;
> -	}
>  
> -	if (len)
> -		/* We still need one final local copy */
> -		do_final_copy = true;
> +	size = i_size_read(dst_inode);
> +	bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
> +				     src_fsc, len, flags);
> +	if (bytes <= 0) {
> +		if (!ret)
> +			ret = bytes;
> +		goto out_caps;
> +	}

Suppose we did the front part with do_splice_direct (ret > 0), but then
ceph_do_objects_copy fails (bytes < 0). We "goto out_caps" but then...

[...]

> @@ -2152,15 +2176,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
>  out_caps:
>  	put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
>  
> -	if (do_final_copy) {
> -		err = do_splice_direct(src_file, &src_off, dst_file,
> -				       &dst_off, len, flags);
> -		if (err < 0) {
> -			dout("do_splice_direct returned %d\n", err);
> -			goto out;
> -		}
> -		len -= err;
> -		ret += err;
> +	if (len && (ret >= 0)) {

...len is still positive and we do_splice_direct again (probably at the
wrong offset?), instead of just returning a short copy. I think we
probably want to just stop any copying if it fails at any point along
the way, right?

> +		dout("Final partial copy of %zu bytes\n", len);
> +		bytes = do_splice_direct(src_file, &src_off, dst_file,
> +					 &dst_off, len, flags);
> +		if (bytes > 0)
> +			ret += bytes;
> +		else
> +			dout("Failed partial copy (%zd)\n", bytes);
>  	}
>  
>  out:

-- 
Jeff Layton <jlayton@xxxxxxxxxx>




[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux