Luis Henriques <lhenriques@xxxxxxxx> writes: <snip> > +static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, > + struct file *dst_file, loff_t dst_off, > + size_t len, unsigned int flags) > +{ > + struct inode *src_inode = file_inode(src_file); > + struct inode *dst_inode = file_inode(dst_file); > + struct ceph_inode_info *src_ci = ceph_inode(src_inode); > + struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); > + struct ceph_cap_flush *prealloc_cf; > + struct ceph_object_locator src_oloc, dst_oloc; > + struct ceph_object_id src_oid, dst_oid; > + loff_t endoff = 0, size; > + ssize_t ret = -EIO; > + u64 src_objnum, dst_objnum, src_objoff, dst_objoff; > + u32 src_objlen, dst_objlen, object_size; > + int src_got = 0, dst_got = 0, err, dirty; > + bool do_final_copy = false; > + > + if (src_inode == dst_inode) > + return -EINVAL; > + if (ceph_snap(dst_inode) != CEPH_NOSNAP) > + return -EROFS; > + > + /* > + * Some of the checks below will return -EOPNOTSUPP, which will force a > + * fallback to the default VFS copy_file_range implementation. This is > + * desirable in several cases (for ex, the 'len' is smaller than the > + * size of the objects, or in cases where that would be more > + * efficient). > + */ > + > + if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || > + (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) || > + (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) > + return -EOPNOTSUPP; > + > + if (len < src_ci->i_layout.object_size) > + return -EOPNOTSUPP; /* no remote copy will be done */ > + > + prealloc_cf = ceph_alloc_cap_flush(); > + if (!prealloc_cf) > + return -ENOMEM; > + > + /* Start by sync'ing the source file */ > + ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); > + if (ret < 0) > + goto out; > + > + /* > + * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other > + * clients may have dirty data in their caches. And OSDs know nothing > + * about caps, so they can't safely do the remote object copies. > + */ > + err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, > + dst_ci, (dst_off + len), &dst_got); > + if (err < 0) { > + dout("get_rd_wr_caps returned %d\n", err); > + ret = -EOPNOTSUPP; > + goto out; > + } > + > + ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); > + if (ret < 0) > + goto out_caps; > + > + size = i_size_read(dst_inode); > + endoff = dst_off + len; > + > + /* Drop dst file cached pages */ > + ret = invalidate_inode_pages2_range(dst_inode->i_mapping, > + dst_off >> PAGE_SHIFT, > + endoff >> PAGE_SHIFT); > + if (ret < 0) { > + dout("Failed to invalidate inode pages (%ld)\n", ret); > + ret = 0; /* XXX */ > + } I believe I found an issue with this code: if we try to copy into a file that was just written, the remote copied data will be overwritten by our buffered writes once they are flushed. When this happens we can see the above message in the kernel (with a -EBUSY error). Here's the 3 options I see to fixes this: 1. Add another call to file_write_and_wait_range(), with the dst_file. I believe this souldn't introduce a huge overhead for the cases where the file isn't buffered. 2. Add the call to file_write_and_wait_range() only if invalidate_inode_pages2_range fails. We'll need to drop/get caps for this. 3. All of the above. I'm not sure we need to do both, but I guess we'll need to at least fail the operation and return the error if invalidate_inode_pages2_range fails. Please let me know what do you think. And if you want me to send a patch to fix this or if you rather have me sending the whole thing again. Cheers, -- Luis