On Wed, Sep 12, 2018 at 1:47 AM Gregory Farnum <gfarnum@xxxxxxxxxx> wrote: > > On Sun, Sep 9, 2018 at 7:06 PM, Yan, Zheng <zyan@xxxxxxxxxx> wrote: > > > > > >> On Sep 8, 2018, at 00:03, Luis Henriques <lhenriques@xxxxxxxx> wrote: > >> > >> "Yan, Zheng" <ukernel@xxxxxxxxx> writes: > >> > >>> On Fri, Sep 7, 2018 at 12:08 AM Luis Henriques <lhenriques@xxxxxxxx> wrote: > >> <...> > >> > >>>> + if (objoff || (copy_len < src_ci->i_layout.object_size)) { > >>>> + /* Do not copy beyond this object */ > >>>> + if (copy_len > objlen) > >>>> + copy_len = objlen; > >>>> + err = do_splice_direct(src_file, &src_off, dst_file, > >>>> + &dst_off, copy_len, flags); > >>> > >>> We should release caps refs before calling do_splice_direct(). > >>> do_splice_direct calls read_iter and write_iter, which will get caps > >>> refs again. > >> > >> That's... annoying. I could release caps, call do_slice_direct and get > >> the caps again but this would introduce a race window. > >> > >> I could also re-write this loop so that it would: > >> > >> - do all the required do_splice_direct calls > >> - get the caps > >> - loop doing all the remote object copies > >> - release caps > >> > >> But the whole operation would still not be atomic. Do you have any > >> suggestion on how to make it atomic? > > > > The default implementation (using do_splice_direct) is not atomic. I think we just need to ensure write on single object is atomic. > > I didn't see anything specific about atomicity in the copy_file_range > man page, but the idea that it wouldn't be atomic definitely seems > weird to me. Are we sure this is okay? Is the kernel's default > implementation doing something non-atomic that would be atomic if we > were a local FS? > -Greg > The default implementation just calls filesystem's read/write callback. It's not atomic when copy size is larger than kernel internal buffer size. Regards Yan, Zheng > > > > Regards > > Yan, Zheng > > > >> > >> (Regarding all the other comments, I agree with them and I'll fix them > >> in a later version. Thanks a lot for the review!) > >> > >> Cheers, > >> -- > >> Luis > >> > >>> Besides, do_splice_direct() may copy less data than copy_len. I > >>> missed this in previous review, Sorry. > >>> > >>> > >>>> + if (err < 0) { > >>>> + ret = err; > >>> > >>> If we have copied same data, then error happened, we should return > >>> size of copied data. > >>> > >>>> + goto out_caps; > >>>> + } > >>>> + len -= copy_len; > >>>> + ret += copy_len; > >>>> + continue; > >>>> + } > >>>> + > >>>> + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, > >>>> + copy_len, &objnum, &objoff, > >>>> + &objlen); > >>>> + ceph_oid_init(&dst_oid); > >>>> + ceph_oid_printf(&dst_oid, "%llx.%08llx", > >>>> + dst_ci->i_vino.ino, objnum); > >>>> + /* Again... do a manual copy if: > >>>> + * - destination file offset isn't object aligned, or > >>>> + * - copy length is smaller than object size > >>>> + * (although the object size should be the same for different > >>>> + * files in the same filesystem...) > >>>> + */ > >>>> + if (objoff || (copy_len < dst_ci->i_layout.object_size)) { > >>>> + if (copy_len > objlen) > >>>> + copy_len = objlen; > >>>> + err = do_splice_direct(src_file, &src_off, dst_file, > >>>> + &dst_off, copy_len, flags); > >>>> + if (err < 0) { > >>>> + ret = err; > >>>> + goto out_caps; > >>>> + } > >>>> + len -= copy_len; > >>>> + ceph_oid_destroy(&src_oid); > >>> > >>> how about dst_oid. I think we can set src_oid/dst_oid just before > >>> ceph_osdc_copy_from > >>>> + ret += copy_len; > >>>> + continue; > >>>> + } > >>>> + /* Finally... do an object remote copy */ > >>>> + err = ceph_osdc_copy_from(osdc, src_ci->i_vino.snap, > >>>> + 0, /* XXX src_ci->i_version ? */ > >>>> + &src_oid, &src_oloc, > >>>> + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL|CEPH_OSD_OP_FLAG_FADVISE_WILLNEED, > >>>> + dst_ci->i_vino.snap, &dst_oid, > >>>> + &dst_oloc, > >>>> + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL|CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); > >>>> + if (err) { > >>>> + printk("copy_from returned an error: %d\n", err); /* XXX */ > >>>> + ret = err; > >>>> + goto out_caps; > >>>> + } > >>>> + len -= copy_len; > >>>> + src_off += copy_len; > >>>> + dst_off += copy_len; > >>>> + ret += copy_len; > >>>> + ceph_oid_destroy(&src_oid); > >>>> + ceph_oid_destroy(&dst_oid); > >>>> + } > >>>> + /* Let the MDS know about destination object size change */ > >>>> + if (endoff > size) { > >>> > >>> I think we should file time (call file_update_time()), and mark Fw > >>> dirty even endoff <= size > >>> > >>>> + int dirty; > >>>> + int caps_flags = CHECK_CAPS_AUTHONLY; > >>>> + > >>>> + if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) > >>>> + caps_flags |= CHECK_CAPS_NODELAY; > >>>> + if (ceph_inode_set_size(dst_inode, endoff)) > >>>> + caps_flags |= CHECK_CAPS_AUTHONLY; > >>>> + if (caps_flags) > >>>> + ceph_check_caps(dst_ci, caps_flags, NULL); > >>>> + spin_lock(&dst_ci->i_ceph_lock); > >>>> + dst_ci->i_inline_version = CEPH_INLINE_NONE; > >>>> + dirty = __ceph_mark_dirty_caps( > >>>> + dst_ci, > >>>> + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER, > >>> > >>> don't add CEPH_CAP_FILE_BUFFER > >>> > >>> Regards > >>> Yan, Zheng > >>> > >>>> + &prealloc_cf); > >>>> + spin_unlock(&dst_ci->i_ceph_lock); > >>>> + if (dirty) > >>>> + __mark_inode_dirty(dst_inode, dirty); > >>>> + } > >>>> +out_caps: > >>>> + ceph_put_cap_refs(src_ci, src_got); > >>>> +out_dst_caps: > >>>> + ceph_put_cap_refs(dst_ci, dst_got); > >>>> +out: > >>>> + ceph_free_cap_flush(prealloc_cf); > >>>> + > >>>> + return ret; > >>>> +} > >>>> + > >>>> const struct file_operations ceph_file_fops = { > >>>> .open = ceph_open, > >>>> .release = ceph_release, > >>>> @@ -1844,5 +2064,6 @@ const struct file_operations ceph_file_fops = { > >>>> .unlocked_ioctl = ceph_ioctl, > >>>> .compat_ioctl = ceph_ioctl, > >>>> .fallocate = ceph_fallocate, > >>>> + .copy_file_range = ceph_copy_file_range, > >>>> }; > >>>> > >>> > >