On Sun, Sep 9, 2018 at 7:06 PM, Yan, Zheng <zyan@xxxxxxxxxx> wrote: > > >> On Sep 8, 2018, at 00:03, Luis Henriques <lhenriques@xxxxxxxx> wrote: >> >> "Yan, Zheng" <ukernel@xxxxxxxxx> writes: >> >>> On Fri, Sep 7, 2018 at 12:08 AM Luis Henriques <lhenriques@xxxxxxxx> wrote: >> <...> >> >>>> + if (objoff || (copy_len < src_ci->i_layout.object_size)) { >>>> + /* Do not copy beyond this object */ >>>> + if (copy_len > objlen) >>>> + copy_len = objlen; >>>> + err = do_splice_direct(src_file, &src_off, dst_file, >>>> + &dst_off, copy_len, flags); >>> >>> We should release caps refs before calling do_splice_direct(). >>> do_splice_direct calls read_iter and write_iter, which will get caps >>> refs again. >> >> That's... annoying. I could release caps, call do_slice_direct and get >> the caps again but this would introduce a race window. >> >> I could also re-write this loop so that it would: >> >> - do all the required do_splice_direct calls >> - get the caps >> - loop doing all the remote object copies >> - release caps >> >> But the whole operation would still not be atomic. Do you have any >> suggestion on how to make it atomic? > > The default implementation (using do_splice_direct) is not atomic. I think we just need to ensure write on single object is atomic. I didn't see anything specific about atomicity in the copy_file_range man page, but the idea that it wouldn't be atomic definitely seems weird to me. Are we sure this is okay? Is the kernel's default implementation doing something non-atomic that would be atomic if we were a local FS? -Greg > > Regards > Yan, Zheng > >> >> (Regarding all the other comments, I agree with them and I'll fix them >> in a later version. Thanks a lot for the review!) >> >> Cheers, >> -- >> Luis >> >>> Besides, do_splice_direct() may copy less data than copy_len. I >>> missed this in previous review, Sorry. >>> >>> >>>> + if (err < 0) { >>>> + ret = err; >>> >>> If we have copied same data, then error happened, we should return >>> size of copied data. >>> >>>> + goto out_caps; >>>> + } >>>> + len -= copy_len; >>>> + ret += copy_len; >>>> + continue; >>>> + } >>>> + >>>> + ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, >>>> + copy_len, &objnum, &objoff, >>>> + &objlen); >>>> + ceph_oid_init(&dst_oid); >>>> + ceph_oid_printf(&dst_oid, "%llx.%08llx", >>>> + dst_ci->i_vino.ino, objnum); >>>> + /* Again... do a manual copy if: >>>> + * - destination file offset isn't object aligned, or >>>> + * - copy length is smaller than object size >>>> + * (although the object size should be the same for different >>>> + * files in the same filesystem...) >>>> + */ >>>> + if (objoff || (copy_len < dst_ci->i_layout.object_size)) { >>>> + if (copy_len > objlen) >>>> + copy_len = objlen; >>>> + err = do_splice_direct(src_file, &src_off, dst_file, >>>> + &dst_off, copy_len, flags); >>>> + if (err < 0) { >>>> + ret = err; >>>> + goto out_caps; >>>> + } >>>> + len -= copy_len; >>>> + ceph_oid_destroy(&src_oid); >>> >>> how about dst_oid. I think we can set src_oid/dst_oid just before >>> ceph_osdc_copy_from >>>> + ret += copy_len; >>>> + continue; >>>> + } >>>> + /* Finally... do an object remote copy */ >>>> + err = ceph_osdc_copy_from(osdc, src_ci->i_vino.snap, >>>> + 0, /* XXX src_ci->i_version ? */ >>>> + &src_oid, &src_oloc, >>>> + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL|CEPH_OSD_OP_FLAG_FADVISE_WILLNEED, >>>> + dst_ci->i_vino.snap, &dst_oid, >>>> + &dst_oloc, >>>> + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL|CEPH_OSD_OP_FLAG_FADVISE_WILLNEED); >>>> + if (err) { >>>> + printk("copy_from returned an error: %d\n", err); /* XXX */ >>>> + ret = err; >>>> + goto out_caps; >>>> + } >>>> + len -= copy_len; >>>> + src_off += copy_len; >>>> + dst_off += copy_len; >>>> + ret += copy_len; >>>> + ceph_oid_destroy(&src_oid); >>>> + ceph_oid_destroy(&dst_oid); >>>> + } >>>> + /* Let the MDS know about destination object size change */ >>>> + if (endoff > size) { >>> >>> I think we should file time (call file_update_time()), and mark Fw >>> dirty even endoff <= size >>> >>>> + int dirty; >>>> + int caps_flags = CHECK_CAPS_AUTHONLY; >>>> + >>>> + if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) >>>> + caps_flags |= CHECK_CAPS_NODELAY; >>>> + if (ceph_inode_set_size(dst_inode, endoff)) >>>> + caps_flags |= CHECK_CAPS_AUTHONLY; >>>> + if (caps_flags) >>>> + ceph_check_caps(dst_ci, caps_flags, NULL); >>>> + spin_lock(&dst_ci->i_ceph_lock); >>>> + dst_ci->i_inline_version = CEPH_INLINE_NONE; >>>> + dirty = __ceph_mark_dirty_caps( >>>> + dst_ci, >>>> + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER, >>> >>> don't add CEPH_CAP_FILE_BUFFER >>> >>> Regards >>> Yan, Zheng >>> >>>> + &prealloc_cf); >>>> + spin_unlock(&dst_ci->i_ceph_lock); >>>> + if (dirty) >>>> + __mark_inode_dirty(dst_inode, dirty); >>>> + } >>>> +out_caps: >>>> + ceph_put_cap_refs(src_ci, src_got); >>>> +out_dst_caps: >>>> + ceph_put_cap_refs(dst_ci, dst_got); >>>> +out: >>>> + ceph_free_cap_flush(prealloc_cf); >>>> + >>>> + return ret; >>>> +} >>>> + >>>> const struct file_operations ceph_file_fops = { >>>> .open = ceph_open, >>>> .release = ceph_release, >>>> @@ -1844,5 +2064,6 @@ const struct file_operations ceph_file_fops = { >>>> .unlocked_ioctl = ceph_ioctl, >>>> .compat_ioctl = ceph_ioctl, >>>> .fallocate = ceph_fallocate, >>>> + .copy_file_range = ceph_copy_file_range, >>>> }; >>>> >>> >