Re: [PATCH v3 3/3] ceph: support copy_file_range file operation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




> On Sep 8, 2018, at 00:03, Luis Henriques <lhenriques@xxxxxxxx> wrote:
> 
> "Yan, Zheng" <ukernel@xxxxxxxxx> writes:
> 
>> On Fri, Sep 7, 2018 at 12:08 AM Luis Henriques <lhenriques@xxxxxxxx> wrote:
> <...>
> 
>>> +               if (objoff || (copy_len < src_ci->i_layout.object_size)) {
>>> +                       /* Do not copy beyond this object */
>>> +                       if (copy_len > objlen)
>>> +                               copy_len = objlen;
>>> +                       err = do_splice_direct(src_file, &src_off, dst_file,
>>> +                                              &dst_off, copy_len, flags);
>> 
>> We should release caps refs before calling do_splice_direct().
>> do_splice_direct calls read_iter and write_iter, which will get caps
>> refs again.
> 
> That's... annoying.  I could release caps, call do_slice_direct and get
> the caps again but this would introduce a race window.
> 
> I could also re-write this loop so that it would:
> 
> - do all the required do_splice_direct calls
> - get the caps
> - loop doing all the remote object copies
> - release caps
> 
> But the whole operation would still not be atomic.  Do you have any
> suggestion on how to make it atomic?

The default implementation (using do_splice_direct) is not atomic. I think we just need to ensure write on single object is atomic.

Regards
Yan, Zheng

> 
> (Regarding all the other comments, I agree with them and I'll fix them
> in a later version.  Thanks a lot for the review!)
> 
> Cheers,
> -- 
> Luis
> 
>> Besides, do_splice_direct() may copy less data than copy_len.  I
>> missed this in previous review, Sorry.
>> 
>> 
>>> +                       if (err < 0) {
>>> +                               ret = err;
>> 
>> If we have copied same data, then error happened, we should return
>> size of copied data.
>> 
>>> +                               goto out_caps;
>>> +                       }
>>> +                       len -= copy_len;
>>> +                       ret += copy_len;
>>> +                       continue;
>>> +               }
>>> +
>>> +               ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
>>> +                                             copy_len, &objnum, &objoff,
>>> +                                             &objlen);
>>> +               ceph_oid_init(&dst_oid);
>>> +               ceph_oid_printf(&dst_oid, "%llx.%08llx",
>>> +                               dst_ci->i_vino.ino, objnum);
>>> +               /* Again... do a manual copy if:
>>> +                *  - destination file offset isn't object aligned, or
>>> +                *  - copy length is smaller than object size
>>> +                *    (although the object size should be the same for different
>>> +                *     files in the same filesystem...)
>>> +                */
>>> +               if (objoff || (copy_len < dst_ci->i_layout.object_size)) {
>>> +                       if (copy_len > objlen)
>>> +                               copy_len = objlen;
>>> +                       err = do_splice_direct(src_file, &src_off, dst_file,
>>> +                                             &dst_off, copy_len, flags);
>>> +                       if (err < 0) {
>>> +                               ret = err;
>>> +                               goto out_caps;
>>> +                       }
>>> +                       len -= copy_len;
>>> +                       ceph_oid_destroy(&src_oid);
>> 
>> how about dst_oid.  I think we can set src_oid/dst_oid just before
>> ceph_osdc_copy_from
>>> +                       ret += copy_len;
>>> +                       continue;
>>> +               }
>>> +               /* Finally... do an object remote copy */
>>> +               err = ceph_osdc_copy_from(osdc, src_ci->i_vino.snap,
>>> +                                         0, /* XXX src_ci->i_version ? */
>>> +                                         &src_oid, &src_oloc,
>>> +                                         CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL|CEPH_OSD_OP_FLAG_FADVISE_WILLNEED,
>>> +                                         dst_ci->i_vino.snap, &dst_oid,
>>> +                                         &dst_oloc,
>>> +                                         CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL|CEPH_OSD_OP_FLAG_FADVISE_WILLNEED);
>>> +               if (err) {
>>> +                       printk("copy_from returned an error: %d\n", err); /* XXX */
>>> +                       ret = err;
>>> +                       goto out_caps;
>>> +               }
>>> +               len -= copy_len;
>>> +               src_off += copy_len;
>>> +               dst_off += copy_len;
>>> +               ret += copy_len;
>>> +               ceph_oid_destroy(&src_oid);
>>> +               ceph_oid_destroy(&dst_oid);
>>> +       }
>>> +       /* Let the MDS know about destination object size change */
>>> +       if (endoff > size) {
>> 
>> I think we should file time (call file_update_time()), and mark Fw
>> dirty even endoff <= size
>> 
>>> +               int dirty;
>>> +               int caps_flags = CHECK_CAPS_AUTHONLY;
>>> +
>>> +               if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff))
>>> +                       caps_flags |= CHECK_CAPS_NODELAY;
>>> +               if (ceph_inode_set_size(dst_inode, endoff))
>>> +                       caps_flags |= CHECK_CAPS_AUTHONLY;
>>> +               if (caps_flags)
>>> +                       ceph_check_caps(dst_ci, caps_flags, NULL);
>>> +               spin_lock(&dst_ci->i_ceph_lock);
>>> +               dst_ci->i_inline_version = CEPH_INLINE_NONE;
>>> +               dirty = __ceph_mark_dirty_caps(
>>> +                       dst_ci,
>>> +                       CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER,
>> 
>> don't add CEPH_CAP_FILE_BUFFER
>> 
>> Regards
>> Yan, Zheng
>> 
>>> +                       &prealloc_cf);
>>> +               spin_unlock(&dst_ci->i_ceph_lock);
>>> +               if (dirty)
>>> +                       __mark_inode_dirty(dst_inode, dirty);
>>> +       }
>>> +out_caps:
>>> +       ceph_put_cap_refs(src_ci, src_got);
>>> +out_dst_caps:
>>> +       ceph_put_cap_refs(dst_ci, dst_got);
>>> +out:
>>> +       ceph_free_cap_flush(prealloc_cf);
>>> +
>>> +       return ret;
>>> +}
>>> +
>>> const struct file_operations ceph_file_fops = {
>>>        .open = ceph_open,
>>>        .release = ceph_release,
>>> @@ -1844,5 +2064,6 @@ const struct file_operations ceph_file_fops = {
>>>        .unlocked_ioctl = ceph_ioctl,
>>>        .compat_ioctl   = ceph_ioctl,
>>>        .fallocate      = ceph_fallocate,
>>> +       .copy_file_range = ceph_copy_file_range,
>>> };
>>> 
>> 




[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux