Re: [RFC PATCH v5 2/4] ceph: support the RADOS copy-from operation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Oct 9, 2018 at 12:47 PM Luis Henriques <lhenriques@xxxxxxxx> wrote:
>
> Add support for performing remote object copies using the 'copy-from'
> operation.
>
> Signed-off-by: Luis Henriques <lhenriques@xxxxxxxx>
> ---
>  include/linux/ceph/osd_client.h | 17 ++++++++
>  include/linux/ceph/rados.h      | 19 +++++++++
>  net/ceph/osd_client.c           | 72 +++++++++++++++++++++++++++++++++
>  3 files changed, 108 insertions(+)

Change the title to "libceph: ...".

>
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index 02096da01845..f25b4aaaab09 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -136,6 +136,13 @@ struct ceph_osd_req_op {
>                         u64 expected_object_size;
>                         u64 expected_write_size;
>                 } alloc_hint;
> +               struct {
> +                       u64 snapid;
> +                       u64 src_version;
> +                       u8 flags;
> +                       u32 src_fadvise_flags;
> +                       struct ceph_osd_data osd_data;
> +               } copy_from;
>         };
>  };
>
> @@ -511,6 +518,16 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
>                                 struct timespec64 *mtime,
>                                 struct page **pages, int nr_pages);
>
> +extern int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
> +                              u64 src_snapid, u64 src_version,
> +                              struct ceph_object_id *src_oid,
> +                              struct ceph_object_locator *src_oloc,
> +                              u32 src_fadvise_flags,
> +                              u64 dst_snapid,
> +                              struct ceph_object_id *dst_oid,
> +                              struct ceph_object_locator *dst_oloc,
> +                              u8 dst_fadvise_flags);

Drop extern here.

> +
>  /* watch/notify */
>  struct ceph_osd_linger_request *
>  ceph_osdc_watch(struct ceph_osd_client *osdc,
> diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> index f1988387c5ad..d47540986eff 100644
> --- a/include/linux/ceph/rados.h
> +++ b/include/linux/ceph/rados.h
> @@ -410,6 +410,14 @@ enum {
>  enum {
>         CEPH_OSD_OP_FLAG_EXCL = 1,      /* EXCL object create */
>         CEPH_OSD_OP_FLAG_FAILOK = 2,    /* continue despite failure */
> +       CEPH_OSD_OP_FLAG_FADVISE_RANDOM     = 0x4, /* the op is random */
> +       CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */
> +       CEPH_OSD_OP_FLAG_FADVISE_WILLNEED   = 0x10,/* data will be accessed in
> +                                                     the near future */
> +       CEPH_OSD_OP_FLAG_FADVISE_DONTNEED   = 0x20,/* data will not be accessed
> +                                                     in the near future */
> +       CEPH_OSD_OP_FLAG_FADVISE_NOCACHE    = 0x40,/* data will be accessed only
> +                                                     once by this client */
>  };
>
>  #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
> @@ -497,6 +505,17 @@ struct ceph_osd_op {
>                         __le64 expected_object_size;
>                         __le64 expected_write_size;
>                 } __attribute__ ((packed)) alloc_hint;
> +               struct {
> +                       __le64 snapid;
> +                       __le64 src_version;
> +                       __u8 flags;

Add /* CEPH_OSD_COPY_FROM_FLAG_* */ and pull in that enum.

> +                       /*
> +                        * __le32 flags: CEPH_OSD_OP_FLAG_FADVISE_: mean the
> +                        * fadvise flags for dest object src_fadvise_flags mean
> +                        * the fadvise flags for src object
> +                        */
> +                       __le32 src_fadvise_flags;

This comment is super confusing.  It makes it look like a commented out
field, but it actually refers to op.flags.  This seems to have confused
you into thinking that dst_fadvise_flags go into the above __u8 flags.

Change it to

/*
 * CEPH_OSD_OP_FLAG_FADVISE_*
 *
 * These are fadvise flags for src object, fadvise flags for dest
 * object are in ceph_osd_op::flags.
 */

I'll send a PR to get this clarified on the userspace side as well.

> +               } __attribute__ ((packed)) copy_from;
>         };
>         __le32 payload_len;
>  } __attribute__ ((packed));
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 60934bd8796c..9fd19ff556af 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -955,6 +955,14 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
>         case CEPH_OSD_OP_CREATE:
>         case CEPH_OSD_OP_DELETE:
>                 break;
> +       case CEPH_OSD_OP_COPY_FROM:
> +               dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid);
> +               dst->copy_from.src_version =
> +                       cpu_to_le64(src->copy_from.src_version);
> +               dst->copy_from.flags = src->copy_from.flags;
> +               dst->copy_from.src_fadvise_flags =
> +                       cpu_to_le32(src->copy_from.src_fadvise_flags);
> +               break;
>         default:
>                 pr_err("unsupported osd opcode %s\n",
>                         ceph_osd_op_name(src->op));
> @@ -1908,6 +1916,10 @@ static void setup_request_data(struct ceph_osd_request *req,
>                         ceph_osdc_msg_data_add(req->r_reply,
>                                                &op->notify.response_data);
>                         break;
> +               case CEPH_OSD_OP_COPY_FROM:
> +                       ceph_osdc_msg_data_add(msg,
> +                                              &op->copy_from.osd_data);
> +                       break;

This is a request data item.  Move it up, near CEPH_OSD_OP_NOTIFY_ACK.

>                 }
>
>                 data_len += op->indata_len;
> @@ -5168,6 +5180,66 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
>  }
>  EXPORT_SYMBOL(ceph_osdc_writepages);
>
> +int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
> +                       u64 src_snapid, u64 src_version,
> +                       struct ceph_object_id *src_oid,
> +                       struct ceph_object_locator *src_oloc,
> +                       u32 src_fadvise_flags,
> +                       u64 dst_snapid,

I don't see any snap context around, why does it need dst_snapid?
Also, libceph will WARN if you try to write to a snapshot.

> +                       struct ceph_object_id *dst_oid,
> +                       struct ceph_object_locator *dst_oloc,
> +                       u8 dst_fadvise_flags)

We should probably add u8 copy_from_flags for generality.

> +{
> +       struct ceph_osd_request *req = NULL;
> +       struct ceph_options *opts = osdc->client->options;
> +       struct ceph_osd_req_op *op;
> +       struct page **pages;
> +       void *p, *end;
> +       int ret;
> +
> +       pages = ceph_alloc_page_vector(1, GFP_NOIO);
> +       if (IS_ERR(pages))
> +               return PTR_ERR(pages);
> +       req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
> +       if (!req) {
> +               ret = -ENOMEM;
> +               goto out;
> +       }
> +       req->r_flags = CEPH_OSD_FLAG_WRITE;
> +       op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, 0);
> +       op->copy_from.snapid = src_snapid;
> +       op->copy_from.src_version = src_version;
> +       op->copy_from.flags = dst_fadvise_flags;

This should be either copy_from_flags or 0.

I'd factor this code out, similar to osd_req_op_notify_ack_init().
This function would then only be concerned with req itself.

> +       op->copy_from.src_fadvise_flags = src_fadvise_flags;
> +
> +       ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc);
> +       ceph_oid_copy(&req->r_t.base_oid, dst_oid);
> +
> +       p = page_address(pages[0]);
> +       end = p + PAGE_SIZE;
> +       ceph_encode_string(&p, end, src_oid->name, src_oid->name_len);
> +       encode_oloc(&p, end, src_oloc);
> +       op->indata_len = PAGE_SIZE - (end - p);
> +
> +       ceph_osd_data_pages_init(&op->copy_from.osd_data, pages,
> +                                op->indata_len, 0,
> +                                false, true);
> +       req->r_snapid = dst_snapid;
> +       req->r_data_offset = 0; /* XXX dst_off? */
> +
> +       ret = ceph_osdc_alloc_messages(req, GFP_NOFS);

You are using GFP_NOIO for pages and req and GFP_NOFS here.
Does this operation need either?  If it's only going to be called in
normal syscall context, GFP_KERNEL should be good enough.

> +       if (ret)
> +               goto out;
> +       ceph_osdc_start_request(osdc, req, false);
> +       ret = wait_request_timeout(req, opts->mount_timeout);

I don't think the timeout is needed here.  CephFS uses mount_timeout
for MDS operations, OSD requests aren't timed out in general.

Thanks,

                Ilya



[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux