On Fri, 21 Feb 2014, Ilya Dryomov wrote: > This is primarily for rbd's benefit and is supposed to combat > fragmentation: > > "... knowing that rbd images have a 4m size, librbd can pass a hint > that will let the osd do the xfs allocation size ioctl on new files so > that they are allocated in 1m or 4m chunks. We've seen cases where > users with rbd workloads have very high levels of fragmentation in xfs > and this would mitigate that and probably have a pretty nice > performance benefit." > > SETALLOCHINT is considered advisory, so our backwards compatibility > mechanism here is to set FAILOK flag for all SETALLOCHINT ops. > > Signed-off-by: Ilya Dryomov <ilya.dryomov@xxxxxxxxxxx> > --- > include/linux/ceph/osd_client.h | 9 +++++++++ > include/linux/ceph/rados.h | 8 ++++++++ > net/ceph/osd_client.c | 30 ++++++++++++++++++++++++++++++ > 3 files changed, 47 insertions(+) > > diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h > index e94f5da251d6..6bfcb0eca8ab 100644 > --- a/include/linux/ceph/osd_client.h > +++ b/include/linux/ceph/osd_client.h > @@ -103,6 +103,11 @@ struct ceph_osd_req_op { > u32 timeout; > __u8 flag; > } watch; > + struct { > + u64 expected_size; > + u64 expected_write_size; > + __u8 expected_size_probability; > + } hint; s/hint/alloc_hint/ ? > }; > }; > > @@ -294,6 +299,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, > extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, > unsigned int which, u16 opcode, > u64 cookie, u64 version, int flag); > +extern void osd_req_op_hint_init(struct ceph_osd_request *osd_req, > + unsigned int which, u16 opcode, > + u64 expected_size, u64 expected_write_size, > + u8 expected_size_probability); > > extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, > struct ceph_snap_context *snapc, > diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h > index 8f9bf4570215..b8e2dd11f186 100644 > --- a/include/linux/ceph/rados.h > +++ b/include/linux/ceph/rados.h > @@ -227,6 +227,9 @@ enum { > CEPH_OSD_OP_OMAPRMKEYS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24, > CEPH_OSD_OP_OMAP_CMP = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25, > > + /* hints */ > + CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35, > + > /** multi **/ > CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1, > CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2, > @@ -416,6 +419,11 @@ struct ceph_osd_op { > __le64 offset, length; > __le64 src_offset; > } __attribute__ ((packed)) clonerange; > + struct { > + __le64 expected_size; > + __le64 expected_write_size; > + __u8 expected_size_probability; > + } __attribute__ ((packed)) hint; s/hint/alloc_hint/, I think. Just made the same comment on the user space side. > }; > __le32 payload_len; > } __attribute__ ((packed)); > diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c > index 5d7fd0b8c1c8..4090f6e8db3a 100644 > --- a/net/ceph/osd_client.c > +++ b/net/ceph/osd_client.c > @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode) > case CEPH_OSD_OP_OMAPCLEAR: > case CEPH_OSD_OP_OMAPRMKEYS: > case CEPH_OSD_OP_OMAP_CMP: > + case CEPH_OSD_OP_SETALLOCHINT: > case CEPH_OSD_OP_CLONERANGE: > case CEPH_OSD_OP_ASSERT_SRC_VERSION: > case CEPH_OSD_OP_SRC_CMPXATTR: > @@ -591,6 +592,28 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req, > } > EXPORT_SYMBOL(osd_req_op_watch_init); > > +void osd_req_op_hint_init(struct ceph_osd_request *osd_req, > + unsigned int which, u16 opcode, > + u64 expected_size, u64 expected_write_size, > + u8 expected_size_probability) > +{ > + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); > + > + BUG_ON(opcode != CEPH_OSD_OP_SETALLOCHINT); I would just drop the opcode argument all together. And s/hint/alloc_hint/ in the function name... I wouldn't expect that any other type of hint would have these same arguments. > + > + op->hint.expected_size = expected_size; > + op->hint.expected_write_size = expected_write_size; > + op->hint.expected_size_probability = expected_size_probability; > + > + /* > + * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed > + * not worth a feature bit. Set FAILOK per-op flag to make > + * sure older osds don't trip over an unsupported opcode. > + */ > + op->flags |= CEPH_OSD_OP_FLAG_FAILOK; > +} > +EXPORT_SYMBOL(osd_req_op_hint_init); > + > static void ceph_osdc_msg_data_add(struct ceph_msg *msg, > struct ceph_osd_data *osd_data) > { > @@ -681,6 +704,13 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, > dst->watch.ver = cpu_to_le64(src->watch.ver); > dst->watch.flag = src->watch.flag; > break; > + case CEPH_OSD_OP_SETALLOCHINT: > + dst->hint.expected_size = cpu_to_le64(src->hint.expected_size); > + dst->hint.expected_write_size = > + cpu_to_le64(src->hint.expected_write_size); > + dst->hint.expected_size_probability = > + src->hint.expected_size_probability; > + break; > default: > pr_err("unsupported osd opcode %s\n", > ceph_osd_op_name(src->op)); > -- > 1.7.10.4 > > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html