On Wed, Feb 10, 2016 at 12:56 PM, Yan, Zheng <zyan@xxxxxxxxxx> wrote: > >> On Feb 10, 2016, at 19:24, Ilya Dryomov <idryomov@xxxxxxxxx> wrote: >> >> Turn r_ops into a flexible array member to enable large, consisting of >> up to 16 ops, OSD requests. The use case is scattered writeback in >> cephfs and, as far as the kernel client is concerned, 16 is just a made >> up number. >> >> r_ops had size 3 for copyup+hint+write, but copyup is really a special >> case - it can only happen once. ceph_osd_request_cache is therefore >> stuffed with num_ops=2 requests, anything bigger than that is allocated >> with kmalloc(). req_mempool is backed by ceph_osd_request_cache, which >> means either num_ops=1 or num_ops=2 for use_mempool=true - all existing >> users (ceph_writepages_start(), ceph_osdc_writepages()) are fine with >> that. >> >> Signed-off-by: Ilya Dryomov <idryomov@xxxxxxxxx> >> --- >> drivers/block/rbd.c | 2 -- >> include/linux/ceph/osd_client.h | 6 ++++-- >> net/ceph/osd_client.c | 32 +++++++++++++++++++------------- >> 3 files changed, 23 insertions(+), 17 deletions(-) >> >> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c >> index 94f31bde73e8..08018710f363 100644 >> --- a/drivers/block/rbd.c >> +++ b/drivers/block/rbd.c >> @@ -1847,8 +1847,6 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, >> if (osd_req->r_result < 0) >> obj_request->result = osd_req->r_result; >> >> - rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); >> - >> /* >> * We support a 64-bit length, but ultimately it has to be >> * passed to the block layer, which just supports a 32-bit >> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h >> index c6d1d603bacf..aada6a1383a4 100644 >> --- a/include/linux/ceph/osd_client.h >> +++ b/include/linux/ceph/osd_client.h >> @@ -43,7 +43,8 @@ struct ceph_osd { >> }; >> >> >> -#define CEPH_OSD_MAX_OP 3 >> +#define CEPH_OSD_SLAB_OPS 2 >> +#define CEPH_OSD_MAX_OPS 16 >> >> enum ceph_osd_data_type { >> CEPH_OSD_DATA_TYPE_NONE = 0, >> @@ -139,7 +140,6 @@ struct ceph_osd_request { >> >> /* request osd ops array */ >> unsigned int r_num_ops; >> - struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; >> >> /* these are updated on each send */ >> __le32 *r_request_osdmap_epoch; >> @@ -175,6 +175,8 @@ struct ceph_osd_request { >> unsigned long r_stamp; /* send OR check time */ >> >> struct ceph_snap_context *r_snapc; /* snap context for writes */ >> + >> + struct ceph_osd_req_op r_ops[]; >> }; >> >> struct ceph_request_redirect { >> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c >> index 8bf4f74064e5..37a0fc5273d1 100644 >> --- a/net/ceph/osd_client.c >> +++ b/net/ceph/osd_client.c >> @@ -338,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref) >> ceph_put_snap_context(req->r_snapc); >> if (req->r_mempool) >> mempool_free(req, req->r_osdc->req_mempool); >> - else >> + else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) >> kmem_cache_free(ceph_osd_request_cache, req); >> - >> + else >> + kfree(req); >> } >> >> void ceph_osdc_get_request(struct ceph_osd_request *req) >> @@ -369,9 +370,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, >> struct ceph_msg *msg; >> size_t msg_size; >> >> - BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); >> - BUG_ON(num_ops > CEPH_OSD_MAX_OP); >> - >> msg_size = 4 + 4 + 8 + 8 + 4+8; >> msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ >> msg_size += 1 + 8 + 4 + 4; /* pg_t */ >> @@ -383,14 +381,21 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, >> msg_size += 4; >> >> if (use_mempool) { >> + BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); >> req = mempool_alloc(osdc->req_mempool, gfp_flags); >> - memset(req, 0, sizeof(*req)); >> + } else if (num_ops <= CEPH_OSD_SLAB_OPS) { >> + req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags); >> } else { >> - req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); >> + BUG_ON(num_ops > CEPH_OSD_MAX_OPS); >> + req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]), >> + gfp_flags); >> } >> - if (req == NULL) >> + if (unlikely(!req)) >> return NULL; >> >> + /* req only, each op is zeroed in _osd_req_op_init() */ >> + memset(req, 0, sizeof(*req)); >> + >> req->r_osdc = osdc; >> req->r_mempool = use_mempool; >> req->r_num_ops = num_ops; >> @@ -1809,7 +1814,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) >> >> ceph_decode_need(&p, end, 4, bad_put); >> numops = ceph_decode_32(&p); >> - if (numops > CEPH_OSD_MAX_OP) >> + if (numops > CEPH_OSD_MAX_OPS) >> goto bad_put; >> if (numops != req->r_num_ops) >> goto bad_put; >> @@ -2773,11 +2778,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages); >> >> int ceph_osdc_setup(void) >> { >> + size_t size = sizeof(struct ceph_osd_request) + >> + CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); >> + >> BUG_ON(ceph_osd_request_cache); >> - ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", >> - sizeof (struct ceph_osd_request), >> - __alignof__(struct ceph_osd_request), >> - 0, NULL); >> + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size, >> + 0, 0, NULL); >> >> return ceph_osd_request_cache ? 0 : -ENOMEM; >> } > > This approach is better than mine. But the code that calculates r_request/r_reply messages size get lost in your patch r_request calculation is there, r_reply was missing because I wanted to take another look at both r_request and r_reply - it can be a separate commit though. I'll pull wip-alloc-request into testing later today. Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html