This resolve: http://tracker.ceph.com/issues/190 Signed-off-by: Guangliang Zhao <lucienchao@xxxxxxxxx> --- drivers/block/rbd.c | 105 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ca1fd14..67ea156 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -212,6 +212,7 @@ enum obj_request_type { enum obj_operation_type { OBJ_OPT_WRITE, OBJ_OPT_READ, + OBJ_OPT_DISCARD, }; /* @@ -221,6 +222,7 @@ enum obj_operation_type { static const char* obj_opt[] = { "write", "read", + "discard", }; enum obj_req_flags { @@ -228,6 +230,7 @@ enum obj_req_flags { OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ + IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ }; struct rbd_obj_request { @@ -1518,6 +1521,21 @@ static bool img_request_write_test(struct rbd_img_request *img_request) return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; } +/* + * Set the discard flag when the img_request is an discard request + */ +static void img_request_discard_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_DISCARD, &img_request->flags); + smp_mb(); +} + +static bool img_request_discard_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; +} + static void img_request_child_set(struct rbd_img_request *img_request) { set_bit(IMG_REQ_CHILD, &img_request->flags); @@ -1640,6 +1658,18 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) obj_request_done_set(obj_request); } +static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p result %d %llu\n", __func__, obj_request, + obj_request->result, obj_request->length); + /* + * There is no such thing as a successful short write. Set + * it to our originally-requested length. + */ + obj_request->xferred = obj_request->length; + obj_request_done_set(obj_request); +} + /* * For a simple stat call there's nothing to do. We'll do more if * this is part of a write sequence for a layered image. @@ -1687,6 +1717,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_STAT: rbd_osd_stat_callback(obj_request); break; + case CEPH_OSD_OP_DELETE: + case CEPH_OSD_OP_TRUNCATE: + case CEPH_OSD_OP_ZERO: + rbd_osd_discard_callback(obj_request); + break; case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: @@ -1729,6 +1764,20 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) snapc, CEPH_NOSNAP, &mtime); } +static void rbd_osd_req_format_discard(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_osd_request *osd_req = obj_request->osd_req; + struct timespec mtime = CURRENT_TIME; + u64 snap_id; + + rbd_assert(osd_req != NULL); + + snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; + ceph_osdc_build_request(osd_req, obj_request->offset, + NULL, snap_id, &mtime); +} + static struct ceph_osd_request *rbd_osd_req_create( struct rbd_device *rbd_dev, enum obj_operation_type type, @@ -1738,12 +1787,15 @@ static struct ceph_osd_request *rbd_osd_req_create( struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; bool write_request = (type == OBJ_OPT_WRITE) != 0; + bool discard_request = (type == OBJ_OPT_DISCARD) != 0; if (obj_request_img_data_test(obj_request)) { struct rbd_img_request *img_request = obj_request->img_request; rbd_assert(write_request == img_request_write_test(img_request)); + rbd_assert(discard_request == + img_request_discard_test(img_request)); if (type == OBJ_OPT_WRITE) snapc = img_request->snapc; @@ -1756,7 +1808,7 @@ static struct ceph_osd_request *rbd_osd_req_create( if (!osd_req) return NULL; /* ENOMEM */ - if (type == OBJ_OPT_WRITE) + if (type == OBJ_OPT_WRITE || type == OBJ_OPT_DISCARD) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; else osd_req->r_flags = CEPH_OSD_FLAG_READ; @@ -1982,7 +2034,10 @@ static struct rbd_img_request *rbd_img_request_create( img_request->offset = offset; img_request->length = length; img_request->flags = 0; - if (type == OBJ_OPT_WRITE) { + if (type == OBJ_OPT_DISCARD) { + img_request_discard_set(img_request); + img_request->snap_id = rbd_dev->spec->snap_id; + } else if (type == OBJ_OPT_WRITE) { img_request_write_set(img_request); img_request->snapc = rbd_dev->header.snapc; } else { @@ -2083,8 +2138,13 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) struct rbd_device *rbd_dev = img_request->rbd_dev; enum obj_operation_type type; - type = img_request_write_test(img_request) ? OBJ_OPT_WRITE : - OBJ_OPT_READ; + if (img_request_discard_test(img_request)) + type = OBJ_OPT_DISCARD; + else if (img_request_write_test(img_request)) + type = OBJ_OPT_WRITE; + else + type = OBJ_OPT_READ; + rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n", obj_opt[type], obj_request->length, obj_request->img_offset, obj_request->offset); @@ -2170,6 +2230,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, unsigned int bio_offset = 0; struct page **pages = NULL; enum obj_operation_type op_type; + u64 object_size = (u64) 1 << rbd_dev->header.obj_order; u64 img_offset; u64 resid; u16 opcode; @@ -2185,8 +2246,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, bio_list = data_desc; rbd_assert(img_offset == bio_list->bi_iter.bi_sector << SECTOR_SHIFT); - } else { - rbd_assert(type == OBJ_REQUEST_PAGES); + } else if (type == OBJ_REQUEST_PAGES) { pages = data_desc; } @@ -2225,7 +2285,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, GFP_ATOMIC); if (!obj_request->bio_list) goto out_partial; - } else { + } else if (type == OBJ_REQUEST_PAGES) { unsigned int page_count; obj_request->pages = pages; @@ -2236,7 +2296,15 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, pages += page_count; } - if (img_request_write_test(img_request)) { + if (img_request_discard_test(img_request)) { + op_type = OBJ_OPT_DISCARD; + if (!offset && length == object_size) + opcode = CEPH_OSD_OP_DELETE; + else if (offset + length == object_size) + opcode = CEPH_OSD_OP_TRUNCATE; + else + opcode = CEPH_OSD_OP_ZERO; + } else if (img_request_write_test(img_request)) { op_type = OBJ_OPT_WRITE; opcode = CEPH_OSD_OP_WRITE; } else { @@ -2255,13 +2323,15 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, if (type == OBJ_REQUEST_BIO) osd_req_op_extent_osd_data_bio(osd_req, 0, obj_request->bio_list, length); - else + else if (type == OBJ_REQUEST_PAGES) osd_req_op_extent_osd_data_pages(osd_req, 0, obj_request->pages, length, offset & ~PAGE_MASK, false, false); if (op_type == OBJ_OPT_WRITE) rbd_osd_req_format_write(obj_request); + else if (op_type == OBJ_OPT_DISCARD) + rbd_osd_req_format_discard(obj_request); else rbd_osd_req_format_read(obj_request); @@ -3093,7 +3163,9 @@ static void rbd_request_fn(struct request_queue *q) spin_unlock_irq(q->queue_lock); - if (rq->cmd_flags & REQ_WRITE) + if (rq->cmd_flags & REQ_DISCARD) + type = OBJ_OPT_DISCARD; + else if (rq->cmd_flags & REQ_WRITE) type = OBJ_OPT_WRITE; else type = OBJ_OPT_READ; @@ -3143,8 +3215,12 @@ static void rbd_request_fn(struct request_queue *q) img_request->rq = rq; - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, - rq->bio); + if (type == OBJ_OPT_DISCARD) + result = rbd_img_request_fill(img_request, + OBJ_REQUEST_NODATA, NULL); + else + result = rbd_img_request_fill(img_request, + OBJ_REQUEST_BIO, rq->bio); if (!result) result = rbd_img_request_submit(img_request); if (result) @@ -3452,6 +3528,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) blk_queue_io_min(q, segment_size); blk_queue_io_opt(q, segment_size); + /* enable the discard support */ + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + q->limits.discard_granularity = segment_size; + q->limits.discard_alignment = segment_size; + blk_queue_merge_bvec(q, rbd_merge_bvec); disk->queue = q; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html