On Fri, Oct 12, 2018 at 02:06:28PM -0700, Daniel Verkamp wrote: > From: Changpeng Liu <changpeng.liu@xxxxxxxxx> > > In commit 88c85538, "virtio-blk: add discard and write zeroes features > to specification" (https://github.com/oasis-tcs/virtio-spec), the virtio > block specification has been extended to add VIRTIO_BLK_T_DISCARD and > VIRTIO_BLK_T_WRITE_ZEROES commands. This patch enables support for > discard and write zeroes in the virtio-blk driver when the device > advertises the corresponding features, VIRTIO_BLK_F_DISCARD and > VIRTIO_BLK_F_WRITE_ZEROES. > > Signed-off-by: Changpeng Liu <changpeng.liu@xxxxxxxxx> > Signed-off-by: Daniel Verkamp <dverkamp@xxxxxxxxxxxx> > --- > dverkamp: I've picked up this patch and made a few minor changes (as > listed below); most notably, I changed the kmalloc back to GFP_ATOMIC, > since it can be called from a context where sleeping is not allowed. > To prevent large allocations, I've also clamped the maximum number of > discard segments to 256; this results in a 4K allocation and should be > plenty of descriptors for most use cases. > > I also removed most of the description from the commit message, since it > was duplicating the comments from virtio_blk.h and quoting parts of the > spec without adding any extra information. I have tested this iteration > of the patch using crosvm with modifications to enable the new features: > https://chromium.googlesource.com/chromiumos/platform/crosvm/ > > CHANGELOG: > v8: [dverkamp] replace shifts by 9 with SECTOR_SHIFT constant > v7: [dverkamp] use GFP_ATOMIC for allocation that may not sleep; clarify > descriptor flags field; comment wording cleanups. > v6: don't set T_OUT bit to discard and write zeroes commands. > v5: use new block layer API: blk_queue_flag_set. > v4: several optimizations based on MST's comments, remove bit field > usage for command descriptor. > v3: define the virtio-blk protocol to add discard and write zeroes > support, first version implementation based on proposed specification. > v2: add write zeroes command support. > v1: initial proposal implementation for discard command. > --- > drivers/block/virtio_blk.c | 95 ++++++++++++++++++++++++++++++++- > include/uapi/linux/virtio_blk.h | 54 +++++++++++++++++++ > 2 files changed, 147 insertions(+), 2 deletions(-) The implementation is quite straightforward, just some minor points, see inline comment. > > diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c > index 23752dc99b00..04a7ae602e2f 100644 > --- a/drivers/block/virtio_blk.c > +++ b/drivers/block/virtio_blk.c > @@ -18,6 +18,7 @@ > > #define PART_BITS 4 > #define VQ_NAME_LEN 16 > +#define MAX_DISCARD_SEGMENTS 256 > > static int major; > static DEFINE_IDA(vd_index_ida); > @@ -172,10 +173,50 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, > return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); > } > > + > +static inline int virtblk_setup_discard_write_zeroes(struct request *req, > + bool unmap) > +{ > + unsigned short segments = blk_rq_nr_discard_segments(req); > + unsigned short n = 0; > + struct virtio_blk_discard_write_zeroes *range; > + struct bio *bio; > + u32 flags = 0; > + > + if (unmap) > + flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP; > + > + range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC); > + if (!range) > + return -ENOMEM; > + > + __rq_for_each_bio(bio, req) { > + u64 sector = bio->bi_iter.bi_sector; > + u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT; > + > + range[n].flags = cpu_to_le32(flags); > + range[n].num_sectors = cpu_to_le32(num_sectors); > + range[n].sector = cpu_to_le64(sector); > + n++; > + } > + > + req->special_vec.bv_page = virt_to_page(range); > + req->special_vec.bv_offset = offset_in_page(range); > + req->special_vec.bv_len = sizeof(*range) * segments; > + req->rq_flags |= RQF_SPECIAL_PAYLOAD; > + > + return 0; > +} > + > static inline void virtblk_request_done(struct request *req) > { > struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); > > + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { > + kfree(page_address(req->special_vec.bv_page) + > + req->special_vec.bv_offset); > + } > + > switch (req_op(req)) { > case REQ_OP_SCSI_IN: > case REQ_OP_SCSI_OUT: > @@ -225,6 +266,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > int qid = hctx->queue_num; > int err; > bool notify = false; > + bool unmap = false; > u32 type; > > BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); > @@ -237,6 +279,13 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > case REQ_OP_FLUSH: > type = VIRTIO_BLK_T_FLUSH; > break; > + case REQ_OP_DISCARD: > + type = VIRTIO_BLK_T_DISCARD; > + break; > + case REQ_OP_WRITE_ZEROES: > + type = VIRTIO_BLK_T_WRITE_ZEROES; > + unmap = !(req->cmd_flags & REQ_NOUNMAP); > + break; > case REQ_OP_SCSI_IN: > case REQ_OP_SCSI_OUT: > type = VIRTIO_BLK_T_SCSI_CMD; > @@ -256,6 +305,12 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, > > blk_mq_start_request(req); > > + if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES) { > + err = virtblk_setup_discard_write_zeroes(req, unmap); > + if (err) > + return BLK_STS_RESOURCE; > + } > + > num = blk_rq_map_sg(hctx->queue, req, vbr->sg); > if (num) { > if (rq_data_dir(req) == WRITE) > @@ -777,6 +832,42 @@ static int virtblk_probe(struct virtio_device *vdev) > if (!err && opt_io_size) > blk_queue_io_opt(q, blk_size * opt_io_size); > > + if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { > + q->limits.discard_granularity = blk_size; > + > + virtio_cread(vdev, struct virtio_blk_config, > + discard_sector_alignment, &v); > + if (v) > + q->limits.discard_alignment = v << SECTOR_SHIFT; > + else > + q->limits.discard_alignment = 0; It may be better to use "v ? v << SECTOR_SHIFT : 0". > + > + virtio_cread(vdev, struct virtio_blk_config, > + max_discard_sectors, &v); > + if (v) > + blk_queue_max_discard_sectors(q, v); > + else > + blk_queue_max_discard_sectors(q, UINT_MAX); Same with above. > + > + virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, > + &v); > + if (v && v <= MAX_DISCARD_SEGMENTS) > + blk_queue_max_discard_segments(q, v); > + else > + blk_queue_max_discard_segments(q, MAX_DISCARD_SEGMENTS); It may be better to use min_not_zero(). > + > + blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); > + } > + > + if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) { > + virtio_cread(vdev, struct virtio_blk_config, > + max_write_zeroes_sectors, &v); > + if (v) > + blk_queue_max_write_zeroes_sectors(q, v); > + else > + blk_queue_max_write_zeroes_sectors(q, UINT_MAX); > + } Same with above. > + > virtblk_update_capacity(vblk, false); > virtio_device_ready(vdev); > > @@ -885,14 +976,14 @@ static unsigned int features_legacy[] = { > VIRTIO_BLK_F_SCSI, > #endif > VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, > - VIRTIO_BLK_F_MQ, > + VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES, > } > ; > static unsigned int features[] = { > VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, > VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, > VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, > - VIRTIO_BLK_F_MQ, > + VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES, > }; > > static struct virtio_driver virtio_blk = { > diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h > index 9ebe4d968dd5..682afbfe3aa4 100644 > --- a/include/uapi/linux/virtio_blk.h > +++ b/include/uapi/linux/virtio_blk.h > @@ -38,6 +38,8 @@ > #define VIRTIO_BLK_F_BLK_SIZE 6 /* Block size of disk is available*/ > #define VIRTIO_BLK_F_TOPOLOGY 10 /* Topology information is available */ > #define VIRTIO_BLK_F_MQ 12 /* support more than one vq */ > +#define VIRTIO_BLK_F_DISCARD 13 /* DISCARD is supported */ > +#define VIRTIO_BLK_F_WRITE_ZEROES 14 /* WRITE ZEROES is supported */ > > /* Legacy feature bits */ > #ifndef VIRTIO_BLK_NO_LEGACY > @@ -86,6 +88,39 @@ struct virtio_blk_config { > > /* number of vqs, only available when VIRTIO_BLK_F_MQ is set */ > __u16 num_queues; > + > + /* the next 3 entries are guarded by VIRTIO_BLK_F_DISCARD */ > + /* > + * The maximum discard sectors (in 512-byte sectors) for > + * one segment. > + */ > + __u32 max_discard_sectors; > + /* > + * The maximum number of discard segments in a > + * discard command. > + */ > + __u32 max_discard_seg; > + /* Discard commands must be aligned to this number of sectors. */ > + __u32 discard_sector_alignment; > + > + /* the next 3 entries are guarded by VIRTIO_BLK_F_WRITE_ZEROES */ > + /* > + * The maximum number of write zeroes sectors (in 512-byte sectors) in > + * one segment. > + */ > + __u32 max_write_zeroes_sectors; > + /* > + * The maximum number of segments in a write zeroes > + * command. > + */ > + __u32 max_write_zeroes_seg; > + /* > + * Set if a VIRTIO_BLK_T_WRITE_ZEROES request may result in the > + * deallocation of one or more of the sectors. > + */ > + __u8 write_zeroes_may_unmap; > + > + __u8 unused1[3]; > } __attribute__((packed)); > > /* > @@ -114,6 +149,12 @@ struct virtio_blk_config { > /* Get device ID command */ > #define VIRTIO_BLK_T_GET_ID 8 > > +/* Discard command */ > +#define VIRTIO_BLK_T_DISCARD 11 > + > +/* Write zeroes command */ > +#define VIRTIO_BLK_T_WRITE_ZEROES 13 > + > #ifndef VIRTIO_BLK_NO_LEGACY > /* Barrier before this op. */ > #define VIRTIO_BLK_T_BARRIER 0x80000000 > @@ -133,6 +174,19 @@ struct virtio_blk_outhdr { > __virtio64 sector; > }; > > +/* Unmap this range (only valid for write zeroes command) */ > +#define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP 0x00000001 > + > +/* Discard/write zeroes range for each request. */ > +struct virtio_blk_discard_write_zeroes { > + /* discard/write zeroes start sector */ > + __virtio64 sector; > + /* number of discard/write zeroes sectors */ > + __virtio32 num_sectors; > + /* flags for this range */ > + __virtio32 flags; > +}; > + > #ifndef VIRTIO_BLK_NO_LEGACY > struct virtio_scsi_inhdr { > __virtio32 errors; > -- > 2.19.0.605.g01d371f741-goog > Thanks, Ming