Some block devices benefit from a hint that they can forget the contents of certain sectors. Add basic support for this to the block core, along with a 'blkdev_issue_discard()' helper function which issues such requests. Although blkdev_issue_discard() can take an end_io function, it's acceptable to leave that as NULL and in that case the allocated bio will be automatically freed. Most of the time, it's expected that callers won't care about when, or even _if_, the request completes. It's only a hint to the device anyway. By definition, the file system doesn't _care_ about these sectors any more. Signed-off-by: David Woodhouse <David.Woodhouse@xxxxxxxxx> --- block/blk-barrier.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-core.c | 25 ++++++++++++++++++----- block/blk-settings.c | 16 +++++++++++++++ block/elevator.c | 2 +- include/linux/bio.h | 9 ++++++++ include/linux/blkdev.h | 10 +++++++++ 6 files changed, 104 insertions(+), 7 deletions(-) diff --git a/block/blk-barrier.c b/block/blk-barrier.c index a09ead1..29e60ff 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -315,3 +315,52 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) return ret; } EXPORT_SYMBOL(blkdev_issue_flush); + +/** + * blkdev_issue_discard - queue a discard + * @bdev: blockdev to issue discard for + * @sector: start sector + * @nr_sects: number of sectors to discard + * @end_io: end_io function (or %NULL) + * + * Description: + * Issue a discard request for the sectors in question. Caller can pass an + * end_io function (which must call bio_put()), if they really want to see + * the outcome. Most callers probably won't, and can just pass %NULL. + */ +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + unsigned nr_sects, bio_end_io_t end_io) +{ + struct request_queue *q; + struct bio *bio; + int ret; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + /* Many callers won't care at all about the outcome. After all, + this is just a hint to the underlying device. They'll just + ignore errors completely. So the end_io function can be just + a call to bio_put() */ + if (!end_io) + end_io = (void *)&bio_put; + + if (!q->discard_fn) + return -EOPNOTSUPP; + + bio = bio_alloc(GFP_KERNEL, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = end_io; + bio->bi_bdev = bdev; + bio->bi_sector = sector; + bio->bi_size = nr_sects << 9; + submit_bio(1 << BIO_RW_DISCARD, bio); + return 0; +} +EXPORT_SYMBOL(blkdev_issue_discard); diff --git a/block/blk-core.c b/block/blk-core.c index 4889eb8..0cb993a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1081,6 +1081,10 @@ void init_request_from_bio(struct request *req, struct bio *bio) */ if (unlikely(bio_barrier(bio))) req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); + if (unlikely(bio_discard(bio))) { + req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD | REQ_NOMERGE); + req->q->discard_fn(req->q, req); + } if (bio_sync(bio)) req->cmd_flags |= REQ_RW_SYNC; @@ -1097,7 +1101,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) static int __make_request(struct request_queue *q, struct bio *bio) { struct request *req; - int el_ret, nr_sectors, barrier, err; + int el_ret, nr_sectors, barrier, discard, err; const unsigned short prio = bio_prio(bio); const int sync = bio_sync(bio); int rw_flags; @@ -1117,9 +1121,15 @@ static int __make_request(struct request_queue *q, struct bio *bio) goto end_io; } + discard = bio_discard(bio); + if (unlikely(discard) && !q->discard_fn) { + err = -EOPNOTSUPP; + goto end_io; + } + spin_lock_irq(q->queue_lock); - if (unlikely(barrier) || elv_queue_empty(q)) + if (unlikely(barrier) || unlikely(discard) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1411,6 +1421,10 @@ end_io: err = -EOPNOTSUPP; goto end_io; } + if (bio_discard(bio) && !q->discard_fn) { + err = -EOPNOTSUPP; + goto end_io; + } ret = q->make_request_fn(q, bio); } while (ret); @@ -1488,10 +1502,9 @@ void submit_bio(int rw, struct bio *bio) * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ - if (!bio_empty_barrier(bio)) { + if (!bio_dataless(bio)) { BIO_BUG_ON(!bio->bi_size); - BIO_BUG_ON(!bio->bi_io_vec); if (rw & WRITE) { count_vm_events(PGPGOUT, count); @@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, struct request_queue *q = rq->q; unsigned long flags = 0UL; - if (blk_fs_request(rq) || blk_pc_request(rq)) { + if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) { if (__end_that_request_first(rq, error, nr_bytes)) return 1; @@ -1944,7 +1957,7 @@ EXPORT_SYMBOL_GPL(blk_end_request); **/ int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { - if (blk_fs_request(rq) || blk_pc_request(rq)) { + if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) { if (__end_that_request_first(rq, error, nr_bytes)) return 1; } diff --git a/block/blk-settings.c b/block/blk-settings.c index dfc7701..6991af0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -33,6 +33,22 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) EXPORT_SYMBOL(blk_queue_prep_rq); /** + * blk_queue_discard - set a discard_sectors function for queue + * @q: queue + * @dfn: discard function + * + * It's possible for a queue to register a discard callback which is used + * to queue a request to discard (or "trim") sectors whose content is no + * longer required, if the underlying device supports such an action. + * + */ +void blk_queue_discard(struct request_queue *q, discard_fn *dfn) +{ + q->discard_fn = dfn; +} +EXPORT_SYMBOL(blk_queue_discard); + +/** * blk_queue_merge_bvec - set a merge_bvec function for queue * @q: queue * @mbfn: merge_bvec_fn diff --git a/block/elevator.c b/block/elevator.c index ed6f8f3..bb26424 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where, if (q->ordcolor) rq->cmd_flags |= REQ_ORDERED_COLOR; - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { + if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) { /* * toggle ordered color */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 0933a14..67cb2ee 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -149,6 +149,8 @@ struct bio { * bit 2 -- barrier * bit 3 -- fail fast, don't want low level driver retries * bit 4 -- synchronous I/O hint: the block layer will unplug immediately + * bit 5 -- + * bit 6 -- discard sectors (trim) */ #define BIO_RW 0 #define BIO_RW_AHEAD 1 @@ -156,6 +158,7 @@ struct bio { #define BIO_RW_FAILFAST 3 #define BIO_RW_SYNC 4 #define BIO_RW_META 5 +#define BIO_RW_DISCARD 6 /* * upper 16 bits of bi_rw define the io priority of this bio @@ -185,10 +188,16 @@ struct bio { #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) +#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) #define bio_empty_barrier(bio) (bio_barrier(bio) && !(bio)->bi_size) +#define bio_dataless(bio) (!(bio)->bi_io_vec) + static inline unsigned int bio_cur_sectors(struct bio *bio) { + if (unlikely(bio_discard(bio))) + return bio->bi_size >> 9; + if (bio->bi_vcnt) return bio_iovec(bio)->bv_len >> 9; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e61f22b..9e3adc4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -55,6 +55,7 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ REQ_TYPE_FLUSH, /* flush request */ + REQ_TYPE_DISCARD, /* discard sectors request */ REQ_TYPE_SPECIAL, /* driver defined type */ REQ_TYPE_LINUX_BLOCK, /* generic block layer message */ /* @@ -89,6 +90,7 @@ enum { enum rq_flag_bits { __REQ_RW, /* not set, read. set, write */ __REQ_FAILFAST, /* no low level driver retries */ + __REQ_DISCARD, /* request to discard sectors */ __REQ_SORTED, /* elevator knows about this request */ __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ __REQ_HARDBARRIER, /* may not be passed by drive either */ @@ -111,6 +113,7 @@ enum rq_flag_bits { }; #define REQ_RW (1 << __REQ_RW) +#define REQ_DISCARD (1 << __REQ_DISCARD) #define REQ_FAILFAST (1 << __REQ_FAILFAST) #define REQ_SORTED (1 << __REQ_SORTED) #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) @@ -252,6 +255,7 @@ typedef void (request_fn_proc) (struct request_queue *q); typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unplug_fn) (struct request_queue *); +typedef int (discard_fn) (struct request_queue *, struct request *); struct bio_vec; struct bvec_merge_data { @@ -298,6 +302,7 @@ struct request_queue make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unplug_fn *unplug_fn; + discard_fn *discard_fn; merge_bvec_fn *merge_bvec_fn; prepare_flush_fn *prepare_flush_fn; softirq_done_fn *softirq_done_fn; @@ -536,10 +541,12 @@ enum { #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED) #define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER) #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA) +#define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) #define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors) /* rq->queuelist of dequeued request must be list_empty() */ #define blk_queued_rq(rq) (!list_empty(&(rq)->queuelist)) +#define blk_dataless_rq(rq) (!(rq)->buffer) #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) @@ -786,6 +793,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); +extern void blk_queue_discard(struct request_queue *, discard_fn *); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); extern int blk_do_ordered(struct request_queue *, struct request **); @@ -829,6 +837,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, } extern int blkdev_issue_flush(struct block_device *, sector_t *); +extern int blkdev_issue_discard(struct block_device *, sector_t sector, + unsigned nr_sects, bio_end_io_t *); /* * command filter functions -- 1.5.5.1 -- David Woodhouse Open Source Technology Centre David.Woodhouse@xxxxxxxxx Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html