On Wed, Aug 06 2008, David Woodhouse wrote: > Some block devices benefit from a hint that they can forget the contents > of certain sectors. Add basic support for this to the block core, along > with a 'blkdev_issue_discard()' helper function which issues such > requests. > > Although blkdev_issue_discard() can take an end_io function, it's > acceptable to leave that as NULL and in that case the allocated bio will > be automatically freed. Most of the time, it's expected that callers > won't care about when, or even _if_, the request completes. It's only a > hint to the device anyway. By definition, the file system doesn't _care_ > about these sectors any more. I've queued this up for some testing, I'll add the merge support as well. > Signed-off-by: David Woodhouse <David.Woodhouse@xxxxxxxxx> > --- > block/blk-barrier.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ > block/blk-core.c | 25 ++++++++++++++++++----- > block/blk-settings.c | 16 +++++++++++++++ > block/elevator.c | 2 +- > include/linux/bio.h | 9 ++++++++ > include/linux/blkdev.h | 10 +++++++++ > 6 files changed, 104 insertions(+), 7 deletions(-) > > diff --git a/block/blk-barrier.c b/block/blk-barrier.c > index a09ead1..29e60ff 100644 > --- a/block/blk-barrier.c > +++ b/block/blk-barrier.c Not sure why you are placing it here, it should probably just go into blk-core.c > @@ -315,3 +315,52 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) > return ret; > } > EXPORT_SYMBOL(blkdev_issue_flush); > + > +/** > + * blkdev_issue_discard - queue a discard > + * @bdev: blockdev to issue discard for > + * @sector: start sector > + * @nr_sects: number of sectors to discard > + * @end_io: end_io function (or %NULL) > + * > + * Description: > + * Issue a discard request for the sectors in question. Caller can pass an > + * end_io function (which must call bio_put()), if they really want to see > + * the outcome. Most callers probably won't, and can just pass %NULL. > + */ > +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, > + unsigned nr_sects, bio_end_io_t end_io) > +{ > + struct request_queue *q; > + struct bio *bio; > + int ret; > + > + if (bdev->bd_disk == NULL) > + return -ENXIO; > + > + q = bdev_get_queue(bdev); > + if (!q) > + return -ENXIO; > + > + /* Many callers won't care at all about the outcome. After all, > + this is just a hint to the underlying device. They'll just > + ignore errors completely. So the end_io function can be just > + a call to bio_put() */ > + if (!end_io) > + end_io = (void *)&bio_put; Please don't do that, that's pretty ugly. > + > + if (!q->discard_fn) > + return -EOPNOTSUPP; > + > + bio = bio_alloc(GFP_KERNEL, 0); > + if (!bio) > + return -ENOMEM; > + > + bio->bi_end_io = end_io; > + bio->bi_bdev = bdev; > + bio->bi_sector = sector; > + bio->bi_size = nr_sects << 9; > + submit_bio(1 << BIO_RW_DISCARD, bio); > + return 0; > +} > +EXPORT_SYMBOL(blkdev_issue_discard); > diff --git a/block/blk-core.c b/block/blk-core.c > index 4889eb8..0cb993a 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1081,6 +1081,10 @@ void init_request_from_bio(struct request *req, struct bio *bio) > */ > if (unlikely(bio_barrier(bio))) > req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); > + if (unlikely(bio_discard(bio))) { > + req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD | REQ_NOMERGE); > + req->q->discard_fn(req->q, req); > + } What is the point of putting that request init into a new queue strategy? Even if you wanted to do it in the driver, the driver should just do it in its prep function. > if (bio_sync(bio)) > req->cmd_flags |= REQ_RW_SYNC; > @@ -1097,7 +1101,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) > static int __make_request(struct request_queue *q, struct bio *bio) > { > struct request *req; > - int el_ret, nr_sectors, barrier, err; > + int el_ret, nr_sectors, barrier, discard, err; > const unsigned short prio = bio_prio(bio); > const int sync = bio_sync(bio); > int rw_flags; > @@ -1117,9 +1121,15 @@ static int __make_request(struct request_queue *q, struct bio *bio) > goto end_io; > } > > + discard = bio_discard(bio); > + if (unlikely(discard) && !q->discard_fn) { > + err = -EOPNOTSUPP; > + goto end_io; > + } > + > spin_lock_irq(q->queue_lock); > > - if (unlikely(barrier) || elv_queue_empty(q)) > + if (unlikely(barrier) || unlikely(discard) || elv_queue_empty(q)) > goto get_rq; > > el_ret = elv_merge(q, &req, bio); > @@ -1411,6 +1421,10 @@ end_io: > err = -EOPNOTSUPP; > goto end_io; > } > + if (bio_discard(bio) && !q->discard_fn) { > + err = -EOPNOTSUPP; > + goto end_io; > + } > > ret = q->make_request_fn(q, bio); > } while (ret); > @@ -1488,10 +1502,9 @@ void submit_bio(int rw, struct bio *bio) > * If it's a regular read/write or a barrier with data attached, > * go through the normal accounting stuff before submission. > */ > - if (!bio_empty_barrier(bio)) { > + if (!bio_dataless(bio)) { > > BIO_BUG_ON(!bio->bi_size); > - BIO_BUG_ON(!bio->bi_io_vec); > > if (rw & WRITE) { > count_vm_events(PGPGOUT, count); Zach suggested bio_has_data() instead, which I agree is a lot more readable. > @@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, > struct request_queue *q = rq->q; > unsigned long flags = 0UL; > > - if (blk_fs_request(rq) || blk_pc_request(rq)) { > + if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) { > if (__end_that_request_first(rq, error, nr_bytes)) > return 1; > Striking my last comment on this, this really just checks whether we have a bio to complete or not. So a check for that would probably be better, but lets leave that for later. For now this is fine. > diff --git a/block/blk-settings.c b/block/blk-settings.c > index dfc7701..6991af0 100644 > --- a/block/blk-settings.c > +++ b/block/blk-settings.c > @@ -33,6 +33,22 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) > EXPORT_SYMBOL(blk_queue_prep_rq); > > /** > + * blk_queue_discard - set a discard_sectors function for queue > + * @q: queue > + * @dfn: discard function > + * > + * It's possible for a queue to register a discard callback which is used > + * to queue a request to discard (or "trim") sectors whose content is no > + * longer required, if the underlying device supports such an action. > + * > + */ > +void blk_queue_discard(struct request_queue *q, discard_fn *dfn) > +{ > + q->discard_fn = dfn; > +} > +EXPORT_SYMBOL(blk_queue_discard); > + > +/** > * blk_queue_merge_bvec - set a merge_bvec function for queue > * @q: queue > * @mbfn: merge_bvec_fn > diff --git a/block/elevator.c b/block/elevator.c > index ed6f8f3..bb26424 100644 > --- a/block/elevator.c > +++ b/block/elevator.c > @@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where, > if (q->ordcolor) > rq->cmd_flags |= REQ_ORDERED_COLOR; > > - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { > + if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) { > /* > * toggle ordered color > */ Just make REQ_DISCARD set REQ_SOFTBARRIER? Do you care if this acts as a barrier or not? > diff --git a/include/linux/bio.h b/include/linux/bio.h > index 0933a14..67cb2ee 100644 > --- a/include/linux/bio.h > +++ b/include/linux/bio.h > @@ -149,6 +149,8 @@ struct bio { > * bit 2 -- barrier > * bit 3 -- fail fast, don't want low level driver retries > * bit 4 -- synchronous I/O hint: the block layer will unplug immediately > + * bit 5 -- > + * bit 6 -- discard sectors (trim) > */ > #define BIO_RW 0 > #define BIO_RW_AHEAD 1 > @@ -156,6 +158,7 @@ struct bio { > #define BIO_RW_FAILFAST 3 > #define BIO_RW_SYNC 4 > #define BIO_RW_META 5 > +#define BIO_RW_DISCARD 6 > > /* > * upper 16 bits of bi_rw define the io priority of this bio > @@ -185,10 +188,16 @@ struct bio { > #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) > #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) > #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) > +#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) > #define bio_empty_barrier(bio) (bio_barrier(bio) && !(bio)->bi_size) > > +#define bio_dataless(bio) (!(bio)->bi_io_vec) > + > static inline unsigned int bio_cur_sectors(struct bio *bio) > { > + if (unlikely(bio_discard(bio))) > + return bio->bi_size >> 9; > + > if (bio->bi_vcnt) > return bio_iovec(bio)->bv_len >> 9; Just make that if (bio->bi_vcnt) return bio_iovec(bio)->bv_len >> 9; else return bio->bi_size >> 9; and add a comment about it. > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index e61f22b..9e3adc4 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -55,6 +55,7 @@ enum rq_cmd_type_bits { > REQ_TYPE_PM_RESUME, /* resume request */ > REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ > REQ_TYPE_FLUSH, /* flush request */ > + REQ_TYPE_DISCARD, /* discard sectors request */ > REQ_TYPE_SPECIAL, /* driver defined type */ > REQ_TYPE_LINUX_BLOCK, /* generic block layer message */ > /* > @@ -89,6 +90,7 @@ enum { > enum rq_flag_bits { > __REQ_RW, /* not set, read. set, write */ > __REQ_FAILFAST, /* no low level driver retries */ > + __REQ_DISCARD, /* request to discard sectors */ > __REQ_SORTED, /* elevator knows about this request */ > __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ > __REQ_HARDBARRIER, /* may not be passed by drive either */ > @@ -111,6 +113,7 @@ enum rq_flag_bits { > }; > > #define REQ_RW (1 << __REQ_RW) > +#define REQ_DISCARD (1 << __REQ_DISCARD) > #define REQ_FAILFAST (1 << __REQ_FAILFAST) > #define REQ_SORTED (1 << __REQ_SORTED) > #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) > @@ -252,6 +255,7 @@ typedef void (request_fn_proc) (struct request_queue *q); > typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); > typedef int (prep_rq_fn) (struct request_queue *, struct request *); > typedef void (unplug_fn) (struct request_queue *); > +typedef int (discard_fn) (struct request_queue *, struct request *); > > struct bio_vec; > struct bvec_merge_data { > @@ -298,6 +302,7 @@ struct request_queue > make_request_fn *make_request_fn; > prep_rq_fn *prep_rq_fn; > unplug_fn *unplug_fn; > + discard_fn *discard_fn; > merge_bvec_fn *merge_bvec_fn; > prepare_flush_fn *prepare_flush_fn; > softirq_done_fn *softirq_done_fn; > @@ -536,10 +541,12 @@ enum { > #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED) > #define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER) > #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA) > +#define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD) > #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) > #define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors) > /* rq->queuelist of dequeued request must be list_empty() */ > #define blk_queued_rq(rq) (!list_empty(&(rq)->queuelist)) > +#define blk_dataless_rq(rq) (!(rq)->buffer) > > #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) > > @@ -786,6 +793,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); > extern void blk_queue_dma_alignment(struct request_queue *, int); > extern void blk_queue_update_dma_alignment(struct request_queue *, int); > extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); > +extern void blk_queue_discard(struct request_queue *, discard_fn *); > extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); > extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); > extern int blk_do_ordered(struct request_queue *, struct request **); > @@ -829,6 +837,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, > } > > extern int blkdev_issue_flush(struct block_device *, sector_t *); > +extern int blkdev_issue_discard(struct block_device *, sector_t sector, > + unsigned nr_sects, bio_end_io_t *); > > /* > * command filter functions > -- > 1.5.5.1 > > > -- > David Woodhouse Open Source Technology Centre > David.Woodhouse@xxxxxxxxx Intel Corporation > > > -- Jens Axboe -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html