On Fri, Aug 08 2008, David Woodhouse wrote: > On Fri, 2008-08-08 at 13:18 +0200, Jens Axboe wrote: > > Actually, my memory is a bit shot, since I got rid of ->issue_flush_fn > > in the 2.6.24 cycle. So it's already gone > > It was q->prepare_flush_fn which the discard_fn is based on, and that's > still there in the for-2.6.28 tree you just pointed me at. > > http://git.kernel.dk/?p=linux-2.6-block.git;a=blob;f=block/blk-barrier.c;h=a09ead19f9c5702a1ad76d709c54969176fe9e94;hb=6dc2b733b2f9605b48fdb7692fce5a3eafe241e4#l149 Sigh indeed, ->issue_flush_fn() was the actual issuer, not the preparer. Let me send a new diff. This adds the ->prepare_discard_fn() to do the transformation, and also extends blkdev_issue_flush() to return error if the IO was never queued because of some device in the stack not supporting it. Until we have overlap detection, I think we should make the discard request a explicit barrier. Otherwise we could have problems with a discard being passed by a write request and such. diff --git a/block/blk-barrier.c b/block/blk-barrier.c index a09ead1..b2add39 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -315,3 +315,78 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) return ret; } EXPORT_SYMBOL(blkdev_issue_flush); + +static void bio_discard_end_io(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + + bio_put(bio); +} + +/** + * blkdev_issue_discard - queue a discard + * @bdev: blockdev to issue discard for + * @sector: start sector + * @nr_sects: number of sectors to discard + * @end_io: end_io function (or %NULL) + * + * Description: + * Issue a discard request for the sectors in question. Caller can pass an + * end_io function (which must call bio_put()), if they really want to see + * the outcome. Most callers probably won't, and can just pass %NULL. + */ +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + unsigned nr_sects, bio_end_io_t end_io) +{ + struct request_queue *q; + struct bio *bio; + int ret; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!q->prepare_discard_fn) + return -EOPNOTSUPP; + + bio = bio_alloc(GFP_KERNEL, 0); + if (!bio) + return -ENOMEM; + + /* + * Many callers won't care at all about the outcome. After all, + * this is just a hint to the underlying device. They'll just + * ignore errors completely. So the end_io function can be just + * a call to bio_put() + */ + if (end_io) + bio->bi_end_io = end_io; + else + bio->bi_end_io = bio_discard_end_io; + + bio->bi_bdev = bdev; + bio->bi_sector = sector; + bio->bi_size = nr_sects << 9; + bio_get(bio); + submit_bio(1 << BIO_RW_DISCARD, bio); + + /* + * Check if it was failed before being submitted + */ + ret = 0; + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + else if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(blkdev_issue_discard); diff --git a/block/blk-core.c b/block/blk-core.c index ff7ec49..ad519c9 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1068,7 +1068,10 @@ EXPORT_SYMBOL(blk_put_request); void init_request_from_bio(struct request *req, struct bio *bio) { - req->cmd_type = REQ_TYPE_FS; + if (bio_discard(bio)) + req->cmd_type = REQ_TYPE_DISCARD; + else + req->cmd_type = REQ_TYPE_FS; /* * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) @@ -1407,7 +1410,8 @@ end_io: if (bio_check_eod(bio, nr_sectors)) goto end_io; - if (bio_empty_barrier(bio) && !q->prepare_flush_fn) { + if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) || + (bio_discard(bio) && !q->prepare_discard_fn)) { err = -EOPNOTSUPP; goto end_io; } diff --git a/block/blk-settings.c b/block/blk-settings.c index dfc7701..c783d84 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -32,6 +32,23 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) } EXPORT_SYMBOL(blk_queue_prep_rq); + /** + * blk_queue_setdiscard - set a discard_sectors function for queue + * @q: queue + * @dfn: discard function + * + * It's possible for a queue to register a discard callback which is used + * to transform a discard request into the appropriate type for the + * hardware. If none is registered, then discard requests are failed + * with EOPNOTSUPP. + * + */ +void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn) +{ + q->prepare_discard_fn = dfn; +} +EXPORT_SYMBOL(blk_queue_set_discard); + /** * blk_queue_merge_bvec - set a merge_bvec function for queue * @q: queue diff --git a/block/elevator.c b/block/elevator.c index ed6f8f3..e704086 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -75,6 +75,13 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) return 0; /* + * Don't merge file system and non-file system requests + */ + if ((!bio_has_data(bio) && bio_has_data(rq->bio)) || + (bio_has_data(bio) && !bio_has_data(rq->bio))) + return 0; + + /* * different data direction or already started, don't merge */ if (bio_data_dir(bio) != rq_data_dir(rq)) @@ -607,7 +614,6 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) break; case ELEVATOR_INSERT_SORT: - BUG_ON(!blk_fs_request(rq)); rq->cmd_flags |= REQ_SORTED; q->nr_sorted++; if (rq_mergeable(rq)) { diff --git a/include/linux/bio.h b/include/linux/bio.h index dbeb66f..cc678d4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -149,6 +149,8 @@ struct bio { * bit 2 -- barrier * bit 3 -- fail fast, don't want low level driver retries * bit 4 -- synchronous I/O hint: the block layer will unplug immediately + * bit 5 -- metadata request + * bit 6 -- discard sectors request */ #define BIO_RW 0 #define BIO_RW_AHEAD 1 @@ -156,6 +158,7 @@ struct bio { #define BIO_RW_FAILFAST 3 #define BIO_RW_SYNC 4 #define BIO_RW_META 5 +#define BIO_RW_DISCARD 6 /* * upper 16 bits of bi_rw define the io priority of this bio @@ -186,13 +189,18 @@ struct bio { #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio)) +#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) static inline unsigned int bio_cur_sectors(struct bio *bio) { + /* + * Discard requests don't have an bio_vec list, yet they still + * refer to a range of data. Just use the given bio size in that case. + */ if (bio->bi_vcnt) return bio_iovec(bio)->bv_len >> 9; - - return 0; + else + return bio_sectors(bio); } static inline void *bio_data(struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e61f22b..1df6889 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -56,6 +56,7 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ REQ_TYPE_FLUSH, /* flush request */ REQ_TYPE_SPECIAL, /* driver defined type */ + REQ_TYPE_DISCARD, /* discard sectors request */ REQ_TYPE_LINUX_BLOCK, /* generic block layer message */ /* * for ATA/ATAPI devices. this really doesn't belong here, ide should @@ -263,6 +264,7 @@ struct bvec_merge_data { typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *, struct bio_vec *); typedef void (prepare_flush_fn) (struct request_queue *, struct request *); +typedef void (prepare_discard_fn) (struct request_queue *, struct request *); typedef void (softirq_done_fn)(struct request *); typedef int (dma_drain_needed_fn)(struct request *); @@ -300,6 +302,7 @@ struct request_queue unplug_fn *unplug_fn; merge_bvec_fn *merge_bvec_fn; prepare_flush_fn *prepare_flush_fn; + prepare_discard_fn *prepare_discard_fn; softirq_done_fn *softirq_done_fn; dma_drain_needed_fn *dma_drain_needed; @@ -522,6 +525,7 @@ enum { #define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC) #define blk_special_request(rq) ((rq)->cmd_type == REQ_TYPE_SPECIAL) #define blk_sense_request(rq) ((rq)->cmd_type == REQ_TYPE_SENSE) +#define blk_discard_request(rq) ((rq)->cmd_type == REQ_TYPE_DISCARD) #define blk_noretry_request(rq) ((rq)->cmd_flags & REQ_FAILFAST) #define blk_rq_started(rq) ((rq)->cmd_flags & REQ_STARTED) @@ -788,6 +792,7 @@ extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); +extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *); extern int blk_do_ordered(struct request_queue *, struct request **); extern unsigned blk_ordered_cur_seq(struct request_queue *); extern unsigned blk_ordered_req_seq(struct request *); @@ -829,6 +834,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, } extern int blkdev_issue_flush(struct block_device *, sector_t *); +extern int blkdev_issue_discard(struct block_device *, sector_t, + unsigned int, bio_end_io_t *); /* * command filter functions -- Jens Axboe -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html