On Tue, 2008-08-05 at 13:42 +0200, Jens Axboe wrote: <some stuff> Incremental patch below; I'll commit it to a git tree in discrete patches, and publish those. You can just pass NULL as your end_io function to blkdev_issue_discard() and it'll set it to bio_put() for you. We'll probably want to make the elevator capable of merging discard requests -- and even letting them cross real read/write requests for other sectors, and just dropping them if there's a subsequent write request covering the same region, etc. diff -u b/block/blk-barrier.c b/block/blk-barrier.c --- b/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -320,17 +321,16 @@ * @bdev: blockdev to issue discard for * @sector: start sector * @nr_sects: number of sectors to discard - * @error_sector: error sector + * @end_io: end_io function (or NULL) * * Description: - * Issue a discard request for the sectors in question. Caller can supply - * room for storing the error offset in case of a discard error, if they - * wish to. + * Issue a discard request for the sectors in question. Caller can pass an + * end_io function (which must call bio_put()), if they really want to see + * the outcome. Most callers probably won't, and can just pass NULL. */ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - unsigned nr_sects, sector_t *error_sector) + unsigned nr_sects, bio_end_io_t end_io) { - DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; int ret; @@ -343,33 +343,24 @@ return -ENXIO; + /* Many callers won't care at all about the outcome. After all, + this is just a hint to the underlying device. They'll just + ignore errors completely. So the end_io function can be just + a call to bio_put() */ + if (!end_io) + end_io = (void *)&bio_put; + + if (!q->discard_fn) + return -EOPNOTSUPP; + bio = bio_alloc(GFP_KERNEL, 0); if (!bio) return -ENOMEM; - bio->bi_end_io = bio_end_empty_barrier; - bio->bi_private = &wait; + bio->bi_end_io = end_io; bio->bi_bdev = bdev; bio->bi_sector = sector; bio->bi_size = nr_sects << 9; submit_bio(1 << BIO_RW_DISCARD, bio); - - wait_for_completion(&wait); - - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be copied - * from rq->sector. - */ - if (error_sector) - *error_sector = bio->bi_sector; - - ret = 0; - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - - bio_put(bio); - return ret; + return 0; } EXPORT_SYMBOL(blkdev_issue_discard); diff -u b/block/blk-core.c b/block/blk-core.c --- b/block/blk-core.c +++ b/block/blk-core.c @@ -1502,10 +1501,9 @@ * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ - if (!bio_empty_barrier(bio) && !bio_discard(bio)) { + if (!bio_dataless(bio)) { BIO_BUG_ON(!bio->bi_size); - BIO_BUG_ON(!bio->bi_io_vec); if (rw & WRITE) { count_vm_events(PGPGOUT, count); @@ -1576,12 +1574,13 @@ int nbytes; /* - * For an empty barrier request or sector discard request, the - * low level driver must store a potential error location in - * ->sector. We pass that back up in ->bi_sector. + * For an empty barrier request, the low level driver must + * store a potential error location in ->sector. We pass + * that back up in ->bi_sector. */ - if (blk_empty_barrier(req) || blk_discard_rq(req)) + if (blk_empty_barrier(req)) bio->bi_sector = req->sector; + if (nr_bytes >= bio->bi_size) { req->bio = bio->bi_next; nbytes = bio->bi_size; diff -u b/fs/fat/fatent.c b/fs/fat/fatent.c --- b/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -528,15 +528,6 @@ return err; } -static void fat_discard_cluster(struct super_block *sb, int cluster) -{ - struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct block_device *bdev = sb->s_bdev; - unsigned long sector = fat_clus_to_blknr(sbi, cluster); - - blkdev_issue_discard(bdev, sector, sbi->sec_per_clus, NULL); -} - int fat_free_clusters(struct inode *inode, int cluster) { struct super_block *sb = inode->i_sb; @@ -550,8 +541,11 @@ fatent_init(&fatent); lock_fat(sbi); do { - fat_discard_cluster(sb, cluster); - + /* Issue discard for the sectors we no longer care about */ + blkdev_issue_discard(sb->s_bdev, + fat_clus_to_blknr(sbi, cluster), + sbi->sec_per_clus, NULL); + cluster = fat_ent_read(inode, &fatent, cluster); if (cluster < 0) { err = cluster; diff -u b/include/linux/bio.h b/include/linux/bio.h --- b/include/linux/bio.h +++ b/include/linux/bio.h @@ -191,6 +191,8 @@ #define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) #define bio_empty_barrier(bio) (bio_barrier(bio) && !(bio)->bi_size) +#define bio_dataless(bio) (!(bio)->bi_io_vec) + static inline unsigned int bio_cur_sectors(struct bio *bio) { if (unlikely(bio_discard(bio))) diff -u b/include/linux/blkdev.h b/include/linux/blkdev.h --- b/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -546,6 +546,7 @@ #define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors) /* rq->queuelist of dequeued request must be list_empty() */ #define blk_queued_rq(rq) (!list_empty(&(rq)->queuelist)) +#define blk_dataless_rq(rq) (!(rq)->buffer) #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) @@ -836,7 +837,7 @@ } extern int blkdev_issue_flush(struct block_device *, sector_t *); -extern int blkdev_issue_discard(struct block_device *, sector_t, unsigned, sector_t *); +extern int blkdev_issue_discard(struct block_device *, sector_t, unsigned, bio_end_io_t *); /* * command filter functions -- David Woodhouse Open Source Technology Centre David.Woodhouse@xxxxxxxxx Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html