On Sat, 2008-07-26 at 13:02 -0700, Andrew Morton wrote: > I seem to be hearing a lot of silence over support for SSD devices. I > have this vague worry that there will be a large rollout of SSD > hardware and Linux will be found to have pants-around-ankles. > > For example, support for the T13 Trim command. There will be others, > but I surely don't know what they are. Here's a first attempt at that. It implements basic support for 'discard' requests in the block layer, implements that in FTL (the flash translation layer used on PCMCIA flash cards), and in FAT. It doesn't yet do any merging of discard requests, and everything in block/ needs careful review -- I don't know that code well at all. But it seems to pass at least basic testing: modprobe mtdram modprobe mtdchar ftl_format /dev/mtd0 modprobe ftl mkfs.msdos /dev/ftla mount /dev/ftla /mnt/test cp /etc/services /mnt/test rm /mnt/test/services ... and (with appropriate debugging added) you can see the sectors being thrown away. And 'od -t x1 /dev/ftla' will confirm that. diff --git a/block/blk-barrier.c b/block/blk-barrier.c index a09ead1..e0ac1c8 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -258,7 +258,6 @@ static void bio_end_empty_barrier(struct bio *bio, int err) set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); clear_bit(BIO_UPTODATE, &bio->bi_flags); } - complete(bio->bi_private); } @@ -315,3 +314,62 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) return ret; } EXPORT_SYMBOL(blkdev_issue_flush); + +/** + * blkdev_issue_discard - queue a discard + * @bdev: blockdev to issue discard for + * @sector: start sector + * @nr_sects: number of sectors to discard + * @error_sector: error sector + * + * Description: + * Issue a discard request for the sectors in question. Caller can supply + * room for storing the error offset in case of a discard error, if they + * wish to. + */ +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + unsigned nr_sects, sector_t *error_sector) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q; + struct bio *bio; + int ret; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + bio = bio_alloc(GFP_KERNEL, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = bio_end_empty_barrier; + bio->bi_private = &wait; + bio->bi_bdev = bdev; + bio->bi_sector = sector; + bio->bi_size = nr_sects << 9; + submit_bio(1 << BIO_RW_DISCARD, bio); + + wait_for_completion(&wait); + + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be copied + * from rq->sector. + */ + if (error_sector) + *error_sector = bio->bi_sector; + + ret = 0; + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + else if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(blkdev_issue_discard); diff --git a/block/blk-core.c b/block/blk-core.c index 4889eb8..dc57c33 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -626,6 +626,7 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) * first three bits are identical in rq->cmd_flags and bio->bi_rw, * see bio.h and blkdev.h */ + /* Er, Are they really? And REQ_ALLOCED isn't one of them anyway */ rq->cmd_flags = rw | REQ_ALLOCED; if (priv) { @@ -1081,7 +1082,10 @@ void init_request_from_bio(struct request *req, struct bio *bio) */ if (unlikely(bio_barrier(bio))) req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); - + if (unlikely(bio_discard(bio))) { + req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD | REQ_NOMERGE); + req->q->discard_fn(req->q, req); + } if (bio_sync(bio)) req->cmd_flags |= REQ_RW_SYNC; if (bio_rw_meta(bio)) @@ -1097,7 +1101,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) static int __make_request(struct request_queue *q, struct bio *bio) { struct request *req; - int el_ret, nr_sectors, barrier, err; + int el_ret, nr_sectors, barrier, discard, err; const unsigned short prio = bio_prio(bio); const int sync = bio_sync(bio); int rw_flags; @@ -1117,9 +1121,15 @@ static int __make_request(struct request_queue *q, struct bio *bio) goto end_io; } + discard = bio_discard(bio); + if (unlikely(discard) && !q->discard_fn) { + err = -EOPNOTSUPP; + goto end_io; + } + spin_lock_irq(q->queue_lock); - if (unlikely(barrier) || elv_queue_empty(q)) + if (unlikely(barrier) || unlikely(discard) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1411,6 +1421,10 @@ end_io: err = -EOPNOTSUPP; goto end_io; } + if (bio_discard(bio) && !q->discard_fn) { + err = -EOPNOTSUPP; + goto end_io; + } ret = q->make_request_fn(q, bio); } while (ret); @@ -1488,7 +1502,7 @@ void submit_bio(int rw, struct bio *bio) * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ - if (!bio_empty_barrier(bio)) { + if (!bio_empty_barrier(bio) && !bio_discard(bio)) { BIO_BUG_ON(!bio->bi_size); BIO_BUG_ON(!bio->bi_io_vec); @@ -1562,13 +1576,12 @@ static int __end_that_request_first(struct request *req, int error, int nbytes; /* - * For an empty barrier request, the low level driver must - * store a potential error location in ->sector. We pass - * that back up in ->bi_sector. + * For an empty barrier request or sector discard request, the + * low level driver must store a potential error location in + * ->sector. We pass that back up in ->bi_sector. */ - if (blk_empty_barrier(req)) + if (blk_empty_barrier(req) || blk_discard_rq(req)) bio->bi_sector = req->sector; - if (nr_bytes >= bio->bi_size) { req->bio = bio->bi_next; nbytes = bio->bi_size; @@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, struct request_queue *q = rq->q; unsigned long flags = 0UL; - if (blk_fs_request(rq) || blk_pc_request(rq)) { + if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) { if (__end_that_request_first(rq, error, nr_bytes)) return 1; @@ -1944,7 +1957,7 @@ EXPORT_SYMBOL_GPL(blk_end_request); **/ int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { - if (blk_fs_request(rq) || blk_pc_request(rq)) { + if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) { if (__end_that_request_first(rq, error, nr_bytes)) return 1; } @@ -2015,6 +2028,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ + /* Er, Are they really? */ rq->cmd_flags |= (bio->bi_rw & 3); rq->nr_phys_segments = bio_phys_segments(q, bio); diff --git a/block/blk-settings.c b/block/blk-settings.c index dfc7701..6991af0 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -33,6 +33,22 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) EXPORT_SYMBOL(blk_queue_prep_rq); /** + * blk_queue_discard - set a discard_sectors function for queue + * @q: queue + * @dfn: discard function + * + * It's possible for a queue to register a discard callback which is used + * to queue a request to discard (or "trim") sectors whose content is no + * longer required, if the underlying device supports such an action. + * + */ +void blk_queue_discard(struct request_queue *q, discard_fn *dfn) +{ + q->discard_fn = dfn; +} +EXPORT_SYMBOL(blk_queue_discard); + +/** * blk_queue_merge_bvec - set a merge_bvec function for queue * @q: queue * @mbfn: merge_bvec_fn diff --git a/block/elevator.c b/block/elevator.c index ed6f8f3..bb26424 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where, if (q->ordcolor) rq->cmd_flags |= REQ_ORDERED_COLOR; - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { + if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) { /* * toggle ordered color */ diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c index f34f20c..0f8d886 100644 --- a/drivers/mtd/ftl.c +++ b/drivers/mtd/ftl.c @@ -1005,6 +1005,29 @@ static int ftl_writesect(struct mtd_blktrans_dev *dev, return ftl_write((void *)dev, buf, block, 1); } +static int ftl_discardsect(struct mtd_blktrans_dev *dev, + unsigned long sector, unsigned nr_sects) +{ + partition_t *part = (void *)dev; + uint32_t bsize = 1 << part->header.EraseUnitSize; + + printk("FTL erase sector %ld for %d sectors\n", + sector, nr_sects); + + while (nr_sects) { + uint32_t old_addr = part->VirtualBlockMap[sector]; + if (old_addr != 0xffffffff) { + part->VirtualBlockMap[sector] = 0xffffffff; + part->EUNInfo[old_addr/bsize].Deleted++; + if (set_bam_entry(part, old_addr, 0)) + return -EIO; + } + nr_sects--; + sector++; + } + + return 0; +} /*====================================================================*/ static void ftl_freepart(partition_t *part) @@ -1069,6 +1092,7 @@ static struct mtd_blktrans_ops ftl_tr = { .blksize = SECTOR_SIZE, .readsect = ftl_readsect, .writesect = ftl_writesect, + .discard = ftl_discardsect, .getgeo = ftl_getgeo, .add_mtd = ftl_add_mtd, .remove_dev = ftl_remove_dev, diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 9ff007c..0ef1d4b 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -32,6 +32,13 @@ struct mtd_blkcore_priv { spinlock_t queue_lock; }; +static int blktrans_discard_request(struct request_queue *q, + struct request *req) +{ + req->cmd_type = REQ_TYPE_DISCARD; + return 0; +} + static int do_blktrans_request(struct mtd_blktrans_ops *tr, struct mtd_blktrans_dev *dev, struct request *req) @@ -44,6 +51,9 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, buf = req->buffer; + if (req->cmd_type == REQ_TYPE_DISCARD) + return !tr->discard(dev, block, nsect); + if (!blk_fs_request(req)) return 0; @@ -367,6 +377,9 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr) tr->blkcore_priv->rq->queuedata = tr; blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize); + if (tr->discard) + blk_queue_discard(tr->blkcore_priv->rq, blktrans_discard_request); + tr->blkshift = ffs(tr->blksize) - 1; tr->blkcore_priv->thread = kthread_run(mtd_blktrans_thread, tr, diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 302e95c..249e9ec 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/msdos_fs.h> +#include <linux/blkdev.h> struct fatent_operations { void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); @@ -527,6 +528,15 @@ out: return err; } +static void fat_discard_cluster(struct super_block *sb, int cluster) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct block_device *bdev = sb->s_bdev; + unsigned long sector = fat_clus_to_blknr(sbi, cluster); + + blkdev_issue_discard(bdev, sector, sbi->sec_per_clus, NULL); +} + int fat_free_clusters(struct inode *inode, int cluster) { struct super_block *sb = inode->i_sb; @@ -540,6 +550,8 @@ int fat_free_clusters(struct inode *inode, int cluster) fatent_init(&fatent); lock_fat(sbi); do { + fat_discard_cluster(sb, cluster); + cluster = fat_ent_read(inode, &fatent, cluster); if (cluster < 0) { err = cluster; diff --git a/include/linux/bio.h b/include/linux/bio.h index 0933a14..5480702 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -149,6 +149,8 @@ struct bio { * bit 2 -- barrier * bit 3 -- fail fast, don't want low level driver retries * bit 4 -- synchronous I/O hint: the block layer will unplug immediately + * bit 5 -- + * bit 6 -- discard sectors (trim) */ #define BIO_RW 0 #define BIO_RW_AHEAD 1 @@ -156,6 +158,7 @@ struct bio { #define BIO_RW_FAILFAST 3 #define BIO_RW_SYNC 4 #define BIO_RW_META 5 +#define BIO_RW_DISCARD 6 /* * upper 16 bits of bi_rw define the io priority of this bio @@ -185,10 +188,14 @@ struct bio { #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) +#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) #define bio_empty_barrier(bio) (bio_barrier(bio) && !(bio)->bi_size) static inline unsigned int bio_cur_sectors(struct bio *bio) { + if (unlikely(bio_discard(bio))) + return bio->bi_size >> 9; + if (bio->bi_vcnt) return bio_iovec(bio)->bv_len >> 9; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e61f22b..6f00c37 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -55,6 +55,7 @@ enum rq_cmd_type_bits { REQ_TYPE_PM_RESUME, /* resume request */ REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ REQ_TYPE_FLUSH, /* flush request */ + REQ_TYPE_DISCARD, /* discard sectors ('trim') request */ REQ_TYPE_SPECIAL, /* driver defined type */ REQ_TYPE_LINUX_BLOCK, /* generic block layer message */ /* @@ -89,6 +90,7 @@ enum { enum rq_flag_bits { __REQ_RW, /* not set, read. set, write */ __REQ_FAILFAST, /* no low level driver retries */ + __REQ_DISCARD, /* request to discard sectors */ __REQ_SORTED, /* elevator knows about this request */ __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ __REQ_HARDBARRIER, /* may not be passed by drive either */ @@ -111,6 +113,7 @@ enum rq_flag_bits { }; #define REQ_RW (1 << __REQ_RW) +#define REQ_DISCARD (1 << __REQ_DISCARD) #define REQ_FAILFAST (1 << __REQ_FAILFAST) #define REQ_SORTED (1 << __REQ_SORTED) #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) @@ -252,6 +255,7 @@ typedef void (request_fn_proc) (struct request_queue *q); typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unplug_fn) (struct request_queue *); +typedef int (discard_fn) (struct request_queue *, struct request *); struct bio_vec; struct bvec_merge_data { @@ -298,6 +302,7 @@ struct request_queue make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unplug_fn *unplug_fn; + discard_fn *discard_fn; merge_bvec_fn *merge_bvec_fn; prepare_flush_fn *prepare_flush_fn; softirq_done_fn *softirq_done_fn; @@ -536,6 +541,7 @@ enum { #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED) #define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER) #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA) +#define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) #define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors) /* rq->queuelist of dequeued request must be list_empty() */ @@ -786,6 +792,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); +extern void blk_queue_discard(struct request_queue *, discard_fn *); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); extern int blk_do_ordered(struct request_queue *, struct request **); @@ -829,6 +836,7 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, } extern int blkdev_issue_flush(struct block_device *, sector_t *); +extern int blkdev_issue_discard(struct block_device *, sector_t, unsigned, sector_t *); /* * command filter functions diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h index 310e616..8b4aa05 100644 --- a/include/linux/mtd/blktrans.h +++ b/include/linux/mtd/blktrans.h @@ -41,6 +41,8 @@ struct mtd_blktrans_ops { unsigned long block, char *buffer); int (*writesect)(struct mtd_blktrans_dev *dev, unsigned long block, char *buffer); + int (*discard)(struct mtd_blktrans_dev *dev, + unsigned long block, unsigned nr_blocks); /* Block layer ioctls */ int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo); -- David Woodhouse Open Source Technology Centre David.Woodhouse@xxxxxxxxx Intel Corporation -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html