Re: [RFC] 'discard sectors' block request.

David Woodhouse <dwmw2@xxxxxxxxxxxxx> · Tue, 05 Aug 2008 17:29:01 +0100

On Tue, 2008-08-05 at 13:42 +0200, Jens Axboe wrote:
 <some stuff>

Incremental patch below; I'll commit it to a git tree in discrete
patches, and publish those.

You can just pass NULL as your end_io function to blkdev_issue_discard()
and it'll set it to bio_put() for you.

We'll probably want to make the elevator capable of merging discard
requests -- and even letting them cross real read/write requests for
other sectors, and just dropping them if there's a subsequent write
request covering the same region, etc.


diff -u b/block/blk-barrier.c b/block/blk-barrier.c

--- b/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -320,17 +321,16 @@
  * @bdev:	blockdev to issue discard for
  * @sector:	start sector
  * @nr_sects:	number of sectors to discard
- * @error_sector:	error sector
+ * @end_io:	end_io function (or NULL)
  *
  * Description:
- *    Issue a discard request for the sectors in question. Caller can supply
- *    room for storing the error offset in case of a discard error, if they
- *    wish to.
+ *    Issue a discard request for the sectors in question. Caller can pass an
+ *    end_io function (which must call bio_put()), if they really want to see
+ *    the outcome. Most callers probably won't, and can just pass NULL.
  */
 int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-			 unsigned nr_sects, sector_t *error_sector)
+			 unsigned nr_sects, bio_end_io_t end_io)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
 	struct bio *bio;
 	int ret;
@@ -343,33 +343,24 @@
 		return -ENXIO;
 
+	/* Many callers won't care at all about the outcome. After all,
+	   this is just a hint to the underlying device. They'll just
+	   ignore errors completely. So the end_io function can be just
+	   a call to bio_put() */
+	if (!end_io)
+		end_io = (void *)&bio_put;
+
+	if (!q->discard_fn)
+		return -EOPNOTSUPP;
+
 	bio = bio_alloc(GFP_KERNEL, 0);
 	if (!bio)
 		return -ENOMEM;
 
-	bio->bi_end_io = bio_end_empty_barrier;
-	bio->bi_private = &wait;
+	bio->bi_end_io = end_io;
 	bio->bi_bdev = bdev;
 	bio->bi_sector = sector;
 	bio->bi_size = nr_sects << 9;
 	submit_bio(1 << BIO_RW_DISCARD, bio);
-
-	wait_for_completion(&wait);
-
-	/*
-	 * The driver must store the error location in ->bi_sector, if
-	 * it supports it. For non-stacked drivers, this should be copied
-	 * from rq->sector.
-	 */
-	if (error_sector)
-		*error_sector = bio->bi_sector;
-
-	ret = 0;
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	else if (!bio_flagged(bio, BIO_UPTODATE))
-		ret = -EIO;
-
-	bio_put(bio);
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(blkdev_issue_discard);
diff -u b/block/blk-core.c b/block/blk-core.c
--- b/block/blk-core.c
+++ b/block/blk-core.c
@@ -1502,10 +1501,9 @@
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
-	if (!bio_empty_barrier(bio) && !bio_discard(bio)) {
+	if (!bio_dataless(bio)) {
 
 		BIO_BUG_ON(!bio->bi_size);
-		BIO_BUG_ON(!bio->bi_io_vec);
 
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
@@ -1576,12 +1574,13 @@
 		int nbytes;
 
 		/*
-		 * For an empty barrier request or sector discard request, the
-		 * low level driver must store a potential error location in 
-		 * ->sector. We pass that back up in ->bi_sector.
+		 * For an empty barrier request, the low level driver must
+		 * store a potential error location in ->sector. We pass
+		 * that back up in ->bi_sector.
 		 */
-		if (blk_empty_barrier(req) || blk_discard_rq(req))
+		if (blk_empty_barrier(req))
 			bio->bi_sector = req->sector;
+
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
diff -u b/fs/fat/fatent.c b/fs/fat/fatent.c
--- b/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -528,15 +528,6 @@
 	return err;
 }
 
-static void fat_discard_cluster(struct super_block *sb, int cluster)
-{
-	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	struct block_device *bdev = sb->s_bdev;
-	unsigned long sector = fat_clus_to_blknr(sbi, cluster);
-
-	blkdev_issue_discard(bdev, sector, sbi->sec_per_clus, NULL);
-}
-
 int fat_free_clusters(struct inode *inode, int cluster)
 {
 	struct super_block *sb = inode->i_sb;
@@ -550,8 +541,11 @@
 	fatent_init(&fatent);
 	lock_fat(sbi);
 	do {
-		fat_discard_cluster(sb, cluster);
-
+		/* Issue discard for the sectors we no longer care about */
+		blkdev_issue_discard(sb->s_bdev,
+				     fat_clus_to_blknr(sbi, cluster),
+				     sbi->sec_per_clus, NULL);
+				     
 		cluster = fat_ent_read(inode, &fatent, cluster);
 		if (cluster < 0) {
 			err = cluster;
diff -u b/include/linux/bio.h b/include/linux/bio.h
--- b/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -191,6 +191,8 @@
 #define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
 
+#define bio_dataless(bio)	(!(bio)->bi_io_vec)
+
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
 	if (unlikely(bio_discard(bio)))
diff -u b/include/linux/blkdev.h b/include/linux/blkdev.h
--- b/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -546,6 +546,7 @@
 #define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
+#define blk_dataless_rq(rq)	(!(rq)->buffer)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 
@@ -836,7 +837,7 @@
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
-extern int blkdev_issue_discard(struct block_device *, sector_t, unsigned, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t, unsigned, bio_end_io_t *);
 
 /*
 * command filter functions

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@xxxxxxxxx                              Intel Corporation



--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html