Re: [PATCH 1/5] [BLOCK] Add 'discard' request handling

Jens Axboe <jens.axboe@xxxxxxxxxx> · Thu, 7 Aug 2008 20:41:56 +0200

On Wed, Aug 06 2008, David Woodhouse wrote:
> Some block devices benefit from a hint that they can forget the contents
> of certain sectors. Add basic support for this to the block core, along
> with a 'blkdev_issue_discard()' helper function which issues such
> requests.
> 
> Although blkdev_issue_discard() can take an end_io function, it's
> acceptable to leave that as NULL and in that case the allocated bio will
> be automatically freed. Most of the time, it's expected that callers
> won't care about when, or even _if_, the request completes. It's only a
> hint to the device anyway. By definition, the file system doesn't _care_
> about these sectors any more.

I've queued this up for some testing, I'll add the merge support as
well.

> Signed-off-by: David Woodhouse <David.Woodhouse@xxxxxxxxx>
> ---
>  block/blk-barrier.c    |   49 ++++++++++++++++++++++++++++++++++++++++++++++++
>  block/blk-core.c       |   25 ++++++++++++++++++-----
>  block/blk-settings.c   |   16 +++++++++++++++
>  block/elevator.c       |    2 +-
>  include/linux/bio.h    |    9 ++++++++
>  include/linux/blkdev.h |   10 +++++++++
>  6 files changed, 104 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-barrier.c b/block/blk-barrier.c
> index a09ead1..29e60ff 100644
> --- a/block/blk-barrier.c
> +++ b/block/blk-barrier.c

Not sure why you are placing it here, it should probably just go into
blk-core.c

> @@ -315,3 +315,52 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
>  	return ret;
>  }
>  EXPORT_SYMBOL(blkdev_issue_flush);
> +
> +/**
> + * blkdev_issue_discard - queue a discard
> + * @bdev:	blockdev to issue discard for
> + * @sector:	start sector
> + * @nr_sects:	number of sectors to discard
> + * @end_io:	end_io function (or %NULL)
> + *
> + * Description:
> + *    Issue a discard request for the sectors in question. Caller can pass an
> + *    end_io function (which must call bio_put()), if they really want to see
> + *    the outcome. Most callers probably won't, and can just pass %NULL.
> + */
> +int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
> +			 unsigned nr_sects, bio_end_io_t end_io)
> +{
> +	struct request_queue *q;
> +	struct bio *bio;
> +	int ret;
> +
> +	if (bdev->bd_disk == NULL)
> +		return -ENXIO;
> +
> +	q = bdev_get_queue(bdev);
> +	if (!q)
> +		return -ENXIO;
> +
> +	/* Many callers won't care at all about the outcome. After all,
> +	   this is just a hint to the underlying device. They'll just
> +	   ignore errors completely. So the end_io function can be just
> +	   a call to bio_put() */
> +	if (!end_io)
> +		end_io = (void *)&bio_put;

Please don't do that, that's pretty ugly.

> +
> +	if (!q->discard_fn)
> +		return -EOPNOTSUPP;
> +
> +	bio = bio_alloc(GFP_KERNEL, 0);
> +	if (!bio)
> +		return -ENOMEM;
> +
> +	bio->bi_end_io = end_io;
> +	bio->bi_bdev = bdev;
> +	bio->bi_sector = sector;
> +	bio->bi_size = nr_sects << 9;
> +	submit_bio(1 << BIO_RW_DISCARD, bio);
> +	return 0;
> +}
> +EXPORT_SYMBOL(blkdev_issue_discard);
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 4889eb8..0cb993a 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1081,6 +1081,10 @@ void init_request_from_bio(struct request *req, struct bio *bio)
>  	 */
>  	if (unlikely(bio_barrier(bio)))
>  		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
> +	if (unlikely(bio_discard(bio))) {
> +		req->cmd_flags |= (REQ_SOFTBARRIER | REQ_DISCARD | REQ_NOMERGE);
> +		req->q->discard_fn(req->q, req);
> +	}

What is the point of putting that request init into a new queue
strategy? Even if you wanted to do it in the driver, the driver should
just do it in its prep function.

>  	if (bio_sync(bio))
>  		req->cmd_flags |= REQ_RW_SYNC;
> @@ -1097,7 +1101,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
>  static int __make_request(struct request_queue *q, struct bio *bio)
>  {
>  	struct request *req;
> -	int el_ret, nr_sectors, barrier, err;
> +	int el_ret, nr_sectors, barrier, discard, err;
>  	const unsigned short prio = bio_prio(bio);
>  	const int sync = bio_sync(bio);
>  	int rw_flags;
> @@ -1117,9 +1121,15 @@ static int __make_request(struct request_queue *q, struct bio *bio)
>  		goto end_io;
>  	}
>  
> +	discard = bio_discard(bio);
> +	if (unlikely(discard) && !q->discard_fn) {
> +		err = -EOPNOTSUPP;
> +		goto end_io;
> +	}
> +
>  	spin_lock_irq(q->queue_lock);
>  
> -	if (unlikely(barrier) || elv_queue_empty(q))
> +	if (unlikely(barrier) || unlikely(discard) || elv_queue_empty(q))
>  		goto get_rq;
>  
>  	el_ret = elv_merge(q, &req, bio);
> @@ -1411,6 +1421,10 @@ end_io:
>  			err = -EOPNOTSUPP;
>  			goto end_io;
>  		}
> +		if (bio_discard(bio) && !q->discard_fn) {
> +			err = -EOPNOTSUPP;
> +			goto end_io;
> +		}
>  
>  		ret = q->make_request_fn(q, bio);
>  	} while (ret);
> @@ -1488,10 +1502,9 @@ void submit_bio(int rw, struct bio *bio)
>  	 * If it's a regular read/write or a barrier with data attached,
>  	 * go through the normal accounting stuff before submission.
>  	 */
> -	if (!bio_empty_barrier(bio)) {
> +	if (!bio_dataless(bio)) {
>  
>  		BIO_BUG_ON(!bio->bi_size);
> -		BIO_BUG_ON(!bio->bi_io_vec);
>  
>  		if (rw & WRITE) {
>  			count_vm_events(PGPGOUT, count);

Zach suggested bio_has_data() instead, which I agree is a lot more
readable.

> @@ -1886,7 +1899,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
>  	struct request_queue *q = rq->q;
>  	unsigned long flags = 0UL;
>  
> -	if (blk_fs_request(rq) || blk_pc_request(rq)) {
> +	if (blk_fs_request(rq) || blk_pc_request(rq) || blk_discard_rq(rq)) {
>  		if (__end_that_request_first(rq, error, nr_bytes))
>  			return 1;
>  

Striking my last comment on this, this really just checks whether we
have a bio to complete or not. So a check for that would probably be
better, but lets leave that for later. For now this is fine.

> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index dfc7701..6991af0 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -33,6 +33,22 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
>  EXPORT_SYMBOL(blk_queue_prep_rq);
>  
>  /**
> + * blk_queue_discard - set a discard_sectors function for queue
> + * @q:		queue
> + * @dfn:	discard function
> + *
> + * It's possible for a queue to register a discard callback which is used
> + * to queue a request to discard (or "trim") sectors whose content is no
> + * longer required, if the underlying device supports such an action.
> + *
> + */
> +void blk_queue_discard(struct request_queue *q, discard_fn *dfn)
> +{
> +	q->discard_fn = dfn;
> +}
> +EXPORT_SYMBOL(blk_queue_discard);
> +
> +/**
>   * blk_queue_merge_bvec - set a merge_bvec function for queue
>   * @q:		queue
>   * @mbfn:	merge_bvec_fn
> diff --git a/block/elevator.c b/block/elevator.c
> index ed6f8f3..bb26424 100644
> --- a/block/elevator.c
> +++ b/block/elevator.c
> @@ -675,7 +675,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
>  	if (q->ordcolor)
>  		rq->cmd_flags |= REQ_ORDERED_COLOR;
>  
> -	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
> +	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_DISCARD)) {
>  		/*
>  		 * toggle ordered color
>  		 */

Just make REQ_DISCARD set REQ_SOFTBARRIER? Do you care if this acts as a
barrier or not?

> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 0933a14..67cb2ee 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -149,6 +149,8 @@ struct bio {
>   * bit 2 -- barrier
>   * bit 3 -- fail fast, don't want low level driver retries
>   * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
> + * bit 5 --
> + * bit 6 -- discard sectors (trim)
>   */
>  #define BIO_RW		0
>  #define BIO_RW_AHEAD	1
> @@ -156,6 +158,7 @@ struct bio {
>  #define BIO_RW_FAILFAST	3
>  #define BIO_RW_SYNC	4
>  #define BIO_RW_META	5
> +#define BIO_RW_DISCARD	6
>  
>  /*
>   * upper 16 bits of bi_rw define the io priority of this bio
> @@ -185,10 +188,16 @@ struct bio {
>  #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
>  #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
>  #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
> +#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
>  #define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
>  
> +#define bio_dataless(bio)	(!(bio)->bi_io_vec)
> +
>  static inline unsigned int bio_cur_sectors(struct bio *bio)
>  {
> +	if (unlikely(bio_discard(bio)))
> +		return bio->bi_size >> 9;
> +
>  	if (bio->bi_vcnt)
>  		return bio_iovec(bio)->bv_len >> 9;

Just make that

        if (bio->bi_vcnt)
  		return bio_iovec(bio)->bv_len >> 9;
        else
                return bio->bi_size >> 9;

and add a comment about it.

> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index e61f22b..9e3adc4 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -55,6 +55,7 @@ enum rq_cmd_type_bits {
>  	REQ_TYPE_PM_RESUME,		/* resume request */
>  	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
>  	REQ_TYPE_FLUSH,			/* flush request */
> +	REQ_TYPE_DISCARD,		/* discard sectors request */
>  	REQ_TYPE_SPECIAL,		/* driver defined type */
>  	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
>  	/*
> @@ -89,6 +90,7 @@ enum {
>  enum rq_flag_bits {
>  	__REQ_RW,		/* not set, read. set, write */
>  	__REQ_FAILFAST,		/* no low level driver retries */
> +	__REQ_DISCARD,		/* request to discard sectors */
>  	__REQ_SORTED,		/* elevator knows about this request */
>  	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
>  	__REQ_HARDBARRIER,	/* may not be passed by drive either */
> @@ -111,6 +113,7 @@ enum rq_flag_bits {
>  };
>  
>  #define REQ_RW		(1 << __REQ_RW)
> +#define REQ_DISCARD	(1 << __REQ_DISCARD)
>  #define REQ_FAILFAST	(1 << __REQ_FAILFAST)
>  #define REQ_SORTED	(1 << __REQ_SORTED)
>  #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
> @@ -252,6 +255,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
>  typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
>  typedef int (prep_rq_fn) (struct request_queue *, struct request *);
>  typedef void (unplug_fn) (struct request_queue *);
> +typedef int (discard_fn) (struct request_queue *, struct request *);
>  
>  struct bio_vec;
>  struct bvec_merge_data {
> @@ -298,6 +302,7 @@ struct request_queue
>  	make_request_fn		*make_request_fn;
>  	prep_rq_fn		*prep_rq_fn;
>  	unplug_fn		*unplug_fn;
> +	discard_fn		*discard_fn;
>  	merge_bvec_fn		*merge_bvec_fn;
>  	prepare_flush_fn	*prepare_flush_fn;
>  	softirq_done_fn		*softirq_done_fn;
> @@ -536,10 +541,12 @@ enum {
>  #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
>  #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
>  #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
> +#define blk_discard_rq(rq)	((rq)->cmd_flags & REQ_DISCARD)
>  #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
>  #define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
>  /* rq->queuelist of dequeued request must be list_empty() */
>  #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
> +#define blk_dataless_rq(rq)	(!(rq)->buffer)
>  
>  #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
>  
> @@ -786,6 +793,7 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
>  extern void blk_queue_dma_alignment(struct request_queue *, int);
>  extern void blk_queue_update_dma_alignment(struct request_queue *, int);
>  extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
> +extern void blk_queue_discard(struct request_queue *, discard_fn *);
>  extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
>  extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
>  extern int blk_do_ordered(struct request_queue *, struct request **);
> @@ -829,6 +837,8 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
>  }
>  
>  extern int blkdev_issue_flush(struct block_device *, sector_t *);
> +extern int blkdev_issue_discard(struct block_device *, sector_t sector,
> +				unsigned nr_sects, bio_end_io_t *);
>  
>  /*
>  * command filter functions
> -- 
> 1.5.5.1
> 
> 
> -- 
> David Woodhouse                            Open Source Technology Centre
> David.Woodhouse@xxxxxxxxx                              Intel Corporation
> 
> 
> 

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html