Re: [PATCH 2/2] loop: Better discard support for block devices

Ming Lei <ming.lei@xxxxxxxxxx> · Thu, 6 Dec 2018 08:22:11 +0800

On Wed, Dec 05, 2018 at 11:35:57AM -0800, Evan Green wrote:
> Hi Ming,
> 
> On Tue, Dec 4, 2018 at 5:11 PM Ming Lei <ming.lei@xxxxxxxxxx> wrote:
> >
> > On Tue, Dec 04, 2018 at 02:19:46PM -0800, Evan Green wrote:
> > > Hi Ming,
> > >
> > > On Tue, Nov 27, 2018 at 5:26 PM Ming Lei <ming.lei@xxxxxxxxxx> wrote:
> > > >
> > > > On Tue, Oct 30, 2018 at 04:06:24PM -0700, Evan Green wrote:
> > > > > If the backing device for a loop device is a block device,
> > > > > then mirror the discard properties of the underlying block
> > > > > device into the loop device. While in there, differentiate
> > > > > between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> > > > > different for block devices, but which the loop device had
> > > > > just been lumping together.
> > > > >
> > > > > Signed-off-by: Evan Green <evgreen@xxxxxxxxxxxx>
> > > > > ---
> > > > >
> > > > >  drivers/block/loop.c | 61 +++++++++++++++++++++++++++++++++++-----------------
> > > > >  1 file changed, 41 insertions(+), 20 deletions(-)
> > > > >
> > > > > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > > > > index 28990fc94841a..176e65101c4ef 100644
> > > > > --- a/drivers/block/loop.c
> > > > > +++ b/drivers/block/loop.c
> > > > > @@ -417,19 +417,14 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq,
> > > > >       return ret;
> > > > >  }
> > > > >
> > > > > -static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
> > > > > +static int lo_discard(struct loop_device *lo, struct request *rq,
> > > > > +             int mode, loff_t pos)
> > > > >  {
> > > > > -     /*
> > > > > -      * We use punch hole to reclaim the free space used by the
> > > > > -      * image a.k.a. discard. However we do not support discard if
> > > > > -      * encryption is enabled, because it may give an attacker
> > > > > -      * useful information.
> > > > > -      */
> > > > >       struct file *file = lo->lo_backing_file;
> > > > > -     int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> > > > > +     struct request_queue *q = lo->lo_queue;
> > > > >       int ret;
> > > > >
> > > > > -     if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> > > > > +     if (!blk_queue_discard(q)) {
> > > > >               ret = -EOPNOTSUPP;
> > > > >               goto out;
> > > > >       }
> > > > > @@ -603,8 +598,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
> > > > >       case REQ_OP_FLUSH:
> > > > >               return lo_req_flush(lo, rq);
> > > > >       case REQ_OP_DISCARD:
> > > > > +             return lo_discard(lo, rq,
> > > > > +                     FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, pos);
> > > > > +
> > > > >       case REQ_OP_WRITE_ZEROES:
> > > > > -             return lo_discard(lo, rq, pos);
> > > > > +             return lo_discard(lo, rq,
> > > > > +                     FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, pos);
> > > > > +
> > > > >       case REQ_OP_WRITE:
> > > > >               if (lo->transfer)
> > > > >                       return lo_write_transfer(lo, rq, pos);
> > > > > @@ -859,6 +859,25 @@ static void loop_config_discard(struct loop_device *lo)
> > > > >       struct file *file = lo->lo_backing_file;
> > > > >       struct inode *inode = file->f_mapping->host;
> > > > >       struct request_queue *q = lo->lo_queue;
> > > > > +     struct request_queue *backingq;
> > > > > +
> > > > > +     /*
> > > > > +      * If the backing device is a block device, mirror its discard
> > > > > +      * capabilities.
> > > > > +      */
> > > > > +     if (S_ISBLK(inode->i_mode)) {
> > > > > +             backingq = bdev_get_queue(inode->i_bdev);
> > > > > +             blk_queue_max_discard_sectors(q,
> > > > > +                     backingq->limits.max_discard_sectors);
> > > > > +
> > > > > +             blk_queue_max_write_zeroes_sectors(q,
> > > > > +                     backingq->limits.max_write_zeroes_sectors);
> > > > > +
> > > > > +             q->limits.discard_granularity =
> > > > > +                     backingq->limits.discard_granularity;
> > > > > +
> > > > > +             q->limits.discard_alignment =
> > > > > +                     backingq->limits.discard_alignment;
> > > >
> > > > I think it isn't necessary to mirror backing queue's discard/write_zeros
> > > > capabilities, given either fs of the underlying queue can deal with well.
> > > >
> > > > >
> > > > >       /*
> > > > >        * We use punch hole to reclaim the free space used by the
> > > > > @@ -866,22 +885,24 @@ static void loop_config_discard(struct loop_device *lo)
> > > > >        * encryption is enabled, because it may give an attacker
> > > > >        * useful information.
> > > > >        */
> > > > > -     if ((!file->f_op->fallocate) ||
> > > > > -         lo->lo_encrypt_key_size) {
> > > > > +     } else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> > > > >               q->limits.discard_granularity = 0;
> > > > >               q->limits.discard_alignment = 0;
> > > > >               blk_queue_max_discard_sectors(q, 0);
> > > > >               blk_queue_max_write_zeroes_sectors(q, 0);
> > > > > -             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> > > > > -             return;
> > > > > -     }
> > > > >
> > > > > -     q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > > > > -     q->limits.discard_alignment = 0;
> > > > > +     } else {
> > > > > +             q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > > > > +             q->limits.discard_alignment = 0;
> > > > > +
> > > > > +             blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > > > > +             blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > > > > +     }
> > > > >
> > > > > -     blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > > > > -     blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > > > > -     blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > > > > +     if (q->limits.max_discard_sectors || q->limits.max_write_zeroes_sectors)
> > > > > +             blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > > > > +     else
> > > > > +             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> > > > >  }
> > > >
> > > > Looks it should work just by mirroring backing queue's discard
> > > > capability to loop queue in case that the loop is backed by
> > > > block device, doesn't it? Meantime the unified discard limits &
> > > > write_zeros limits can be kept.
> > >
> > > I tested this out, and you're right that I could just flip the
> > > QUEUE_FLAG_DISCARD based on whether its a block device, and leave
> >
> > What I meant actually is to do the following discard config:
> >
> >         bool discard;
> >         if (S_ISBLK(inode->i_mode)) {
> >                 struct request_queue *backingq = bdev_get_queue(inode->i_bdev);
> >                 discard = blk_queue_discard(backingq);
> >         } else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size)
> >                 discard = false;
> >         else
> >                 discard = true;
> >
> >         if (discard) {
> >                 blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> >                 blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> >                 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> >         } else {
> >                 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> >         }
> 
> Ah, I see. But I think it's useful to reflect max_discard_sectors,
> max_write_zeroes_sectors, discard_granularity, and discard_alignment
> from the block device to the loop device. With the exception of
> discard_alignment, these parameters are visible via sysfs, so usermode
> can actually use these to make more intelligent use of fallocate.

Could you share us what the intelligent use of fallocate is?

The block layer code of blk_bio_discard_split() can deal with all the
magic limits.

The unified discard limits is simpler from implement view of loop, or
from userspace view.

> Without this part of it, I still see issues with GNU cp.

Could you investigate the cause of the GNU cp issue?

Thanks,
Ming