Re: [for-416 PATCH v2] bcache: writeback: collapse contiguous IO better

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Tang Junhui---

On 12/28/2017 06:00 PM, tang.junhui@xxxxxxxxxx wrote:
> LGTM, I would like it much more if MAX_WRITEBACKS_IN_PASS(5) defines a 
> little bigger value.

Yes-- we can always turn it up further depending on experience.  5 was
chosen to provide a clear benefit without changing things too much.

> Reviewed-by: Tang Junhui <tang.junhui@xxxxxxxxxx>

Thank you very much.  Applied to my staging tree.

Mike

> 
>> Previously, there was some logic that attempted to immediately issue
>> writeback of backing-contiguous blocks when the writeback rate was
>> fast.
>>
>> The previous logic did not have any limits on the aggregate size it
>> would issue, nor the number of keys it would combine at once.  It
>> would also discard the chance to do a contiguous write when the
>> writeback rate was low-- e.g. at "background" writeback of target
>> rate = 8, it would not combine two adjacent 4k writes and would
>> instead seek the disk twice.
>>
>> This patch imposes limits and explicitly understands the size of
>> contiguous I/O during issue.  It also will combine contiguous I/O
>> in all circumstances, not just when writeback is requested to be
>> relatively fast.
>>
>> It is a win on its own, but also lays the groundwork for skip writes to
>> short keys to make the I/O more sequential/contiguous.  It also gets
>> ready to start using blk_*_plug, and to allow issuing of non-contig
>> I/O in parallel if requested by the user (to make use of disk
>> throughput benefits available from higher queue depths).
>>
>> This patch fixes a previous version where the contiguous information
>> was not calculated properly, thanks to suggestions by Tang Junhui.
>>
>> Signed-off-by: Michael Lyle <mlyle@xxxxxxxx>
>> ---
>> drivers/md/bcache/bcache.h    |   6 ---
>> drivers/md/bcache/writeback.c | 120 +++++++++++++++++++++++++++++-------------
>> drivers/md/bcache/writeback.h |   3 ++
>> 3 files changed, 85 insertions(+), 44 deletions(-)
>>
>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> index 843877e017e1..1784e50eb857 100644
>> --- a/drivers/md/bcache/bcache.h
>> +++ b/drivers/md/bcache/bcache.h
>> @@ -323,12 +323,6 @@ struct cached_dev {
>>     struct bch_ratelimit    writeback_rate;
>>     struct delayed_work    writeback_rate_update;
>>
>> -    /*
>> -     * Internal to the writeback code, so read_dirty() can keep track of
>> -     * where it's at.
>> -     */
>> -    sector_t        last_read;
>> -
>>     /* Limit number of writeback bios in flight */
>>     struct semaphore    in_flight;
>>     struct task_struct    *writeback_thread;
>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>> index f3d680c907ae..6975003454d9 100644
>> --- a/drivers/md/bcache/writeback.c
>> +++ b/drivers/md/bcache/writeback.c
>> @@ -251,7 +251,9 @@ static void read_dirty_submit(struct closure *cl)
>> static void read_dirty(struct cached_dev *dc)
>> {
>>     unsigned delay = 0;
>> -    struct keybuf_key *w;
>> +    struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
>> +    size_t size;
>> +    int nk, i;
>>     struct dirty_io *io;
>>     struct closure cl;
>>
>> @@ -262,45 +264,87 @@ static void read_dirty(struct cached_dev *dc)
>>      * mempools.
>>      */
>>
>> -    while (!kthread_should_stop()) {
>> -
>> -        w = bch_keybuf_next(&dc->writeback_keys);
>> -        if (!w)
>> -            break;
>> -
>> -        BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
>> -
>> -        if (KEY_START(&w->key) != dc->last_read ||
>> -            jiffies_to_msecs(delay) > 50)
>> -            while (!kthread_should_stop() && delay)
>> -                delay = schedule_timeout_interruptible(delay);
>> -
>> -        dc->last_read    = KEY_OFFSET(&w->key);
>> -
>> -        io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
>> -                 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>> -                 GFP_KERNEL);
>> -        if (!io)
>> -            goto err;
>> -
>> -        w->private    = io;
>> -        io->dc        = dc;
>> -
>> -        dirty_init(w);
>> -        bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>> -        io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>> -        bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>> -        io->bio.bi_end_io    = read_dirty_endio;
>> -
>> -        if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>> -            goto err_free;
>> -
>> -        trace_bcache_writeback(&w->key);
>> +    next = bch_keybuf_next(&dc->writeback_keys);
>> +
>> +    while (!kthread_should_stop() && next) {
>> +        size = 0;
>> +        nk = 0;
>> +
>> +        do {
>> +            BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
>> +
>> +            /*
>> +             * Don't combine too many operations, even if they
>> +             * are all small.
>> +             */
>> +            if (nk >= MAX_WRITEBACKS_IN_PASS)
>> +                break;
>> +
>> +            /*
>> +             * If the current operation is very large, don't
>> +             * further combine operations.
>> +             */
>> +            if (size >= MAX_WRITESIZE_IN_PASS)
>> +                break;
>> +
>> +            /*
>> +             * Operations are only eligible to be combined
>> +             * if they are contiguous.
>> +             *
>> +             * TODO: add a heuristic willing to fire a
>> +             * certain amount of non-contiguous IO per pass,
>> +             * so that we can benefit from backing device
>> +             * command queueing.
>> +             */
>> +            if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
>> +                        &START_KEY(&next->key)))
>> +                break;
>> +
>> +            size += KEY_SIZE(&next->key);
>> +            keys[nk++] = next;
>> +        } while ((next = bch_keybuf_next(&dc->writeback_keys)));
>> +
>> +        /* Now we have gathered a set of 1..5 keys to write back. */
>> +        for (i = 0; i < nk; i++) {
>> +            w = keys[i];
>> +
>> +            io = kzalloc(sizeof(struct dirty_io) +
>> +                     sizeof(struct bio_vec) *
>> +                     DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
>> +                     GFP_KERNEL);
>> +            if (!io)
>> +                goto err;
>> +
>> +            w->private    = io;
>> +            io->dc        = dc;
>> +
>> +            dirty_init(w);
>> +            bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
>> +            io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
>> +            bio_set_dev(&io->bio,
>> +                    PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
>> +            io->bio.bi_end_io    = read_dirty_endio;
>> +
>> +            if (bio_alloc_pages(&io->bio, GFP_KERNEL))
>> +                goto err_free;
>> +
>> +            trace_bcache_writeback(&w->key);
>> +
>> +            down(&dc->in_flight);
>> +
>> +            /* We've acquired a semaphore for the maximum
>> +             * simultaneous number of writebacks; from here
>> +             * everything happens asynchronously.
>> +             */
>> +            closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>> +        }
>>
>> -        down(&dc->in_flight);
>> -        closure_call(&io->cl, read_dirty_submit, NULL, &cl);
>> +        delay = writeback_delay(dc, size);
>>
>> -        delay = writeback_delay(dc, KEY_SIZE(&w->key));
>> +        while (!kthread_should_stop() && delay) {
>> +            schedule_timeout_interruptible(delay);
>> +            delay = writeback_delay(dc, 0);
>> +        }
>>     }
>>
>>     if (0) {
>> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
>> index a9e3ffb4b03c..6d26927267f8 100644
>> --- a/drivers/md/bcache/writeback.h
>> +++ b/drivers/md/bcache/writeback.h
>> @@ -5,6 +5,9 @@
>> #define CUTOFF_WRITEBACK    40
>> #define CUTOFF_WRITEBACK_SYNC    70
>>
>> +#define MAX_WRITEBACKS_IN_PASS  5
>> +#define MAX_WRITESIZE_IN_PASS   5000    /* *512b */
>> +
>> static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
>> {
>>     uint64_t i, ret = 0;
>> -- 
>> 2.14.1
> 
> 
> Thanks,
> Tang
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 




[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux