Re: [PATCH 2/5] bcache: implement PI controller for writeback rate

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Coly--

Excellent, thank you for all your help.  I will send out another
version of these patches after a little more time for review.  I have
made the change you've suggested.

Mike

On Wed, Sep 27, 2017 at 1:35 AM, Coly Li <i@xxxxxxx> wrote:
> On 2017/9/27 上午3:24, Michael Lyle wrote:
>> bcache uses a control system to attempt to keep the amount of dirty data
>> in cache at a user-configured level, while not responding excessively to
>> transients and variations in write rate.  Previously, the system was a
>> PD controller; but the output from it was integrated, turning the
>> Proportional term into an Integral term, and turning the Derivative term
>> into a crude Proportional term.  Performance of the controller has been
>> uneven in production, and it has tended to respond slowly, oscillate,
>> and overshoot.
>>
>> This patch set replaces the current control system with an explicit PI
>> controller and tuning that should be correct for most hardware.  By
>> default, it attempts to write at a rate that would retire 1/40th of the
>> current excess blocks per second.  An integral term in turn works to
>> remove steady state errors.
>>
>> IMO, this yields benefits in simplicity (removing weighted average
>> filtering, etc) and system performance.
>>
>> Another small change is a tunable parameter is introduced to allow the
>> user to specify a minimum rate at which dirty blocks are retired.
>>
>> There is a slight difference from earlier versions of the patch in
>> integral handling to prevent excessive negative integral windup.
>>
>> Signed-off-by: Michael Lyle <mlyle@xxxxxxxx>
>
> Reviewed-by: Coly Li <colyli@xxxxxxx>
>
> Last comment, could you please to use current code comments style in
> bcache, like this,
>
> /*
>  * The allocation code needs gc_mark in struct bucket to be correct, but
>  * it's not while a gc is in progress. Protected by bucket_lock.
>  */
>
> Or like this,
>
> /*
>  * For any bio we don't skip we subtract the number of sectors from
>  * rescale; when it hits 0 we rescale all the bucket priorities.
>  */
>
> Rested part are OK to me. Thanks for your effort to implement a simpler
> PI controller.
>
> Coly Li
>
>> ---
>>  drivers/md/bcache/bcache.h    |  9 ++---
>>  drivers/md/bcache/sysfs.c     | 18 +++++----
>>  drivers/md/bcache/writeback.c | 89 ++++++++++++++++++++++++-------------------
>>  3 files changed, 64 insertions(+), 52 deletions(-)
>>
>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> index 2ed9bd231d84..eb83be693d60 100644
>> --- a/drivers/md/bcache/bcache.h
>> +++ b/drivers/md/bcache/bcache.h
>> @@ -265,9 +265,6 @@ struct bcache_device {
>>       atomic_t                *stripe_sectors_dirty;
>>       unsigned long           *full_dirty_stripes;
>>
>> -     unsigned long           sectors_dirty_last;
>> -     long                    sectors_dirty_derivative;
>> -
>>       struct bio_set          *bio_split;
>>
>>       unsigned                data_csum:1;
>> @@ -362,12 +359,14 @@ struct cached_dev {
>>
>>       uint64_t                writeback_rate_target;
>>       int64_t                 writeback_rate_proportional;
>> -     int64_t                 writeback_rate_derivative;
>> +     int64_t                 writeback_rate_integral;
>> +     int64_t                 writeback_rate_integral_scaled;
>>       int64_t                 writeback_rate_change;
>>
>>       unsigned                writeback_rate_update_seconds;
>> -     unsigned                writeback_rate_d_term;
>> +     unsigned                writeback_rate_i_term_inverse;
>>       unsigned                writeback_rate_p_term_inverse;
>> +     unsigned                writeback_rate_minimum;
>>  };
>>
>>  enum alloc_reserve {
>> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
>> index 104c57cd666c..eb493814759c 100644
>> --- a/drivers/md/bcache/sysfs.c
>> +++ b/drivers/md/bcache/sysfs.c
>> @@ -81,8 +81,9 @@ rw_attribute(writeback_delay);
>>  rw_attribute(writeback_rate);
>>
>>  rw_attribute(writeback_rate_update_seconds);
>> -rw_attribute(writeback_rate_d_term);
>> +rw_attribute(writeback_rate_i_term_inverse);
>>  rw_attribute(writeback_rate_p_term_inverse);
>> +rw_attribute(writeback_rate_minimum);
>>  read_attribute(writeback_rate_debug);
>>
>>  read_attribute(stripe_size);
>> @@ -130,15 +131,16 @@ SHOW(__bch_cached_dev)
>>       sysfs_hprint(writeback_rate,    dc->writeback_rate.rate << 9);
>>
>>       var_print(writeback_rate_update_seconds);
>> -     var_print(writeback_rate_d_term);
>> +     var_print(writeback_rate_i_term_inverse);
>>       var_print(writeback_rate_p_term_inverse);
>> +     var_print(writeback_rate_minimum);
>>
>>       if (attr == &sysfs_writeback_rate_debug) {
>>               char rate[20];
>>               char dirty[20];
>>               char target[20];
>>               char proportional[20];
>> -             char derivative[20];
>> +             char integral[20];
>>               char change[20];
>>               s64 next_io;
>>
>> @@ -146,7 +148,7 @@ SHOW(__bch_cached_dev)
>>               bch_hprint(dirty,       bcache_dev_sectors_dirty(&dc->disk) << 9);
>>               bch_hprint(target,      dc->writeback_rate_target << 9);
>>               bch_hprint(proportional,dc->writeback_rate_proportional << 9);
>> -             bch_hprint(derivative,  dc->writeback_rate_derivative << 9);
>> +             bch_hprint(integral,    dc->writeback_rate_integral_scaled << 9);
>>               bch_hprint(change,      dc->writeback_rate_change << 9);
>>
>>               next_io = div64_s64(dc->writeback_rate.next - local_clock(),
>> @@ -157,11 +159,11 @@ SHOW(__bch_cached_dev)
>>                              "dirty:\t\t%s\n"
>>                              "target:\t\t%s\n"
>>                              "proportional:\t%s\n"
>> -                            "derivative:\t%s\n"
>> +                            "integral:\t%s\n"
>>                              "change:\t\t%s/sec\n"
>>                              "next io:\t%llims\n",
>>                              rate, dirty, target, proportional,
>> -                            derivative, change, next_io);
>> +                            integral, change, next_io);
>>       }
>>
>>       sysfs_hprint(dirty_data,
>> @@ -213,7 +215,7 @@ STORE(__cached_dev)
>>                           dc->writeback_rate.rate, 1, INT_MAX);
>>
>>       d_strtoul_nonzero(writeback_rate_update_seconds);
>> -     d_strtoul(writeback_rate_d_term);
>> +     d_strtoul(writeback_rate_i_term_inverse);
>>       d_strtoul_nonzero(writeback_rate_p_term_inverse);
>>
>>       d_strtoi_h(sequential_cutoff);
>> @@ -319,7 +321,7 @@ static struct attribute *bch_cached_dev_files[] = {
>>       &sysfs_writeback_percent,
>>       &sysfs_writeback_rate,
>>       &sysfs_writeback_rate_update_seconds,
>> -     &sysfs_writeback_rate_d_term,
>> +     &sysfs_writeback_rate_i_term_inverse,
>>       &sysfs_writeback_rate_p_term_inverse,
>>       &sysfs_writeback_rate_debug,
>>       &sysfs_dirty_data,
>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>> index eea49bf36401..5a8f5655151b 100644
>> --- a/drivers/md/bcache/writeback.c
>> +++ b/drivers/md/bcache/writeback.c
>> @@ -25,48 +25,60 @@ static void __update_writeback_rate(struct cached_dev *dc)
>>                               bcache_flash_devs_sectors_dirty(c);
>>       uint64_t cache_dirty_target =
>>               div_u64(cache_sectors * dc->writeback_percent, 100);
>> -
>>       int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
>>                                  c->cached_dev_sectors);
>>
>> -     /* PD controller */
>> -
>> +     /* PI controller:
>> +      * Figures out the amount that should be written per second.
>> +      *
>> +      * First, the error (number of sectors that are dirty beyond our
>> +      * target) is calculated.  The error is accumulated (numerically
>> +      * integrated).
>> +      *
>> +      * Then, the proportional value and integral value are scaled
>> +      * based on configured values.  These are stored as inverses to
>> +      * avoid fixed point math and to make configuration easy-- e.g.
>> +      * the default value of 40 for writeback_rate_p_term_inverse
>> +      * attempts to write at a rate that would retire all the dirty
>> +      * blocks in 40 seconds.
>> +      *
>> +      * The writeback_rate_i_inverse value of 10000 means that 1/10000th
>> +      * of the error is accumulated in the integral term per second.
>> +      * This acts as a slow, long-term average that is not subject to
>> +      * variations in usage like the p term.
>> +      */
>>       int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
>> -     int64_t derivative = dirty - dc->disk.sectors_dirty_last;
>> -     int64_t proportional = dirty - target;
>> -     int64_t change;
>> -
>> -     dc->disk.sectors_dirty_last = dirty;
>> -
>> -     /* Scale to sectors per second */
>> -
>> -     proportional *= dc->writeback_rate_update_seconds;
>> -     proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
>> -
>> -     derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
>> -
>> -     derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
>> -                           (dc->writeback_rate_d_term /
>> -                            dc->writeback_rate_update_seconds) ?: 1, 0);
>> -
>> -     derivative *= dc->writeback_rate_d_term;
>> -     derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
>> -
>> -     change = proportional + derivative;
>> +     int64_t error = dirty - target;
>> +     int64_t proportional_scaled =
>> +             div_s64(error, dc->writeback_rate_p_term_inverse);
>> +     int64_t integral_scaled, new_rate;
>> +
>> +     if ((error < 0 && dc->writeback_rate_integral > 0) ||
>> +         (error > 0 && time_before64(local_clock(),
>> +                      dc->writeback_rate.next + NSEC_PER_MSEC))) {
>> +             /* Only decrease the integral term if it's more than
>> +              * zero.  Only increase the integral term if the device
>> +              * is keeping up.  (Don't wind up the integral
>> +              * ineffectively in either case).
>> +              *
>> +              * It's necessary to scale this by
>> +              * writeback_rate_update_seconds to keep the integral
>> +              * term dimensioned properly.
>> +              */
>> +             dc->writeback_rate_integral += error *
>> +                     dc->writeback_rate_update_seconds;
>> +     }
>>
>> -     /* Don't increase writeback rate if the device isn't keeping up */
>> -     if (change > 0 &&
>> -         time_after64(local_clock(),
>> -                      dc->writeback_rate.next + NSEC_PER_MSEC))
>> -             change = 0;
>> +     integral_scaled = div_s64(dc->writeback_rate_integral,
>> +                     dc->writeback_rate_i_term_inverse);
>>
>> -     dc->writeback_rate.rate =
>> -             clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
>> -                     1, NSEC_PER_MSEC);
>> +     new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled),
>> +                     dc->writeback_rate_minimum, NSEC_PER_MSEC);
>>
>> -     dc->writeback_rate_proportional = proportional;
>> -     dc->writeback_rate_derivative = derivative;
>> -     dc->writeback_rate_change = change;
>> +     dc->writeback_rate_proportional = proportional_scaled;
>> +     dc->writeback_rate_integral_scaled = integral_scaled;
>> +     dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
>> +     dc->writeback_rate.rate = new_rate;
>>       dc->writeback_rate_target = target;
>>  }
>>
>> @@ -498,8 +510,6 @@ void bch_sectors_dirty_init(struct bcache_device *d)
>>
>>       bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
>>                          sectors_dirty_init_fn, 0);
>> -
>> -     d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
>>  }
>>
>>  void bch_cached_dev_writeback_init(struct cached_dev *dc)
>> @@ -513,10 +523,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>>       dc->writeback_percent           = 10;
>>       dc->writeback_delay             = 30;
>>       dc->writeback_rate.rate         = 1024;
>> +     dc->writeback_rate_minimum      = 1;
>>
>>       dc->writeback_rate_update_seconds = 5;
>> -     dc->writeback_rate_d_term       = 30;
>> -     dc->writeback_rate_p_term_inverse = 6000;
>> +     dc->writeback_rate_p_term_inverse = 40;
>> +     dc->writeback_rate_i_term_inverse = 10000;
>>
>>       INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
>>  }
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Linux ARM Kernel]     [Linux Filesystem Development]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux