Re: [PATCH] bcache: implement PI controller for writeback rate

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hey everyone---

I'd appreciate any feedback you all can lend on this changeset.  I
know it's a bit of an awkward time with the opening of the merge
window to have a new functional change show up.  I also would
appreciate any comments on process / how to go about submitting work,
as I have not been active in the Linux kernel community in quite some
time.

This change makes a pretty substantial difference in the smoothness of
IO rates on my cached VM environment.  I see a couple of problems with
further review: I have an incorrect comment about the default p term
value, and there is a small bit of conflict with the patches that have
just gone out.  I can fix both of these quickly in a subsequent
revision.

It's also helpful for intermittent workloads to be able to write at a
somewhat higher rate out to disk.  Spending a few percent of disk
bandwidth on flushing dirty data-- to leave room to deal with new
bursts of workload-- is very helpful.

Thanks!

Mike


On Wed, Sep 6, 2017 at 12:56 AM, Michael Lyle <mlyle@xxxxxxxx> wrote:
> bcache uses a control system to attempt to keep the amount of dirty data
> in cache at a user-configured level, while not responding excessively to
> transients and variations in write rate.  Previously, the system was a
> PD controller; but the output from it was integrated, turning the
> Proportional term into an Integral term, and turning the Derivative term
> into a crude Proportional term.  Performance of the controller has been
> uneven in production, and it has tended to respond slowly, oscillate,
> and overshoot.
>
> This patch set replaces the current control system with an explicit PI
> controller and tuning that should be correct for most hardware.  By
> default, it attempts to write at a rate that would retire 1/40th of the
> current excess blocks per second.  An integral term in turn works to
> remove steady state errors.
>
> IMO, this yields benefits in simplicity (removing weighted average
> filtering, etc) and system performance.
>
> Another small change is a tunable parameter is introduced to allow the
> user to specify a minimum rate at which dirty blocks are retired.
> Ideally one would set this writeback_rate_minimum to a small percentage
> of disk bandwidth, allowing the dirty data to be slowly cleaned out when
> the system is inactive.  The old behavior would try and retire 1
> sector/second, and the new default is 5 sectors/second.
>
> Signed-off-by: Michael Lyle <mlyle@xxxxxxxx>
> ---
>  drivers/md/bcache/bcache.h    |  9 +++--
>  drivers/md/bcache/sysfs.c     | 19 +++++-----
>  drivers/md/bcache/writeback.c | 84 +++++++++++++++++++++++--------------------
>  3 files changed, 60 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index dee542fff68e..f1cdf92e7399 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -265,9 +265,6 @@ struct bcache_device {
>         atomic_t                *stripe_sectors_dirty;
>         unsigned long           *full_dirty_stripes;
>
> -       unsigned long           sectors_dirty_last;
> -       long                    sectors_dirty_derivative;
> -
>         struct bio_set          *bio_split;
>
>         unsigned                data_csum:1;
> @@ -361,12 +358,14 @@ struct cached_dev {
>
>         uint64_t                writeback_rate_target;
>         int64_t                 writeback_rate_proportional;
> -       int64_t                 writeback_rate_derivative;
> +       int64_t                 writeback_rate_integral;
> +       int64_t                 writeback_rate_integral_scaled;
>         int64_t                 writeback_rate_change;
>
>         unsigned                writeback_rate_update_seconds;
> -       unsigned                writeback_rate_d_term;
> +       unsigned                writeback_rate_i_term_inverse;
>         unsigned                writeback_rate_p_term_inverse;
> +       unsigned                writeback_rate_minimum;
>  };
>
>  enum alloc_reserve {
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index f90f13616980..66a716d5f111 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -81,8 +81,9 @@ rw_attribute(writeback_delay);
>  rw_attribute(writeback_rate);
>
>  rw_attribute(writeback_rate_update_seconds);
> -rw_attribute(writeback_rate_d_term);
> +rw_attribute(writeback_rate_i_term_inverse);
>  rw_attribute(writeback_rate_p_term_inverse);
> +rw_attribute(writeback_rate_minimum);
>  read_attribute(writeback_rate_debug);
>
>  read_attribute(stripe_size);
> @@ -130,15 +131,16 @@ SHOW(__bch_cached_dev)
>         sysfs_hprint(writeback_rate,    dc->writeback_rate.rate << 9);
>
>         var_print(writeback_rate_update_seconds);
> -       var_print(writeback_rate_d_term);
> +       var_print(writeback_rate_i_term_inverse);
>         var_print(writeback_rate_p_term_inverse);
> +       var_print(writeback_rate_minimum);
>
>         if (attr == &sysfs_writeback_rate_debug) {
>                 char rate[20];
>                 char dirty[20];
>                 char target[20];
>                 char proportional[20];
> -               char derivative[20];
> +               char integral[20];
>                 char change[20];
>                 s64 next_io;
>
> @@ -146,7 +148,7 @@ SHOW(__bch_cached_dev)
>                 bch_hprint(dirty,       bcache_dev_sectors_dirty(&dc->disk) << 9);
>                 bch_hprint(target,      dc->writeback_rate_target << 9);
>                 bch_hprint(proportional,dc->writeback_rate_proportional << 9);
> -               bch_hprint(derivative,  dc->writeback_rate_derivative << 9);
> +               bch_hprint(integral,    dc->writeback_rate_integral_scaled << 9);
>                 bch_hprint(change,      dc->writeback_rate_change << 9);
>
>                 next_io = div64_s64(dc->writeback_rate.next - local_clock(),
> @@ -157,11 +159,11 @@ SHOW(__bch_cached_dev)
>                                "dirty:\t\t%s\n"
>                                "target:\t\t%s\n"
>                                "proportional:\t%s\n"
> -                              "derivative:\t%s\n"
> +                              "integral:\t%s\n"
>                                "change:\t\t%s/sec\n"
>                                "next io:\t%llims\n",
>                                rate, dirty, target, proportional,
> -                              derivative, change, next_io);
> +                              integral, change, next_io);
>         }
>
>         sysfs_hprint(dirty_data,
> @@ -213,7 +215,7 @@ STORE(__cached_dev)
>                             dc->writeback_rate.rate, 1, INT_MAX);
>
>         d_strtoul_nonzero(writeback_rate_update_seconds);
> -       d_strtoul(writeback_rate_d_term);
> +       d_strtoul(writeback_rate_i_term_inverse);
>         d_strtoul_nonzero(writeback_rate_p_term_inverse);
>
>         d_strtoi_h(sequential_cutoff);
> @@ -319,8 +321,9 @@ static struct attribute *bch_cached_dev_files[] = {
>         &sysfs_writeback_percent,
>         &sysfs_writeback_rate,
>         &sysfs_writeback_rate_update_seconds,
> -       &sysfs_writeback_rate_d_term,
> +       &sysfs_writeback_rate_i_term_inverse,
>         &sysfs_writeback_rate_p_term_inverse,
> +       &sysfs_writeback_rate_minimum,
>         &sysfs_writeback_rate_debug,
>         &sysfs_dirty_data,
>         &sysfs_stripe_size,
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 42c66e76f05e..76e71e8ef356 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -24,48 +24,55 @@ static void __update_writeback_rate(struct cached_dev *dc)
>         uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
>         uint64_t cache_dirty_target =
>                 div_u64(cache_sectors * dc->writeback_percent, 100);
> -
>         int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
>                                    c->cached_dev_sectors);
>
> -       /* PD controller */
> -
> +       /* PI controller:
> +        * Figures out the amount that should be written per second.
> +        *
> +        * First, the error (number of sectors that are dirty beyond our
> +        * target) is calculated.  The error is accumulated (numerically
> +        * integrated).
> +        *
> +        * Then, the proportional value and integral value are scaled
> +        * based on configured values.  These are stored as inverses to
> +        * avoid fixed point math and to make configuration easy-- e.g.
> +        * the default value of 100 for writeback_rate_p_term_inverse
> +        * attempts to write at a rate that would retire all the dirty
> +        * blocks in 100 seconds.
> +        */
>         int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
> -       int64_t derivative = dirty - dc->disk.sectors_dirty_last;
> -       int64_t proportional = dirty - target;
> -       int64_t change;
> -
> -       dc->disk.sectors_dirty_last = dirty;
> -
> -       /* Scale to sectors per second */
> -
> -       proportional *= dc->writeback_rate_update_seconds;
> -       proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
> -
> -       derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
> -
> -       derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
> -                             (dc->writeback_rate_d_term /
> -                              dc->writeback_rate_update_seconds) ?: 1, 0);
> -
> -       derivative *= dc->writeback_rate_d_term;
> -       derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
> -
> -       change = proportional + derivative;
> +       int64_t error = dirty - target;
> +       int64_t proportional_scaled =
> +               div_s64(error, dc->writeback_rate_p_term_inverse);
> +       int64_t integral_scaled, new_rate;
> +
> +       if ((error < 0 && dc->writeback_rate_integral > 0) ||
> +           (error > 0 && time_before64(local_clock(),
> +                        dc->writeback_rate.next + NSEC_PER_MSEC))) {
> +               /* Only decrease the integral term if it's more than
> +                * zero.  Only increase the integral term if the device
> +                * is keeping up.  (Don't wind up the integral
> +                * ineffectively in either case).
> +                *
> +                * It's necessary to scale this by
> +                * writeback_rate_update_seconds to keep the integral
> +                * term dimensioned properly.
> +                */
> +               dc->writeback_rate_integral += error *
> +                       dc->writeback_rate_update_seconds;
> +       }
>
> -       /* Don't increase writeback rate if the device isn't keeping up */
> -       if (change > 0 &&
> -           time_after64(local_clock(),
> -                        dc->writeback_rate.next + NSEC_PER_MSEC))
> -               change = 0;
> +       integral_scaled = div_s64(dc->writeback_rate_integral,
> +                       dc->writeback_rate_i_term_inverse);
>
> -       dc->writeback_rate.rate =
> -               clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
> -                       1, NSEC_PER_MSEC);
> +       new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled),
> +                       dc->writeback_rate_minimum, NSEC_PER_MSEC);
>
> -       dc->writeback_rate_proportional = proportional;
> -       dc->writeback_rate_derivative = derivative;
> -       dc->writeback_rate_change = change;
> +       dc->writeback_rate_proportional = proportional_scaled;
> +       dc->writeback_rate_integral_scaled = integral_scaled;
> +       dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
> +       dc->writeback_rate.rate = new_rate;
>         dc->writeback_rate_target = target;
>  }
>
> @@ -491,8 +498,6 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
>
>         bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
>                            sectors_dirty_init_fn, 0);
> -
> -       dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
>  }
>
>  void bch_cached_dev_writeback_init(struct cached_dev *dc)
> @@ -506,10 +511,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>         dc->writeback_percent           = 10;
>         dc->writeback_delay             = 30;
>         dc->writeback_rate.rate         = 1024;
> +       dc->writeback_rate_minimum      = 5;
>
>         dc->writeback_rate_update_seconds = 5;
> -       dc->writeback_rate_d_term       = 30;
> -       dc->writeback_rate_p_term_inverse = 6000;
> +       dc->writeback_rate_p_term_inverse = 40;
> +       dc->writeback_rate_i_term_inverse = 10000;
>
>         INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
>  }
> --
> 2.11.0
>
--
To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Linux ARM Kernel]     [Linux Filesystem Development]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux