On 1/19/21 8:56 PM, Dongdong Tao wrote: > From: dongdong tao <dongdong.tao@xxxxxxxxxxxxx> > > Current way to calculate the writeback rate only considered the > dirty sectors, this usually works fine when the fragmentation > is not high, but it will give us unreasonable small rate when > we are under a situation that very few dirty sectors consumed > a lot dirty buckets. In some case, the dirty bucekts can reached > to CUTOFF_WRITEBACK_SYNC while the dirty data(sectors) not even > reached the writeback_percent, the writeback rate will still > be the minimum value (4k), thus it will cause all the writes to be > stucked in a non-writeback mode because of the slow writeback. > > We accelerate the rate in 3 stages with different aggressiveness, > the first stage starts when dirty buckets percent reach above > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW (50), the second is > BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID (57), the third is > BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH (64). By default > the first stage tries to writeback the amount of dirty data > in one bucket (on average) in (1 / (dirty_buckets_percent - 50)) second, > the second stage tries to writeback the amount of dirty data in one bucket > in (1 / (dirty_buckets_percent - 57)) * 100 millisecond, the third > stage tries to writeback the amount of dirty data in one bucket in > (1 / (dirty_buckets_percent - 64)) millisecond. It is ok to be very > aggressive in the last stage, as it is our last chance to pull it back. > > Option writeback_consider_fragment to control whether we want > this feature to be on or off, it's on by default. > > Lastly, below is the performance data for all the testing result, > including the data from production env: > https://docs.google.com/document/d/ > 1AmbIEa_2MhB9bqhC3rfga9tp7n9YX9PLn0jSUxscVW0/edit?usp=sharing > > Signed-off-by: dongdong tao <dongdong.tao@xxxxxxxxxxxxx> Hi Dongdong, Now overall the patch is OK for me, just minor comments and please check them inline. > --- > drivers/md/bcache/bcache.h | 4 ++++ > drivers/md/bcache/sysfs.c | 22 ++++++++++++++++++ > drivers/md/bcache/writeback.c | 42 +++++++++++++++++++++++++++++++++++ > drivers/md/bcache/writeback.h | 4 ++++ > 4 files changed, 72 insertions(+) > > diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h > index 1d57f48307e6..d7a84327b7f1 100644 > --- a/drivers/md/bcache/bcache.h > +++ b/drivers/md/bcache/bcache.h > @@ -373,6 +373,7 @@ struct cached_dev { > unsigned int partial_stripes_expensive:1; > unsigned int writeback_metadata:1; > unsigned int writeback_running:1; > + unsigned int writeback_consider_fragment:1; > unsigned char writeback_percent; > unsigned int writeback_delay; > > @@ -385,6 +386,9 @@ struct cached_dev { > unsigned int writeback_rate_update_seconds; > unsigned int writeback_rate_i_term_inverse; > unsigned int writeback_rate_p_term_inverse; > + unsigned int writeback_rate_fp_term_low; > + unsigned int writeback_rate_fp_term_mid; > + unsigned int writeback_rate_fp_term_high; > unsigned int writeback_rate_minimum; > > enum stop_on_failure stop_when_cache_set_failed; > diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c > index 00a520c03f41..136899beadba 100644 > --- a/drivers/md/bcache/sysfs.c > +++ b/drivers/md/bcache/sysfs.c > @@ -117,10 +117,14 @@ rw_attribute(writeback_running); > rw_attribute(writeback_percent); > rw_attribute(writeback_delay); > rw_attribute(writeback_rate); > +rw_attribute(writeback_consider_fragment); > > rw_attribute(writeback_rate_update_seconds); > rw_attribute(writeback_rate_i_term_inverse); > rw_attribute(writeback_rate_p_term_inverse); > +rw_attribute(writeback_rate_fp_term_low); > +rw_attribute(writeback_rate_fp_term_mid); > +rw_attribute(writeback_rate_fp_term_high); > rw_attribute(writeback_rate_minimum); > read_attribute(writeback_rate_debug); > > @@ -195,6 +199,7 @@ SHOW(__bch_cached_dev) > var_printf(bypass_torture_test, "%i"); > var_printf(writeback_metadata, "%i"); > var_printf(writeback_running, "%i"); > + var_printf(writeback_consider_fragment, "%i"); > var_print(writeback_delay); > var_print(writeback_percent); > sysfs_hprint(writeback_rate, > @@ -205,6 +210,9 @@ SHOW(__bch_cached_dev) > var_print(writeback_rate_update_seconds); > var_print(writeback_rate_i_term_inverse); > var_print(writeback_rate_p_term_inverse); > + var_print(writeback_rate_fp_term_low); > + var_print(writeback_rate_fp_term_mid); > + var_print(writeback_rate_fp_term_high); > var_print(writeback_rate_minimum); > > if (attr == &sysfs_writeback_rate_debug) { > @@ -303,6 +311,7 @@ STORE(__cached_dev) > sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); > sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); > sysfs_strtoul_bool(writeback_running, dc->writeback_running); > + sysfs_strtoul_bool(writeback_consider_fragment, dc->writeback_consider_fragment); > sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX); > > sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, > @@ -331,6 +340,15 @@ STORE(__cached_dev) > sysfs_strtoul_clamp(writeback_rate_p_term_inverse, > dc->writeback_rate_p_term_inverse, > 1, UINT_MAX); > + sysfs_strtoul_clamp(writeback_rate_fp_term_low, > + dc->writeback_rate_fp_term_low, > + 1, UINT_MAX); > + sysfs_strtoul_clamp(writeback_rate_fp_term_mid, > + dc->writeback_rate_fp_term_mid, > + 1, UINT_MAX); > + sysfs_strtoul_clamp(writeback_rate_fp_term_high, > + dc->writeback_rate_fp_term_high, > + 1, UINT_MAX); Here you should make sure the input value should always be in order, low < mid < high Otherwise the calculation might be messed up. > sysfs_strtoul_clamp(writeback_rate_minimum, > dc->writeback_rate_minimum, > 1, UINT_MAX); > @@ -499,9 +517,13 @@ static struct attribute *bch_cached_dev_files[] = { > &sysfs_writeback_delay, > &sysfs_writeback_percent, > &sysfs_writeback_rate, > + &sysfs_writeback_consider_fragment, > &sysfs_writeback_rate_update_seconds, > &sysfs_writeback_rate_i_term_inverse, > &sysfs_writeback_rate_p_term_inverse, > + &sysfs_writeback_rate_fp_term_low, > + &sysfs_writeback_rate_fp_term_mid, > + &sysfs_writeback_rate_fp_term_high, > &sysfs_writeback_rate_minimum, > &sysfs_writeback_rate_debug, > &sysfs_io_errors, > diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c > index a129e4d2707c..9d440166c6bf 100644 > --- a/drivers/md/bcache/writeback.c > +++ b/drivers/md/bcache/writeback.c > @@ -88,6 +88,44 @@ static void __update_writeback_rate(struct cached_dev *dc) > int64_t integral_scaled; > uint32_t new_rate; > > + /* > + * We need to consider the number of dirty buckets as well > + * when calculating the proportional_scaled, Otherwise we might > + * have an unreasonable small writeback rate at a highly fragmented situation > + * when very few dirty sectors consumed a lot dirty buckets, the > + * worst case is when dirty buckets reached cutoff_writeback_sync and > + * dirty data is still not even reached to writeback percent, so the rate > + * still will be at the minimum value, which will cause the write > + * stuck at a non-writeback mode. > + */ > + struct cache_set *c = dc->disk.c; > + > + int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets; > + > + if (dc->writeback_consider_fragment && > + c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) { > + int64_t fragment = > + div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty); > + int64_t fp_term; > + int64_t fps; > + > + if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) { > + fp_term = dc->writeback_rate_fp_term_low * > + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW); > + } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) { > + fp_term = dc->writeback_rate_fp_term_mid * > + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID); > + } else { > + fp_term = dc->writeback_rate_fp_term_high * > + (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH); > + } > + fps = div_s64(dirty, dirty_buckets) * fp_term; > + if (fragment > 3 && fps > proportional_scaled) { > + //Only overrite the p when fragment > 3 The above "//" style is not recommended as bcache coding style. > + proportional_scaled = fps; > + } > + } > + > if ((error < 0 && dc->writeback_rate_integral > 0) || > (error > 0 && time_before64(local_clock(), > dc->writeback_rate.next + NSEC_PER_MSEC))) { > @@ -977,6 +1015,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) > > dc->writeback_metadata = true; > dc->writeback_running = false; > + dc->writeback_consider_fragment = true; > dc->writeback_percent = 10; > dc->writeback_delay = 30; > atomic_long_set(&dc->writeback_rate.rate, 1024); > @@ -984,6 +1023,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) > > dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT; > dc->writeback_rate_p_term_inverse = 40; > + dc->writeback_rate_fp_term_low = 1; > + dc->writeback_rate_fp_term_mid = 10; > + dc->writeback_rate_fp_term_high = 1000; Could you please to explain a bit how the above 3 numbers are decided ? > dc->writeback_rate_i_term_inverse = 10000; > > WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)); > diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h > index 3f1230e22de0..02b2f9df73f6 100644 > --- a/drivers/md/bcache/writeback.h > +++ b/drivers/md/bcache/writeback.h > @@ -16,6 +16,10 @@ > > #define BCH_AUTO_GC_DIRTY_THRESHOLD 50 > > +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW 50 > +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57 > +#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64 > + > #define BCH_DIRTY_INIT_THRD_MAX 64 > /* > * 14 (16384ths) is chosen here as something that each backing device > Thanks in advance. Coly Li