________________________________________ 发件人: Uladzislau Rezki <urezki@xxxxxxxxx> 发送时间: 2021年2月2日 3:57 收件人: Zhang, Qiang 抄送: urezki@xxxxxxxxx; paulmck@xxxxxxxxxx; joel@xxxxxxxxxxxxxxxxx; rcu@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx 主题: Re: [PATCH v3] kvfree_rcu: Release page cache under memory pressure [Please note: This e-mail is from an EXTERNAL e-mail address] Hello, Zqiang. > From: Zqiang <qiang.zhang@xxxxxxxxxxxxx> > > Add free per-cpu existing krcp's page cache operation, when > the system is under memory pressure. > > Signed-off-by: Zqiang <qiang.zhang@xxxxxxxxxxxxx> > --- > kernel/rcu/tree.c | 26 ++++++++++++++++++++++++++ > 1 file changed, 26 insertions(+) > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > index c1ae1e52f638..644b0f3c7b9f 100644 > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -3571,17 +3571,41 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) > } > EXPORT_SYMBOL_GPL(kvfree_call_rcu); > > +static int free_krc_page_cache(struct kfree_rcu_cpu *krcp) > +{ > + unsigned long flags; > + struct llist_node *page_list, *pos, *n; > + int freed = 0; > + > + raw_spin_lock_irqsave(&krcp->lock, flags); > + page_list = llist_del_all(&krcp->bkvcache); > + krcp->nr_bkv_objs = 0; > + raw_spin_unlock_irqrestore(&krcp->lock, flags); > + > + llist_for_each_safe(pos, n, page_list) { > + free_page((unsigned long)pos); > + freed++; > + } > + > + return freed; > +} > + > static unsigned long > kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) > { > int cpu; > unsigned long count = 0; > + unsigned long flags; > > /* Snapshot count of all CPUs */ > for_each_possible_cpu(cpu) { > struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); > > count += READ_ONCE(krcp->count); > + > + raw_spin_lock_irqsave(&krcp->lock, flags); > + count += krcp->nr_bkv_objs; > + raw_spin_unlock_irqrestore(&krcp->lock, flags); > } > > return count; > @@ -3598,6 +3622,8 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) > struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); > > count = krcp->count; > + count += free_krc_page_cache(krcp); > + > raw_spin_lock_irqsave(&krcp->lock, flags); > if (krcp->monitor_todo) > kfree_rcu_drain_unlock(krcp, flags); > -- > 2.17.1 >> >Thank you for your patch! > >I spent some time to see how the patch behaves under low memory condition. >To simulate it, i used "rcuscale" tool with below parameters: > >../rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig >CONFIG_NR_CPUS=64 \ >--bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 >rcuscale.holdoff=20 rcuscale.kfree_loops=10000 \ >torture.disable_onoff_at_boot" --trust-make > >64 CPUs + 512 MB of memory. In general, my test system was running on edge >hitting an out of memory sometimes, but could be considered as stable in >regards to a test completion and taken time, so both were pretty solid. > >You can find a comparison on a plot, that can be downloaded following >a link: wget >ftp://vps418301.ovh.net/incoming/release_page_cache_under_low_memory.png > >In short, i see that a patched version can lead to longer test completion, >whereas the default variant is stable on almost all runs. After some analysis >and further digging i came to conclusion that a shrinker free_krc_page_cache() >concurs with run_page_cache_worker(krcp) running from kvfree_rcu() context. > >i.e. During the test a page shrinker is pretty active, because of low memory >condition. Our callback drains it whereas kvfree_rcu() part refill it right >away making kind of vicious circle. > >So, a run_page_cache_worker() should be backoff for some time when a system >runs into a low memory condition or high pressure: > >diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c >index 7077d73fcb53..446723b9646b 100644 >--- a/kernel/rcu/tree.c >+++ b/kernel/rcu/tree.c >@@ -3163,7 +3163,7 @@ struct kfree_rcu_cpu { > bool initialized; > int count; > >- struct work_struct page_cache_work; >+ struct delayed_work page_cache_work; > atomic_t work_in_progress; > struct hrtimer hrtimer; > >@@ -3419,7 +3419,7 @@ schedule_page_work_fn(struct hrtimer *t) > struct kfree_rcu_cpu *krcp = > container_of(t, struct kfree_rcu_cpu, hrtimer); > >- queue_work(system_highpri_wq, &krcp->page_cache_work); >+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); > return HRTIMER_NORESTART; > } > >@@ -3428,7 +3428,7 @@ static void fill_page_cache_func(struct work_struct *work) > struct kvfree_rcu_bulk_data *bnode; > struct kfree_rcu_cpu *krcp = > container_of(work, struct kfree_rcu_cpu, >- page_cache_work); >+ page_cache_work.work); > unsigned long flags; > bool pushed; > int i; >@@ -3452,15 +3452,22 @@ static void fill_page_cache_func(struct work_struct >*work) > atomic_set(&krcp->work_in_progress, 0); > } > >+static bool backoff_page_cache_fill; >+ > static void > run_page_cache_worker(struct kfree_rcu_cpu *krcp) > { > if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && > !atomic_xchg(&krcp->work_in_progress, 1)) { >- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, >- HRTIMER_MODE_REL); >- krcp->hrtimer.function = schedule_page_work_fn; >- hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); >+ if (READ_ONCE(backoff_page_cache_fill)) { >+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, >HZ); >+ WRITE_ONCE(backoff_page_cache_fill, false); >+ } else { >+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, >+ HRTIMER_MODE_REL); >+ krcp->hrtimer.function = schedule_page_work_fn; >+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); >+ } > } > } > >@@ -3644,6 +3651,8 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct >shrink_control *sc) > raw_spin_unlock_irqrestore(&krcp->lock, flags); > } > >+ // Low memory condition, limit a page cache worker activity. >+ WRITE_ONCE(backoff_page_cache_fill, true); > return count; > } >@@ -4634,7 +4643,7 @@ static void __init kfree_rcu_batch_init(void) > } > > INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); >- INIT_WORK(&krcp->page_cache_work, fill_page_cache_func); >+ INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); > krcp->initialized = true; > } > if (register_shrinker(&kfree_rcu_shrinker)) > >below patch fixes it. We should backoff the page fill worker keeping it empty >until the situation with memory consumption is normalized. Maybe can cancel the timer and cancel work in the rcu_shrink_count function, after the rcu_shrink_scan function is executed and recover timer. What do you think? > >Any thoughts ideas? > >-- >Vlad Rezki