On Thu, May 07, 2020 at 05:36:47PM +0800, Hillf Danton wrote: > > Hello Paul > > On Wed, 6 May 2020 17:42:40 Paul E. McKenney wrote: > > > > This commit adds a shrinker so as to inform RCU when memory is scarce. > > A simpler hook is added in the logic of kswapd for subscribing the info > that memory pressure is high, and then on top of it make rcu a subscriber > by copying your code for the shrinker, wishing it makes a sense to you. > > What's not yet included is to make the hook per node to help make every > reviewer convinced that memory is becoming tight. Of course without the > cost of making subscribers node aware. > > Hillf I must defer to the MM folks on the MM portion of this patch, but early warning of impending memory pressure would be extremely good. A few RCU-related notes inline below, though. Thanx, Paul > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -49,6 +49,16 @@ static inline void set_max_mapnr(unsigne > static inline void set_max_mapnr(unsigned long limit) { } > #endif > > +/* subscriber of kswapd's memory_pressure_high signal */ > +struct mph_subscriber { > + struct list_head node; > + void (*info) (void *data); > + void *data; > +}; > + > +int mph_subscribe(struct mph_subscriber *ms); > +void mph_unsubscribe(struct mph_subscriber *ms); > + > extern atomic_long_t _totalram_pages; > static inline unsigned long totalram_pages(void) > { > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -3536,6 +3536,40 @@ static bool kswapd_shrink_node(pg_data_t > } > > /* > + * subscribers of kswapd's signal that memory pressure is high > + */ > +static LIST_HEAD(mph_subs); > +static DEFINE_MUTEX(mph_lock); > + > +int mph_subscribe(struct mph_subscriber *ms) > +{ > + if (!ms->info) > + return -EAGAIN; > + > + mutex_lock(&mph_lock); > + list_add_tail(&ms->node, &mph_subs); > + mutex_unlock(&mph_lock); > + return 0; > +} > + > +void mph_unsubscribe(struct mph_subscriber *ms) > +{ > + mutex_lock(&mph_lock); > + list_del(&ms->node); > + mutex_unlock(&mph_lock); > +} > + > +static void kswapd_bbc_mph(void) > +{ > + struct mph_subscriber *ms; > + > + mutex_lock(&mph_lock); > + list_for_each_entry(ms, &mph_subs, node) > + ms->info(ms->data); > + mutex_unlock(&mph_lock); > +} > + > +/* > * For kswapd, balance_pgdat() will reclaim pages across a node from zones > * that are eligible for use by the caller until at least one zone is > * balanced. > @@ -3663,8 +3697,11 @@ restart: > * If we're getting trouble reclaiming, start doing writepage > * even in laptop mode. > */ > - if (sc.priority < DEF_PRIORITY - 2) > + if (sc.priority < DEF_PRIORITY - 2) { > sc.may_writepage = 1; > + if (sc.priority == DEF_PRIORITY - 3) > + kswapd_bbc_mph(); > + } > > /* Call soft limit reclaim before calling shrink_node. */ > sc.nr_scanned = 0; > --- a/kernel/rcu/tree.h > +++ b/kernel/rcu/tree.h > @@ -325,6 +325,8 @@ struct rcu_state { > int ncpus_snap; /* # CPUs seen last time. */ > u8 cbovld; /* Callback overload now? */ > u8 cbovldnext; /* ^ ^ next time? */ > + u8 mph; /* mm pressure high signal from kswapd */ > + unsigned long mph_end; /* time stamp in jiffies */ > > unsigned long jiffies_force_qs; /* Time at which to invoke */ > /* force_quiescent_state(). */ > --- a/kernel/rcu/tree.c > +++ b/kernel/rcu/tree.c > @@ -52,6 +52,7 @@ > #include <linux/kprobes.h> > #include <linux/gfp.h> > #include <linux/oom.h> > +#include <linux/mm.h> > #include <linux/smpboot.h> > #include <linux/jiffies.h> > #include <linux/slab.h> > @@ -2314,8 +2315,15 @@ static void force_qs_rnp(int (*f)(struct > struct rcu_data *rdp; > struct rcu_node *rnp; > > - rcu_state.cbovld = rcu_state.cbovldnext; > + rcu_state.cbovld = smp_load_acquire(&rcu_state.mph) || > + rcu_state.cbovldnext; > rcu_state.cbovldnext = false; > + > + if (READ_ONCE(rcu_state.mph) && > + time_after(jiffies, READ_ONCE(rcu_state.mph_end))) { > + WRITE_ONCE(rcu_state.mph, false); > + pr_info("%s: Ending OOM-mode grace periods.\n", __func__); > + } > rcu_for_each_leaf_node(rnp) { > cond_resched_tasks_rcu_qs(); > mask = 0; > @@ -2643,6 +2651,20 @@ static void check_cb_ovld(struct rcu_dat > raw_spin_unlock_rcu_node(rnp); > } > > +static void rcu_mph_info(void *data) This pointer will always be &rcu_state, so why not ignore the pointer and use "rcu_state" below? RCU grace periods are inherently global, so I don't know of any way for RCU to focus on a given NUMA node. All or nothing. But on the other hand, speeding up RCU grace periods will also help specific NUMA nodes, so I believe that it is all good. > +{ > + struct rcu_state *state = data; > + > + WRITE_ONCE(state->mph_end, jiffies + HZ / 10); > + smp_store_release(&state->mph, true); > + rcu_force_quiescent_state(); > +} > + > +static struct mph_subscriber rcu_mph_subscriber = { > + .info = rcu_mph_info, > + .data = &rcu_state, Then this ".data" entry can be omitted, correct? > +}; > + > /* Helper function for call_rcu() and friends. */ > static void > __call_rcu(struct rcu_head *head, rcu_callback_t func) > @@ -4036,6 +4058,8 @@ void __init rcu_init(void) > qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; > else > qovld_calc = qovld; > + > + mph_subscribe(&rcu_mph_subscriber); > } > > #include "tree_stall.h" >