Re: [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode

"Paul E. McKenney" <paulmck@xxxxxxxxxx> · Thu, 7 May 2020 08:49:10 -0700

On Thu, May 07, 2020 at 05:36:47PM +0800, Hillf Danton wrote:
> 
> Hello Paul
> 
> On Wed, 6 May 2020 17:42:40 Paul E. McKenney wrote:
> > 
> > This commit adds a shrinker so as to inform RCU when memory is scarce.
> 
> A simpler hook is added in the logic of kswapd for subscribing the info
> that memory pressure is high, and then on top of it make rcu a subscriber
> by copying your code for the shrinker, wishing it makes a sense to you.
> 
> What's not yet included is to make the hook per node to help make every
> reviewer convinced that memory is becoming tight. Of course without the
> cost of making subscribers node aware.
> 
> Hillf

I must defer to the MM folks on the MM portion of this patch, but early
warning of impending memory pressure would be extremely good.  A few
RCU-related notes inline below, though.

							Thanx, Paul

> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -49,6 +49,16 @@ static inline void set_max_mapnr(unsigne
>  static inline void set_max_mapnr(unsigned long limit) { }
>  #endif
>  
> +/* subscriber of kswapd's memory_pressure_high signal */
> +struct mph_subscriber {
> +	struct list_head node;
> +	void (*info) (void *data);
> +	void *data;
> +};
> +
> +int mph_subscribe(struct mph_subscriber *ms);
> +void mph_unsubscribe(struct mph_subscriber *ms);
> +
>  extern atomic_long_t _totalram_pages;
>  static inline unsigned long totalram_pages(void)
>  {
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3536,6 +3536,40 @@ static bool kswapd_shrink_node(pg_data_t
>  }
>  
>  /*
> + * subscribers of kswapd's signal that memory pressure is high
> + */
> +static LIST_HEAD(mph_subs);
> +static DEFINE_MUTEX(mph_lock);
> +
> +int mph_subscribe(struct mph_subscriber *ms)
> +{
> +	if (!ms->info)
> +		return -EAGAIN;
> +
> +	mutex_lock(&mph_lock);
> +	list_add_tail(&ms->node, &mph_subs);
> +	mutex_unlock(&mph_lock);
> +	return 0;
> +}
> +
> +void mph_unsubscribe(struct mph_subscriber *ms)
> +{
> +	mutex_lock(&mph_lock);
> +	list_del(&ms->node);
> +	mutex_unlock(&mph_lock);
> +}
> +
> +static void kswapd_bbc_mph(void)
> +{
> +	struct mph_subscriber *ms;
> +
> +	mutex_lock(&mph_lock);
> +	list_for_each_entry(ms, &mph_subs, node)
> +		ms->info(ms->data);
> +	mutex_unlock(&mph_lock);
> +}
> +
> +/*
>   * For kswapd, balance_pgdat() will reclaim pages across a node from zones
>   * that are eligible for use by the caller until at least one zone is
>   * balanced.
> @@ -3663,8 +3697,11 @@ restart:
>  		 * If we're getting trouble reclaiming, start doing writepage
>  		 * even in laptop mode.
>  		 */
> -		if (sc.priority < DEF_PRIORITY - 2)
> +		if (sc.priority < DEF_PRIORITY - 2) {
>  			sc.may_writepage = 1;
> +			if (sc.priority == DEF_PRIORITY - 3)
> +				kswapd_bbc_mph();
> +		}
>  
>  		/* Call soft limit reclaim before calling shrink_node. */
>  		sc.nr_scanned = 0;
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -325,6 +325,8 @@ struct rcu_state {
>  	int ncpus_snap;				/* # CPUs seen last time. */
>  	u8 cbovld;				/* Callback overload now? */
>  	u8 cbovldnext;				/* ^        ^  next time? */
> +	u8 mph;					/* mm pressure high signal from kswapd */
> +	unsigned long mph_end;			/* time stamp in jiffies */
>  
>  	unsigned long jiffies_force_qs;		/* Time at which to invoke */
>  						/*  force_quiescent_state(). */
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -52,6 +52,7 @@
>  #include <linux/kprobes.h>
>  #include <linux/gfp.h>
>  #include <linux/oom.h>
> +#include <linux/mm.h>
>  #include <linux/smpboot.h>
>  #include <linux/jiffies.h>
>  #include <linux/slab.h>
> @@ -2314,8 +2315,15 @@ static void force_qs_rnp(int (*f)(struct
>  	struct rcu_data *rdp;
>  	struct rcu_node *rnp;
>  
> -	rcu_state.cbovld = rcu_state.cbovldnext;
> +	rcu_state.cbovld = smp_load_acquire(&rcu_state.mph) ||
> +					rcu_state.cbovldnext;
>  	rcu_state.cbovldnext = false;
> +
> +	if (READ_ONCE(rcu_state.mph) &&
> +	    time_after(jiffies, READ_ONCE(rcu_state.mph_end))) {
> +		WRITE_ONCE(rcu_state.mph, false);
> +		pr_info("%s: Ending OOM-mode grace periods.\n", __func__);
> +	}
>  	rcu_for_each_leaf_node(rnp) {
>  		cond_resched_tasks_rcu_qs();
>  		mask = 0;
> @@ -2643,6 +2651,20 @@ static void check_cb_ovld(struct rcu_dat
>  	raw_spin_unlock_rcu_node(rnp);
>  }
>  
> +static void rcu_mph_info(void *data)

This pointer will always be &rcu_state, so why not ignore the pointer
and use "rcu_state" below?

RCU grace periods are inherently global, so I don't know of any way
for RCU to focus on a given NUMA node.  All or nothing.  But on the
other hand, speeding up RCU grace periods will also help specific
NUMA nodes, so I believe that it is all good.

> +{
> +	struct rcu_state *state = data;
> +
> +	WRITE_ONCE(state->mph_end, jiffies + HZ / 10);
> +	smp_store_release(&state->mph, true);
> +	rcu_force_quiescent_state();
> +}
> +
> +static struct mph_subscriber rcu_mph_subscriber = {
> +	.info = rcu_mph_info,
> +	.data = &rcu_state,

Then this ".data" entry can be omitted, correct?

> +};
> +
>  /* Helper function for call_rcu() and friends.  */
>  static void
>  __call_rcu(struct rcu_head *head, rcu_callback_t func)
> @@ -4036,6 +4058,8 @@ void __init rcu_init(void)
>  		qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
>  	else
>  		qovld_calc = qovld;
> +
> +	mph_subscribe(&rcu_mph_subscriber);
>  }
>  
>  #include "tree_stall.h"
>