Re: [PATCH 23/27] libxfs: use PSI information to detect memory pressure

"Darrick J. Wong" <darrick.wong@xxxxxxxxxx> · Thu, 15 Oct 2020 10:56:11 -0700

On Thu, Oct 15, 2020 at 06:21:51PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> The buffer cache needs to have a reliable trigger for shrinking
> the cache. Modern kernels track and report memory pressure events to
> the userspace via the Pressure Stall Interface (PSI). Create a PSI
> memory pressure monitoring thread to listen for memory pressure
> events and use that to drive buffer cache shrinking interfaces.
> 
> Add the shrinker framework that will allow us to implement LRU
> reclaim of buffers when memory pressure occues.  We also create a
> low memory detection and reclaim wait mechanism to allow use to
> throttle back new allocations while we are shrinking the buffer
> cache.
> 
> We also include malloc heap trimming callouts so that once the
> shrinker frees the memory, we trim the malloc heap to release the
> freed memory back to the system.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
>  libxfs/buftarg.c     | 142 ++++++++++++++++++++++++++++++++++++++++++-
>  libxfs/xfs_buftarg.h |   9 +++
>  2 files changed, 150 insertions(+), 1 deletion(-)
> 
> diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c
> index 42806e433715..6c7142d41eb1 100644
> --- a/libxfs/buftarg.c
> +++ b/libxfs/buftarg.c
> @@ -62,6 +62,128 @@ xfs_buftarg_setsize_early(
>  	return xfs_buftarg_setsize(btp, bsize);
>  }
>  
> +/*
> + * Scan a chunk of the buffer cache and drop LRU reference counts. If the
> + * count goes to zero, dispose of the buffer.
> + */
> +static void
> +xfs_buftarg_shrink(
> +	struct xfs_buftarg	*btc)
> +{
> +	/*
> +	 * Make the fact we are in memory reclaim externally visible. This
> +	 * allows buffer cache allocation throttling while we are trying to
> +	 * free memory.
> +	 */
> +	atomic_inc_return(&btc->bt_low_mem);
> +
> +	fprintf(stderr, "Got memory pressure event. Shrinking caches!\n");
> +
> +	/*
> +	 * Now we've free a bunch of memory, trim the heap down to release the
> +	 * freed memory back to the kernel and reduce the pressure we are
> +	 * placing on the system.
> +	 */
> +	malloc_trim(0);
> +
> +	/*
> +	 * Done, wake anyone waiting on memory reclaim to complete.
> +	 */
> +	atomic_dec_return(&btc->bt_low_mem);
> +	complete(&btc->bt_low_mem_wait);
> +}
> +
> +static void *
> +xfs_buftarg_shrinker(
> +	void			*args)
> +{
> +	struct xfs_buftarg	*btp = args;
> +	struct pollfd		 fds = {
> +		.fd = btp->bt_psi_fd,
> +		.events = POLLPRI,
> +	};
> +
> +	rcu_register_thread();
> +	while (!btp->bt_exiting) {
> +		int	n;
> +
> +		n = poll(&fds, 1, 100);
> +		if (n == 0)
> +			continue;	/* timeout */
> +		if (n < 0) {
> +			perror("poll(PSI)");
> +			break;
> +		}
> +		if (fds.revents & POLLERR) {
> +			fprintf(stderr,
> +				"poll(psi) POLLERR: event source dead?\n");
> +			break;
> +		}
> +		if (!(fds.revents & POLLPRI)) {
> +			fprintf(stderr,
> +				"poll(psi): unknown event.  Ignoring.\n");
> +			continue;
> +		}
> +
> +		/* run the shrinker here */
> +		xfs_buftarg_shrink(btp);
> +
> +	}
> +	rcu_unregister_thread();
> +	return NULL;
> +}
> +
> +/*
> + * This only picks up on global memory pressure. Maybe in future we can detect
> + * whether we are running inside a container and use the PSI information for the
> + * container.
> + *
> + * We want relatively early notification of memory pressure stalls because
> + * xfs_repair will consume lots of memory. Hence set a low trigger threshold for
> + * reclaim to run - a partial stall of 5ms over a 1s sample period will trigger

The trigger string looks like it's configuring for a partial stall of
10ms over a 1s sample period?

> + * reclaim algorithms.
> + */
> +static int
> +xfs_buftarg_mempressue_init(

xfs_buftarg_mempressure_init() ?

> +	struct xfs_buftarg	*btp)
> +{
> +	const char		*fname = "/proc/pressure/memory";
> +	const char		*trigger = "some 10000 1000000";
> +	int			error;
> +
> +	btp->bt_psi_fd = open(fname, O_RDWR | O_NONBLOCK);
> +	if (btp->bt_psi_fd < 0) {
> +		perror("open(PSI)");
> +		return -errno;
> +	}
> +	if (write(btp->bt_psi_fd, trigger, strlen(trigger) + 1) !=
> +						strlen(trigger) + 1) {
> +		perror("write(PSI)");
> +		error = -errno;
> +		goto out_close;
> +	}
> +
> +	atomic_set(&btp->bt_low_mem, 0);
> +	init_completion(&btp->bt_low_mem_wait);
> +
> +	/*
> +	 * Now create the monitoring reclaim thread. This will run until the
> +	 * buftarg is torn down.
> +	 */
> +	error = pthread_create(&btp->bt_psi_tid, NULL,
> +				xfs_buftarg_shrinker, btp);
> +	if (error)
> +		goto out_close;
> +
> +	return 0;
> +
> +out_close:
> +	close(btp->bt_psi_fd);
> +	btp->bt_psi_fd = -1;
> +	return error;
> +}
> +
> +
>  struct xfs_buftarg *
>  xfs_buftarg_alloc(
>  	struct xfs_mount	*mp,
> @@ -74,6 +196,8 @@ xfs_buftarg_alloc(
>  	btp->bt_mount = mp;
>  	btp->bt_fd = libxfs_device_to_fd(bdev);
>  	btp->bt_bdev = bdev;
> +	btp->bt_psi_fd = -1;
> +	btp->bt_exiting = false;
>  
>  	if (xfs_buftarg_setsize_early(btp))
>  		goto error_free;
> @@ -84,8 +208,13 @@ xfs_buftarg_alloc(
>  	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
>  		goto error_lru;
>  
> +	if (xfs_buftarg_mempressue_init(btp))

So what happens if PSI isn't enabled or procfs isn't mounted yet?
xfs_repair just ... fails?  That seems disappointing, particularly if
the admin is trying to fix a dead root fs from the initramfs premount
shell and /proc isn't set up yet.

Hmm, looks like Debian actually /does/ set up procfs nowadays.  Still,
if we're going to add a hard requirement on CONFIG_PSI=y and
CONFIG_PSI_DEFAULT_DISABLED=n, we need to advertise this kind of loudly.

(Personally, I thought that if there's no pressure stall information,
we'd just fall back to not having a shrinker and daring the system to
OOM us like it does now...)

--D

> +		goto error_pcp;
> +
>  	return btp;
>  
> +error_pcp:
> +	percpu_counter_destroy(&btp->bt_io_count);
>  error_lru:
>  	list_lru_destroy(&btp->bt_lru);
>  error_free:
> @@ -97,6 +226,12 @@ void
>  xfs_buftarg_free(
>  	struct xfs_buftarg	*btp)
>  {
> +	btp->bt_exiting = true;
> +	if (btp->bt_psi_tid)
> +		pthread_join(btp->bt_psi_tid, NULL);
> +	if (btp->bt_psi_fd >= 0)
> +		close(btp->bt_psi_fd);
> +
>  	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
>  	percpu_counter_destroy(&btp->bt_io_count);
>  	platform_flush_device(btp->bt_fd, btp->bt_bdev);
> @@ -121,10 +256,15 @@ xfs_buf_allocate_memory(
>  	struct xfs_buf		*bp,
>  	uint			flags)
>  {
> +	struct xfs_buftarg	*btp = bp->b_target;
>  	size_t			size;
>  
> +	/* Throttle allocation while dealing with low memory events */
> +	while (atomic_read(&btp->bt_low_mem))
> +		wait_for_completion(&btp->bt_low_mem_wait);
> +
>  	size = BBTOB(bp->b_length);
> -	bp->b_addr = memalign(bp->b_target->bt_meta_sectorsize, size);
> +	bp->b_addr = memalign(btp->bt_meta_sectorsize, size);
>  	if (!bp->b_addr)
>  		return -ENOMEM;
>  	return 0;
> diff --git a/libxfs/xfs_buftarg.h b/libxfs/xfs_buftarg.h
> index 798980fdafeb..d2ce47e22545 100644
> --- a/libxfs/xfs_buftarg.h
> +++ b/libxfs/xfs_buftarg.h
> @@ -41,7 +41,16 @@ struct xfs_buftarg {
>  
>  	uint32_t		bt_io_count;
>  	unsigned int		flags;
> +
> +	/*
> +	 * Memory pressure (PSI) and cache reclaim infrastructure
> +	 */
>  	struct list_lru		bt_lru;
> +	int			bt_psi_fd;
> +	pthread_t		bt_psi_tid;
> +	bool			bt_exiting;
> +	bool			bt_low_mem;
> +	struct completion	bt_low_mem_wait;
>  };
>  
>  /* We purged a dirty buffer and lost a write. */
> -- 
> 2.28.0
>