On Thu, Oct 15, 2020 at 06:21:51PM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > The buffer cache needs to have a reliable trigger for shrinking > the cache. Modern kernels track and report memory pressure events to > the userspace via the Pressure Stall Interface (PSI). Create a PSI > memory pressure monitoring thread to listen for memory pressure > events and use that to drive buffer cache shrinking interfaces. > > Add the shrinker framework that will allow us to implement LRU > reclaim of buffers when memory pressure occues. We also create a > low memory detection and reclaim wait mechanism to allow use to > throttle back new allocations while we are shrinking the buffer > cache. > > We also include malloc heap trimming callouts so that once the > shrinker frees the memory, we trim the malloc heap to release the > freed memory back to the system. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > libxfs/buftarg.c | 142 ++++++++++++++++++++++++++++++++++++++++++- > libxfs/xfs_buftarg.h | 9 +++ > 2 files changed, 150 insertions(+), 1 deletion(-) > > diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c > index 42806e433715..6c7142d41eb1 100644 > --- a/libxfs/buftarg.c > +++ b/libxfs/buftarg.c > @@ -62,6 +62,128 @@ xfs_buftarg_setsize_early( > return xfs_buftarg_setsize(btp, bsize); > } > > +/* > + * Scan a chunk of the buffer cache and drop LRU reference counts. If the > + * count goes to zero, dispose of the buffer. > + */ > +static void > +xfs_buftarg_shrink( > + struct xfs_buftarg *btc) > +{ > + /* > + * Make the fact we are in memory reclaim externally visible. This > + * allows buffer cache allocation throttling while we are trying to > + * free memory. > + */ > + atomic_inc_return(&btc->bt_low_mem); > + > + fprintf(stderr, "Got memory pressure event. Shrinking caches!\n"); > + > + /* > + * Now we've free a bunch of memory, trim the heap down to release the > + * freed memory back to the kernel and reduce the pressure we are > + * placing on the system. > + */ > + malloc_trim(0); > + > + /* > + * Done, wake anyone waiting on memory reclaim to complete. > + */ > + atomic_dec_return(&btc->bt_low_mem); > + complete(&btc->bt_low_mem_wait); > +} > + > +static void * > +xfs_buftarg_shrinker( > + void *args) > +{ > + struct xfs_buftarg *btp = args; > + struct pollfd fds = { > + .fd = btp->bt_psi_fd, > + .events = POLLPRI, > + }; > + > + rcu_register_thread(); > + while (!btp->bt_exiting) { > + int n; > + > + n = poll(&fds, 1, 100); > + if (n == 0) > + continue; /* timeout */ > + if (n < 0) { > + perror("poll(PSI)"); > + break; > + } > + if (fds.revents & POLLERR) { > + fprintf(stderr, > + "poll(psi) POLLERR: event source dead?\n"); > + break; > + } > + if (!(fds.revents & POLLPRI)) { > + fprintf(stderr, > + "poll(psi): unknown event. Ignoring.\n"); > + continue; > + } > + > + /* run the shrinker here */ > + xfs_buftarg_shrink(btp); > + > + } > + rcu_unregister_thread(); > + return NULL; > +} > + > +/* > + * This only picks up on global memory pressure. Maybe in future we can detect > + * whether we are running inside a container and use the PSI information for the > + * container. > + * > + * We want relatively early notification of memory pressure stalls because > + * xfs_repair will consume lots of memory. Hence set a low trigger threshold for > + * reclaim to run - a partial stall of 5ms over a 1s sample period will trigger The trigger string looks like it's configuring for a partial stall of 10ms over a 1s sample period? > + * reclaim algorithms. > + */ > +static int > +xfs_buftarg_mempressue_init( xfs_buftarg_mempressure_init() ? > + struct xfs_buftarg *btp) > +{ > + const char *fname = "/proc/pressure/memory"; > + const char *trigger = "some 10000 1000000"; > + int error; > + > + btp->bt_psi_fd = open(fname, O_RDWR | O_NONBLOCK); > + if (btp->bt_psi_fd < 0) { > + perror("open(PSI)"); > + return -errno; > + } > + if (write(btp->bt_psi_fd, trigger, strlen(trigger) + 1) != > + strlen(trigger) + 1) { > + perror("write(PSI)"); > + error = -errno; > + goto out_close; > + } > + > + atomic_set(&btp->bt_low_mem, 0); > + init_completion(&btp->bt_low_mem_wait); > + > + /* > + * Now create the monitoring reclaim thread. This will run until the > + * buftarg is torn down. > + */ > + error = pthread_create(&btp->bt_psi_tid, NULL, > + xfs_buftarg_shrinker, btp); > + if (error) > + goto out_close; > + > + return 0; > + > +out_close: > + close(btp->bt_psi_fd); > + btp->bt_psi_fd = -1; > + return error; > +} > + > + > struct xfs_buftarg * > xfs_buftarg_alloc( > struct xfs_mount *mp, > @@ -74,6 +196,8 @@ xfs_buftarg_alloc( > btp->bt_mount = mp; > btp->bt_fd = libxfs_device_to_fd(bdev); > btp->bt_bdev = bdev; > + btp->bt_psi_fd = -1; > + btp->bt_exiting = false; > > if (xfs_buftarg_setsize_early(btp)) > goto error_free; > @@ -84,8 +208,13 @@ xfs_buftarg_alloc( > if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) > goto error_lru; > > + if (xfs_buftarg_mempressue_init(btp)) So what happens if PSI isn't enabled or procfs isn't mounted yet? xfs_repair just ... fails? That seems disappointing, particularly if the admin is trying to fix a dead root fs from the initramfs premount shell and /proc isn't set up yet. Hmm, looks like Debian actually /does/ set up procfs nowadays. Still, if we're going to add a hard requirement on CONFIG_PSI=y and CONFIG_PSI_DEFAULT_DISABLED=n, we need to advertise this kind of loudly. (Personally, I thought that if there's no pressure stall information, we'd just fall back to not having a shrinker and daring the system to OOM us like it does now...) --D > + goto error_pcp; > + > return btp; > > +error_pcp: > + percpu_counter_destroy(&btp->bt_io_count); > error_lru: > list_lru_destroy(&btp->bt_lru); > error_free: > @@ -97,6 +226,12 @@ void > xfs_buftarg_free( > struct xfs_buftarg *btp) > { > + btp->bt_exiting = true; > + if (btp->bt_psi_tid) > + pthread_join(btp->bt_psi_tid, NULL); > + if (btp->bt_psi_fd >= 0) > + close(btp->bt_psi_fd); > + > ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); > percpu_counter_destroy(&btp->bt_io_count); > platform_flush_device(btp->bt_fd, btp->bt_bdev); > @@ -121,10 +256,15 @@ xfs_buf_allocate_memory( > struct xfs_buf *bp, > uint flags) > { > + struct xfs_buftarg *btp = bp->b_target; > size_t size; > > + /* Throttle allocation while dealing with low memory events */ > + while (atomic_read(&btp->bt_low_mem)) > + wait_for_completion(&btp->bt_low_mem_wait); > + > size = BBTOB(bp->b_length); > - bp->b_addr = memalign(bp->b_target->bt_meta_sectorsize, size); > + bp->b_addr = memalign(btp->bt_meta_sectorsize, size); > if (!bp->b_addr) > return -ENOMEM; > return 0; > diff --git a/libxfs/xfs_buftarg.h b/libxfs/xfs_buftarg.h > index 798980fdafeb..d2ce47e22545 100644 > --- a/libxfs/xfs_buftarg.h > +++ b/libxfs/xfs_buftarg.h > @@ -41,7 +41,16 @@ struct xfs_buftarg { > > uint32_t bt_io_count; > unsigned int flags; > + > + /* > + * Memory pressure (PSI) and cache reclaim infrastructure > + */ > struct list_lru bt_lru; > + int bt_psi_fd; > + pthread_t bt_psi_tid; > + bool bt_exiting; > + bool bt_low_mem; > + struct completion bt_low_mem_wait; > }; > > /* We purged a dirty buffer and lost a write. */ > -- > 2.28.0 >