From: Dave Chinner <dchinner@xxxxxxxxxx> The buffer cache needs to have a reliable trigger for shrinking the cache. Modern kernels track and report memory pressure events to the userspace via the Pressure Stall Interface (PSI). Create a PSI memory pressure monitoring thread to listen for memory pressure events and use that to drive buffer cache shrinking interfaces. Add the shrinker framework that will allow us to implement LRU reclaim of buffers when memory pressure occues. We also create a low memory detection and reclaim wait mechanism to allow use to throttle back new allocations while we are shrinking the buffer cache. We also include malloc heap trimming callouts so that once the shrinker frees the memory, we trim the malloc heap to release the freed memory back to the system. Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- libxfs/buftarg.c | 142 ++++++++++++++++++++++++++++++++++++++++++- libxfs/xfs_buftarg.h | 9 +++ 2 files changed, 150 insertions(+), 1 deletion(-) diff --git a/libxfs/buftarg.c b/libxfs/buftarg.c index 42806e433715..6c7142d41eb1 100644 --- a/libxfs/buftarg.c +++ b/libxfs/buftarg.c @@ -62,6 +62,128 @@ xfs_buftarg_setsize_early( return xfs_buftarg_setsize(btp, bsize); } +/* + * Scan a chunk of the buffer cache and drop LRU reference counts. If the + * count goes to zero, dispose of the buffer. + */ +static void +xfs_buftarg_shrink( + struct xfs_buftarg *btc) +{ + /* + * Make the fact we are in memory reclaim externally visible. This + * allows buffer cache allocation throttling while we are trying to + * free memory. + */ + atomic_inc_return(&btc->bt_low_mem); + + fprintf(stderr, "Got memory pressure event. Shrinking caches!\n"); + + /* + * Now we've free a bunch of memory, trim the heap down to release the + * freed memory back to the kernel and reduce the pressure we are + * placing on the system. + */ + malloc_trim(0); + + /* + * Done, wake anyone waiting on memory reclaim to complete. + */ + atomic_dec_return(&btc->bt_low_mem); + complete(&btc->bt_low_mem_wait); +} + +static void * +xfs_buftarg_shrinker( + void *args) +{ + struct xfs_buftarg *btp = args; + struct pollfd fds = { + .fd = btp->bt_psi_fd, + .events = POLLPRI, + }; + + rcu_register_thread(); + while (!btp->bt_exiting) { + int n; + + n = poll(&fds, 1, 100); + if (n == 0) + continue; /* timeout */ + if (n < 0) { + perror("poll(PSI)"); + break; + } + if (fds.revents & POLLERR) { + fprintf(stderr, + "poll(psi) POLLERR: event source dead?\n"); + break; + } + if (!(fds.revents & POLLPRI)) { + fprintf(stderr, + "poll(psi): unknown event. Ignoring.\n"); + continue; + } + + /* run the shrinker here */ + xfs_buftarg_shrink(btp); + + } + rcu_unregister_thread(); + return NULL; +} + +/* + * This only picks up on global memory pressure. Maybe in future we can detect + * whether we are running inside a container and use the PSI information for the + * container. + * + * We want relatively early notification of memory pressure stalls because + * xfs_repair will consume lots of memory. Hence set a low trigger threshold for + * reclaim to run - a partial stall of 5ms over a 1s sample period will trigger + * reclaim algorithms. + */ +static int +xfs_buftarg_mempressue_init( + struct xfs_buftarg *btp) +{ + const char *fname = "/proc/pressure/memory"; + const char *trigger = "some 10000 1000000"; + int error; + + btp->bt_psi_fd = open(fname, O_RDWR | O_NONBLOCK); + if (btp->bt_psi_fd < 0) { + perror("open(PSI)"); + return -errno; + } + if (write(btp->bt_psi_fd, trigger, strlen(trigger) + 1) != + strlen(trigger) + 1) { + perror("write(PSI)"); + error = -errno; + goto out_close; + } + + atomic_set(&btp->bt_low_mem, 0); + init_completion(&btp->bt_low_mem_wait); + + /* + * Now create the monitoring reclaim thread. This will run until the + * buftarg is torn down. + */ + error = pthread_create(&btp->bt_psi_tid, NULL, + xfs_buftarg_shrinker, btp); + if (error) + goto out_close; + + return 0; + +out_close: + close(btp->bt_psi_fd); + btp->bt_psi_fd = -1; + return error; +} + + struct xfs_buftarg * xfs_buftarg_alloc( struct xfs_mount *mp, @@ -74,6 +196,8 @@ xfs_buftarg_alloc( btp->bt_mount = mp; btp->bt_fd = libxfs_device_to_fd(bdev); btp->bt_bdev = bdev; + btp->bt_psi_fd = -1; + btp->bt_exiting = false; if (xfs_buftarg_setsize_early(btp)) goto error_free; @@ -84,8 +208,13 @@ xfs_buftarg_alloc( if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) goto error_lru; + if (xfs_buftarg_mempressue_init(btp)) + goto error_pcp; + return btp; +error_pcp: + percpu_counter_destroy(&btp->bt_io_count); error_lru: list_lru_destroy(&btp->bt_lru); error_free: @@ -97,6 +226,12 @@ void xfs_buftarg_free( struct xfs_buftarg *btp) { + btp->bt_exiting = true; + if (btp->bt_psi_tid) + pthread_join(btp->bt_psi_tid, NULL); + if (btp->bt_psi_fd >= 0) + close(btp->bt_psi_fd); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); platform_flush_device(btp->bt_fd, btp->bt_bdev); @@ -121,10 +256,15 @@ xfs_buf_allocate_memory( struct xfs_buf *bp, uint flags) { + struct xfs_buftarg *btp = bp->b_target; size_t size; + /* Throttle allocation while dealing with low memory events */ + while (atomic_read(&btp->bt_low_mem)) + wait_for_completion(&btp->bt_low_mem_wait); + size = BBTOB(bp->b_length); - bp->b_addr = memalign(bp->b_target->bt_meta_sectorsize, size); + bp->b_addr = memalign(btp->bt_meta_sectorsize, size); if (!bp->b_addr) return -ENOMEM; return 0; diff --git a/libxfs/xfs_buftarg.h b/libxfs/xfs_buftarg.h index 798980fdafeb..d2ce47e22545 100644 --- a/libxfs/xfs_buftarg.h +++ b/libxfs/xfs_buftarg.h @@ -41,7 +41,16 @@ struct xfs_buftarg { uint32_t bt_io_count; unsigned int flags; + + /* + * Memory pressure (PSI) and cache reclaim infrastructure + */ struct list_lru bt_lru; + int bt_psi_fd; + pthread_t bt_psi_tid; + bool bt_exiting; + bool bt_low_mem; + struct completion bt_low_mem_wait; }; /* We purged a dirty buffer and lost a write. */ -- 2.28.0