Hi, Barry, Barry Song <21cnbao@xxxxxxxxx> writes: > On Sat, Jun 15, 2024 at 2:59 PM Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> wrote: >> >> On Fri, 14 Jun 2024 19:51:11 -0700 Chris Li <chrisl@xxxxxxxxxx> wrote: >> >> > > I'm having trouble understanding the overall impact of this on users. >> > > We fail the mTHP swap allocation and fall back, but things continue to >> > > operate OK? >> > >> > Continue to operate OK in the sense that the mTHP will have to split >> > into 4K pages before the swap out, aka the fall back. The swap out and >> > swap in can continue to work as 4K pages, not as the mTHP. Due to the >> > fallback, the mTHP based zsmalloc compression with 64K buffer will not >> > happen. That is the effect of the fallback. But mTHP swap out and swap >> > in is relatively new, it is not really a regression. >> >> Sure, but it's pretty bad to merge a new feature only to have it >> ineffective after a few hours use. >> >> > > >> > > > There is some test number in the V1 thread of this series: >> > > > https://lore.kernel.org/r/20240524-swap-allocator-v1-0-47861b423b26@xxxxxxxxxx >> > > >> > > Well, please let's get the latest numbers into the latest patchset. >> > > Along with a higher-level (and quantitative) description of the user impact. >> > >> > I will need Barray's help to collect the number. I don't have the >> > setup to reproduce his test result. >> > Maybe a follow up commit message amendment for the test number when I get it? > > Although the issue may seem complex at a systemic level, even a small program can > demonstrate the problem and highlight how Chris's patch has improved the > situation. > > To demonstrate this, I designed a basic test program that maximally allocates > two memory blocks: > > * A memory block of up to 60MB, recommended for HUGEPAGE usage > * A memory block of up to 1MB, recommended for NOHUGEPAGE usage > > In the system configuration, I enabled 64KB mTHP and 64MB zRAM, providing more than > enough space for both the 60MB and 1MB allocations in the worst case. This setup > allows us to assess two effects: > > 1. When we don't enable mem2 (small folios), we consistently allocate and free > swap slots aligned with 64KB. whether there is a risk of failure to obtain > swap slots even though the zRAM has sufficient free space? > 2. When we enable mem2 (small folios), the presence of small folios may lead > to fragmentation of clusters, potentially impacting the swapout process for > large folios negatively. > > (2) can be enabled by "-s", without -s, small folios are disabled. > > The script to configure zRAM and mTHP: > > echo lzo > /sys/block/zram0/comp_algorithm > echo 64M > /sys/block/zram0/disksize > echo never > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled > echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled > mkswap /dev/zram0 > swapon /dev/zram0 > > The test program I made today after receiving Chris' patchset v2 > > (Andrew, Please let me know if you want this small test program to > be committed into kernel/tools/ folder. If yes, please let me know, > and I will cleanup and prepare a patch): > > #define _GNU_SOURCE > #include <stdio.h> > #include <stdlib.h> > #include <unistd.h> > #include <string.h> > #include <sys/mman.h> > #include <errno.h> > #include <time.h> > > #define MEMSIZE_MTHP (60 * 1024 * 1024) > #define MEMSIZE_SMALLFOLIO (1 * 1024 * 1024) > #define ALIGNMENT_MTHP (64 * 1024) > #define ALIGNMENT_SMALLFOLIO (4 * 1024) > #define TOTAL_DONTNEED_MTHP (16 * 1024 * 1024) > #define TOTAL_DONTNEED_SMALLFOLIO (256 * 1024) > #define MTHP_FOLIO_SIZE (64 * 1024) > > #define SWPOUT_PATH \ > "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout" > #define SWPOUT_FALLBACK_PATH \ > "/sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback" > > static void *aligned_alloc_mem(size_t size, size_t alignment) > { > void *mem = NULL; > if (posix_memalign(&mem, alignment, size) != 0) { > perror("posix_memalign"); > return NULL; > } > return mem; > } > > static void random_madvise_dontneed(void *mem, size_t mem_size, > size_t align_size, size_t total_dontneed_size) > { > size_t num_pages = total_dontneed_size / align_size; > size_t i; > size_t offset; > void *addr; > > for (i = 0; i < num_pages; ++i) { > offset = (rand() % (mem_size / align_size)) * align_size; > addr = (char *)mem + offset; > if (madvise(addr, align_size, MADV_DONTNEED) != 0) { > perror("madvise dontneed"); > } > memset(addr, 0x11, align_size); > } > } > > static unsigned long read_stat(const char *path) > { > FILE *file; > unsigned long value; > > file = fopen(path, "r"); > if (!file) { > perror("fopen"); > return 0; > } > > if (fscanf(file, "%lu", &value) != 1) { > perror("fscanf"); > fclose(file); > return 0; > } > > fclose(file); > return value; > } > > int main(int argc, char *argv[]) > { > int use_small_folio = 0; > int i; > void *mem1 = aligned_alloc_mem(MEMSIZE_MTHP, ALIGNMENT_MTHP); > if (mem1 == NULL) { > fprintf(stderr, "Failed to allocate 60MB memory\n"); > return EXIT_FAILURE; > } > > if (madvise(mem1, MEMSIZE_MTHP, MADV_HUGEPAGE) != 0) { > perror("madvise hugepage for mem1"); > free(mem1); > return EXIT_FAILURE; > } > > for (i = 1; i < argc; ++i) { > if (strcmp(argv[i], "-s") == 0) { > use_small_folio = 1; > } > } > > void *mem2 = NULL; > if (use_small_folio) { > mem2 = aligned_alloc_mem(MEMSIZE_SMALLFOLIO, ALIGNMENT_MTHP); > if (mem2 == NULL) { > fprintf(stderr, "Failed to allocate 1MB memory\n"); > free(mem1); > return EXIT_FAILURE; > } > > if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_NOHUGEPAGE) != 0) { > perror("madvise nohugepage for mem2"); > free(mem1); > free(mem2); > return EXIT_FAILURE; > } > } > > for (i = 0; i < 100; ++i) { > unsigned long initial_swpout; > unsigned long initial_swpout_fallback; > unsigned long final_swpout; > unsigned long final_swpout_fallback; > unsigned long swpout_inc; > unsigned long swpout_fallback_inc; > double fallback_percentage; > > initial_swpout = read_stat(SWPOUT_PATH); > initial_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH); > > random_madvise_dontneed(mem1, MEMSIZE_MTHP, ALIGNMENT_MTHP, > TOTAL_DONTNEED_MTHP); > > if (use_small_folio) { > random_madvise_dontneed(mem2, MEMSIZE_SMALLFOLIO, > ALIGNMENT_SMALLFOLIO, > TOTAL_DONTNEED_SMALLFOLIO); > } > > if (madvise(mem1, MEMSIZE_MTHP, MADV_PAGEOUT) != 0) { > perror("madvise pageout for mem1"); > free(mem1); > if (mem2 != NULL) { > free(mem2); > } > return EXIT_FAILURE; > } > > if (use_small_folio) { > if (madvise(mem2, MEMSIZE_SMALLFOLIO, MADV_PAGEOUT) != 0) { > perror("madvise pageout for mem2"); > free(mem1); > free(mem2); > return EXIT_FAILURE; > } > } > > final_swpout = read_stat(SWPOUT_PATH); > final_swpout_fallback = read_stat(SWPOUT_FALLBACK_PATH); > > swpout_inc = final_swpout - initial_swpout; > swpout_fallback_inc = final_swpout_fallback - initial_swpout_fallback; > > fallback_percentage = (double)swpout_fallback_inc / > (swpout_fallback_inc + swpout_inc) * 100; > > printf("Iteration %d: swpout inc: %lu, swpout fallback inc: %lu, Fallback percentage: %.2f%%\n", > i + 1, swpout_inc, swpout_fallback_inc, fallback_percentage); > } > > free(mem1); > if (mem2 != NULL) { > free(mem2); > } > > return EXIT_SUCCESS; > } Thank you very for your effort to write this test program. TBH, personally, I thought that this test program isn't practical enough. Can we show performance difference with some normal workloads? [snip] -- Best Regards, Huang, Ying