Re: [PATCH v5 1/2] fsx: support reads/writes from buffers backed by hugepages

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, Feb 2, 2025 at 6:25 AM Zorro Lang <zlang@xxxxxxxxxx> wrote:
>
> On Tue, Jan 21, 2025 at 01:56:40PM -0800, Joanne Koong wrote:
> > Add support for reads/writes from buffers backed by hugepages.
> > This can be enabled through the '-h' flag. This flag should only be used
> > on systems where THP capabilities are enabled.
> >
> > This is motivated by a recent bug that was due to faulty handling of
> > userspace buffers backed by hugepages. This patch is a mitigation
> > against problems like this in the future.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@xxxxxxxxx>
> > Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx>
> > Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx>
> > ---
>
> Those two test cases fail on old system which doesn't support
> MADV_COLLAPSE:
>
>    fsx -N 10000 -l 500000 -h
>   -fsx -N 10000 -o 8192 -l 500000 -h
>   -fsx -N 10000 -o 128000 -l 500000 -h
>   +MADV_COLLAPSE not supported. Can't support -h
>
> and
>
>    fsx -N 10000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
>   -fsx -N 10000 -o 8192 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
>   -fsx -N 10000 -o 128000 -l 500000 -r PSIZE -t BSIZE -w BSIZE -Z -R -W -h
>   +mapped writes DISABLED
>   +MADV_COLLAPSE not supported. Can't support -h
>
> I'm wondering ...
>
> >  ltp/fsx.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> >  1 file changed, 153 insertions(+), 13 deletions(-)
> >
> > diff --git a/ltp/fsx.c b/ltp/fsx.c
> > index 41933354..3be383c6 100644
> > --- a/ltp/fsx.c
> > +++ b/ltp/fsx.c
> > @@ -190,6 +190,16 @@ int      o_direct;                       /* -Z */
> >  int  aio = 0;
> >  int  uring = 0;
> >  int  mark_nr = 0;
> > +int  hugepages = 0;                  /* -h flag */
> > +
> > +/* Stores info needed to periodically collapse hugepages */
> > +struct hugepages_collapse_info {
> > +     void *orig_good_buf;
> > +     long good_buf_size;
> > +     void *orig_temp_buf;
> > +     long temp_buf_size;
> > +};
> > +struct hugepages_collapse_info hugepages_info;
> >
> >  int page_size;
> >  int page_mask;
> > @@ -2471,7 +2481,7 @@ void
> >  usage(void)
> >  {
> >       fprintf(stdout, "usage: %s",
> > -             "fsx [-dfknqxyzBEFHIJKLORWXZ0]\n\
> > +             "fsx [-dfhknqxyzBEFHIJKLORWXZ0]\n\
> >          [-b opnum] [-c Prob] [-g filldata] [-i logdev] [-j logid]\n\
> >          [-l flen] [-m start:end] [-o oplen] [-p progressinterval]\n\
> >          [-r readbdy] [-s style] [-t truncbdy] [-w writebdy]\n\
> > @@ -2483,8 +2493,11 @@ usage(void)
> >       -d: debug output for all operations\n\
> >       -e: pollute post-eof on size changes (default 0)\n\
> >       -f: flush and invalidate cache after I/O\n\
> > -     -g X: write character X instead of random generated data\n\
> > -     -i logdev: do integrity testing, logdev is the dm log writes device\n\
> > +     -g X: write character X instead of random generated data\n"
> > +#ifdef MADV_COLLAPSE
> > +"    -h hugepages: use buffers backed by hugepages for reads/writes\n"
> > +#endif
> > +"    -i logdev: do integrity testing, logdev is the dm log writes device\n\
> >       -j logid: prefix debug log messsages with this id\n\
> >       -k: do not truncate existing file and use its size as upper bound on file size\n\
> >       -l flen: the upper bound on file size (default 262144)\n\
> > @@ -2833,11 +2846,41 @@ __test_fallocate(int mode, const char *mode_str)
> >  #endif
> >  }
> >
> > +/*
> > + * Reclaim may break up hugepages, so do a best-effort collapse every once in
> > + * a while.
> > + */
> > +static void
> > +collapse_hugepages(void)
> > +{
> > +#ifdef MADV_COLLAPSE
> > +     int ret;
> > +
> > +     /* re-collapse every 16k fsxops after we start */
> > +     if (!numops || (numops & ((1U << 14) - 1)))
> > +             return;
> > +
> > +     ret = madvise(hugepages_info.orig_good_buf,
> > +                   hugepages_info.good_buf_size, MADV_COLLAPSE);
> > +     if (ret)
> > +             prt("collapsing hugepages for good_buf failed (numops=%llu): %s\n",
> > +                  numops, strerror(errno));
> > +     ret = madvise(hugepages_info.orig_temp_buf,
> > +                   hugepages_info.temp_buf_size, MADV_COLLAPSE);
> > +     if (ret)
> > +             prt("collapsing hugepages for temp_buf failed (numops=%llu): %s\n",
> > +                  numops, strerror(errno));
> > +#endif
> > +}
> > +
> >  bool
> >  keep_running(void)
> >  {
> >       int ret;
> >
> > +     if (hugepages)
> > +             collapse_hugepages();
> > +
> >       if (deadline.tv_nsec) {
> >               struct timespec now;
> >
> > @@ -2856,6 +2899,103 @@ keep_running(void)
> >       return numops-- != 0;
> >  }
> >
> > +static long
> > +get_hugepage_size(void)
> > +{
> > +     const char str[] = "Hugepagesize:";
> > +     size_t str_len =  sizeof(str) - 1;
> > +     unsigned int hugepage_size = 0;
> > +     char buffer[64];
> > +     FILE *file;
> > +
> > +     file = fopen("/proc/meminfo", "r");
> > +     if (!file) {
> > +             prterr("get_hugepage_size: fopen /proc/meminfo");
> > +             return -1;
> > +     }
> > +     while (fgets(buffer, sizeof(buffer), file)) {
> > +             if (strncmp(buffer, str, str_len) == 0) {
> > +                     sscanf(buffer + str_len, "%u", &hugepage_size);
> > +                     break;
> > +             }
> > +     }
> > +     fclose(file);
> > +     if (!hugepage_size) {
> > +             prterr("get_hugepage_size: failed to find "
> > +                     "hugepage size in /proc/meminfo\n");
> > +             return -1;
> > +     }
> > +
> > +     /* convert from KiB to bytes */
> > +     return hugepage_size << 10;
> > +}
> > +
> > +static void *
> > +init_hugepages_buf(unsigned len, int hugepage_size, int alignment, long *buf_size)
> > +{
> > +     void *buf = NULL;
> > +#ifdef MADV_COLLAPSE
> > +     int ret;
> > +     long size = roundup(len, hugepage_size) + alignment;
> > +
> > +     ret = posix_memalign(&buf, hugepage_size, size);
> > +     if (ret) {
> > +             prterr("posix_memalign for buf");
> > +             return NULL;
> > +     }
> > +     memset(buf, '\0', size);
> > +     ret = madvise(buf, size, MADV_COLLAPSE);
> > +     if (ret) {
> > +             prterr("madvise collapse for buf");
> > +             free(buf);
> > +             return NULL;
> > +     }
> > +
> > +     *buf_size = size;
> > +#endif
> > +     return buf;
> > +}
> > +
> > +static void
> > +init_buffers(void)
> > +{
> > +     int i;
> > +
> > +     original_buf = (char *) malloc(maxfilelen);
> > +     for (i = 0; i < maxfilelen; i++)
> > +             original_buf[i] = random() % 256;
> > +     if (hugepages) {
> > +             long hugepage_size = get_hugepage_size();
> > +             if (hugepage_size == -1) {
> > +                     prterr("get_hugepage_size()");
> > +                     exit(102);
> > +             }
> > +             good_buf = init_hugepages_buf(maxfilelen, hugepage_size, writebdy,
> > +                                           &hugepages_info.good_buf_size);
> > +             if (!good_buf) {
> > +                     prterr("init_hugepages_buf failed for good_buf");
> > +                     exit(103);
> > +             }
> > +             hugepages_info.orig_good_buf = good_buf;
> > +
> > +             temp_buf = init_hugepages_buf(maxoplen, hugepage_size, readbdy,
> > +                                           &hugepages_info.temp_buf_size);
> > +             if (!temp_buf) {
> > +                     prterr("init_hugepages_buf failed for temp_buf");
> > +                     exit(103);
> > +             }
> > +             hugepages_info.orig_temp_buf = temp_buf;
> > +     } else {
> > +             unsigned long good_buf_len = maxfilelen + writebdy;
> > +             unsigned long temp_buf_len = maxoplen + readbdy;
> > +
> > +             good_buf = calloc(1, good_buf_len);
> > +             temp_buf = calloc(1, temp_buf_len);
> > +     }
> > +     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > +     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > +}
> > +
> >  static struct option longopts[] = {
> >       {"replay-ops", required_argument, 0, 256},
> >       {"record-ops", optional_argument, 0, 255},
> > @@ -2883,7 +3023,7 @@ main(int argc, char **argv)
> >       setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
> >
> >       while ((ch = getopt_long(argc, argv,
> > -                              "0b:c:de:fg:i:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> > +                              "0b:c:de:fg:hi:j:kl:m:no:p:qr:s:t:uw:xyABD:EFJKHzCILN:OP:RS:UWXZ",
> >                                longopts, NULL)) != EOF)
> >               switch (ch) {
> >               case 'b':
> > @@ -2916,6 +3056,14 @@ main(int argc, char **argv)
> >               case 'g':
> >                       filldata = *optarg;
> >                       break;
> > +             case 'h':
> > +#ifndef MADV_COLLAPSE
> > +                     fprintf(stderr, "MADV_COLLAPSE not supported. "
> > +                             "Can't support -h\n");
> > +                     exit(86);
> > +#endif
> > +                     hugepages = 1;
> > +                     break;
>
> ...
> if we could change this part to:
>
> #ifdef MADV_COLLAPSE
>         hugepages = 1;
> #endif
>         break;
>
> to avoid the test failures on old systems.
>
> Or any better ideas from you :)

Hi Zorro,

It looks like MADV_COLLAPSE was introduced in kernel version 6.1. What
do you think about skipping generic/758 and generic/759 if the kernel
version is older than 6.1? That to me seems more preferable than the
paste above, as the paste above would execute the test as if it did
test hugepages when in reality it didn't, which would be misleading.


Thanks,
Joanne

>
> Thanks,
> Zorro
>
> >               case 'i':
> >                       integrity = 1;
> >                       logdev = strdup(optarg);
> > @@ -3229,15 +3377,7 @@ main(int argc, char **argv)
> >                       exit(95);
> >               }
> >       }
> > -     original_buf = (char *) malloc(maxfilelen);
> > -     for (i = 0; i < maxfilelen; i++)
> > -             original_buf[i] = random() % 256;
> > -     good_buf = (char *) malloc(maxfilelen + writebdy);
> > -     good_buf = round_ptr_up(good_buf, writebdy, 0);
> > -     memset(good_buf, '\0', maxfilelen);
> > -     temp_buf = (char *) malloc(maxoplen + readbdy);
> > -     temp_buf = round_ptr_up(temp_buf, readbdy, 0);
> > -     memset(temp_buf, '\0', maxoplen);
> > +     init_buffers();
> >       if (lite) {     /* zero entire existing file */
> >               ssize_t written;
> >
> > --
> > 2.47.1
> >
>





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux