On Tue, 2021-08-03 at 10:07 -0300, Thiago Rafael Becker wrote: > Linux kernel defined the default read ahead to 128KiB, and this is > making read perform poorly. To mitigate it, add readahead as a mount > option that is handled by userspace, with some refactoring included. > > Signed-off-by: Thiago Rafael Becker <trbecker@xxxxxxxxx> > --- > utils/mount/mount.c | 114 ++++++++++++++++++++++++++++++++++++++++-- > -- > utils/mount/nfs.man | 3 ++ > 2 files changed, 108 insertions(+), 9 deletions(-) > > diff --git a/utils/mount/mount.c b/utils/mount/mount.c > index b98f9e00..15076ca8 100644 > --- a/utils/mount/mount.c > +++ b/utils/mount/mount.c > @@ -56,9 +56,11 @@ int nomtab; > int verbose; > int sloppy; > int string; > +static int readahead_kb = 0; > > #define FOREGROUND (0) > #define BACKGROUND (1) > +#define READAHEAD_VALUE_LEN 24 > > static struct option longopts[] = { > { "fake", 0, 0, 'f' }, > @@ -292,6 +294,16 @@ static int add_mtab(char *spec, char > *mount_point, char *fstype, > return result; > } > > +static void append_extra_opt(const char *opt, char *extra_opts, > size_t len) { > + len -= strlen(extra_opts); > + > + if (*extra_opts && --len > 0) > + strcat(extra_opts, ","); > + > + if ((len -= strlen(opt)) > 0) > + strcat(extra_opts, opt); > +} > + > static void parse_opt(const char *opt, int *mask, char *extra_opts, > size_t len) > { > const struct opt_map *om; > @@ -306,13 +318,37 @@ static void parse_opt(const char *opt, int > *mask, char *extra_opts, size_t len) > } > } > > - len -= strlen(extra_opts); > + append_extra_opt(opt, extra_opts, len); > +} > > - if (*extra_opts && --len > 0) > - strcat(extra_opts, ","); > +static void parse_opt_val(const char *opt, const char *val, char > *extra_opts, size_t len) > +{ > + size_t ov_len; > + char *opt_val; > > - if ((len -= strlen(opt)) > 0) > - strcat(extra_opts, opt); > + /* readahead is a special value that is handled by userspace > */ > + if (!strcmp(opt, "readahead")) { > + char *endptr = NULL; > + const char *expected_endptr = val + strlen(val); > + > + readahead_kb = strtol(val, &endptr, 10); > + > + if (endptr != expected_endptr) { > + nfs_error(_("%s: invalid readahead value > %s"), > + progname, val); > + readahead_kb = 0; > + } > + return; > + } > + > + /* Send the option to the kernel. */ > + ov_len = strlen(opt) + strlen(val) + 3; > + opt_val = malloc(sizeof(char) * ov_len); > + snprintf(opt_val, ov_len, ",%s=%s", opt, val); > + > + append_extra_opt(opt_val, extra_opts, len); > + > + free(opt_val); > } > > /* > @@ -325,7 +361,7 @@ static void parse_opts(const char *options, int > *flags, char **extra_opts) > { > if (options != NULL) { > char *opts = xstrdup(options); > - char *opt, *p; > + char *opt, *p, *val = NULL; > size_t len = strlen(opts) + 1; /* include room for a > null */ > int open_quote = 0; > > @@ -341,17 +377,75 @@ static void parse_opts(const char *options, int > *flags, char **extra_opts) > continue; /* still in a quoted > block */ > if (*p == ',') > *p = '\0'; /* terminate the > option item */ > + if (*p == '=') { /* key=val option */ > + if (!val) { > + *p = '\0'; /* terminate > key */ > + val = ++p; /* start the > value */ > + } > + } > > /* end of option item or last item */ > if (*p == '\0' || *(p + 1) == '\0') { > - parse_opt(opt, flags, *extra_opts, > len); > - opt = NULL; > + if (val) { > + parse_opt_val(opt, val, > *extra_opts, len); > + } else > + parse_opt(opt, flags, > *extra_opts, len); > + opt = val = NULL; > } > } > free(opts); > } > } > > +/* > + * Set the read ahead value for the mount point. On failure, uses > the default kernel > + * read ahead value (for new mounts) or the current value (for > remounts). > + */ > +static void set_readahead(const char *mount_point) { > + int error; > + struct statx mp_stat; > + char *mount_point_readahead_file, value[READAHEAD_VALUE_LEN]; > + size_t len; > + int fp; > + > + /* If readahead_kb is unset, or set to 0, do not change the > value */ > + if (!readahead_kb) > + return; > + > + if ((error = statx(0, mount_point, 0, 0, &mp_stat)) != 0) { > + goto out_error; > + } > + > + if (!(mount_point_readahead_file = malloc(PATH_MAX))) { > + error = -ENOMEM; > + goto out_error; > + } > + > + snprintf(mount_point_readahead_file, PATH_MAX, > "/sys/class/bdi/%d:%d/read_ahead_kb", > + mp_stat.stx_dev_major, > mp_stat.stx_dev_minor); > + > + len = snprintf(value, READAHEAD_VALUE_LEN, "%d", > readahead_kb); > + > + if ((fp = open(mount_point_readahead_file, O_WRONLY)) < 0) { > + error = errno; > + goto out_free; > + } > + > + if ((error = write(fp, value, len)) < 0) > + goto out_close; > + > + close(fp); > + return; > + > +out_close: > + close(fp); > +out_free: > + free(mount_point_readahead_file); > +out_error: > + nfs_error(_("%s: unable to set readahead value, using default > kernel value (error = %d)\n"), > + progname, error); > +} > + > static int try_mount(char *spec, char *mount_point, int flags, > char *fs_type, char **extra_opts, char > *mount_opts, > int fake, int bg) > @@ -373,8 +467,10 @@ static int try_mount(char *spec, char > *mount_point, int flags, > if (ret) > return ret; > > - if (!fake) > + if (!fake) { > + set_readahead(mount_point); > print_one(spec, mount_point, fs_type, mount_opts); > + } > > return add_mtab(spec, mount_point, fs_type, flags, > *extra_opts); > } > diff --git a/utils/mount/nfs.man b/utils/mount/nfs.man > index f1b76936..9832a377 100644 > --- a/utils/mount/nfs.man > +++ b/utils/mount/nfs.man > @@ -561,6 +561,9 @@ The > .B sloppy > option is an alternative to specifying > .BR mount.nfs " -s " option. > +.TP 1.5i > +.B readahead=n > +Set the read ahead of the mount to n KiB. This is handled entirely > in userspace and will not appear on mountinfo. If unset or set to 0, > it will not set the a value, using the current value (for a remount) > or the kernel default for a new mount. > > .SS "Options for NFS versions 2 and 3 only" > Use these options, along with the options in the above subsection, There is already a method for doing this: you set up an appropriate entry in /etc/udev/rules.d/. Adding a mount option, particularly one that is NFS only and is handled in userspace rather than by the kernel parser, just causes fragmentation and confusion. If you want to help users configure readahead, I'd suggest focussing on a utility to help them set up the udev rule correctly. -- Trond Myklebust Linux NFS client maintainer, Hammerspace trond.myklebust@xxxxxxxxxxxxxxx