On Wed, 2023-09-20 at 10:41 +0200, Christian Brauner wrote: > > > f1 was last written to *after* f2 was last written to. If the timestamp of f1 > > > is then lower than the timestamp of f2, timestamps are fundamentally broken. > > > > > > Many things in user-space depend on timestamps, such as build system > > > centered around 'make', but also 'find ... -newer ...'. > > > > > > > > > What does breakage with make look like in this situation? The "fuzz" > > here is going to be on the order of a jiffy. The typical case for make > > timestamp comparisons is comparing source files vs. a build target. If > > those are being written nearly simultaneously, then that could be an > > issue, but is that a typical behavior? It seems like it would be hard to > > rely on that anyway, esp. given filesystems like NFS that can do lazy > > writeback. > > > > One of the operating principles with this series is that timestamps can > > be of varying granularity between different files. Note that Linux > > already violates this assumption when you're working across filesystems > > of different types. > > > > As to potential fixes if this is a real problem: > > > > I don't really want to put this behind a mount or mkfs option (a'la > > relatime, etc.), but that is one possibility. > > > > I wonder if it would be feasible to just advance the coarse-grained > > current_time whenever we end up updating a ctime with a fine-grained > > timestamp? It might produce some inode write amplification. Files that > > Less than ideal imho. > > If this risks breaking existing workloads by enabling it unconditionally > and there isn't a clear way to detect and handle these situations > without risk of regression then we should move this behind a mount > option. > > So how about the following: > > From cb14add421967f6e374eb77c36cc4a0526b10d17 Mon Sep 17 00:00:00 2001 > From: Christian Brauner <brauner@xxxxxxxxxx> > Date: Wed, 20 Sep 2023 10:00:08 +0200 > Subject: [PATCH] vfs: move multi-grain timestamps behind a mount option > > While we initially thought we can do this unconditionally it turns out > that this might break existing workloads that rely on timestamps in very > specific ways and we always knew this was a possibility. Move > multi-grain timestamps behind a vfs mount option. > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > --- > fs/fs_context.c | 18 ++++++++++++++++++ > fs/inode.c | 4 ++-- > fs/proc_namespace.c | 1 + > fs/stat.c | 2 +- > include/linux/fs.h | 4 +++- > 5 files changed, 25 insertions(+), 4 deletions(-) > > diff --git a/fs/fs_context.c b/fs/fs_context.c > index a0ad7a0c4680..dd4dade0bb9e 100644 > --- a/fs/fs_context.c > +++ b/fs/fs_context.c > @@ -44,6 +44,7 @@ static const struct constant_table common_set_sb_flag[] = { > { "mand", SB_MANDLOCK }, > { "ro", SB_RDONLY }, > { "sync", SB_SYNCHRONOUS }, > + { "mgtime", SB_MGTIME }, > { }, > }; > > > @@ -52,18 +53,32 @@ static const struct constant_table common_clear_sb_flag[] = { > { "nolazytime", SB_LAZYTIME }, > { "nomand", SB_MANDLOCK }, > { "rw", SB_RDONLY }, > + { "nomgtime", SB_MGTIME }, > { }, > }; > > > +static inline int check_mgtime(unsigned int token, const struct fs_context *fc) > +{ > + if (token != SB_MGTIME) > + return 0; > + if (!(fc->fs_type->fs_flags & FS_MGTIME)) > + return invalf(fc, "Filesystem doesn't support multi-grain timestamps"); > + return 0; > +} > + > /* > * Check for a common mount option that manipulates s_flags. > */ > static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) > { > unsigned int token; > + int ret; > > > token = lookup_constant(common_set_sb_flag, key, 0); > if (token) { > + ret = check_mgtime(token, fc); > + if (ret) > + return ret; > fc->sb_flags |= token; > fc->sb_flags_mask |= token; > return 0; > @@ -71,6 +86,9 @@ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) > > > token = lookup_constant(common_clear_sb_flag, key, 0); > if (token) { > + ret = check_mgtime(token, fc); > + if (ret) > + return ret; > fc->sb_flags &= ~token; > fc->sb_flags_mask |= token; > return 0; > diff --git a/fs/inode.c b/fs/inode.c > index 54237f4242ff..fd1a2390aaa3 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -2141,7 +2141,7 @@ EXPORT_SYMBOL(current_mgtime); > > > static struct timespec64 current_ctime(struct inode *inode) > { > - if (is_mgtime(inode)) > + if (IS_MGTIME(inode)) > return current_mgtime(inode); > return current_time(inode); > } > @@ -2588,7 +2588,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode) > now = current_time(inode); > > > /* Just copy it into place if it's not multigrain */ > - if (!is_mgtime(inode)) { > + if (!IS_MGTIME(inode)) { > inode_set_ctime_to_ts(inode, now); > return now; > } > diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c > index 250eb5bf7b52..08f5bf4d2c6c 100644 > --- a/fs/proc_namespace.c > +++ b/fs/proc_namespace.c > @@ -49,6 +49,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb) > { SB_DIRSYNC, ",dirsync" }, > { SB_MANDLOCK, ",mand" }, > { SB_LAZYTIME, ",lazytime" }, > + { SB_MGTIME, ",mgtime" }, > { 0, NULL } > }; > const struct proc_fs_opts *fs_infop; > diff --git a/fs/stat.c b/fs/stat.c > index 6e60389d6a15..2f18dd5de18b 100644 > --- a/fs/stat.c > +++ b/fs/stat.c > @@ -90,7 +90,7 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, > stat->size = i_size_read(inode); > stat->atime = inode->i_atime; > > > - if (is_mgtime(inode)) { > + if (IS_MGTIME(inode)) { > fill_mg_cmtime(stat, request_mask, inode); > } else { > stat->mtime = inode->i_mtime; > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 4aeb3fa11927..03e415fb3a7c 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1114,6 +1114,7 @@ extern int send_sigurg(struct fown_struct *fown); > #define SB_NODEV BIT(2) /* Disallow access to device special files */ > #define SB_NOEXEC BIT(3) /* Disallow program execution */ > #define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ > +#define SB_MGTIME BIT(5) /* Use multi-grain timestamps */ > #define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ > #define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ > #define SB_NOATIME BIT(10) /* Do not update access times. */ > @@ -2105,6 +2106,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags > ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) > #define IS_MANDLOCK(inode) __IS_FLG(inode, SB_MANDLOCK) > #define IS_NOATIME(inode) __IS_FLG(inode, SB_RDONLY|SB_NOATIME) > +#define IS_MGTIME(inode) __IS_FLG(inode, SB_MGTIME) > #define IS_I_VERSION(inode) __IS_FLG(inode, SB_I_VERSION) > > > #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) > @@ -2366,7 +2368,7 @@ struct file_system_type { > */ > static inline bool is_mgtime(const struct inode *inode) > { > - return inode->i_sb->s_type->fs_flags & FS_MGTIME; > + return inode->i_sb->s_flags & SB_MGTIME; > } > > > extern struct dentry *mount_bdev(struct file_system_type *fs_type, The mount option looks reasonable. Thanks for throwing together the patch. Maybe in the future we can come up with a way to mitigate the problems and do this unconditionally? Reviewed-by: Jeff Layton <jlayton@xxxxxxxxxx>