On 17. 05. 24, 22:07, Linus Torvalds wrote:
On Fri, 17 May 2024 at 00:54, Jiri Slaby <jirislaby@xxxxxxxxxx> wrote:
inode->i_private = data;
inode->i_flags |= S_PRIVATE;
+ inode->i_mode &= ~S_IFREG;
That is not a sensible operation. S_IFREG isn't a bit mask.
Oh, sure. I just unmasked what libfs' prepare_anon_dentry() set by default.
But it looks like 'anon_inode' traditionally had *no* type bytes at
all. That's literally crazy.
Doing a 'stat -L' on one in /proc/X/fd/Y will correctly say "weird
file" about them.
What a crock. That's horrible, and we apparently never noticed how
broken anon_inodes were because nobody really cared. But then lsof
seems to have done the *opposite* and just said (for unfathomable
reasons) "this can't be a normal regular file".
But I can't actually find that code in lsof. I see
if (rest && rest[0] == '[' && rest[1] == 'p')
fdinfo_mask |= FDINFO_PID;
which only checks that the name starts with '[p'. Hmm.
lsof just has received a fix in a form of:
else if (Lf->ntype == N_REGLR && rest && *rest && strcmp(pbuf,
"pidfd") == 0) {
https://github.com/lsof-org/lsof/pull/319/commits/c1678e3f6e4b4d984cb3078b7bf0c9e24bedb8ca
[ Time passes, I go looking ]
Oh Christ. It's process_proc_node:
Yes, didn't I note it? Hmm, apparently not (or maybe it's hidden in all
those pulls/isuses/bugs). But definitely been there, seen that. Sorry.
type = s->st_mode & S_IFMT;
switch (type) {
...
case 0:
if (!strcmp(p, "anon_inode"))
Lf->ntype = Ntype = N_ANON_INODE;
break;
so yes, process_proc_node() really seems to have intentionally noticed
that our anon inodes forgot to put a file type in the st_mode, and
together with the path from readlink matching 'anon_inode' is how lsof
determines it's one of the special inodes.
So yeah, we made a mistake, and then lsof decided that mistake was a feature.
Yes, but we can schedule a removal of this compat handling after some
years...
But that does mean that we probably just have to live in the bed we made.
But that
+ inode->i_mode &= ~S_IFREG;
is still very very wrong. It should use the proper bit mask: S_IFMT.
Either, I don't like removing that WARN_ON_ONCE() from libfs'
prepare_anon_dentry(). Is it OK to remove this S_IFREG after
path_from_stashed() in pidfs' pidfs_alloc_file(). I.e. after
d_alloc_anon(), d_instantiate(), stash_dentry(), but before dentry_open()?
That looks weird.
Instead, add a sort of LEGACY_DONT_WARN_ABOUT_IFMT to path_from_stashed()?
Dirty, I think.
So what about LEGACY_NO_MODE which would set "i_mode = 0" and mangle the
WARN_ON appropriately. Like in the patch attached? It works (when
applied together with the anon_inode name fix).
And we'd have to add a big comment about our historical stupidity that
we are perpetuating.
And immediately add it to Documentation/ABI/obsolete/?
thanks,
--
js
suse labs
From b005cd96c97684adfabf07c56bd91fabe45c8cb7 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@xxxxxxxxxx>
Date: Mon, 20 May 2024 10:13:44 +0200
Subject: [PATCH] add LEGACY_NO_MODE
Signed-off-by: Jiri Slaby (SUSE) <jirislaby@xxxxxxxxxx>
---
Documentation/ABI/obsolete/libfs-LEGACY_NO_MODE | 8 ++++++++
fs/internal.h | 5 ++++-
fs/libfs.c | 11 ++++++-----
fs/nsfs.c | 4 ++--
fs/pidfs.c | 3 ++-
5 files changed, 22 insertions(+), 9 deletions(-)
create mode 100644 Documentation/ABI/obsolete/libfs-LEGACY_NO_MODE
diff --git a/Documentation/ABI/obsolete/libfs-LEGACY_NO_MODE b/Documentation/ABI/obsolete/libfs-LEGACY_NO_MODE
new file mode 100644
index 000000000000..37ad036b18b2
--- /dev/null
+++ b/Documentation/ABI/obsolete/libfs-LEGACY_NO_MODE
@@ -0,0 +1,8 @@
+What: libfs' LEGACY_NO_MODE
+Date: May 2024
+KernelVersion: 6.10
+Contact: linux-fsdevel@xxxxxxxxxxxxxxx
+Description: LEGACY_NO_MODE was added to mimic the old (wrong) i_mode (zero)
+ of anon_inode when pidfs was moved away from anon_inode to
+ libfs helpers.
+Users: lsof
diff --git a/fs/internal.h b/fs/internal.h
index 7ca738904e34..a7cd0eecc266 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -315,6 +315,9 @@ struct stashed_operations {
void (*put_data)(void *data);
int (*init_inode)(struct inode *inode, void *data);
};
+
+#define LEGACY_NO_MODE BIT(0)
+
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
- struct path *path);
+ struct path *path, unsigned int flags);
void stashed_dentry_prune(struct dentry *dentry);
diff --git a/fs/libfs.c b/fs/libfs.c
index b635ee5adbcc..c047aa4f4dac 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -2045,11 +2045,12 @@ static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
static struct dentry *prepare_anon_dentry(struct dentry **stashed,
struct super_block *sb,
- void *data)
+ void *data, unsigned int flags)
{
struct dentry *dentry;
struct inode *inode;
const struct stashed_operations *sops = sb->s_fs_info;
+ umode_t i_mode;
int ret;
inode = new_inode_pseudo(sb);
@@ -2059,7 +2060,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
}
inode->i_flags |= S_IMMUTABLE;
- inode->i_mode = S_IFREG;
+ inode->i_mode = i_mode = (flags & LEGACY_NO_MODE) ? 0 : S_IFREG;
simple_inode_init_ts(inode);
ret = sops->init_inode(inode, data);
@@ -2069,7 +2070,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
}
/* Notice when this is changed. */
- WARN_ON_ONCE(!S_ISREG(inode->i_mode));
+ WARN_ON_ONCE((inode->i_mode & S_IFMT) != i_mode);
WARN_ON_ONCE(!IS_IMMUTABLE(inode));
dentry = d_alloc_anon(sb);
@@ -2126,7 +2127,7 @@ static struct dentry *stash_dentry(struct dentry **stashed,
* Return: On success zero and on failure a negative error is returned.
*/
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
- struct path *path)
+ struct path *path, unsigned int flags)
{
struct dentry *dentry;
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
@@ -2139,7 +2140,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
}
/* Allocate a new dentry. */
- dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data);
+ dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data, flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 07e22a15ef02..11b8ef2aeaed 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -56,7 +56,7 @@ int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
if (!ns)
return -ENOENT;
- return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path);
+ return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path, 0);
}
struct ns_get_path_task_args {
@@ -101,7 +101,7 @@ int open_related_ns(struct ns_common *ns,
return PTR_ERR(relative);
}
- err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path);
+ err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path, 0);
if (err < 0) {
put_unused_fd(fd);
return err;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index a63d5d24aa02..cb894e9024ed 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -266,7 +266,8 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
struct path path;
int ret;
- ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
+ ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path,
+ LEGACY_NO_MODE);
if (ret < 0)
return ERR_PTR(ret);
--
2.45.1