On Thu, Nov 28, 2024 at 1:34 PM Christian Brauner <brauner@xxxxxxxxxx> wrote: > > Recently we received a patchset that aims to enable file handle encoding > and decoding via name_to_handle_at(2) and open_by_handle_at(2). > > A crucical step in the patch series is how to go from inode number to > struct pid without leaking information into unprivileged contexts. The > issue is that in order to find a struct pid the pid number in the > initial pid namespace must be encoded into the file handle via > name_to_handle_at(2). This can be used by containers using a separate > pid namespace to learn what the pid number of a given process in the > initial pid namespace is. While this is a weak information leak it could > be used in various exploits and in general is an ugly wart in the design. > > To solve this problem a new way is needed to lookup a struct pid based > on the inode number allocated for that struct pid. The other part is to > remove the custom inode number allocation on 32bit systems that is also > an ugly wart that should go away. > > So, a new scheme is used that I was discusssing with Tejun some time > back. A cyclic ida is used for the lower 32 bits and a the high 32 bits > are used for the generation number. This gives a 64 bit inode number > that is unique on both 32 bit and 64 bit. The lower 32 bit number is > recycled slowly and can be used to lookup struct pids. > > Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx> > --- > fs/pidfs.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/pidfs.h | 2 ++ > kernel/pid.c | 11 +++--- > 3 files changed, 98 insertions(+), 7 deletions(-) > > diff --git a/fs/pidfs.c b/fs/pidfs.c > index 618abb1fa1b84cf31282c922374e28d60cd49d00..09a0c8ac805301927a94758b3f7d1e513826daf9 100644 > --- a/fs/pidfs.c > +++ b/fs/pidfs.c > @@ -23,6 +23,88 @@ > #include "internal.h" > #include "mount.h" > > +static u32 pidfs_ino_highbits; > +static u32 pidfs_ino_last_ino_lowbits; > + > +static DEFINE_IDR(pidfs_ino_idr); > + > +static inline ino_t pidfs_ino(u64 ino) > +{ > + /* On 32 bit low 32 bits are the inode. */ > + if (sizeof(ino_t) < sizeof(u64)) > + return (u32)ino; > + > + /* On 64 bit simply return ino. */ > + return ino; > +} > + > +static inline u32 pidfs_gen(u64 ino) > +{ > + /* On 32 bit the generation number are the upper 32 bits. */ > + if (sizeof(ino_t) < sizeof(u64)) > + return ino >> 32; > + > + /* On 64 bit the generation number is 1. */ > + return 1; > +} > + > +/* > + * Construct an inode number for struct pid in a way that we can use the > + * lower 32bit to lookup struct pid independent of any pid numbers that > + * could be leaked into userspace (e.g., via file handle encoding). > + */ > +int pidfs_add_pid(struct pid *pid) > +{ > + u32 ino_highbits; > + int ret; > + > + ret = idr_alloc_cyclic(&pidfs_ino_idr, pid, 1, 0, GFP_ATOMIC); > + if (ret >= 0 && ret < pidfs_ino_last_ino_lowbits) > + pidfs_ino_highbits++; > + ino_highbits = pidfs_ino_highbits; > + pidfs_ino_last_ino_lowbits = ret; > + if (ret < 0) > + return ret; > + This code looks "too useful" to be in a random filesystem. Maybe work a generation counter into idr_alloc_cyclic or just let it let the caller know that the range has been cycled? Only if this is not too complicated to do. > + pid->ino = (u64)ino_highbits << 32 | ret; > + pid->stashed = NULL; > + return 0; > +} > + > +void pidfs_remove_pid(struct pid *pid) > +{ > + idr_remove(&pidfs_ino_idr, (u32)pidfs_ino(pid->ino)); > +} > + > +/* Find a struct pid based on the inode number. */ > +static __maybe_unused struct pid *pidfs_ino_get_pid(u64 ino) > +{ > + ino_t pid_ino = pidfs_ino(ino); > + u32 gen = pidfs_gen(ino); > + struct pid *pid; > + > + guard(rcu)(); > + > + /* Handle @pid lookup carefully so there's no risk of UAF. */ > + pid = idr_find(&pidfs_ino_idr, (u32)ino); > + if (!pid) > + return NULL; > + > + if (sizeof(ino_t) < sizeof(u64)) { > + if (gen && pidfs_gen(pid->ino) != gen) > + pid = NULL; > + } else { > + if (pidfs_ino(pid->ino) != pid_ino) > + pid = NULL; > + } > + > + /* Within our pid namespace hierarchy? */ > + if (pid_vnr(pid) == 0) > + pid = NULL; > + > + return get_pid(pid); > +} > + > #ifdef CONFIG_PROC_FS > /** > * pidfd_show_fdinfo - print information about a pidfd > @@ -491,6 +573,16 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) > > void __init pidfs_init(void) > { > + /* > + * On 32 bit systems the lower 32 bits are the inode number and > + * the higher 32 bits are the generation number. The starting > + * value for the inode number and the generation number is one. > + */ > + if (sizeof(ino_t) < sizeof(u64)) > + pidfs_ino_highbits = 1; > + else > + pidfs_ino_highbits = 0; > + > pidfs_mnt = kern_mount(&pidfs_type); > if (IS_ERR(pidfs_mnt)) > panic("Failed to mount pidfs pseudo filesystem"); > diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h > index 75bdf9807802a5d1a9699c99aa42648c2bd34170..2958652bb108b8a2e02128e17317be4545b40a01 100644 > --- a/include/linux/pidfs.h > +++ b/include/linux/pidfs.h > @@ -4,5 +4,7 @@ > > struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); > void __init pidfs_init(void); > +int pidfs_add_pid(struct pid *pid); > +void pidfs_remove_pid(struct pid *pid); > > #endif /* _LINUX_PID_FS_H */ > diff --git a/kernel/pid.c b/kernel/pid.c > index 115448e89c3e9e664d0d51c8d853e8167ba0540c..9f321c6456d24af705c28f0256ca4de771f5e681 100644 > --- a/kernel/pid.c > +++ b/kernel/pid.c > @@ -64,11 +64,6 @@ int pid_max = PID_MAX_DEFAULT; > > int pid_max_min = RESERVED_PIDS + 1; > int pid_max_max = PID_MAX_LIMIT; > -/* > - * Pseudo filesystems start inode numbering after one. We use Reserved > - * PIDs as a natural offset. > - */ > -static u64 pidfs_ino = RESERVED_PIDS; > > /* > * PID-map pages start out as NULL, they get allocated upon > @@ -157,6 +152,7 @@ void free_pid(struct pid *pid) > } > > idr_remove(&ns->idr, upid->nr); > + pidfs_remove_pid(pid); > } > spin_unlock_irqrestore(&pidmap_lock, flags); > > @@ -276,8 +272,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, > spin_lock_irq(&pidmap_lock); > if (!(ns->pid_allocated & PIDNS_ADDING)) > goto out_unlock; > - pid->stashed = NULL; > - pid->ino = ++pidfs_ino; > + retval = pidfs_add_pid(pid); I hope this does not create a scalability issue compared to the prev method? > + if (retval) > + goto out_unlock; > for ( ; upid >= pid->numbers; --upid) { > /* Make the PID visible to find_pid_ns. */ > idr_replace(&upid->ns->idr, pid, upid->nr); > > -- > 2.45.2 > Nice idea! If there is something wrong with it, then I could not find it ;) Thanks, Amir.