Re: [PATCH RFC v2 1/3] pidfs: rework inode number allocation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri 29-11-24 14:02:23, Christian Brauner wrote:
> Recently we received a patchset that aims to enable file handle encoding
> and decoding via name_to_handle_at(2) and open_by_handle_at(2).
> 
> A crucical step in the patch series is how to go from inode number to
> struct pid without leaking information into unprivileged contexts. The
> issue is that in order to find a struct pid the pid number in the
> initial pid namespace must be encoded into the file handle via
> name_to_handle_at(2). This can be used by containers using a separate
> pid namespace to learn what the pid number of a given process in the
> initial pid namespace is. While this is a weak information leak it could
> be used in various exploits and in general is an ugly wart in the design.
> 
> To solve this problem a new way is needed to lookup a struct pid based
> on the inode number allocated for that struct pid. The other part is to
> remove the custom inode number allocation on 32bit systems that is also
> an ugly wart that should go away.
> 
> So, a new scheme is used that I was discusssing with Tejun some time
> back. A cyclic ida is used for the lower 32 bits and a the high 32 bits
> are used for the generation number. This gives a 64 bit inode number
> that is unique on both 32 bit and 64 bit. The lower 32 bit number is
> recycled slowly and can be used to lookup struct pids.
> 
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>

Looks good to me. Feel free to add:

Reviewed-by: Jan Kara <jack@xxxxxxx>

								Honza

> ---
>  fs/pidfs.c            | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/pidfs.h |  2 ++
>  kernel/pid.c          | 14 ++++++------
>  3 files changed, 72 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index 618abb1fa1b84cf31282c922374e28d60cd49d00..0bdd9c525b80895d33f2eae5e8e375788580072f 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -23,6 +23,59 @@
>  #include "internal.h"
>  #include "mount.h"
>  
> +static u32 pidfs_ino_highbits;
> +static u32 pidfs_ino_last_ino_lowbits;
> +
> +static DEFINE_IDR(pidfs_ino_idr);
> +
> +static inline ino_t pidfs_ino(u64 ino)
> +{
> +	/* On 32 bit low 32 bits are the inode. */
> +	if (sizeof(ino_t) < sizeof(u64))
> +		return (u32)ino;
> +
> +	/* On 64 bit simply return ino. */
> +	return ino;
> +}
> +
> +static inline u32 pidfs_gen(u64 ino)
> +{
> +	/* On 32 bit the generation number are the upper 32 bits. */
> +	if (sizeof(ino_t) < sizeof(u64))
> +		return ino >> 32;
> +
> +	/* On 64 bit the generation number is 1. */
> +	return 1;
> +}
> +
> +/*
> + * Construct an inode number for struct pid in a way that we can use the
> + * lower 32bit to lookup struct pid independent of any pid numbers that
> + * could be leaked into userspace (e.g., via file handle encoding).
> + */
> +int pidfs_add_pid(struct pid *pid)
> +{
> +	u32 ino_highbits;
> +	int ret;
> +
> +	ret = idr_alloc_cyclic(&pidfs_ino_idr, pid, 1, 0, GFP_ATOMIC);
> +	if (ret >= 0 && ret < pidfs_ino_last_ino_lowbits)
> +		pidfs_ino_highbits++;
> +	ino_highbits = pidfs_ino_highbits;
> +	pidfs_ino_last_ino_lowbits = ret;
> +	if (ret < 0)
> +		return ret;
> +
> +	pid->ino = (u64)ino_highbits << 32 | ret;
> +	pid->stashed = NULL;
> +	return 0;
> +}
> +
> +void pidfs_remove_pid(struct pid *pid)
> +{
> +	idr_remove(&pidfs_ino_idr, (u32)pidfs_ino(pid->ino));
> +}
> +
>  #ifdef CONFIG_PROC_FS
>  /**
>   * pidfd_show_fdinfo - print information about a pidfd
> @@ -491,6 +544,16 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
>  
>  void __init pidfs_init(void)
>  {
> +	/*
> +	 * On 32 bit systems the lower 32 bits are the inode number and
> +	 * the higher 32 bits are the generation number. The starting
> +	 * value for the inode number and the generation number is one.
> +	 */
> +	if (sizeof(ino_t) < sizeof(u64))
> +		pidfs_ino_highbits = 1;
> +	else
> +		pidfs_ino_highbits = 0;
> +
>  	pidfs_mnt = kern_mount(&pidfs_type);
>  	if (IS_ERR(pidfs_mnt))
>  		panic("Failed to mount pidfs pseudo filesystem");
> diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
> index 75bdf9807802a5d1a9699c99aa42648c2bd34170..2958652bb108b8a2e02128e17317be4545b40a01 100644
> --- a/include/linux/pidfs.h
> +++ b/include/linux/pidfs.h
> @@ -4,5 +4,7 @@
>  
>  struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
>  void __init pidfs_init(void);
> +int pidfs_add_pid(struct pid *pid);
> +void pidfs_remove_pid(struct pid *pid);
>  
>  #endif /* _LINUX_PID_FS_H */
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 115448e89c3e9e664d0d51c8d853e8167ba0540c..6131543e7c090c164a2bac014f8eeee61926b13d 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -64,11 +64,6 @@ int pid_max = PID_MAX_DEFAULT;
>  
>  int pid_max_min = RESERVED_PIDS + 1;
>  int pid_max_max = PID_MAX_LIMIT;
> -/*
> - * Pseudo filesystems start inode numbering after one. We use Reserved
> - * PIDs as a natural offset.
> - */
> -static u64 pidfs_ino = RESERVED_PIDS;
>  
>  /*
>   * PID-map pages start out as NULL, they get allocated upon
> @@ -157,6 +152,7 @@ void free_pid(struct pid *pid)
>  		}
>  
>  		idr_remove(&ns->idr, upid->nr);
> +		pidfs_remove_pid(pid);
>  	}
>  	spin_unlock_irqrestore(&pidmap_lock, flags);
>  
> @@ -273,22 +269,26 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
>  	INIT_HLIST_HEAD(&pid->inodes);
>  
>  	upid = pid->numbers + ns->level;
> +	idr_preload(GFP_KERNEL);
>  	spin_lock_irq(&pidmap_lock);
>  	if (!(ns->pid_allocated & PIDNS_ADDING))
>  		goto out_unlock;
> -	pid->stashed = NULL;
> -	pid->ino = ++pidfs_ino;
> +	retval = pidfs_add_pid(pid);
> +	if (retval)
> +		goto out_unlock;
>  	for ( ; upid >= pid->numbers; --upid) {
>  		/* Make the PID visible to find_pid_ns. */
>  		idr_replace(&upid->ns->idr, pid, upid->nr);
>  		upid->ns->pid_allocated++;
>  	}
>  	spin_unlock_irq(&pidmap_lock);
> +	idr_preload_end();
>  
>  	return pid;
>  
>  out_unlock:
>  	spin_unlock_irq(&pidmap_lock);
> +	idr_preload_end();
>  	put_pid_ns(ns);
>  
>  out_free:
> 
> -- 
> 2.45.2
> 
-- 
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux