> > > * Make fd count acces O(1) and expose it in /proc/pid/status > > This is doable, next to FDSize. It feels like a better solution, but maybe I'm missing some context here. Let me know whether this is preferred. That said, I've tried doing it, but failed. There's a noticeable mismatch in the numbers: * systemd: ivan@vm:~$ sudo ls -l /proc/1/fd | wc -l 66 ivan@vm:~$ cat /proc/1/status | fgrep FD FDSize: 256 FDUsed: 71 * journald: ivan@vm:~$ sudo ls -l /proc/803/fd | wc -l 29 ivan@vm:~$ cat /proc/803/status | fgrep FD FDSize: 128 FDUsed: 37 I'll see if I can make it work next week. I'm happy to receive tips as well. Below is my attempt (link in case gmail breaks patch formatting): * https://gist.githubusercontent.com/bobrik/acce40881d629d8cce2e55966b31a0a2/raw/716eb4724a8fe3afeeb76fd2a7a47ee13790a9e9/fdused.patch diff --git a/fs/file.c b/fs/file.c index 3bcc1ecc314a..8bc0741cabf1 100644 --- a/fs/file.c +++ b/fs/file.c @@ -85,6 +85,8 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); + + atomic_set(&nfdt->count, atomic_read(&ofdt->count)); } /* @@ -105,6 +107,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; + atomic_t count = ATOMIC_INIT(0); void *data; /* @@ -148,6 +151,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; + fdt->count = count; return fdt; @@ -399,6 +403,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); + atomic_set(&new_fdt->count, atomic_read(&old_fdt->count)); + rcu_assign_pointer(newf->fdt, new_fdt); return newf; @@ -474,6 +480,7 @@ struct files_struct init_files = { .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, + .count = ATOMIC_INIT(0), }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), @@ -613,6 +620,7 @@ void fd_install(unsigned int fd, struct file *file) BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); + atomic_inc(&fdt->count); return; } /* coupled with smp_wmb() in expand_fdtable() */ @@ -621,6 +629,7 @@ void fd_install(unsigned int fd, struct file *file) BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); + atomic_inc(&fdt->count); } EXPORT_SYMBOL(fd_install); @@ -646,6 +655,7 @@ static struct file *pick_file(struct files_struct *files, unsigned fd) if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); + atomic_dec(&fdt->count); } return file; } @@ -844,6 +854,7 @@ void do_close_on_exec(struct files_struct *files) filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); + atomic_dec(&fdt->count); } } @@ -1108,6 +1119,7 @@ __releases(&files->file_lock) else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); + atomic_inc(&fdt->count); if (tofree) filp_close(tofree, files); diff --git a/fs/proc/array.c b/fs/proc/array.c index 99fcbfda8e25..5847f077bfc3 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -153,7 +153,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; - unsigned int max_fds = 0; + struct fdtable *fdt; + unsigned int max_fds = 0, open_fds = 0; rcu_read_lock(); ppid = pid_alive(p) ? @@ -170,8 +171,11 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, task_lock(p); if (p->fs) umask = p->fs->umask; - if (p->files) - max_fds = files_fdtable(p->files)->max_fds; + if (p->files) { + fdt = files_fdtable(p->files); + max_fds = fdt->max_fds; + open_fds = atomic_read(&fdt->count); + } task_unlock(p); rcu_read_unlock(); @@ -194,6 +198,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid)); seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid)); seq_put_decimal_ull(m, "\nFDSize:\t", max_fds); + seq_put_decimal_ull(m, "\nFDUsed:\t", open_fds); seq_puts(m, "\nGroups:\t"); group_info = cred->group_info; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index e066816f3519..59aceb1e4bc6 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -31,6 +31,7 @@ struct fdtable { unsigned long *open_fds; unsigned long *full_fds_bits; struct rcu_head rcu; + atomic_t count; }; static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt) > > > + > > > + generic_fillattr(&init_user_ns, inode, stat); > ^^^^^^^^^^^^^ > > Is this correct? I'm not userns guy at all. I mostly copied from here: * https://elixir.bootlin.com/linux/v6.0-rc5/source/fs/proc/generic.c#L150 Maybe it can be simplified even further to match this one: * https://elixir.bootlin.com/linux/v6.0-rc5/source/fs/proc/root.c#L317