Re: [RFC] proc: report open files as size in stat() for /proc/pid/fd

Ivan Babrou <ivan@xxxxxxxxxxxxxx> · Sat, 17 Sep 2022 11:32:02 -0700

> > > * Make fd count acces O(1) and expose it in /proc/pid/status
>
> This is doable, next to FDSize.

It feels like a better solution, but maybe I'm missing some context
here. Let me know whether this is preferred.

That said, I've tried doing it, but failed. There's a noticeable
mismatch in the numbers:

* systemd:

ivan@vm:~$ sudo ls -l /proc/1/fd | wc -l
66
ivan@vm:~$ cat /proc/1/status | fgrep FD
FDSize: 256
FDUsed: 71

* journald:

ivan@vm:~$ sudo ls -l /proc/803/fd | wc -l
29
ivan@vm:~$ cat /proc/803/status | fgrep FD
FDSize: 128
FDUsed: 37

I'll see if I can make it work next week. I'm happy to receive tips as well.

Below is my attempt (link in case gmail breaks patch formatting):

* https://gist.githubusercontent.com/bobrik/acce40881d629d8cce2e55966b31a0a2/raw/716eb4724a8fe3afeeb76fd2a7a47ee13790a9e9/fdused.patch

diff --git a/fs/file.c b/fs/file.c
index 3bcc1ecc314a..8bc0741cabf1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -85,6 +85,8 @@ static void copy_fdtable(struct fdtable *nfdt,
struct fdtable *ofdt)
  memset((char *)nfdt->fd + cpy, 0, set);

  copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
+
+ atomic_set(&nfdt->count, atomic_read(&ofdt->count));
 }

 /*
@@ -105,6 +107,7 @@ static void copy_fdtable(struct fdtable *nfdt,
struct fdtable *ofdt)
 static struct fdtable * alloc_fdtable(unsigned int nr)
 {
  struct fdtable *fdt;
+ atomic_t count = ATOMIC_INIT(0);
  void *data;

  /*
@@ -148,6 +151,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
  fdt->close_on_exec = data;
  data += nr / BITS_PER_BYTE;
  fdt->full_fds_bits = data;
+ fdt->count = count;

  return fdt;

@@ -399,6 +403,8 @@ struct files_struct *dup_fd(struct files_struct
*oldf, unsigned int max_fds, int
  /* clear the remainder */
  memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

+ atomic_set(&new_fdt->count, atomic_read(&old_fdt->count));
+
  rcu_assign_pointer(newf->fdt, new_fdt);

  return newf;
@@ -474,6 +480,7 @@ struct files_struct init_files = {
  .close_on_exec = init_files.close_on_exec_init,
  .open_fds = init_files.open_fds_init,
  .full_fds_bits = init_files.full_fds_bits_init,
+ .count = ATOMIC_INIT(0),
  },
  .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
  .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
@@ -613,6 +620,7 @@ void fd_install(unsigned int fd, struct file *file)
  BUG_ON(fdt->fd[fd] != NULL);
  rcu_assign_pointer(fdt->fd[fd], file);
  spin_unlock(&files->file_lock);
+ atomic_inc(&fdt->count);
  return;
  }
  /* coupled with smp_wmb() in expand_fdtable() */
@@ -621,6 +629,7 @@ void fd_install(unsigned int fd, struct file *file)
  BUG_ON(fdt->fd[fd] != NULL);
  rcu_assign_pointer(fdt->fd[fd], file);
  rcu_read_unlock_sched();
+ atomic_inc(&fdt->count);
 }

 EXPORT_SYMBOL(fd_install);
@@ -646,6 +655,7 @@ static struct file *pick_file(struct files_struct
*files, unsigned fd)
  if (file) {
  rcu_assign_pointer(fdt->fd[fd], NULL);
  __put_unused_fd(files, fd);
+ atomic_dec(&fdt->count);
  }
  return file;
 }
@@ -844,6 +854,7 @@ void do_close_on_exec(struct files_struct *files)
  filp_close(file, files);
  cond_resched();
  spin_lock(&files->file_lock);
+ atomic_dec(&fdt->count);
  }

  }
@@ -1108,6 +1119,7 @@ __releases(&files->file_lock)
  else
  __clear_close_on_exec(fd, fdt);
  spin_unlock(&files->file_lock);
+ atomic_inc(&fdt->count);

  if (tofree)
  filp_close(tofree, files);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 99fcbfda8e25..5847f077bfc3 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -153,7 +153,8 @@ static inline void task_state(struct seq_file *m,
struct pid_namespace *ns,
  struct task_struct *tracer;
  const struct cred *cred;
  pid_t ppid, tpid = 0, tgid, ngid;
- unsigned int max_fds = 0;
+ struct fdtable *fdt;
+ unsigned int max_fds = 0, open_fds = 0;

  rcu_read_lock();
  ppid = pid_alive(p) ?
@@ -170,8 +171,11 @@ static inline void task_state(struct seq_file *m,
struct pid_namespace *ns,
  task_lock(p);
  if (p->fs)
  umask = p->fs->umask;
- if (p->files)
- max_fds = files_fdtable(p->files)->max_fds;
+ if (p->files) {
+ fdt = files_fdtable(p->files);
+ max_fds = fdt->max_fds;
+ open_fds = atomic_read(&fdt->count);
+ }
  task_unlock(p);
  rcu_read_unlock();

@@ -194,6 +198,7 @@ static inline void task_state(struct seq_file *m,
struct pid_namespace *ns,
  seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->sgid));
  seq_put_decimal_ull(m, "\t", from_kgid_munged(user_ns, cred->fsgid));
  seq_put_decimal_ull(m, "\nFDSize:\t", max_fds);
+ seq_put_decimal_ull(m, "\nFDUsed:\t", open_fds);

  seq_puts(m, "\nGroups:\t");
  group_info = cred->group_info;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..59aceb1e4bc6 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -31,6 +31,7 @@ struct fdtable {
  unsigned long *open_fds;
  unsigned long *full_fds_bits;
  struct rcu_head rcu;
+ atomic_t count;
 };

 static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)


> > > +
> > > +   generic_fillattr(&init_user_ns, inode, stat);
>                          ^^^^^^^^^^^^^
>
> Is this correct? I'm not userns guy at all.

I mostly copied from here:

* https://elixir.bootlin.com/linux/v6.0-rc5/source/fs/proc/generic.c#L150

Maybe it can be simplified even further to match this one:

* https://elixir.bootlin.com/linux/v6.0-rc5/source/fs/proc/root.c#L317