On Mon, Oct 12, 2020 at 1:49 PM Jann Horn <jannh@xxxxxxxxxx> wrote: > Since 34e55232e59f7b19050267a05ff1226e5cd122a5 (introduced back in > v2.6.34), Linux uses per-thread RSS counters to reduce cache contention on > the per-mm counters. With a 4K page size, that means that you can end up > with the counters off by up to 252KiB per thread. Actually, as Mark Mossberg pointed out to me off-thread, the counters can actually be off by many times more... can be reproduced with e.g. the following: #include <stdlib.h> #include <err.h> #include <stdio.h> #include <signal.h> #include <unistd.h> #include <sys/mman.h> #include <sys/eventfd.h> #include <sys/prctl.h> void dump(int pid) { char cmd[1000]; sprintf(cmd, "grep '^VmRSS' /proc/%d/status;" "grep '^Rss:' /proc/%d/smaps_rollup;" "echo", pid, pid ); system(cmd); } int main(void) { eventfd_t dummy; int child_wait = eventfd(0, EFD_SEMAPHORE|EFD_CLOEXEC); int child_resume = eventfd(0, EFD_SEMAPHORE|EFD_CLOEXEC); if (child_wait == -1 || child_resume == -1) err(1, "eventfd"); pid_t child = fork(); if (child == -1) err(1, "fork"); if (child == 0) { if (prctl(PR_SET_PDEATHSIG, SIGKILL)) err(1, "PDEATHSIG"); if (getppid() == 1) exit(0); char *mapping = mmap(NULL, 80 * 0x1000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); for (int i=0; 1; i++) { eventfd_write(child_wait, 1); eventfd_read(child_resume, &dummy); if (i == 80) break; mapping[0x1000 * i] = 1; } exit(0); } for (int i=0; i<81; i++) { eventfd_read(child_wait, &dummy); dump(child); eventfd_write(child_resume, 1); } exit(0); } I'm not entirely sure why though.