On Jun 22, 2022, at 11:50 AM, Nadav Amit <nadav.amit@xxxxxxxxx> wrote: > From: Nadav Amit <namit@xxxxxxxxxx> > > Using a PTE on x86 with cleared access-bit (aka young-bit) > takes ~600 cycles more than when the access bit is set. At the same > time, setting the access-bit for memory that is not used (e.g., > prefetched) can introduce greater overheads, as the prefetched memory is > reclaimed later than it should be. > > Userfaultfd currently does not set the access-bit (excluding the > huge-pages case). Arguably, it is best to let the user control whether > the access bit should be set or not. The expected use is to request > userfaultfd to set the access-bit when the copy/wp operation is done to > resolve a page-fault, and not to set the access-bit when the memory is > prefetched. > > Introduce UFFDIO_[op]_ACCESS_LIKELY to enable userspace to request the > young bit to be set. I reply to my own email, but this mostly addresses the concerns that Peter has raised. So I ran the test below on my Haswell (x86), which showed two things: 1. Accessing an address using a clean PTE or old PTE takes ~500 cycles more than with dirty+young (depending on the access, of course: dirty does not matter for read, dirty+young both matter for write). 2. I made a mistake in my implementation. PTEs are - at least on x86 - created as young with mk_pte(). So the logic should be similar to do_set_pte(): if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); else entry = pte_sw_mkyoung(entry); Based on these results, I will send another version for both young and dirty. Let me know if these results are not convincing. I will add, as we discussed (well, I think I raised these things, so hopefully you agree): 1. On x86, avoid flush if changing WP->RO and PTE is clean. 2. When write-unprotecting entry, if PTE is exclusive, set it as writable. [ I considered not setting it as writable if write-hint is not provided, but with the change in (1), it does not provide any real value. ] --- #define _GNU_SOURCE #include <sys/types.h> #include <stdio.h> #include <linux/userfaultfd.h> #include <pthread.h> #include <errno.h> #include <unistd.h> #include <stdlib.h> #include <fcntl.h> #include <signal.h> #include <poll.h> #include <string.h> #include <sys/mman.h> #include <sys/syscall.h> #include <sys/ioctl.h> #include <stdint.h> #include <stdbool.h> #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ } while (0) static inline uint64_t rdtscp(void) { uint64_t rax, rdx; uint32_t aux; asm volatile ("rdtscp" : "=a" (rax), "=d" (rdx), "=c" (aux):: "memory"); } int main(int argc, char *argv[]) { long uffd; /* userfaultfd file descriptor */ char *addr; /* Start of region handled by userfaultfd */ unsigned long len; /* Length of region handled by userfaultfd */ pthread_t thr; /* ID of thread that handles page faults */ bool young, dirty, write; struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; int l; static char *page = NULL; struct uffdio_copy uffdio_copy; ssize_t nread; int page_size; if (argc != 5) { fprintf(stderr, "Usage: %s [num-pages] [write] [young] [dirty]\n", argv[0]); exit(EXIT_FAILURE); } page_size = sysconf(_SC_PAGE_SIZE); len = strtoul(argv[1], NULL, 0) * page_size; write = !!strtoul(argv[2], NULL, 0); young = !!strtoul(argv[3], NULL, 0); dirty = !!strtoul(argv[4], NULL, 0); page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (page == MAP_FAILED) errExit("mmap"); uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); if (uffd == -1) errExit("userfaultfd"); uffdio_api.api = UFFD_API; uffdio_api.features = (1<<11); //UFFD_FEATURE_EXACT_ADDRESS; if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) errExit("ioctl-UFFDIO_API"); addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) errExit("mmap"); uffdio_register.range.start = (unsigned long) addr; uffdio_register.range.len = len; uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) errExit("ioctl-UFFDIO_REGISTER"); uffdio_copy.src = (unsigned long) page; uffdio_copy.mode = 0; if (young) uffdio_copy.mode |= (1ul << 2); if (dirty) uffdio_copy.mode |= (1ul << 3); uffdio_copy.len = page_size; uffdio_copy.copy = 0; for (l = 0; l < len; l += page_size) { uffdio_copy.dst = (unsigned long)(&addr[l]); if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1) errExit("ioctl-UFFDIO_COPY"); } for (l = 0; l < len; l += page_size) { char c; uint64_t start; start = rdtscp(); if (write) addr[l] = 5; else c = *(volatile char *)(&addr[l]); printf("%ld\n", rdtscp() - start); } exit(EXIT_SUCCESS); }