David, would you please repost a new patch for this one and copy Ives to
make sure it'll work for him in his systems?
Yes, will do. I think has was already cc'ed.
I'd suggest to drop the mprotect example, I'll reply later on that to the
other email but I still don't know whether it's a good example for a reader
to understand the problem.
Yes, can do.
No reproducer needed for numa I think - I guess Ives's test case would be
far enough to verify it if possible. I also hope what Ives saw was the
numa balancing issue you reported, so maybe it'll resolve all problem he
has. Then with that verified and queued we can drop the mm/migrate patch.
I tried writing a numa hinting reproducer, but so far I assume that it's
with current code not (easily) possible for shmem.
We'd have to call change_prot_numa() in order to protnone these PTEs.
That can either happen via
a) mbind() with MPOL_MF_LAZY. However, user space is currently not able
to set that flag (dead code ...).
b) task_numa_work(). However, vma_policy_mof() seems to always fail on
shmem and prevent us from reaching that code. Reason is that shmem
implements vm_ops->get_policy, and it seems to be impossible to get
MPOL_F_MOF set. At least I haven't found an easy way or I am missing
something.
So numa hinting might not immediately explain the lost write faults.
... but are there other ways to reach do_numa_page(), even without
active NUMA hinting? I found at least one:
map = mmap(NULL, size, PROT_WRITE, MAP_SHARED|MAP_ANON, -1, 0);
memset(map, 0, size);
uffd_wp_range(map, size);
On upstream during the next write fault, we'll end up in do_numa_page()
and simply remap the page writable due to vm_page_prot, not triggering a
write fault. I can see the "numa_hint_faults" counter in /proc/vmstat
increasing accordingly, so we're really in do_numa_page().
PROT_WRITE on shmem with uffd-wp is completely non-functional as it
seems. It seems to work with anon memory. And with my patch, it also
works with shmem.
Attaching a simple reproducer (uffd-wp-prot-write.c).
Independent of uffd-wp on shmem, we seem to be missing propagating the
uffd-wp bit when splitting the huge zeropage. So uffd-wp'ing the huge
zeropage and then splitting it loses the uffd-wp markers. :/
Fix seems easy and I just tested my possible fix. Attaching a simple
reproducer (uffd-wp-huge-zero.c).
--
Thanks,
David / dhildenb
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <inttypes.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <poll.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <linux/memfd.h>
#include <linux/userfaultfd.h>
static size_t pagesize;
static int uffd;
static int pagemap_fd;
#define SIZE (1024*1024*1024ull)
#define barrier() __asm__ __volatile__("": : :"memory")
volatile bool uffd_triggered;
static void uffd_wp_range(char *start, size_t size, bool wp)
{
struct uffdio_writeprotect uffd_writeprotect;
uffd_writeprotect.range.start = (unsigned long) start;
uffd_writeprotect.range.len = size;
if (wp) {
uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
} else {
uffd_writeprotect.mode = 0;
}
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
exit(1);
}
}
static void *uffd_thread_fn(void *arg)
{
static struct uffd_msg msg;
ssize_t nread;
while (1) {
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
if (nready == -1) {
fprintf(stderr, "poll() failed: %d\n", errno);
exit(1);
}
nread = read(uffd, &msg, sizeof(msg));
if (nread <= 0)
continue;
if (msg.event != UFFD_EVENT_PAGEFAULT ||
!(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)) {
printf("FAIL: wrong uffd-wp event fired\n");
exit(1);
}
/* un-protect */
uffd_triggered = true;
uffd_wp_range((char *)(uintptr_t)msg.arg.pagefault.address, pagesize, false);
}
return arg;
}
static int setup_uffd(char *map, size_t size)
{
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
pthread_t thread;
uffd = syscall(__NR_userfaultfd,
O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
if (uffd < 0) {
fprintf(stderr, "syscall() failed: %d\n", errno);
return -errno;
}
uffdio_api.api = UFFD_API;
uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
return -errno;
}
if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
return -ENOSYS;
}
uffdio_register.range.start = (unsigned long) map;
uffdio_register.range.len = size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
return -errno;
}
pthread_create(&thread, NULL, uffd_thread_fn, NULL);
return 0;
}
int main(void)
{
const size_t size = SIZE;
char *map, *cur;
pagesize = getpagesize();
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
if (pagemap_fd < 0) {
fprintf(stderr, "Opening /proc/self/pagemap failed\n");
return 1;
}
map = mmap(NULL, size, PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return -errno;
}
if (madvise(map, size, MADV_NOHUGEPAGE)) {
fprintf(stderr, "MADV_HUGEPAGE failed\n");
return -errno;
}
/* Populate all pages ... */
memset(map, 0, size);
if (setup_uffd(map, size))
return 1;
/* ... and write-protect them using uffd-wp. */
uffd_wp_range(map, size, true);
/* Test if all faults trigger. */
for (cur = map; cur < map + size; cur += pagesize) {
uffd_triggered = false;
barrier();
/* Trigger a write fault. */
*cur = 1;
barrier();
if (!uffd_triggered) {
printf("FAIL: uffd-wp did not trigger\n");
return 1;
}
}
printf("PASS: uffd-wp triggered\n");
return 0;
}
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <inttypes.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <poll.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <linux/memfd.h>
#include <linux/userfaultfd.h>
static size_t pagesize;
static int uffd;
static int pagemap_fd;
#define SIZE (128*1024*1024ull)
#define barrier() __asm__ __volatile__("": : :"memory")
volatile bool uffd_triggered;
static void uffd_wp_range(char *start, size_t size, bool wp)
{
struct uffdio_writeprotect uffd_writeprotect;
uffd_writeprotect.range.start = (unsigned long) start;
uffd_writeprotect.range.len = size;
if (wp) {
uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP;
} else {
uffd_writeprotect.mode = 0;
}
if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno);
exit(1);
}
}
static void *uffd_thread_fn(void *arg)
{
static struct uffd_msg msg;
ssize_t nread;
while (1) {
struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
if (nready == -1) {
fprintf(stderr, "poll() failed: %d\n", errno);
exit(1);
}
nread = read(uffd, &msg, sizeof(msg));
if (nread <= 0)
continue;
if (msg.event != UFFD_EVENT_PAGEFAULT ||
!(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP)) {
printf("FAIL: wrong uffd-wp event fired\n");
exit(1);
}
/* un-protect */
uffd_triggered = true;
uffd_wp_range((char *)(uintptr_t)msg.arg.pagefault.address, pagesize, false);
}
return arg;
}
static int setup_uffd(char *map, size_t size)
{
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
pthread_t thread;
uffd = syscall(__NR_userfaultfd,
O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
if (uffd < 0) {
fprintf(stderr, "syscall() failed: %d\n", errno);
return -errno;
}
uffdio_api.api = UFFD_API;
uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) {
fprintf(stderr, "UFFDIO_API failed: %d\n", errno);
return -errno;
}
if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) {
fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n");
return -ENOSYS;
}
uffdio_register.range.start = (unsigned long) map;
uffdio_register.range.len = size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) {
fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno);
return -errno;
}
pthread_create(&thread, NULL, uffd_thread_fn, NULL);
return 0;
}
int main(void)
{
const size_t size = SIZE;
char *map, *cur;
pagesize = getpagesize();
pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
if (pagemap_fd < 0) {
fprintf(stderr, "Opening /proc/self/pagemap failed\n");
return 1;
}
map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
if (map == MAP_FAILED) {
fprintf(stderr, "mmap() failed\n");
return -errno;
}
if (madvise(map, size, MADV_HUGEPAGE)) {
fprintf(stderr, "MADV_HUGEPAGE failed\n");
return -errno;
}
if (setup_uffd(map, size))
return 1;
/* Populate the shared zeropage, hopefullt also the huge one.*/
madvise(map, size, MADV_POPULATE_READ);
/* ... and write-protect all pages using uffd-wp. */
uffd_wp_range(map, size, true);
/*
* Discard every second odd page, this will split any huge zero
* THP and will hopefully keep the PTE protected using uffd-wp.
*
* Any mechanism to PTE-map the THP would do.
*/
for (cur = map + pagesize; cur < map + size; cur += 2 * pagesize)
madvise(cur, pagesize, MADV_DONTNEED);
/*
* Test every second even page (-> all remaining ones) if we get our
* uffd-wp events.
*/
for (cur = map; cur < map + size; cur += 2 * pagesize) {
uffd_triggered = false;
barrier();
/* Trigger a write fault. */
*cur = 1;
barrier();
if (!uffd_triggered) {
printf("FAIL: uffd-wp did not trigger\n");
return 1;
}
}
printf("PASS: uffd-wp triggered\n");
return 0;
}