We're experimenting with userfaultfd write protect implementation on
Andrea's tree and it looks like there is a problem if we combine
MODE_MISSING and MODE_WP in one region.
You can find a test case below together with detailed problem
description. Please take a look, maybe we're doing something wrong?
Will be happy to provide any additional info if needed.
/*
* This testcase reproduces a problem with userfaultfd writeprotect
feature on
* http://git.kernel.org/pub/scm/linux/kernel/git/andrea/aa.git, HEAD
a22d71c
* gcc ufdtest.c -std=gnu99 -lpthread -o ufdtest
*
* 1. Allocate a private RW region and register it with MODE_MISSING |
MODE_WP.
* 2. Fork a UFD thread and begin writing to memory from main thread.
*
* Expected behavior:
* Recv pagefaults with UFFD_PAGEFAULT_FLAG_WRITE set, handle them with
zeropage
*
* Actual behavior:
* We recv to pagefaults for each page:
*
* 1. First fault is expected UFFD_PAGEFAULT_FLAG_WRITE set which we
resolve
* with zeropage
*
* 2. Second fault immediately follows the first one with the same address
* and has UFFD_PAGEFAULT_FLAG_WRITE | UFFD_PAGEFAULT_FLAG_WP set.
* If we ignore this second fault then main thread never wakes up
* If we try to resolve it with !WP then main thread received SIGBUS.
*
* If we register that region only with MODE_MISSING _or_ MODE_WP then
we get
* no problems, i.e. only missing faults or WP faults are seen.
*/
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdio.h>
#include <errno.h>
#include <assert.h>
#define _GNU_SOURCE
#include <fcntl.h>
#include <poll.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/user.h>
#include <sys/syscall.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <asm/types.h>
#include <sys/eventfd.h>
#include <linux/userfaultfd.h>
#include <pthread.h>
#if !(defined(__linux__) && defined(__NR_userfaultfd))
# error Need userfaultfd
#endif
#define DIE(fmt, ...) do { \
fprintf(stderr, fmt, ##__VA_ARGS__); \
fprintf(stderr, "\n"); \
assert(0); \
} while(0);
#define DPRINTF(fmt, ...) do { \
printf("%s: " fmt, __func__, ##__VA_ARGS__); \
printf("\n"); \
} while(0);
static int g_ufd = -1;
static bool ufd_version_check(void)
{
struct uffdio_api api_struct;
uint64_t ioctl_mask;
api_struct.api = UFFD_API;
api_struct.features = 0;
if (ioctl(g_ufd, UFFDIO_API, &api_struct)) {
DIE("UFFDIO_API failed: %s", strerror(errno));
}
ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
(__u64)1 << _UFFDIO_UNREGISTER;
if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
DIE("Missing features: %llx", ~api_struct.ioctls & ioctl_mask);
}
return true;
}
static void ufd_zeropage(__u64 page)
{
struct uffdio_zeropage zero_struct;
zero_struct.range.start = page;
zero_struct.range.len = getpagesize();
zero_struct.mode = 0;
if (ioctl(g_ufd, UFFDIO_ZEROPAGE, &zero_struct)) {
DIE("zeropage ioctl failed");
}
}
static void ufd_writeprotect(__u64 page, bool readonly)
{
struct uffdio_writeprotect wp_struct;
wp_struct.range.start = page;
wp_struct.range.len = PAGE_SIZE;
if (readonly) {
wp_struct.mode = UFFDIO_WRITEPROTECT_MODE_WP;
} else {
wp_struct.mode = 0;
}
if (ioctl(g_ufd, UFFDIO_WRITEPROTECT, &wp_struct)) {
DIE("ioctl failed: %s", strerror(errno));
}
}
static void* ufd_worker(void* arg)
{
while(1) {
DPRINTF("Reading from ufd");
struct uffd_msg msg;
int ret = read(g_ufd, &msg, sizeof(msg));
if (ret != sizeof(msg)) {
if (errno == EAGAIN) {
continue;
}
if (ret < 0) {
DIE("Failed to read full message: %s", strerror(errno));
} else {
DIE("Read %d bytes, expected %zd", ret, sizeof(msg));
}
}
if (msg.event != UFFD_EVENT_PAGEFAULT) {
DIE("unexpected event 0x%x", msg.event);
}
__u64 page = msg.arg.pagefault.address & ~(PAGE_SIZE - 1ull);
DPRINTF("Pagefault @ 0x%llx, flags 0x%llx",
page, msg.arg.pagefault.flags);
bool is_write_fault =
(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) != 0;
bool is_wp_fault =
(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) != 0;
if (!is_write_fault || (is_write_fault && !is_wp_fault)) {
ufd_zeropage(page);
DPRINTF("0x%llx zeropaged", page);
} else if (is_wp_fault) {
DPRINTF("unexpected WP fault on 0x%llx", page);
// If you remove this main thread will sleep forever
ufd_writeprotect(page, false);
}
}
DIE("Unreachable");
return NULL;
}
int main(void)
{
int res = 0;
g_ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
if (g_ufd < 0) {
DIE("userfaultfd not available: %s", strerror(errno));
}
if (!ufd_version_check()) {
DIE("UFFDIO_API not supported");
}
size_t len = 1024 * 1024 * 1024;
void* mem = mmap(NULL, len, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mem == MAP_FAILED) {
DIE("mmap failed: %s", strerror(errno));
}
struct uffdio_register reg_struct;
reg_struct.range.start = (uintptr_t)mem;
reg_struct.range.len = len;
reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING |
UFFDIO_REGISTER_MODE_WP;
if (ioctl(g_ufd, UFFDIO_REGISTER, ®_struct)) {
DIE("userfault register: %s", strerror(errno));
}
uint64_t feature_mask = 1ull << _UFFDIO_WAKE |
1ull << _UFFDIO_ZEROPAGE |
1ull << _UFFDIO_WRITEPROTECT;
if ((reg_struct.ioctls & feature_mask) != feature_mask) {
DIE("Missing range features: %llx", ~reg_struct.ioctls &
feature_mask);
}
DPRINTF("Registered range %p:%zu", mem, len);
DPRINTF("UFD features: 0x%x", reg_struct.ioctls);
pthread_t worker;
if (0 != pthread_create(&worker, NULL, ufd_worker, NULL)) {
DIE("Failed to start ufd worker thread");
}
volatile uint8_t* pdata = (uint8_t*)mem;
for (int i = 0; i < (len / PAGE_SIZE); ++i) {
pdata[0] = (uint8_t)rand();
}
DPRINTF("done!");
pthread_join(worker, NULL);
return EXIT_SUCCESS;
}
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxx. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>