A failed attempt to detect improper dropping of Writable and/or Dirty bits. Doesn't work because the primary MMU write-protects its PTEs when file writeback occurs, i.e. KVM's dirty bits are meaningless as far as file-backed guest memory is concnered. Not-signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 4 + .../selftests/kvm/volatile_spte_test.c | 208 ++++++++++++++++++ 3 files changed, 213 insertions(+) create mode 100644 tools/testing/selftests/kvm/volatile_spte_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 56140068b763..3307444d9fda 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -70,3 +70,4 @@ /steal_time /kvm_binary_stats_test /system_counter_offset_test +/volatile_spte_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index af582d168621..bc0907de6638 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -103,6 +103,7 @@ TEST_GEN_PROGS_x86_64 += set_memory_region_test TEST_GEN_PROGS_x86_64 += steal_time TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test TEST_GEN_PROGS_x86_64 += system_counter_offset_test +TEST_GEN_PROGS_x86_64 += volatile_spte_test TEST_GEN_PROGS_aarch64 += aarch64/arch_timer TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions @@ -122,6 +123,7 @@ TEST_GEN_PROGS_aarch64 += rseq_test TEST_GEN_PROGS_aarch64 += set_memory_region_test TEST_GEN_PROGS_aarch64 += steal_time TEST_GEN_PROGS_aarch64 += kvm_binary_stats_test +TEST_GEN_PROGS_aarch64 += volatile_spte_test TEST_GEN_PROGS_s390x = s390x/memop TEST_GEN_PROGS_s390x += s390x/resets @@ -134,6 +136,7 @@ TEST_GEN_PROGS_s390x += kvm_page_table_test TEST_GEN_PROGS_s390x += rseq_test TEST_GEN_PROGS_s390x += set_memory_region_test TEST_GEN_PROGS_s390x += kvm_binary_stats_test +TEST_GEN_PROGS_s390x += volatile_spte_test TEST_GEN_PROGS_riscv += demand_paging_test TEST_GEN_PROGS_riscv += dirty_log_test @@ -141,6 +144,7 @@ TEST_GEN_PROGS_riscv += kvm_create_max_vcpus TEST_GEN_PROGS_riscv += kvm_page_table_test TEST_GEN_PROGS_riscv += set_memory_region_test TEST_GEN_PROGS_riscv += kvm_binary_stats_test +TEST_GEN_PROGS_riscv += volatile_spte_test TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/volatile_spte_test.c b/tools/testing/selftests/kvm/volatile_spte_test.c new file mode 100644 index 000000000000..a4277216eb3d --- /dev/null +++ b/tools/testing/selftests/kvm/volatile_spte_test.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0-only +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <signal.h> +#include <syscall.h> +#include <sys/ioctl.h> +#include <sys/sysinfo.h> +#include <asm/barrier.h> +#include <linux/atomic.h> +#include <linux/rseq.h> +#include <linux/unistd.h> + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +#define VCPU_ID 0 + +#define PAGE_SIZE 4096 + +#define NR_ITERATIONS 1000 + +#define MEM_FILE_NAME "volatile_spte_test_mem" +#define MEM_FILE_MEMSLOT 1 +#define MEM_FILE_DATA_PATTERN 0xa5a5a5a5a5a5a5a5ul + +static const uint64_t gpa = (4ull * (1 << 30)); + +static uint64_t *hva; + +static pthread_t mprotect_thread; +static atomic_t rendezvous; +static bool done; + +static void guest_code(void) +{ + uint64_t *gva = (uint64_t *)gpa; + + while (!READ_ONCE(done)) { + WRITE_ONCE(*gva, 0); + GUEST_SYNC(0); + + WRITE_ONCE(*gva, MEM_FILE_DATA_PATTERN); + GUEST_SYNC(1); + } +} + +static void *mprotect_worker(void *ign) +{ + int i, r; + + i = 0; + while (!READ_ONCE(done)) { + for ( ; atomic_read(&rendezvous) != 1; i++) + cpu_relax(); + + usleep((i % 10) + 1); + + r = mprotect(hva, PAGE_SIZE, PROT_NONE); + TEST_ASSERT(!r, "Failed to mprotect file (hva = %lx), errno = %d (%s)", + (unsigned long)hva, errno, strerror(errno)); + + atomic_inc(&rendezvous); + } + return NULL; +} + +int main(int argc, char *argv[]) +{ + uint64_t bitmap = -1ull, val; + int i, r, fd, nr_writes; + struct kvm_regs regs; + struct ucall ucall; + struct kvm_vm *vm; + + vm = vm_create_default(VCPU_ID, 0, guest_code); + vcpu_regs_get(vm, VCPU_ID, ®s); + ucall_init(vm, NULL); + + pthread_create(&mprotect_thread, NULL, mprotect_worker, 0); + + fd = open(MEM_FILE_NAME, O_RDWR | O_CREAT, 0644); + TEST_ASSERT(fd >= 0, "Failed to open '%s', errno = %d (%s)", + MEM_FILE_NAME, errno, strerror(errno)); + + r = ftruncate(fd, PAGE_SIZE); + TEST_ASSERT(fd >= 0, "Failed to ftruncate '%s', errno = %d (%s)", + MEM_FILE_NAME, errno, strerror(errno)); + + hva = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT(hva != MAP_FAILED, "Failed to map file, errno = %d (%s)", + errno, strerror(errno)); + + vm_set_user_memory_region(vm, MEM_FILE_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES, + gpa, PAGE_SIZE, hva); + virt_pg_map(vm, gpa, gpa); + + for (i = 0, nr_writes = 0; i < NR_ITERATIONS; i++) { + fdatasync(fd); + + vcpu_run(vm, VCPU_ID); + ASSERT_EQ(*hva, 0); + ASSERT_EQ(get_ucall(vm, VCPU_ID, &ucall), UCALL_SYNC); + ASSERT_EQ(ucall.args[1], 0); + + /* + * The origin hope/intent was to detect dropped Dirty bits by + * checking for missed file writeback. Sadly, the kernel is + * too smart and write-protects the primary MMU's PTEs, which + * zaps KVM's SPTEs and ultimately causes the folio/page to get + * marked marked dirty by the primary MMU when KVM re-faults on + * the page. + * + * Triggering swap _might_ be a way to detect failure, as swap + * is treated differently than "normal" files. + * + * RIP: 0010:kvm_unmap_gfn_range+0xf1/0x100 [kvm] + * Call Trace: + * <TASK> + * kvm_mmu_notifier_invalidate_range_start+0x11c/0x2c0 [kvm] + * __mmu_notifier_invalidate_range_start+0x7e/0x190 + * page_mkclean_one+0x226/0x250 + * rmap_walk_file+0x213/0x430 + * folio_mkclean+0x95/0xb0 + * folio_clear_dirty_for_io+0x5d/0x1c0 + * mpage_submit_page+0x1f/0x70 + * mpage_process_page_bufs+0xf8/0x110 + * mpage_prepare_extent_to_map+0x1e3/0x420 + * ext4_writepages+0x277/0xca0 + * do_writepages+0xd1/0x190 + * filemap_fdatawrite_wbc+0x62/0x90 + * file_write_and_wait_range+0xa3/0xe0 + * ext4_sync_file+0xdb/0x340 + * do_fsync+0x38/0x70 + * __x64_sys_fdatasync+0x13/0x20 + * do_syscall_64+0x31/0x50 + * entry_SYSCALL_64_after_hwframe+0x44/0xae + * </TASK> + * + * RIP: 0010:__folio_mark_dirty+0x266/0x310 + * Call Trace: + * <TASK> + * mark_buffer_dirty+0xe7/0x140 + * __block_commit_write.isra.0+0x59/0xc0 + * block_page_mkwrite+0x15a/0x170 + * ext4_page_mkwrite+0x485/0x620 + * do_page_mkwrite+0x54/0x150 + * __handle_mm_fault+0xe2a/0x1600 + * handle_mm_fault+0xbd/0x280 + * do_user_addr_fault+0x192/0x600 + * exc_page_fault+0x6c/0x140 + * asm_exc_page_fault+0x1e/0x30 + * </TASK> + */ + /* fdatasync(fd); */ + + /* + * Clear the dirty log to coerce KVM into write-protecting the + * SPTE (or into clearing dirty bits when using PML). + */ + kvm_vm_clear_dirty_log(vm, MEM_FILE_MEMSLOT, &bitmap, 0, 1); + + atomic_inc(&rendezvous); + + usleep(i % 10); + + r = _vcpu_run(vm, VCPU_ID); + + while (atomic_read(&rendezvous) != 2) + cpu_relax(); + + atomic_set(&rendezvous, 0); + + fdatasync(fd); + mprotect(hva, PAGE_SIZE, PROT_READ | PROT_WRITE); + + val = READ_ONCE(*hva); + if (r) { + TEST_ASSERT(!val, "Memory should be zero, write faulted\n"); + vcpu_regs_set(vm, VCPU_ID, ®s); + continue; + } + nr_writes++; + TEST_ASSERT(val == MEM_FILE_DATA_PATTERN, + "Memory doesn't match data pattern, want 0x%lx, got 0x%lx", + MEM_FILE_DATA_PATTERN, val); + ASSERT_EQ(get_ucall(vm, VCPU_ID, &ucall), UCALL_SYNC); + ASSERT_EQ(ucall.args[1], 1); + } + + printf("%d of %d iterations wrote memory\n", nr_writes, NR_ITERATIONS); + + atomic_inc(&rendezvous); + WRITE_ONCE(done, true); + + pthread_join(mprotect_thread, NULL); + + kvm_vm_free(vm); + + return 0; +} + -- 2.36.0.rc2.479.g8af0fa9b8e-goog