Re: [PATCH v2 13/13] KVM: selftests: Verify KVM correctly handles mprotect(PROT_READ)

James Houghton <jthoughton@xxxxxxxxxx> · Wed, 11 Sep 2024 17:19:24 -0700

On Wed, Sep 11, 2024 at 1:42 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote:
>
> Add two phases to mmu_stress_test to verify that KVM correctly handles
> guest memory that was writable, and then made read-only in the primary MMU,
> and then made writable again.
>
> Add bonus coverage for x86 and arm64 to verify that all of guest memory was
> marked read-only.  Making forward progress (without making memory writable)
> requires arch specific code to skip over the faulting instruction, but the
> test can at least verify each vCPU's starting page was made read-only for
> other architectures.
>
> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx>
> ---
>  tools/testing/selftests/kvm/mmu_stress_test.c | 104 +++++++++++++++++-
>  1 file changed, 101 insertions(+), 3 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
> index 50c3a17418c4..c07c15d7cc9a 100644
> --- a/tools/testing/selftests/kvm/mmu_stress_test.c
> +++ b/tools/testing/selftests/kvm/mmu_stress_test.c
> @@ -16,6 +16,8 @@
>  #include "guest_modes.h"
>  #include "processor.h"
>
> +static bool mprotect_ro_done;
> +
>  static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
>  {
>         uint64_t gpa;
> @@ -31,6 +33,42 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
>                 *((volatile uint64_t *)gpa);
>         GUEST_SYNC(2);
>
> +       /*
> +        * Write to the region while mprotect(PROT_READ) is underway.  Keep
> +        * looping until the memory is guaranteed to be read-only, otherwise
> +        * vCPUs may complete their writes and advance to the next stage
> +        * prematurely.
> +        *
> +        * For architectures that support skipping the faulting instruction,
> +        * generate the store via inline assembly to ensure the exact length
> +        * of the instruction is known and stable (vcpu_arch_put_guest() on
> +        * fixed-length architectures should work, but the cost of paranoia
> +        * is low in this case).  For x86, hand-code the exact opcode so that
> +        * there is no room for variability in the generated instruction.
> +        */
> +       do {
> +               for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
> +#ifdef __x86_64__
> +                       asm volatile(".byte 0x48,0x89,0x00" :: "a"(gpa) : "memory"); /* mov %rax, (%rax) */

I'm curious what you think about using labels (in asm, but perhaps
also in C) and *setting* the PC instead of incrementing the PC. Diff
attached (tested on x86). It might even be safe/okay to always use
vcpu_arch_put_guest(), just set the PC to a label immediately
following it.

I don't feel strongly, so feel free to ignore.



> +#elif defined(__aarch64__)
> +                       asm volatile("str %0, [%0]" :: "r" (gpa) : "memory");
> +#else
> +                       vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
> +#endif
> +       } while (!READ_ONCE(mprotect_ro_done));
> +
> +       /*
> +        * Only architectures that write the entire range can explicitly sync,
> +        * as other architectures will be stuck on the write fault.
> +        */
> +#if defined(__x86_64__) || defined(__aarch64__)
> +       GUEST_SYNC(3);
> +#endif

diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index c07c15d7cc9a..a7cced5c3e67 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -16,8 +16,13 @@
 #include "guest_modes.h"
 #include "processor.h"
 
+#if defined(__x86_64__) || defined(__aarch64__)
+extern void *skip_page;
+#endif
+
 static bool mprotect_ro_done;
 
+
 static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 {
 	uint64_t gpa;
@@ -40,18 +45,21 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 	 * prematurely.
 	 *
 	 * For architectures that support skipping the faulting instruction,
-	 * generate the store via inline assembly to ensure the exact length
-	 * of the instruction is known and stable (vcpu_arch_put_guest() on
-	 * fixed-length architectures should work, but the cost of paranoia
-	 * is low in this case).  For x86, hand-code the exact opcode so that
-	 * there is no room for variability in the generated instruction.
+	 * generate the store via inline assembly to ensure we can correctly
+	 * adjust the PC upon faulting.
 	 */
 	do {
 		for (gpa = start_gpa; gpa < end_gpa; gpa += stride)
 #ifdef __x86_64__
-			asm volatile(".byte 0x48,0x89,0x00" :: "a"(gpa) : "memory"); /* mov %rax, (%rax) */
+			asm volatile(".global skip_page;"
+				     "mov %0, (%0);"
+				     "skip_page:;"
+				     :: "r" (gpa) : "memory");
 #elif defined(__aarch64__)
-			asm volatile("str %0, [%0]" :: "r" (gpa) : "memory");
+			asm volatile(".global skip_page;"
+				     "str %0, [%0];"
+				     "skip_page:;"
+				     :: "r" (gpa) : "memory");
 #else
 			vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
 #endif
@@ -170,10 +178,10 @@ static void *vcpu_worker(void *data)
 		TEST_ASSERT_EQ(errno, EFAULT);
 #if defined(__x86_64__)
 		WRITE_ONCE(vcpu->run->kvm_dirty_regs, KVM_SYNC_X86_REGS);
-		vcpu->run->s.regs.regs.rip += 3;
+		vcpu->run->s.regs.regs.rip = (vm_vaddr_t)&skip_page;
 #elif defined(__aarch64__)
 		vcpu_set_reg(vcpu, ARM64_CORE_REG(regs.pc),
-			     vcpu_get_reg(vcpu, ARM64_CORE_REG(regs.pc)) + 4);
+			     (vm_vaddr_t)&skip_page);
 #endif
 
 	}